From e489d6c7df7c4ed6fdf22414a52d631be5f6eedb Mon Sep 17 00:00:00 2001
From: Cavus Mustafa <mustafa.cavus@intel.com>
Date: Fri, 20 Jun 2025 17:13:03 -0700
Subject: [PATCH 001/266] Runtime support for openvino quantized models

---
 backends/openvino/runtime/OpenvinoBackend.cpp | 24 +++++++++++++++++++
 backends/openvino/scripts/openvino_build.sh   |  1 +
 examples/models/llama/CMakeLists.txt          |  8 +++++++
 tools/cmake/executorch-config.cmake           |  1 +
 4 files changed, 34 insertions(+)

diff --git a/backends/openvino/runtime/OpenvinoBackend.cpp b/backends/openvino/runtime/OpenvinoBackend.cpp
index a3134f72b4b..39a1bf55c32 100644
--- a/backends/openvino/runtime/OpenvinoBackend.cpp
+++ b/backends/openvino/runtime/OpenvinoBackend.cpp
@@ -114,6 +114,26 @@ exr::Error OpenvinoBackend::execute(
         ov_type, input_shape, input_tensor.mutable_data_ptr());
 
     infer_request->set_input_tensor(i, ov_input_tensor);
+
+    if (args[i]->isInt()) {
+        int64_t *val = &(args[i]->payload.copyable_union.as_int);
+
+        // Create OpenVINO tensor from integer input
+        ov::Tensor ov_input_tensor(ov::element::i64, ov::Shape{1}, val);
+        infer_request->set_input_tensor(i, ov_input_tensor);
+    } else {
+        auto input_tensor = args[i]->toTensor();
+        ov::Shape input_shape(
+            input_tensor.sizes().begin(), input_tensor.sizes().end());
+
+        // Convert input tensor to OpenVINO tensor
+        ov::element::Type ov_type =
+            convert_to_openvino_type(input_tensor.scalar_type());
+        ov::Tensor ov_input_tensor(
+            ov_type, input_shape, input_tensor.mutable_data_ptr());
+
+        infer_request->set_input_tensor(i, ov_input_tensor);
+    }
   }
 
   // Set outputs
@@ -165,10 +185,14 @@ ov::element::Type OpenvinoBackend::convert_to_openvino_type(
   switch (scalar_type) {
     case exa::ScalarType::Float:
       return ov::element::f32;
+    case exa::ScalarType::Half:
+      return ov::element::f16;
     case exa::ScalarType::Int:
       return ov::element::i32;
     case exa::ScalarType::Char:
       return ov::element::i8;
+    case exa::ScalarType::Byte:
+      return ov::element::u8;
     case exa::ScalarType::Long:
       return ov::element::i64;
     case exa::ScalarType::Bool:
diff --git a/backends/openvino/scripts/openvino_build.sh b/backends/openvino/scripts/openvino_build.sh
index bc85d6b8410..c10a3bb4eeb 100755
--- a/backends/openvino/scripts/openvino_build.sh
+++ b/backends/openvino/scripts/openvino_build.sh
@@ -29,6 +29,7 @@ main() {
               -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
               -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
               -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
+              -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
               -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
               -DEXECUTORCH_BUILD_OPENVINO_EXECUTOR_RUNNER=ON \
               -B"${build_dir}"
diff --git a/examples/models/llama/CMakeLists.txt b/examples/models/llama/CMakeLists.txt
index 8c27de20845..1063ebf2561 100644
--- a/examples/models/llama/CMakeLists.txt
+++ b/examples/models/llama/CMakeLists.txt
@@ -189,6 +189,14 @@ if(TARGET mpsdelegate)
   target_link_options_shared_lib(mpsdelegate)
 endif()
 
+# Openvino backend
+if(TARGET openvino_backend)
+  find_package(OpenVINO REQUIRED)
+  target_link_libraries(openvino_backend INTERFACE openvino::runtime executorch_core)
+  list(APPEND link_libraries openvino_backend)
+  target_link_options_shared_lib(openvino_backend)
+endif()
+
 if(TARGET coremldelegate)
   find_library(SQLITE_LIBRARY sqlite3)
   list(
diff --git a/tools/cmake/executorch-config.cmake b/tools/cmake/executorch-config.cmake
index aa5776163a9..adf978fb70a 100644
--- a/tools/cmake/executorch-config.cmake
+++ b/tools/cmake/executorch-config.cmake
@@ -94,6 +94,7 @@ set(lib_list
     quantized_kernels
     quantized_ops_lib
     quantized_ops_aot_lib
+    openvino_backend
 )
 foreach(lib ${lib_list})
   # Name of the variable which stores result of the find_library search

From f0d901f3358fc9bc59b97450111ec0071b90044a Mon Sep 17 00:00:00 2001
From: Cavus Mustafa <mustafa.cavus@intel.com>
Date: Fri, 20 Jun 2025 21:41:24 -0700
Subject: [PATCH 002/266] openvino export_llama_lib support

---
 examples/models/llama/config/llm_config.py | 17 +++++++++++++++++
 examples/models/llama/export_llama_lib.py  | 17 +++++++++++++++++
 extension/llm/export/partitioner_lib.py    | 13 +++++++++++++
 3 files changed, 47 insertions(+)

diff --git a/examples/models/llama/config/llm_config.py b/examples/models/llama/config/llm_config.py
index 034d8af7562..2de58fe47eb 100644
--- a/examples/models/llama/config/llm_config.py
+++ b/examples/models/llama/config/llm_config.py
@@ -437,6 +437,16 @@ class MPSConfig:
     enabled: bool = False
 
 
+@dataclass
+class OpenvinoConfig:
+    """
+    Configures the QNN backend.
+    """
+
+    enabled: bool = False
+    device: str = "CPU"
+
+
 @dataclass
 class BackendConfig:
     """
@@ -449,6 +459,7 @@ class BackendConfig:
     vulkan: VulkanConfig = field(default_factory=VulkanConfig)
     qnn: QNNConfig = field(default_factory=QNNConfig)
     mps: MPSConfig = field(default_factory=MPSConfig)
+    openvino: OpenvinoConfig = field(default_factory=OpenvinoConfig)
 
 
 ################################################################################
@@ -609,6 +620,12 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig":  # noqa: C901
         if hasattr(args, "mps"):
             llm_config.backend.mps.enabled = args.mps
 
+        # Openvino
+        if hasattr(args, "openvino"):
+            llm_config.backend.openvino.enabled = args.openvino
+        if hasattr(args, "openvino_device"):
+            llm_config.backend.openvino.device = args.openvino_device
+
         # DebugConfig
         if hasattr(args, "profile_memory"):
             llm_config.debug.profile_memory = args.profile_memory
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 1f055d65822..8afaa8bf409 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -39,6 +39,7 @@
 from executorch.extension.llm.export.partitioner_lib import (
     get_coreml_partitioner,
     get_mps_partitioner,
+    get_openvino_partitioner,
     get_qnn_partitioner,
     get_vulkan_partitioner,
     get_xnnpack_partitioner,
@@ -443,6 +444,14 @@ def build_args_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="Delegate llama2 to qnn backend (Qualcomm), please use it --kv_cahce=True",
     )
+    parser.add_argument("--openvino", action="store_true")
+    parser.add_argument(
+        "--openvino_device",
+        type=str,
+        default=None,
+        choices=["CPU", "GPU"],
+        help="Specify the device for Openvino (CPU or GPU).",
+    )
 
     parser.add_argument(
         "--expand_rope_table",
@@ -857,6 +866,8 @@ def _to_edge_and_lower_llama(  # noqa: C901
     mps: bool = False,
     coreml: bool = False,
     qnn: bool = False,
+    openvino: bool = False,
+    openvino_device: str = "CPU",
     dtype_override: str = "fp32",
     enable_dynamic_shape: bool = True,
     use_kv_cache: bool = False,
@@ -901,6 +912,10 @@ def _to_edge_and_lower_llama(  # noqa: C901
         partitioners.append(coreml_partitioner)
         modelname = f"coreml_{modelname}"
 
+    if openvino:
+        partitioners.append(get_openvino_partitioner(openvino_device))
+        modelname = f"openvino_{modelname}"
+
     if qnn:
         logging.warning(
             "The model definition in current repro is not performant, please refer to the instruction"
@@ -1068,6 +1083,8 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager:  # noqa: C901
             mps=llm_config.backend.mps.enabled,
             coreml=llm_config.backend.coreml.enabled,
             qnn=llm_config.backend.qnn.enabled,
+            openvino=llm_config.backend.openvino.enabled,
+            openvino_device=llm_config.backend.openvino.device,
             dtype_override=llm_config.model.dtype_override,
             enable_dynamic_shape=llm_config.model.enable_dynamic_shape,
             use_kv_cache=llm_config.model.use_kv_cache,
diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py
index 20604bbf635..3c795dcdf66 100644
--- a/extension/llm/export/partitioner_lib.py
+++ b/extension/llm/export/partitioner_lib.py
@@ -63,6 +63,19 @@ def get_mps_partitioner(use_kv_cache: bool = False):
     compile_specs = [CompileSpec("use_fp16", bytes([True]))]
     return MPSPartitioner(compile_specs)  # pyre-fixme[16]
 
+def get_openvino_partitioner(device: str):
+    try:
+        from executorch.exir.backend.backend_details import CompileSpec
+        from executorch.backends.openvino.partitioner import (
+            OpenvinoPartitioner,
+        )
+    except ImportError:
+        raise ImportError(
+            "Please install the OpenVINO backend following https://github.com/pytorch/executorch/tree/main/backends/openvino"
+        )
+
+    compile_specs = [CompileSpec("device", device.encode())]
+    return OpenvinoPartitioner(compile_specs)
 
 def get_coreml_partitioner(
     ios: int = 15,

From 24f2d930c62484ba038bd9ee9c7fb9fb73cc3fd5 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa <mustafa.cavus@intel.com>
Date: Sat, 21 Jun 2025 20:43:05 -0700
Subject: [PATCH 003/266] nncf pattern checker in openvino partitioner

---
 backends/openvino/partitioner.py          | 62 +++++++++++++++++++++++
 examples/models/llama/export_llama_lib.py |  2 +-
 2 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py
index bc3fde573e2..4828a96f0dd 100644
--- a/backends/openvino/partitioner.py
+++ b/backends/openvino/partitioner.py
@@ -25,6 +25,11 @@
 from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
 from torch.fx.passes.operator_support import OperatorSupportBase
 
+class PatternNode:
+    op_types = {}
+
+    def __init__(self):
+        self.op_types = {}
 
 class OpenvinoOperatorsSupport(OperatorSupportBase):
 
@@ -32,6 +37,7 @@ def __init__(
         self,
         op_types_to_skip: Optional[set] = None,
         op_names_to_skip: Optional[set] = None,
+        enabled_ops_by_name: Optional[set] = None,
     ) -> None:
         """
         Initializes the OpenvinoOperatorsSupport class.
@@ -43,9 +49,12 @@ def __init__(
             op_types_to_skip = set()
         if op_names_to_skip is None:
             op_names_to_skip = set()
+        if enabled_ops_by_name is None:
+            enabled_ops_by_name = set()
 
         self._op_types_to_skip = op_types_to_skip
         self._op_names_to_skip = op_names_to_skip
+        self._enabled_ops_by_name = enabled_ops_by_name
 
     def is_node_supported(self, _, node: torch.fx.Node) -> bool:
         """
@@ -62,6 +71,10 @@ def is_node_supported(self, _, node: torch.fx.Node) -> bool:
             op_type = node.target.__name__
         else:
             op_type = str(node.target)
+
+        if node.name in self._enabled_ops_by_name:
+            return True
+
         supported_ops = OperatorSupport(options)._support_dict
         if op_type == "getitem":
             return True
@@ -88,6 +101,7 @@ def __init__(
         compile_spec: List[CompileSpec],
         op_types_to_skip: Optional[set] = None,
         op_names_to_skip: Optional[set] = None,
+        enabled_ops_by_name: Optional[set] = None,
     ) -> None:
         """
         Initializes the OpenvinoPartitioner class.
@@ -99,6 +113,7 @@ def __init__(
         self.delegation_spec = DelegationSpec(OpenvinoBackend.__name__, compile_spec)
         self._op_types_to_skip = op_types_to_skip
         self._op_names_to_skip = op_names_to_skip
+        self._enabled_ops_by_name = enabled_ops_by_name
 
     def ops_to_not_decompose(
         self,
@@ -120,6 +135,52 @@ def ops_to_not_decompose(
         ]
         return (ops_not_decompose, None)
 
+    def check_pattern(self, node: torch.fx.Node, pattern: PatternNode, enabled_ops: list) -> bool:
+        if node.op == "call_function":
+            if ("call_function" + ":" + str(node.target.__name__)) in pattern.op_types:
+                pt_input_nodes = node.all_input_nodes
+                pattern_input_ops = pattern.op_types["call_function" + ":" + str(node.target.__name__)]
+                if pattern_input_ops is None:
+                    enabled_ops.append(node)
+                    return True
+                if len(pt_input_nodes) != len(pattern_input_ops):
+                    return False
+                for i in range(len(pt_input_nodes)):
+                    if not self.check_pattern(pt_input_nodes[i], pattern_input_ops[i], enabled_ops):
+                        return False
+                enabled_ops.append(node)
+                return True
+        elif node.op == "get_attr":
+            if "get_attr" in pattern.op_types:
+                return True
+            else:
+                return False
+        elif node.op == "placeholder":
+            if "placeholder" in pattern.op_types:
+                return True
+            else:
+                return False
+        return False
+
+    def capture_nncf_patterns(self, graph_module: torch.fx.GraphModule):
+        const_node = PatternNode
+        const_node.op_types["get_attr"] = None
+        const_node.op_types["placeholder"] = None
+        bitwise_right_shift_node = PatternNode
+        bitwise_right_shift_node.op_types["call_function:aten.bitwise_right_shift.Tensor_Scalar"] = [const_node]
+        bitwise_and_node = PatternNode
+        bitwise_and_node.op_types["call_function:aten.bitwise_and.Scalar"] = [const_node]
+        stack_node = PatternNode
+        stack_node.op_types["call_function:aten.stack.default"] = [bitwise_and_node, bitwise_right_shift_node]
+
+        for node in graph_module.graph.nodes:
+            if str(node.op) == "call_function" and str(node.target.__name__) == "aten.stack.default":
+                enabled_ops = []
+                pattern_match = self.check_pattern(node, stack_node, enabled_ops)
+                if pattern_match:
+                    for pattern_op in enabled_ops:
+                        self._enabled_ops_by_name.add(pattern_op.name)
+
     def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         """
         Partitions an exported program into supported and unsupported segments.
@@ -127,6 +188,7 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         :param exported_program: The exported program.
         :return: A PartitionResult containing the partitioned graph and delegation tags.
         """
+        self.capture_nncf_patterns(exported_program.graph_module)
         partitioner = CapabilityBasedPartitioner(
             exported_program.graph_module,
             OpenvinoOperatorsSupport(self._op_types_to_skip, self._op_names_to_skip),
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 8afaa8bf409..a01b05daa17 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -448,7 +448,7 @@ def build_args_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--openvino_device",
         type=str,
-        default=None,
+        default="CPU",
         choices=["CPU", "GPU"],
         help="Specify the device for Openvino (CPU or GPU).",
     )

From 7dd8d0f17aec743d7796bf7b314df97f2aeb90eb Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 23 Jun 2025 19:11:55 +0400
Subject: [PATCH 004/266] nncf compression init

---
 examples/models/llama/export_llama_lib.py |  8 ++++++
 extension/llm/export/builder.py           | 32 +++++++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index a01b05daa17..087e4d1efdc 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -551,6 +551,13 @@ def build_args_parser() -> argparse.ArgumentParser:
         help="path to the input pruning token mapping file (token_map.json)",
     )
 
+    parser.add_argument(
+        "--nncf_compression",
+        default=False,
+        action="store_true",
+        help="If true, stops right after torch.export() and saves the exported model.",
+    )
+
     parser.add_argument(
         "--export_only",
         default=False,
@@ -1207,6 +1214,7 @@ def _load_llama_model(llm_config: LlmConfig) -> "LLMEdgeManager":
         use_legacy_export=llm_config.backend.qnn.enabled,
         save_exported_program=llm_config.export.export_only,
         verbose=llm_config.debug.verbose,
+        nncf_compression=llm_config.nncf_compression,
         metadata=_load_llama_model_metadata(
             WeightType.FAIRSEQ2 if llm_config.base.fairseq2 else WeightType.LLAMA,
             llm_config.model.use_kv_cache,
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index 4128bfd8198..f185d9b346d 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -16,6 +16,7 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple
 from unittest.mock import patch
 
+import nncf
 import torch
 from executorch.backends.transforms.duplicate_dynamic_quant_chain import (
     DuplicateDynamicQuantChainPass,
@@ -40,6 +41,7 @@
 from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
 from torchao.quantization.pt2e.quantizer import ComposableQuantizer, Quantizer
 from torchao.utils import unwrap_tensor_subclass
+from functools import partial
 
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
 logging.basicConfig(level=logging.INFO, format=FORMAT)
@@ -98,6 +100,7 @@ def __init__(
         dynamic_shapes: Optional[Any] = None,
         use_legacy_export: bool = False,
         save_exported_program: bool = False,
+        nncf_compression: bool = False
     ):
         # Store necessary constructor arguments.
         self.model = model
@@ -119,6 +122,7 @@ def __init__(
         self.dynamic_shapes = dynamic_shapes
         self.use_legacy_export = use_legacy_export
         self.save_exported_program = save_exported_program
+        self.nncf_compression = nncf_compression
 
         # Note: treat this as the source of truth for the result of
         # torch.export'ing a model. If the overall ExportedProgram is needed,
@@ -428,6 +432,34 @@ def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManage
                 DuplicateDynamicQuantChainPass()(m)
                 self.pre_autograd_graph_module = m
             return self
+        elif (self.nncf_compression):
+            tokenizer = get_tokenizer(self.tokenizer_path)
+
+            def transform_fn(
+                prompts: str, tokenizer
+            ):
+                tokenized_text = tokenizer.encode(prompts, bos=False, eos=False)
+                logging.error(tokenized_text)
+
+                inputs = ()
+                inputs = (
+                    torch.tensor(tokenized_text).unsqueeze(0),
+                    {"input_pos": torch.tensor([0])},
+                )
+
+                return inputs
+
+            self.calibration_data = [self.calibration_data] if isinstance(self.calibration_data, str) else self.calibration_data
+            self.calibration_data = [word for prompt in self.calibration_data for word in prompt.split()] if not self.dynamic_shapes else self.calibration_data
+
+            self.pre_autograd_graph_module = nncf.compress_weights(
+                                                                self.pre_autograd_graph_module,
+                                                                dataset=nncf.Dataset(self.calibration_data, transform_func=partial(transform_fn, tokenizer=tokenizer)),
+                                                                mode=nncf.CompressWeightsMode.INT4_SYM,
+                                                                ratio=0.8,
+                                                                sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
+                                                            )
+            return self
         else:
             logging.info("No quantizer provided, passing...")
             return self

From 1716834b5ff3889da366f54e2d6f2a3e3e999117 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa <mustafa.cavus@intel.com>
Date: Mon, 23 Jun 2025 13:43:11 -0700
Subject: [PATCH 005/266] openvino backend llama nncf support

---
 backends/openvino/partitioner.py           |  5 +-
 backends/openvino/utils.py                 | 66 ++++++++++++++++++++++
 examples/models/llama/config/llm_config.py |  3 +
 examples/models/llama/export_llama_lib.py  |  4 +-
 extension/llm/export/builder.py            | 39 +++++++++----
 5 files changed, 101 insertions(+), 16 deletions(-)
 create mode 100644 backends/openvino/utils.py

diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py
index 4828a96f0dd..b1e7f5d436a 100644
--- a/backends/openvino/partitioner.py
+++ b/backends/openvino/partitioner.py
@@ -101,7 +101,6 @@ def __init__(
         compile_spec: List[CompileSpec],
         op_types_to_skip: Optional[set] = None,
         op_names_to_skip: Optional[set] = None,
-        enabled_ops_by_name: Optional[set] = None,
     ) -> None:
         """
         Initializes the OpenvinoPartitioner class.
@@ -113,7 +112,7 @@ def __init__(
         self.delegation_spec = DelegationSpec(OpenvinoBackend.__name__, compile_spec)
         self._op_types_to_skip = op_types_to_skip
         self._op_names_to_skip = op_names_to_skip
-        self._enabled_ops_by_name = enabled_ops_by_name
+        self._enabled_ops_by_name = set()
 
     def ops_to_not_decompose(
         self,
@@ -191,7 +190,7 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         self.capture_nncf_patterns(exported_program.graph_module)
         partitioner = CapabilityBasedPartitioner(
             exported_program.graph_module,
-            OpenvinoOperatorsSupport(self._op_types_to_skip, self._op_names_to_skip),
+            OpenvinoOperatorsSupport(self._op_types_to_skip, self._op_names_to_skip, self._enabled_ops_by_name),
             allows_single_node_partition=True,
         )
         partition_list = partitioner.propose_partitions()
diff --git a/backends/openvino/utils.py b/backends/openvino/utils.py
new file mode 100644
index 00000000000..ec4bebe0d6d
--- /dev/null
+++ b/backends/openvino/utils.py
@@ -0,0 +1,66 @@
+# Copyright (c) Intel Corporation
+#
+# Licensed under the BSD License (the "License"); you may not use this file
+# except in compliance with the License. See the license file found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+from typing import Any, Dict, Optional, Tuple, Union
+
+import executorch.exir as exir
+
+import torch
+from executorch.exir import EdgeProgramManager
+from executorch.exir.program._program import to_edge_with_preserved_ops
+from executorch.exir.tracer import Value
+from torch.export import ExportedProgram
+from executorch.extension.export_util.utils import _to_core_aten
+
+_EDGE_COMPILE_CONFIG = exir.EdgeCompileConfig(
+    _check_ir_validity=True,
+    _skip_dim_order=True,  # TODO(T189114319): Reuse dim order op after solving the ios oss issue
+)
+
+def nncf_core_aten_to_edge(
+    core_aten_exir_ep: ExportedProgram,
+    edge_constant_methods: Optional[Dict[str, Any]] = None,
+    edge_compile_config=None,
+    verbose=True,
+) -> EdgeProgramManager:
+    if not edge_compile_config:
+        edge_compile_config = exir.EdgeCompileConfig(
+            _check_ir_validity=False,  # quant ops currently break ir verification
+        )
+    edge_manager: EdgeProgramManager = to_edge_with_preserved_ops(
+        core_aten_exir_ep,
+        constant_methods=edge_constant_methods,
+        compile_config=edge_compile_config,
+        preserve_ops=[torch.ops.aten.stack.default,],
+    )
+    if verbose:
+        logging.info(f"Exported graph:\n{edge_manager.exported_program()}")
+    return edge_manager
+
+def nncf_export_to_edge(
+    model: Union[torch.fx.GraphModule, torch.nn.Module],
+    example_inputs: Tuple[Value, ...],
+    *,
+    example_kwarg_inputs: Optional[Dict] = None,
+    dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any]]] = None,
+    edge_constant_methods: Optional[Dict[str, Any]] = None,
+    edge_compile_config=_EDGE_COMPILE_CONFIG,
+    strict=True,
+    verbose=True,
+) -> EdgeProgramManager:
+    core_aten_ep = _to_core_aten(
+        model,
+        example_inputs,
+        example_kwarg_inputs=example_kwarg_inputs,
+        dynamic_shapes=dynamic_shapes,
+        strict=strict,
+        verbose=verbose,
+    )
+    return nncf_core_aten_to_edge(
+        core_aten_ep, edge_constant_methods, edge_compile_config, verbose=verbose
+    )
diff --git a/examples/models/llama/config/llm_config.py b/examples/models/llama/config/llm_config.py
index 2de58fe47eb..530f7335d8e 100644
--- a/examples/models/llama/config/llm_config.py
+++ b/examples/models/llama/config/llm_config.py
@@ -445,6 +445,7 @@ class OpenvinoConfig:
 
     enabled: bool = False
     device: str = "CPU"
+    nncf_compression = False
 
 
 @dataclass
@@ -625,6 +626,8 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig":  # noqa: C901
             llm_config.backend.openvino.enabled = args.openvino
         if hasattr(args, "openvino_device"):
             llm_config.backend.openvino.device = args.openvino_device
+        if hasattr(args, "nncf_compression"):
+            llm_config.backend.openvino.nncf_compression = args.nncf_compression
 
         # DebugConfig
         if hasattr(args, "profile_memory"):
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 087e4d1efdc..1ea82e3224a 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -555,7 +555,7 @@ def build_args_parser() -> argparse.ArgumentParser:
         "--nncf_compression",
         default=False,
         action="store_true",
-        help="If true, stops right after torch.export() and saves the exported model.",
+        help="Enables nncf compression for openvino backend",
     )
 
     parser.add_argument(
@@ -1214,7 +1214,7 @@ def _load_llama_model(llm_config: LlmConfig) -> "LLMEdgeManager":
         use_legacy_export=llm_config.backend.qnn.enabled,
         save_exported_program=llm_config.export.export_only,
         verbose=llm_config.debug.verbose,
-        nncf_compression=llm_config.nncf_compression,
+        nncf_compression=llm_config.backend.openvino.nncf_compression,
         metadata=_load_llama_model_metadata(
             WeightType.FAIRSEQ2 if llm_config.base.fairseq2 else WeightType.LLAMA,
             llm_config.model.use_kv_cache,
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index f185d9b346d..a2bfaeae22d 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -16,7 +16,6 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple
 from unittest.mock import patch
 
-import nncf
 import torch
 from executorch.backends.transforms.duplicate_dynamic_quant_chain import (
     DuplicateDynamicQuantChainPass,
@@ -41,7 +40,6 @@
 from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
 from torchao.quantization.pt2e.quantizer import ComposableQuantizer, Quantizer
 from torchao.utils import unwrap_tensor_subclass
-from functools import partial
 
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
 logging.basicConfig(level=logging.INFO, format=FORMAT)
@@ -433,6 +431,13 @@ def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManage
                 self.pre_autograd_graph_module = m
             return self
         elif (self.nncf_compression):
+            try:
+                import nncf
+                from functools import partial
+            except ImportError:
+                raise ImportError(
+                    "Please install nncf via backends/openvino/requirements.txt"
+                )
             tokenizer = get_tokenizer(self.tokenizer_path)
 
             def transform_fn(
@@ -487,15 +492,27 @@ def export_to_edge(self) -> "LLMEdgeManager":
                 )
 
             with override_export_behaviour:
-                self.edge_manager = export_to_edge(
-                    self.pre_autograd_graph_module,  # pyre-fixme[6]
-                    self.example_inputs,
-                    example_kwarg_inputs=self.example_kwarg_inputs,
-                    dynamic_shapes=dynamic_shape,
-                    edge_constant_methods=self.metadata,
-                    edge_compile_config=edge_config,
-                    verbose=self.verbose,
-                )
+                if (self.nncf_compression):
+                    from executorch.backends.openvino.utils import nncf_export_to_edge
+                    self.edge_manager = nncf_export_to_edge(
+                        self.pre_autograd_graph_module,  # pyre-fixme[6]
+                        self.example_inputs,
+                        example_kwarg_inputs=self.example_kwarg_inputs,
+                        dynamic_shapes=dynamic_shape,
+                        edge_constant_methods=self.metadata,
+                        edge_compile_config=edge_config,
+                        verbose=self.verbose,
+                    )
+                else:
+                    self.edge_manager = export_to_edge(
+                        self.pre_autograd_graph_module,  # pyre-fixme[6]
+                        self.example_inputs,
+                        example_kwarg_inputs=self.example_kwarg_inputs,
+                        dynamic_shapes=dynamic_shape,
+                        edge_constant_methods=self.metadata,
+                        edge_compile_config=edge_config,
+                        verbose=self.verbose,
+                    )
         return self
 
     def to_backend(self, partitioners: Optional[List[Partitioner]]) -> "LLMEdgeManager":

From 198190e6a250632ed9921fa346895521e5b22dfb Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 7 Jul 2025 14:38:05 +0400
Subject: [PATCH 006/266] openvino quantizer init

---
 .../quantizer/observers/nncf_observers.py     | 114 ++++++++++++
 backends/openvino/quantizer/quantizer.py      | 170 ++++++++++++------
 2 files changed, 228 insertions(+), 56 deletions(-)
 create mode 100644 backends/openvino/quantizer/observers/nncf_observers.py

diff --git a/backends/openvino/quantizer/observers/nncf_observers.py b/backends/openvino/quantizer/observers/nncf_observers.py
new file mode 100644
index 00000000000..54f4348e0ed
--- /dev/null
+++ b/backends/openvino/quantizer/observers/nncf_observers.py
@@ -0,0 +1,114 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+from torch.ao.quantization.observer import MappingType, PerGroup, PerAxis, PerChannelMinMaxObserver, get_block_size
+from torch.ao.quantization.pt2e._affine_quantization import (
+    _get_reduction_params,
+    AffineQuantizedMinMaxObserver,
+)
+from nncf.torch.quantization.layers import INT4AsymmetricWeightsDecompressor, INT4SymmetricWeightsDecompressor, INT8AsymmetricWeightsDecompressor, INT8SymmetricWeightsDecompressor
+from nncf.experimental.torch.fx.transformations import constant_update_fn, module_insertion_transformation_builder
+from nncf.experimental.torch.fx.node_utils import get_tensor_constant_from_node
+from nncf.torch.graph.transformations.commands import PTTargetPoint, TargetType
+
+from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization
+from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
+from nncf.parameters import CompressWeightsMode
+from nncf.tensor.tensor import Tensor
+
+class PTPerBlockParamObserver(AffineQuantizedMinMaxObserver):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        qmode = CompressWeightsMode.INT4_ASYM if self.mapping_type==MappingType.ASYMMETRIC else CompressWeightsMode.INT4_SYM 
+        assert isinstance(self.granularity, PerGroup), "Only PerGroup granularity is supported"
+        self.wc_config = WeightCompressionConfig(mode=qmode, group_size=self.granularity.group_size)
+
+    def calculate_qparams(self, weight):
+        assert hasattr(self, "min_val") and hasattr(
+            self, "max_val"
+        ), "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams"
+        _, reduction_dims = _get_reduction_params(
+            self.block_size, weight.size()
+            )
+        assert len(reduction_dims) == 1, "Only 1-D group size is supported"
+        reduction_dims = reduction_dims[0] - 1
+        q_weight, scale, zp = do_integer_quantization(Tensor(weight), self.wc_config, reduction_axes=reduction_dims)
+        zp = zp.data if zp is not None else None
+        return q_weight.data, scale.data, zp
+
+    def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node):
+        print("calling convert")
+        assert (
+            self.original_dtype is not None
+        ), "Expecting original_dtype to be populated"
+        weight_node = observer_node.args[0]
+        original_weight = get_tensor_constant_from_node(weight_node, model)
+        q_weight, scale, zero_point = self.calculate_qparams(original_weight)
+        
+        with model.graph.inserting_before(observer_node):
+            if(zero_point is not None):
+                decompressor = INT4AsymmetricWeightsDecompressor(scale, zero_point, q_weight.shape, original_weight.shape, original_weight.dtype)
+            else:
+                decompressor = INT4SymmetricWeightsDecompressor(scale, q_weight.shape, original_weight.shape, original_weight.dtype)
+            packed_q_weight = decompressor.pack_weight(q_weight)
+            new_weight_node = constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0)
+            decompressor_name = f'NNCFDecompressor_{new_weight_node.name}'
+
+            module_insertion_transformation_builder(
+                        decompressor,
+                        [PTTargetPoint(TargetType.OPERATOR_POST_HOOK, target_node_name=new_weight_node.name)],
+                        decompressor_name,
+                    )(model)
+        decomp_node = observer_node.args[0]
+        observer_node.replace_all_uses_with(decomp_node)
+        model.graph.erase_node(observer_node)
+
+
+class NNCFInt8observer(PerChannelMinMaxObserver):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        qmode = CompressWeightsMode.INT8_SYM if self.qscheme==torch.per_channel_symmetric else CompressWeightsMode.INT8_ASYM 
+        self.wc_config = WeightCompressionConfig(mode=qmode)
+
+    def calculate_qparams(self, weight):
+        assert hasattr(self, "min_val") and hasattr(
+            self, "max_val"
+        ), "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams"
+        self.granularity = PerAxis(axis=self.ch_axis)
+        self.block_size = get_block_size(weight.shape, self.granularity)
+        _, reduction_dims = _get_reduction_params(
+            self.block_size, weight.size()
+            )
+        q_weight, scale, zp = do_integer_quantization(Tensor(weight), self.wc_config, reduction_axes=reduction_dims)
+        zp = zp.data if zp is not None else None
+        return q_weight.data, scale.data, zp
+
+    def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node):
+        print("calling convert")
+        weight_node = observer_node.args[0]
+        original_weight = get_tensor_constant_from_node(weight_node, model)
+        q_weight, scale, zero_point = self.calculate_qparams(original_weight)
+
+        with model.graph.inserting_before(observer_node):
+            if(zero_point is not None):
+                decompressor = INT8AsymmetricWeightsDecompressor(scale, zero_point, original_weight.dtype)
+            else:
+                decompressor = INT8SymmetricWeightsDecompressor(scale, original_weight.dtype)
+            packed_q_weight = decompressor.pack_weight(q_weight)
+            new_weight_node = constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0)
+            decompressor_name = f'NNCFDecompressor_{new_weight_node.name}'
+
+            module_insertion_transformation_builder(
+                        decompressor,
+                        [PTTargetPoint(TargetType.OPERATOR_POST_HOOK, target_node_name=new_weight_node.name)],
+                        decompressor_name,
+                    )(model)
+        decomp_node = observer_node.args[0]
+        observer_node.replace_all_uses_with(decomp_node)
+        model.graph.erase_node(observer_node)
\ No newline at end of file
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index d0622b24e6d..f8f08996f53 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -21,6 +21,8 @@
     HistogramObserver,
     PerChannelMinMaxObserver,
     UniformQuantizationObserverBase,
+    PerGroup,
+    MappingType,
 )
 from torchao.quantization.pt2e.quantizer import (
     EdgeOrNode,
@@ -30,6 +32,9 @@
     Quantizer,
     SharedQuantizationSpec,
 )
+from nncf.quantization.quantize_model import get_weight_compression_configuration
+from nncf.common.quantization.structs import QuantizerConfig, QuantizationScheme
+from executorch.backends.openvino.quantizer.observers.nncf_observers import PTPerBlockParamObserver,NNCFInt8observer
 
 QUANT_ANNOTATION_KEY = "quantization_annotation"
 
@@ -46,6 +51,10 @@ class QuantizationMode(Enum):
     INT8_SYM = "int8_sym"
     INT8_MIXED = "int8_mixed"
     INT8_TRANSFORMER = "int8_transformer"
+    INT8_SYM_WC = "int8_sym_wc"
+    INT8_ASYM_WC = "int8_asym_wc"
+    INT4_SYM_WC = "int4_sym"
+    INT4_ASYM_WC = "int4_asym"
 
 
 class OpenVINOQuantizer(Quantizer):
@@ -66,8 +75,12 @@ def __init__(
             - INT8_MIXED: INT8 asymmetric quantization for activations, symmetric for weights.
             - INT8_TRANSFORMER: Optimized INT8 quantization for transformer-based models
             Default value is INT8_SYM.
+            - INT4_SYM: Symmetric INT4 Weights-Only Compression
+            - INT4_ASYM: Asymmetric INT4 Weights-Only Compression
         :param kwargs: Arguments to pass to the NNCF MinMaxQuantization algorithm.
         """
+        self.mode = mode
+        self.wc_modes = [QuantizationMode.INT4_ASYM_WC,QuantizationMode.INT4_SYM_WC, QuantizationMode.INT8_ASYM_WC, QuantizationMode.INT8_SYM_WC]
         if mode == QuantizationMode.INT8_SYM:
             preset = quantization.structs.QuantizationPreset.PERFORMANCE
             model_type = None
@@ -77,11 +90,24 @@ def __init__(
         else:
             preset = None
             model_type = nncf.parameters.ModelType.TRANSFORMER
-        self._min_max_algo = (
-            nncf.quantization.algorithms.min_max.algorithm.MinMaxQuantization(
-                preset=preset, model_type=model_type, **kwargs
+        if(self.mode not in self.wc_modes):
+            self._min_max_algo = (
+                nncf.quantization.algorithms.min_max.algorithm.MinMaxQuantization(
+                    preset=preset, model_type=model_type, **kwargs
+                )
             )
-        )
+            self._algo = self._min_max_algo
+        else:
+            weight_compression_configuration = get_weight_compression_configuration(
+                mode.value.replace("_wc", ""), # Mode value has to match NNCF CompressWeightsMode
+                **kwargs
+            )
+            self._weight_compression_algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression(
+                subset_size=None,
+                **weight_compression_configuration
+            )
+            self._algo = self._weight_compression_algo
+
 
     def set_ignored_scope(
         self,
@@ -102,7 +128,7 @@ def set_ignored_scope(
         :param validate: If set to True, then a RuntimeError will be raised if any ignored scope does not match
           in the model graph.
         """
-        self._min_max_algo.set_ignored_scope(
+        self._algo.set_ignored_scope(
             nncf.IgnoredScope(
                 names=names or [],
                 patterns=patterns or [],
@@ -115,63 +141,80 @@ def set_ignored_scope(
     def get_nncf_quantization_setup(
         self, model: torch.fx.GraphModule, nncf_graph: NNCFGraph
     ) -> quantization.quantizer_setup.SingleConfigQuantizerSetup:
-        self._min_max_algo._set_backend_entity(model)
-        return self._min_max_algo.find_quantization_setup(model, nncf_graph)
+        self._algo._set_backend_entity(model)
+        return self._algo.find_quantization_setup(model, nncf_graph)
 
     def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
         nncf_graph = nncf_fx.nncf_graph_builder.GraphConverter.create_nncf_graph(model)
-        quantization_setup = self.get_nncf_quantization_setup(model, nncf_graph)
-
+        
         graph = model.graph
         node_vs_torch_annotation: DefaultDict[torch.fx.Node, QuantizationAnnotation] = (
             defaultdict(QuantizationAnnotation)
         )
+        # Serperate into annotation for quantize and compress
+        if(self.mode in self.wc_modes):
+            self._algo.set_backend_entity(model)
+            nodes_to_compress = self._algo.get_nodes_to_compress(nncf_graph)
+            for node in nodes_to_compress:
+                quantization_insertion_point = quantization.quantizer_setup.WeightQuantizationInsertionPoint(target_node_name=node.node_name)
+                group_size = self._algo._group_size
+                num_bits = 4 if self.mode in [QuantizationMode.INT4_SYM_WC,QuantizationMode.INT4_ASYM_WC] else 8
+                qmode = QuantizationScheme.SYMMETRIC if self.mode in [QuantizationMode.INT4_SYM_WC,QuantizationMode.INT8_SYM_WC] else QuantizationScheme.ASYMMETRIC
+                nncf_qconfig = QuantizerConfig(num_bits=num_bits, mode=qmode)
+                qp = quantization.quantizer_setup.SingleConfigQuantizationPoint(qip=quantization_insertion_point, qconfig=nncf_qconfig, directly_quantized_operator_node_names=[node])
+                edge_or_node, annotation = self._get_edge_or_node_and_annotation(
+                    graph, nncf_graph, qp, node_vs_torch_annotation
+                )
+                qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(qp, group_size=group_size, weights_only=True)
+                self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
+        else:
+            quantization_setup = self.get_nncf_quantization_setup(model, nncf_graph)
 
-        for qp in quantization_setup.quantization_points.values():
-            edge_or_node, annotation = self._get_edge_or_node_and_annotation(
-                graph, nncf_graph, qp, node_vs_torch_annotation
-            )
-            qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_qp(qp)
-            self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
+            for qp in quantization_setup.quantization_points.values():
+                edge_or_node, annotation = self._get_edge_or_node_and_annotation(
+                    graph, nncf_graph, qp, node_vs_torch_annotation
+                )
+                qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(qp)
+                self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
 
-        for quantizer_ids in quantization_setup.unified_scale_groups.values():
+            for quantizer_ids in quantization_setup.unified_scale_groups.values():
 
-            root_quantizer_id = self._get_unified_scales_root_quantizer_id(
-                nncf_graph, quantizer_ids, quantization_setup
-            )
-            root_qp = quantization_setup.quantization_points[root_quantizer_id]
+                root_quantizer_id = self._get_unified_scales_root_quantizer_id(
+                    nncf_graph, quantizer_ids, quantization_setup
+                )
+                root_qp = quantization_setup.quantization_points[root_quantizer_id]
 
-            if any(
-                root_qp.qconfig != quantization_setup.quantization_points[q_id].qconfig
-                for q_id in quantizer_ids
-            ):
-                qps = [
-                    quantization_setup.quantization_points[q_id]
+                if any(
+                    root_qp.qconfig != quantization_setup.quantization_points[q_id].qconfig
                     for q_id in quantizer_ids
-                ]
-                msg = (
-                    "Different quantization configs are set to one unified scale group:"
-                    f"{[(qp.insertion_point.__dict__, str(qp.qconfig)) for qp in qps]}"
+                ):
+                    qps = [
+                        quantization_setup.quantization_points[q_id]
+                        for q_id in quantizer_ids
+                    ]
+                    msg = (
+                        "Different quantization configs are set to one unified scale group:"
+                        f"{[(qp.insertion_point.__dict__, str(qp.qconfig)) for qp in qps]}"
+                    )
+                    raise nncf.InternalError(msg)
+
+                root_target_node = nncf_fx.node_utils.get_graph_node_by_name(
+                    graph, root_qp.insertion_point.target_node_name
+                )
+                root_edge_or_node = self._get_edge_or_node(
+                    root_target_node, root_qp, nncf_graph
                 )
-                raise nncf.InternalError(msg)
-
-            root_target_node = nncf_fx.node_utils.get_graph_node_by_name(
-                graph, root_qp.insertion_point.target_node_name
-            )
-            root_edge_or_node = self._get_edge_or_node(
-                root_target_node, root_qp, nncf_graph
-            )
 
-            for quantizer_id in quantizer_ids:
-                if quantizer_id == root_quantizer_id:
-                    continue
+                for quantizer_id in quantizer_ids:
+                    if quantizer_id == root_quantizer_id:
+                        continue
 
-                qspec = SharedQuantizationSpec(root_edge_or_node)
-                qp = quantization_setup.quantization_points[quantizer_id]
-                edge_or_node, annotation = self._get_edge_or_node_and_annotation(
-                    graph, nncf_graph, qp, node_vs_torch_annotation
-                )
-                self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
+                    qspec = SharedQuantizationSpec(root_edge_or_node)
+                    qp = quantization_setup.quantization_points[quantizer_id]
+                    edge_or_node, annotation = self._get_edge_or_node_and_annotation(
+                        graph, nncf_graph, qp, node_vs_torch_annotation
+                    )
+                    self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
 
         for node, annotation in node_vs_torch_annotation.items():
             assert QUANT_ANNOTATION_KEY not in node.meta
@@ -295,8 +338,8 @@ def _fill_torch_ao_annotation(
             annotation_to_update.input_qspec_map[edge_or_node[0]] = qspec
 
     @staticmethod
-    def _get_torch_ao_qspec_from_qp(
-        qp: quantization.quantizer_setup.QuantizationPointBase,
+    def _get_torch_ao_qspec_from_nncf_config(
+        qp: quantization.quantizer_setup.QuantizationPointBase, group_size=-1, weights_only=False
     ) -> QuantizationSpec:
         """
         Retrieves the quantization configuration from the given quantization point and
@@ -307,11 +350,10 @@ def _get_torch_ao_qspec_from_qp(
         """
         # Eps value is copied from nncf/torch/quantization/layers.py
         extra_args = {"eps": 1e-16}
-        qconfig = qp.qconfig
         is_weight = qp.is_weight_quantization_point()
+        qconfig = qp.qconfig
 
         observer: Type[UniformQuantizationObserverBase]
-
         if qconfig.per_channel:
             torch_qscheme = (
                 torch.per_channel_symmetric
@@ -325,11 +367,27 @@ def _get_torch_ao_qspec_from_qp(
                 else torch.per_tensor_affine
             )
         if is_weight:
-            observer = PerChannelMinMaxObserver
-            quant_min = -128
-            quant_max = 127
-            dtype = torch.int8
-            channel_axis = 0
+            mapping_type = MappingType.SYMMETRIC if qconfig.mode == QuantizationScheme.SYMMETRIC else MappingType.ASYMMETRIC
+            if qconfig.num_bits==4:
+                extra_args["mapping_type"] = mapping_type 
+                extra_args["target_dtype"] = torch.int8
+                extra_args["granularity"] = PerGroup(group_size=group_size)
+                observer = PTPerBlockParamObserver
+                quant_min = -8
+                quant_max = 7
+                dtype = torch.int8
+                channel_axis = 0
+            elif qconfig.num_bits==8:
+                observer = NNCFInt8observer if weights_only else PerChannelMinMaxObserver
+                quant_min = -128
+                quant_max = 127
+                dtype = torch.int8
+                channel_axis = 0
+                torch_qscheme = (
+                torch.per_channel_symmetric
+                if qconfig.mode is quantization.structs.QuantizationScheme.SYMMETRIC
+                else torch.per_channel_affine
+                )
         else:
             observer = (
                 HistogramObserver

From 3d88a4ea80179ba5b4498a47b3365440c81a37bd Mon Sep 17 00:00:00 2001
From: Cavus Mustafa <mustafa.cavus@intel.com>
Date: Tue, 8 Jul 2025 12:45:43 -0700
Subject: [PATCH 007/266] Moved all openvino llama example changes into
 export_llama_lib

---
 backends/openvino/partitioner.py          |  1 +
 examples/models/llama/export_llama_lib.py | 85 ++++++++++++++++++++---
 extension/llm/export/builder.py           | 67 +++---------------
 3 files changed, 86 insertions(+), 67 deletions(-)

diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py
index b1e7f5d436a..b508a698cab 100644
--- a/backends/openvino/partitioner.py
+++ b/backends/openvino/partitioner.py
@@ -131,6 +131,7 @@ def ops_to_not_decompose(
             torch.ops.aten.upsample_bilinear2d.vec,
             torch.ops.aten.upsample_nearest2d.default,
             torch.ops.aten.upsample_nearest2d.vec,
+            torch.ops.aten.stack.default,
         ]
         return (ops_not_decompose, None)
 
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 1ea82e3224a..ecf0ea72dca 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -862,6 +862,73 @@ def _to_edge_and_lower_llama_xnnpack(
     return builder.to_executorch(passes=additional_passes)
 
 
+def _to_edge_and_lower_llama_openvino(
+    builder_exported,
+    modelname,
+    additional_passes,
+    openvino_device: str = "CPU",
+    nncf_compression: bool = False,
+    verbose: bool = False,
+) -> LLMEdgeManager:  # noqa: C901
+    partitioners = []
+
+    # Add OpenVINO partitioner
+    partitioners.append(get_openvino_partitioner(openvino_device))
+    modelname = f"openvino_{modelname}"
+
+
+    logging.info("Lowering model using following partitioner(s): ")
+    for partitioner in partitioners:
+        logging.info(f"--> {partitioner.__class__.__name__}")
+
+    # Use NNCF compression if enabled
+    # TODO: Enable passing OpenVINOQuantizer as a parameter to pt2e_quantize
+    if nncf_compression:
+        try:
+            import nncf
+            from functools import partial
+            from pytorch_tokenizers import get_tokenizer
+        except ImportError:
+            raise ImportError(
+                "Please install nncf via backends/openvino/requirements.txt"
+            )
+        tokenizer = get_tokenizer(builder_exported.tokenizer_path)
+
+        def transform_fn(
+            prompts: str, tokenizer
+        ):
+            tokenized_text = tokenizer.encode(prompts, bos=False, eos=False)
+            logging.error(tokenized_text)
+
+            inputs = ()
+            inputs = (
+                torch.tensor(tokenized_text).unsqueeze(0),
+                {"input_pos": torch.tensor([0])},
+            )
+
+            return inputs
+
+        builder_exported.calibration_data = [builder_exported.calibration_data] if isinstance(builder_exported.calibration_data, str) else builder_exported.calibration_data
+        builder_exported.calibration_data = [word for prompt in builder_exported.calibration_data for word in prompt.split()] if not builder_exported.dynamic_shapes else builder_exported.calibration_data
+
+        builder_exported.pre_autograd_graph_module = nncf.compress_weights(
+                                                            builder_exported.pre_autograd_graph_module,
+                                                            dataset=nncf.Dataset(builder_exported.calibration_data, transform_func=partial(transform_fn, tokenizer=tokenizer)),
+                                                            mode=nncf.CompressWeightsMode.INT4_SYM,
+                                                            ratio=0.8,
+                                                            sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
+                                                        )
+
+    builder = builder_exported.to_edge_transform_and_lower(
+        partitioners
+    )
+
+    if verbose:
+        print_delegation_info(builder.edge_manager.exported_program().graph_module)
+
+    return builder.to_executorch(passes=additional_passes)
+
+
 def _to_edge_and_lower_llama(  # noqa: C901
     builder_exported,
     modelname,
@@ -873,8 +940,6 @@ def _to_edge_and_lower_llama(  # noqa: C901
     mps: bool = False,
     coreml: bool = False,
     qnn: bool = False,
-    openvino: bool = False,
-    openvino_device: str = "CPU",
     dtype_override: str = "fp32",
     enable_dynamic_shape: bool = True,
     use_kv_cache: bool = False,
@@ -919,10 +984,6 @@ def _to_edge_and_lower_llama(  # noqa: C901
         partitioners.append(coreml_partitioner)
         modelname = f"coreml_{modelname}"
 
-    if openvino:
-        partitioners.append(get_openvino_partitioner(openvino_device))
-        modelname = f"openvino_{modelname}"
-
     if qnn:
         logging.warning(
             "The model definition in current repro is not performant, please refer to the instruction"
@@ -1078,6 +1139,15 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager:  # noqa: C901
             generate_etrecord=llm_config.debug.generate_etrecord,
             verbose=llm_config.debug.verbose,
         )
+    elif llm_config.backend.openvino.enabled:
+        builder = _to_edge_and_lower_llama_openvino(
+            builder_exported,
+            modelname,
+            additional_passes,
+            openvino_device=llm_config.backend.openvino.device,
+            nncf_compression=llm_config.backend.openvino.nncf_compression,
+            verbose=llm_config.debug.verbose,
+        )
     else:
         builder = _to_edge_and_lower_llama(
             builder_exported,
@@ -1090,8 +1160,6 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager:  # noqa: C901
             mps=llm_config.backend.mps.enabled,
             coreml=llm_config.backend.coreml.enabled,
             qnn=llm_config.backend.qnn.enabled,
-            openvino=llm_config.backend.openvino.enabled,
-            openvino_device=llm_config.backend.openvino.device,
             dtype_override=llm_config.model.dtype_override,
             enable_dynamic_shape=llm_config.model.enable_dynamic_shape,
             use_kv_cache=llm_config.model.use_kv_cache,
@@ -1214,7 +1282,6 @@ def _load_llama_model(llm_config: LlmConfig) -> "LLMEdgeManager":
         use_legacy_export=llm_config.backend.qnn.enabled,
         save_exported_program=llm_config.export.export_only,
         verbose=llm_config.debug.verbose,
-        nncf_compression=llm_config.backend.openvino.nncf_compression,
         metadata=_load_llama_model_metadata(
             WeightType.FAIRSEQ2 if llm_config.base.fairseq2 else WeightType.LLAMA,
             llm_config.model.use_kv_cache,
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index a2bfaeae22d..4128bfd8198 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -98,7 +98,6 @@ def __init__(
         dynamic_shapes: Optional[Any] = None,
         use_legacy_export: bool = False,
         save_exported_program: bool = False,
-        nncf_compression: bool = False
     ):
         # Store necessary constructor arguments.
         self.model = model
@@ -120,7 +119,6 @@ def __init__(
         self.dynamic_shapes = dynamic_shapes
         self.use_legacy_export = use_legacy_export
         self.save_exported_program = save_exported_program
-        self.nncf_compression = nncf_compression
 
         # Note: treat this as the source of truth for the result of
         # torch.export'ing a model. If the overall ExportedProgram is needed,
@@ -430,41 +428,6 @@ def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManage
                 DuplicateDynamicQuantChainPass()(m)
                 self.pre_autograd_graph_module = m
             return self
-        elif (self.nncf_compression):
-            try:
-                import nncf
-                from functools import partial
-            except ImportError:
-                raise ImportError(
-                    "Please install nncf via backends/openvino/requirements.txt"
-                )
-            tokenizer = get_tokenizer(self.tokenizer_path)
-
-            def transform_fn(
-                prompts: str, tokenizer
-            ):
-                tokenized_text = tokenizer.encode(prompts, bos=False, eos=False)
-                logging.error(tokenized_text)
-
-                inputs = ()
-                inputs = (
-                    torch.tensor(tokenized_text).unsqueeze(0),
-                    {"input_pos": torch.tensor([0])},
-                )
-
-                return inputs
-
-            self.calibration_data = [self.calibration_data] if isinstance(self.calibration_data, str) else self.calibration_data
-            self.calibration_data = [word for prompt in self.calibration_data for word in prompt.split()] if not self.dynamic_shapes else self.calibration_data
-
-            self.pre_autograd_graph_module = nncf.compress_weights(
-                                                                self.pre_autograd_graph_module,
-                                                                dataset=nncf.Dataset(self.calibration_data, transform_func=partial(transform_fn, tokenizer=tokenizer)),
-                                                                mode=nncf.CompressWeightsMode.INT4_SYM,
-                                                                ratio=0.8,
-                                                                sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
-                                                            )
-            return self
         else:
             logging.info("No quantizer provided, passing...")
             return self
@@ -492,27 +455,15 @@ def export_to_edge(self) -> "LLMEdgeManager":
                 )
 
             with override_export_behaviour:
-                if (self.nncf_compression):
-                    from executorch.backends.openvino.utils import nncf_export_to_edge
-                    self.edge_manager = nncf_export_to_edge(
-                        self.pre_autograd_graph_module,  # pyre-fixme[6]
-                        self.example_inputs,
-                        example_kwarg_inputs=self.example_kwarg_inputs,
-                        dynamic_shapes=dynamic_shape,
-                        edge_constant_methods=self.metadata,
-                        edge_compile_config=edge_config,
-                        verbose=self.verbose,
-                    )
-                else:
-                    self.edge_manager = export_to_edge(
-                        self.pre_autograd_graph_module,  # pyre-fixme[6]
-                        self.example_inputs,
-                        example_kwarg_inputs=self.example_kwarg_inputs,
-                        dynamic_shapes=dynamic_shape,
-                        edge_constant_methods=self.metadata,
-                        edge_compile_config=edge_config,
-                        verbose=self.verbose,
-                    )
+                self.edge_manager = export_to_edge(
+                    self.pre_autograd_graph_module,  # pyre-fixme[6]
+                    self.example_inputs,
+                    example_kwarg_inputs=self.example_kwarg_inputs,
+                    dynamic_shapes=dynamic_shape,
+                    edge_constant_methods=self.metadata,
+                    edge_compile_config=edge_config,
+                    verbose=self.verbose,
+                )
         return self
 
     def to_backend(self, partitioners: Optional[List[Partitioner]]) -> "LLMEdgeManager":

From e81f60d895fe235e00fa11567f5f85e6d6e25d08 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa <mustafa.cavus@intel.com>
Date: Tue, 8 Jul 2025 12:57:22 -0700
Subject: [PATCH 008/266] Removed openvino utils.py since it is not needed
 anymore

---
 backends/openvino/utils.py | 66 --------------------------------------
 1 file changed, 66 deletions(-)
 delete mode 100644 backends/openvino/utils.py

diff --git a/backends/openvino/utils.py b/backends/openvino/utils.py
deleted file mode 100644
index ec4bebe0d6d..00000000000
--- a/backends/openvino/utils.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) Intel Corporation
-#
-# Licensed under the BSD License (the "License"); you may not use this file
-# except in compliance with the License. See the license file found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-
-from typing import Any, Dict, Optional, Tuple, Union
-
-import executorch.exir as exir
-
-import torch
-from executorch.exir import EdgeProgramManager
-from executorch.exir.program._program import to_edge_with_preserved_ops
-from executorch.exir.tracer import Value
-from torch.export import ExportedProgram
-from executorch.extension.export_util.utils import _to_core_aten
-
-_EDGE_COMPILE_CONFIG = exir.EdgeCompileConfig(
-    _check_ir_validity=True,
-    _skip_dim_order=True,  # TODO(T189114319): Reuse dim order op after solving the ios oss issue
-)
-
-def nncf_core_aten_to_edge(
-    core_aten_exir_ep: ExportedProgram,
-    edge_constant_methods: Optional[Dict[str, Any]] = None,
-    edge_compile_config=None,
-    verbose=True,
-) -> EdgeProgramManager:
-    if not edge_compile_config:
-        edge_compile_config = exir.EdgeCompileConfig(
-            _check_ir_validity=False,  # quant ops currently break ir verification
-        )
-    edge_manager: EdgeProgramManager = to_edge_with_preserved_ops(
-        core_aten_exir_ep,
-        constant_methods=edge_constant_methods,
-        compile_config=edge_compile_config,
-        preserve_ops=[torch.ops.aten.stack.default,],
-    )
-    if verbose:
-        logging.info(f"Exported graph:\n{edge_manager.exported_program()}")
-    return edge_manager
-
-def nncf_export_to_edge(
-    model: Union[torch.fx.GraphModule, torch.nn.Module],
-    example_inputs: Tuple[Value, ...],
-    *,
-    example_kwarg_inputs: Optional[Dict] = None,
-    dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any]]] = None,
-    edge_constant_methods: Optional[Dict[str, Any]] = None,
-    edge_compile_config=_EDGE_COMPILE_CONFIG,
-    strict=True,
-    verbose=True,
-) -> EdgeProgramManager:
-    core_aten_ep = _to_core_aten(
-        model,
-        example_inputs,
-        example_kwarg_inputs=example_kwarg_inputs,
-        dynamic_shapes=dynamic_shapes,
-        strict=strict,
-        verbose=verbose,
-    )
-    return nncf_core_aten_to_edge(
-        core_aten_ep, edge_constant_methods, edge_compile_config, verbose=verbose
-    )

From 457a868cb01bc1a4be090da18b3e431cf3b506d0 Mon Sep 17 00:00:00 2001
From: Aamir Nazir <aamir.nazir@intel.com>
Date: Wed, 9 Jul 2025 11:53:26 +0400
Subject: [PATCH 009/266] Update nncf_observers.py

---
 .../quantizer/observers/nncf_observers.py      | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/backends/openvino/quantizer/observers/nncf_observers.py b/backends/openvino/quantizer/observers/nncf_observers.py
index 54f4348e0ed..977458801a4 100644
--- a/backends/openvino/quantizer/observers/nncf_observers.py
+++ b/backends/openvino/quantizer/observers/nncf_observers.py
@@ -57,12 +57,14 @@ def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node):
             else:
                 decompressor = INT4SymmetricWeightsDecompressor(scale, q_weight.shape, original_weight.shape, original_weight.dtype)
             packed_q_weight = decompressor.pack_weight(q_weight)
-            new_weight_node = constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0)
-            decompressor_name = f'NNCFDecompressor_{new_weight_node.name}'
+            constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0)
+            compressed_weight_name = observer_node.all_input_nodes[0].name
+            decompressor_suffix = "_".join(compressed_weight_name.replace(".", "_").split("_")[:-2])
+            decompressor_name = f"{decompressor.quantization_mode}_weights_decompressor_{decompressor_suffix}"
 
             module_insertion_transformation_builder(
                         decompressor,
-                        [PTTargetPoint(TargetType.OPERATOR_POST_HOOK, target_node_name=new_weight_node.name)],
+                        [PTTargetPoint(TargetType.OPERATOR_POST_HOOK, target_node_name=compressed_weight_name)],
                         decompressor_name,
                     )(model)
         decomp_node = observer_node.args[0]
@@ -101,14 +103,16 @@ def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node):
             else:
                 decompressor = INT8SymmetricWeightsDecompressor(scale, original_weight.dtype)
             packed_q_weight = decompressor.pack_weight(q_weight)
-            new_weight_node = constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0)
-            decompressor_name = f'NNCFDecompressor_{new_weight_node.name}'
+            constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0)
+            compressed_weight_name = observer_node.all_input_nodes[0].name
+            decompressor_suffix = "_".join(compressed_weight_name.replace(".", "_").split("_")[:-2])
+            decompressor_name = f"{decompressor.quantization_mode}_weights_decompressor_{decompressor_suffix}"
 
             module_insertion_transformation_builder(
                         decompressor,
-                        [PTTargetPoint(TargetType.OPERATOR_POST_HOOK, target_node_name=new_weight_node.name)],
+                        [PTTargetPoint(TargetType.OPERATOR_POST_HOOK, target_node_name=compressed_weight_name)],
                         decompressor_name,
                     )(model)
         decomp_node = observer_node.args[0]
         observer_node.replace_all_uses_with(decomp_node)
-        model.graph.erase_node(observer_node)
\ No newline at end of file
+        model.graph.erase_node(observer_node)

From d1e9330b53f96068590b767ec8896a9317a1e954 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa <mustafa.cavus@intel.com>
Date: Mon, 14 Jul 2025 18:55:40 -0700
Subject: [PATCH 010/266] Add export llama runner build option into openvino
 build script

---
 backends/openvino/scripts/openvino_build.sh | 28 +++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/backends/openvino/scripts/openvino_build.sh b/backends/openvino/scripts/openvino_build.sh
index c10a3bb4eeb..add946e15ae 100755
--- a/backends/openvino/scripts/openvino_build.sh
+++ b/backends/openvino/scripts/openvino_build.sh
@@ -17,7 +17,7 @@ main() {
         # Set build directory
         local build_dir="cmake-out"
 
-        # Create and enter the build directory
+        # Enter the Executorch root directory
         cd "$EXECUTORCH_ROOT"
         rm -rf "${build_dir}"
 
@@ -32,6 +32,7 @@ main() {
               -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
               -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
               -DEXECUTORCH_BUILD_OPENVINO_EXECUTOR_RUNNER=ON \
+              -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
               -B"${build_dir}"
 
 
@@ -42,7 +43,7 @@ main() {
     elif [[ "$build_type" == "--enable_python" ]]; then
         echo "Building Python Package with Pybinding"
 
-        # Create and enter the build directory
+        # Enter the Executorch root directory
         cd "$EXECUTORCH_ROOT"
         ./install_executorch.sh --clean
 
@@ -58,6 +59,29 @@ main() {
         # Install torchao
         pip install third-party/ao
 
+    # If the first arguments is --llama_runner, build export llama runner binary
+    # Note: c++ runtime with openvino backend should be built before building export llama runner
+    elif [[ "$build_type" == "--llama_runner" ]]; then
+        echo "Building Export Llama Runner"
+
+        # Set build directory
+        local build_dir="cmake-out"
+
+        # Enter the Executorch root directory
+        cd "$EXECUTORCH_ROOT"
+
+        # Configure the project with CMake
+        # Note: Add any additional configuration options you need here
+        cmake -DBUILD_TESTING=OFF \
+            -DCMAKE_INSTALL_PREFIX="${build_dir}" \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DEXECUTORCH_BUILD_OPENVINO=ON \
+            -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+            -B"${build_dir}"/examples/models/llama \
+            examples/models/llama
+        
+        # Build the export llama runner
+        cmake --build cmake-out/examples/models/llama -j$(nproc) --config Release
     else
         echo "Error: Argument is not valid: $build_type"
         exit 1  # Exit the script with an error code

From cedab9d875e2965f4faaa90e16a1be1adc8d507d Mon Sep 17 00:00:00 2001
From: Mustafa Cavus <mustafa.cavus@intel.com>
Date: Mon, 14 Jul 2025 19:10:02 -0700
Subject: [PATCH 011/266] Update README.md

---
 examples/openvino/README.md | 48 +++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/examples/openvino/README.md b/examples/openvino/README.md
index 8856ccdce4e..dbce5df1b55 100644
--- a/examples/openvino/README.md
+++ b/examples/openvino/README.md
@@ -183,3 +183,51 @@ Run inference with a given model for 10 iterations:
     --model_path=model.pte \
     --num_executions=10
 ```
+
+# Export Llama with OpenVINO Backend
+
+## Download the Model
+Follow the [instructions](../../examples/models/llama#step-2-prepare-model) to download the required model files. Export Llama with OpenVINO backend is only verified with Llama-3.2-1B variants at this time. 
+
+## Environment Setup
+Follow the [instructions](../../backends/openvino/README.md) of **Prerequisites** and **Setup** in `backends/openvino/README.md` to set up the OpenVINO backend.
+
+## Export the model:
+Execute the commands below to export the model. Update the model file paths to match the location where your model is downloaded.
+
+```
+LLAMA_CHECKPOINT=<path/to/model/folder>/consolidated.00.pth
+LLAMA_PARAMS=<path/to/model/folder>/params.json
+LLAMA_TOKENIZER=<path/to/model/folder>/tokenizer.model
+
+python -u -m examples.models.llama.export_llama \
+  --model "llama3_2" \
+  --checkpoint "${LLAMA_CHECKPOINT:?}" \
+  --params "${LLAMA_PARAMS:?}" \
+  -kv \
+  --openvino \
+  -d fp32 \
+  --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
+  --output_name="llama.pte" \
+  --verbose \
+  --disable_dynamic_shape \
+  --tokenizer_path "${LLAMA_TOKENIZER:?}" \
+  --nncf_compression
+```
+
+## Build OpenVINO C++ Runtime with Llama Runner:
+First, build the backend libraries by executing the script below in `<executorch_root>/backends/openvino/scripts` folder:
+```bash
+./openvino_build.sh
+```
+Then, build the llama runner by executing the script below (with `--llama_runner` argument) also in `<executorch_root>/backends/openvino/scripts` folder:
+```bash
+./openvino_build.sh --llama_runner
+```
+The executable is saved in `<executorch_root>/cmake-out/examples/models/llama/llama_main`
+
+## Execute Inference Using Llama Runner
+Update the model tokenizer file path to match the location where your model is downloaded and replace the prompt.
+```
+./cmake-out/examples/models/llama/llama_main --model_path=llama.pte --tokenizer_path=<path/to/model/folder>/tokenizer.model --prompt="Your custom prompt"
+```

From e54f4c7ef6207733f0907cbe1030124926f6550c Mon Sep 17 00:00:00 2001
From: suryasidd <surya.siddharth.pemmaraju@intel.com>
Date: Tue, 19 Aug 2025 15:56:35 -0700
Subject: [PATCH 012/266] Added CMAKE EXPORT Changes

---
 backends/openvino/CMakeLists.txt            | 12 +++++++++---
 backends/openvino/scripts/openvino_build.sh |  8 +++-----
 examples/models/llama/CMakeLists.txt        |  3 +--
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/backends/openvino/CMakeLists.txt b/backends/openvino/CMakeLists.txt
index cb240805665..a2b982babab 100644
--- a/backends/openvino/CMakeLists.txt
+++ b/backends/openvino/CMakeLists.txt
@@ -38,7 +38,11 @@ add_library(openvino_backend STATIC .)
 target_compile_options(openvino_backend PRIVATE -frtti -fexceptions)
 
 # Include Executorch directories
-target_include_directories(openvino_backend PUBLIC ${COMMON_INCLUDE_DIRS})
+target_include_directories(openvino_backend
+ PUBLIC
+ $<BUILD_INTERFACE:${COMMON_INCLUDE_DIRS}>
+)
+
 
 # Link OpenVINO and ExecuteTorch core libraries
 target_link_libraries(
@@ -77,5 +81,7 @@ if(EXECUTORCH_BUILD_OPENVINO_EXECUTOR_RUNNER)
   )
 endif()
 
-# Install OpenVINO backend library to the lib directory
-install(TARGETS openvino_backend DESTINATION lib)
+# Install OpenVINO backend library and export target
+install(TARGETS openvino_backend
+        EXPORT ExecuTorchTargets
+        DESTINATION lib)
diff --git a/backends/openvino/scripts/openvino_build.sh b/backends/openvino/scripts/openvino_build.sh
index 7f903086163..08741840ddb 100755
--- a/backends/openvino/scripts/openvino_build.sh
+++ b/backends/openvino/scripts/openvino_build.sh
@@ -33,6 +33,8 @@ main() {
               -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
               -DEXECUTORCH_BUILD_OPENVINO_EXECUTOR_RUNNER=ON \
               -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+              -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
+              -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
               -B"${build_dir}"
 
 
@@ -72,14 +74,10 @@ main() {
 
         # Configure the project with CMake
         # Note: Add any additional configuration options you need here
-        cmake -DBUILD_TESTING=OFF \
-            -DCMAKE_INSTALL_PREFIX="${build_dir}" \
+        cmake -DCMAKE_INSTALL_PREFIX="${build_dir}" \
             -DCMAKE_BUILD_TYPE=Release \
-            -DEXECUTORCH_BUILD_OPENVINO=ON \
-            -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
             -B"${build_dir}"/examples/models/llama \
             examples/models/llama
-        
         # Build the export llama runner
         cmake --build cmake-out/examples/models/llama -j$(nproc) --config Release
     else
diff --git a/examples/models/llama/CMakeLists.txt b/examples/models/llama/CMakeLists.txt
index c469a69596c..a2a1f4efa05 100644
--- a/examples/models/llama/CMakeLists.txt
+++ b/examples/models/llama/CMakeLists.txt
@@ -192,9 +192,8 @@ endif()
 # Openvino backend
 if(TARGET openvino_backend)
   find_package(OpenVINO REQUIRED)
-  target_link_libraries(openvino_backend INTERFACE openvino::runtime executorch_core)
   list(APPEND link_libraries openvino_backend)
-  target_link_options_shared_lib(openvino_backend)
+  executorch_target_link_options_shared_lib(openvino_backend)
 endif()
 
 if(TARGET coremldelegate)

From c12a4bafd441be0a77f909c063fcb883a8ac900b Mon Sep 17 00:00:00 2001
From: Cavus Mustafa <mustafa.cavus@intel.com>
Date: Wed, 20 Aug 2025 18:07:33 -0700
Subject: [PATCH 013/266] code formating updates

---
 backends/openvino/CMakeLists.txt              |  14 +-
 backends/openvino/partitioner.py              |  38 +++--
 .../quantizer/observers/nncf_observers.py     | 133 +++++++++++++-----
 backends/openvino/quantizer/quantizer.py      |  95 +++++++++----
 backends/openvino/runtime/OpenvinoBackend.cpp |  26 ++--
 examples/models/llama/export_llama_lib.py     |  45 +++---
 extension/llm/export/partitioner_lib.py       |   6 +-
 7 files changed, 243 insertions(+), 114 deletions(-)

diff --git a/backends/openvino/CMakeLists.txt b/backends/openvino/CMakeLists.txt
index a2b982babab..94f47c5e929 100644
--- a/backends/openvino/CMakeLists.txt
+++ b/backends/openvino/CMakeLists.txt
@@ -38,12 +38,10 @@ add_library(openvino_backend STATIC .)
 target_compile_options(openvino_backend PRIVATE -frtti -fexceptions)
 
 # Include Executorch directories
-target_include_directories(openvino_backend
- PUBLIC
- $<BUILD_INTERFACE:${COMMON_INCLUDE_DIRS}>
+target_include_directories(
+  openvino_backend PUBLIC $<BUILD_INTERFACE:${COMMON_INCLUDE_DIRS}>
 )
 
-
 # Link OpenVINO and ExecuteTorch core libraries
 target_link_libraries(
   openvino_backend PRIVATE openvino::runtime executorch_core
@@ -82,6 +80,8 @@ if(EXECUTORCH_BUILD_OPENVINO_EXECUTOR_RUNNER)
 endif()
 
 # Install OpenVINO backend library and export target
-install(TARGETS openvino_backend
-        EXPORT ExecuTorchTargets
-        DESTINATION lib)
+install(
+  TARGETS openvino_backend
+  EXPORT ExecuTorchTargets
+  DESTINATION lib
+)
diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py
index b508a698cab..a2920285f99 100644
--- a/backends/openvino/partitioner.py
+++ b/backends/openvino/partitioner.py
@@ -25,12 +25,14 @@
 from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
 from torch.fx.passes.operator_support import OperatorSupportBase
 
+
 class PatternNode:
     op_types = {}
 
     def __init__(self):
         self.op_types = {}
 
+
 class OpenvinoOperatorsSupport(OperatorSupportBase):
 
     def __init__(
@@ -135,18 +137,24 @@ def ops_to_not_decompose(
         ]
         return (ops_not_decompose, None)
 
-    def check_pattern(self, node: torch.fx.Node, pattern: PatternNode, enabled_ops: list) -> bool:
+    def check_pattern(
+        self, node: torch.fx.Node, pattern: PatternNode, enabled_ops: list
+    ) -> bool:
         if node.op == "call_function":
             if ("call_function" + ":" + str(node.target.__name__)) in pattern.op_types:
                 pt_input_nodes = node.all_input_nodes
-                pattern_input_ops = pattern.op_types["call_function" + ":" + str(node.target.__name__)]
+                pattern_input_ops = pattern.op_types[
+                    "call_function" + ":" + str(node.target.__name__)
+                ]
                 if pattern_input_ops is None:
                     enabled_ops.append(node)
                     return True
                 if len(pt_input_nodes) != len(pattern_input_ops):
                     return False
                 for i in range(len(pt_input_nodes)):
-                    if not self.check_pattern(pt_input_nodes[i], pattern_input_ops[i], enabled_ops):
+                    if not self.check_pattern(
+                        pt_input_nodes[i], pattern_input_ops[i], enabled_ops
+                    ):
                         return False
                 enabled_ops.append(node)
                 return True
@@ -167,14 +175,24 @@ def capture_nncf_patterns(self, graph_module: torch.fx.GraphModule):
         const_node.op_types["get_attr"] = None
         const_node.op_types["placeholder"] = None
         bitwise_right_shift_node = PatternNode
-        bitwise_right_shift_node.op_types["call_function:aten.bitwise_right_shift.Tensor_Scalar"] = [const_node]
+        bitwise_right_shift_node.op_types[
+            "call_function:aten.bitwise_right_shift.Tensor_Scalar"
+        ] = [const_node]
         bitwise_and_node = PatternNode
-        bitwise_and_node.op_types["call_function:aten.bitwise_and.Scalar"] = [const_node]
+        bitwise_and_node.op_types["call_function:aten.bitwise_and.Scalar"] = [
+            const_node
+        ]
         stack_node = PatternNode
-        stack_node.op_types["call_function:aten.stack.default"] = [bitwise_and_node, bitwise_right_shift_node]
+        stack_node.op_types["call_function:aten.stack.default"] = [
+            bitwise_and_node,
+            bitwise_right_shift_node,
+        ]
 
         for node in graph_module.graph.nodes:
-            if str(node.op) == "call_function" and str(node.target.__name__) == "aten.stack.default":
+            if (
+                str(node.op) == "call_function"
+                and str(node.target.__name__) == "aten.stack.default"
+            ):
                 enabled_ops = []
                 pattern_match = self.check_pattern(node, stack_node, enabled_ops)
                 if pattern_match:
@@ -191,7 +209,11 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         self.capture_nncf_patterns(exported_program.graph_module)
         partitioner = CapabilityBasedPartitioner(
             exported_program.graph_module,
-            OpenvinoOperatorsSupport(self._op_types_to_skip, self._op_names_to_skip, self._enabled_ops_by_name),
+            OpenvinoOperatorsSupport(
+                self._op_types_to_skip,
+                self._op_names_to_skip,
+                self._enabled_ops_by_name,
+            ),
             allows_single_node_partition=True,
         )
         partition_list = partitioner.propose_partitions()
diff --git a/backends/openvino/quantizer/observers/nncf_observers.py b/backends/openvino/quantizer/observers/nncf_observers.py
index 977458801a4..aa531336d0c 100644
--- a/backends/openvino/quantizer/observers/nncf_observers.py
+++ b/backends/openvino/quantizer/observers/nncf_observers.py
@@ -7,38 +7,65 @@
 from typing import Tuple
 
 import torch
-from torch.ao.quantization.observer import MappingType, PerGroup, PerAxis, PerChannelMinMaxObserver, get_block_size
+from nncf.experimental.torch.fx.node_utils import get_tensor_constant_from_node
+from nncf.experimental.torch.fx.transformations import (
+    constant_update_fn,
+    module_insertion_transformation_builder,
+)
+from nncf.parameters import CompressWeightsMode
+from nncf.quantization.algorithms.weight_compression.config import (
+    WeightCompressionConfig,
+)
+
+from nncf.quantization.algorithms.weight_compression.weight_lowering import (
+    do_integer_quantization,
+)
+from nncf.tensor.tensor import Tensor
+from nncf.torch.graph.transformations.commands import PTTargetPoint, TargetType
+from nncf.torch.quantization.layers import (
+    INT4AsymmetricWeightsDecompressor,
+    INT4SymmetricWeightsDecompressor,
+    INT8AsymmetricWeightsDecompressor,
+    INT8SymmetricWeightsDecompressor,
+)
+from torch.ao.quantization.observer import (
+    get_block_size,
+    MappingType,
+    PerAxis,
+    PerChannelMinMaxObserver,
+    PerGroup,
+)
 from torch.ao.quantization.pt2e._affine_quantization import (
     _get_reduction_params,
     AffineQuantizedMinMaxObserver,
 )
-from nncf.torch.quantization.layers import INT4AsymmetricWeightsDecompressor, INT4SymmetricWeightsDecompressor, INT8AsymmetricWeightsDecompressor, INT8SymmetricWeightsDecompressor
-from nncf.experimental.torch.fx.transformations import constant_update_fn, module_insertion_transformation_builder
-from nncf.experimental.torch.fx.node_utils import get_tensor_constant_from_node
-from nncf.torch.graph.transformations.commands import PTTargetPoint, TargetType
 
-from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization
-from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
-from nncf.parameters import CompressWeightsMode
-from nncf.tensor.tensor import Tensor
 
 class PTPerBlockParamObserver(AffineQuantizedMinMaxObserver):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        qmode = CompressWeightsMode.INT4_ASYM if self.mapping_type==MappingType.ASYMMETRIC else CompressWeightsMode.INT4_SYM 
-        assert isinstance(self.granularity, PerGroup), "Only PerGroup granularity is supported"
-        self.wc_config = WeightCompressionConfig(mode=qmode, group_size=self.granularity.group_size)
+        qmode = (
+            CompressWeightsMode.INT4_ASYM
+            if self.mapping_type == MappingType.ASYMMETRIC
+            else CompressWeightsMode.INT4_SYM
+        )
+        assert isinstance(
+            self.granularity, PerGroup
+        ), "Only PerGroup granularity is supported"
+        self.wc_config = WeightCompressionConfig(
+            mode=qmode, group_size=self.granularity.group_size
+        )
 
     def calculate_qparams(self, weight):
         assert hasattr(self, "min_val") and hasattr(
             self, "max_val"
         ), "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams"
-        _, reduction_dims = _get_reduction_params(
-            self.block_size, weight.size()
-            )
+        _, reduction_dims = _get_reduction_params(self.block_size, weight.size())
         assert len(reduction_dims) == 1, "Only 1-D group size is supported"
         reduction_dims = reduction_dims[0] - 1
-        q_weight, scale, zp = do_integer_quantization(Tensor(weight), self.wc_config, reduction_axes=reduction_dims)
+        q_weight, scale, zp = do_integer_quantization(
+            Tensor(weight), self.wc_config, reduction_axes=reduction_dims
+        )
         zp = zp.data if zp is not None else None
         return q_weight.data, scale.data, zp
 
@@ -50,23 +77,38 @@ def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node):
         weight_node = observer_node.args[0]
         original_weight = get_tensor_constant_from_node(weight_node, model)
         q_weight, scale, zero_point = self.calculate_qparams(original_weight)
-        
+
         with model.graph.inserting_before(observer_node):
-            if(zero_point is not None):
-                decompressor = INT4AsymmetricWeightsDecompressor(scale, zero_point, q_weight.shape, original_weight.shape, original_weight.dtype)
+            if zero_point is not None:
+                decompressor = INT4AsymmetricWeightsDecompressor(
+                    scale,
+                    zero_point,
+                    q_weight.shape,
+                    original_weight.shape,
+                    original_weight.dtype,
+                )
             else:
-                decompressor = INT4SymmetricWeightsDecompressor(scale, q_weight.shape, original_weight.shape, original_weight.dtype)
+                decompressor = INT4SymmetricWeightsDecompressor(
+                    scale, q_weight.shape, original_weight.shape, original_weight.dtype
+                )
             packed_q_weight = decompressor.pack_weight(q_weight)
             constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0)
             compressed_weight_name = observer_node.all_input_nodes[0].name
-            decompressor_suffix = "_".join(compressed_weight_name.replace(".", "_").split("_")[:-2])
+            decompressor_suffix = "_".join(
+                compressed_weight_name.replace(".", "_").split("_")[:-2]
+            )
             decompressor_name = f"{decompressor.quantization_mode}_weights_decompressor_{decompressor_suffix}"
 
             module_insertion_transformation_builder(
-                        decompressor,
-                        [PTTargetPoint(TargetType.OPERATOR_POST_HOOK, target_node_name=compressed_weight_name)],
-                        decompressor_name,
-                    )(model)
+                decompressor,
+                [
+                    PTTargetPoint(
+                        TargetType.OPERATOR_POST_HOOK,
+                        target_node_name=compressed_weight_name,
+                    )
+                ],
+                decompressor_name,
+            )(model)
         decomp_node = observer_node.args[0]
         observer_node.replace_all_uses_with(decomp_node)
         model.graph.erase_node(observer_node)
@@ -75,7 +117,11 @@ def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node):
 class NNCFInt8observer(PerChannelMinMaxObserver):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        qmode = CompressWeightsMode.INT8_SYM if self.qscheme==torch.per_channel_symmetric else CompressWeightsMode.INT8_ASYM 
+        qmode = (
+            CompressWeightsMode.INT8_SYM
+            if self.qscheme == torch.per_channel_symmetric
+            else CompressWeightsMode.INT8_ASYM
+        )
         self.wc_config = WeightCompressionConfig(mode=qmode)
 
     def calculate_qparams(self, weight):
@@ -84,10 +130,10 @@ def calculate_qparams(self, weight):
         ), "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams"
         self.granularity = PerAxis(axis=self.ch_axis)
         self.block_size = get_block_size(weight.shape, self.granularity)
-        _, reduction_dims = _get_reduction_params(
-            self.block_size, weight.size()
-            )
-        q_weight, scale, zp = do_integer_quantization(Tensor(weight), self.wc_config, reduction_axes=reduction_dims)
+        _, reduction_dims = _get_reduction_params(self.block_size, weight.size())
+        q_weight, scale, zp = do_integer_quantization(
+            Tensor(weight), self.wc_config, reduction_axes=reduction_dims
+        )
         zp = zp.data if zp is not None else None
         return q_weight.data, scale.data, zp
 
@@ -98,21 +144,32 @@ def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node):
         q_weight, scale, zero_point = self.calculate_qparams(original_weight)
 
         with model.graph.inserting_before(observer_node):
-            if(zero_point is not None):
-                decompressor = INT8AsymmetricWeightsDecompressor(scale, zero_point, original_weight.dtype)
+            if zero_point is not None:
+                decompressor = INT8AsymmetricWeightsDecompressor(
+                    scale, zero_point, original_weight.dtype
+                )
             else:
-                decompressor = INT8SymmetricWeightsDecompressor(scale, original_weight.dtype)
+                decompressor = INT8SymmetricWeightsDecompressor(
+                    scale, original_weight.dtype
+                )
             packed_q_weight = decompressor.pack_weight(q_weight)
             constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0)
             compressed_weight_name = observer_node.all_input_nodes[0].name
-            decompressor_suffix = "_".join(compressed_weight_name.replace(".", "_").split("_")[:-2])
+            decompressor_suffix = "_".join(
+                compressed_weight_name.replace(".", "_").split("_")[:-2]
+            )
             decompressor_name = f"{decompressor.quantization_mode}_weights_decompressor_{decompressor_suffix}"
 
             module_insertion_transformation_builder(
-                        decompressor,
-                        [PTTargetPoint(TargetType.OPERATOR_POST_HOOK, target_node_name=compressed_weight_name)],
-                        decompressor_name,
-                    )(model)
+                decompressor,
+                [
+                    PTTargetPoint(
+                        TargetType.OPERATOR_POST_HOOK,
+                        target_node_name=compressed_weight_name,
+                    )
+                ],
+                decompressor_name,
+            )(model)
         decomp_node = observer_node.args[0]
         observer_node.replace_all_uses_with(decomp_node)
         model.graph.erase_node(observer_node)
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index bf7fd0859d5..820d5dd49ba 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -15,14 +15,20 @@
 import nncf.experimental.torch.fx as nncf_fx  # type: ignore[import-untyped]
 
 import torch.fx
+from executorch.backends.openvino.quantizer.observers.nncf_observers import (
+    NNCFInt8observer,
+    PTPerBlockParamObserver,
+)
 
 from nncf.common.graph.graph import NNCFGraph  # type: ignore[import-untyped]
+from nncf.common.quantization.structs import QuantizationScheme, QuantizerConfig
+from nncf.quantization.quantize_model import get_weight_compression_configuration
 from torchao.quantization.pt2e import (
     HistogramObserver,
+    MappingType,
     PerChannelMinMaxObserver,
-    UniformQuantizationObserverBase,
     PerGroup,
-    MappingType,
+    UniformQuantizationObserverBase,
 )
 from torchao.quantization.pt2e.quantizer import (
     EdgeOrNode,
@@ -32,9 +38,6 @@
     Quantizer,
     SharedQuantizationSpec,
 )
-from nncf.quantization.quantize_model import get_weight_compression_configuration
-from nncf.common.quantization.structs import QuantizerConfig, QuantizationScheme
-from executorch.backends.openvino.quantizer.observers.nncf_observers import PTPerBlockParamObserver,NNCFInt8observer
 
 QUANT_ANNOTATION_KEY = "quantization_annotation"
 from torchao.quantization.pt2e.quantizer.quantizer import Q_ANNOTATION_KEY
@@ -81,7 +84,12 @@ def __init__(
         :param kwargs: Arguments to pass to the NNCF MinMaxQuantization algorithm.
         """
         self.mode = mode
-        self.wc_modes = [QuantizationMode.INT4_ASYM_WC,QuantizationMode.INT4_SYM_WC, QuantizationMode.INT8_ASYM_WC, QuantizationMode.INT8_SYM_WC]
+        self.wc_modes = [
+            QuantizationMode.INT4_ASYM_WC,
+            QuantizationMode.INT4_SYM_WC,
+            QuantizationMode.INT8_ASYM_WC,
+            QuantizationMode.INT8_SYM_WC,
+        ]
         if mode == QuantizationMode.INT8_SYM:
             preset = quantization.structs.QuantizationPreset.PERFORMANCE
             model_type = None
@@ -91,7 +99,7 @@ def __init__(
         else:
             preset = None
             model_type = nncf.parameters.ModelType.TRANSFORMER
-        if(self.mode not in self.wc_modes):
+        if self.mode not in self.wc_modes:
             self._min_max_algo = (
                 nncf.quantization.algorithms.min_max.algorithm.MinMaxQuantization(
                     preset=preset, model_type=model_type, **kwargs
@@ -100,16 +108,16 @@ def __init__(
             self._algo = self._min_max_algo
         else:
             weight_compression_configuration = get_weight_compression_configuration(
-                mode.value.replace("_wc", ""), # Mode value has to match NNCF CompressWeightsMode
-                **kwargs
+                mode.value.replace(
+                    "_wc", ""
+                ),  # Mode value has to match NNCF CompressWeightsMode
+                **kwargs,
             )
             self._weight_compression_algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression(
-                subset_size=None,
-                **weight_compression_configuration
+                subset_size=None, **weight_compression_configuration
             )
             self._algo = self._weight_compression_algo
 
-
     def set_ignored_scope(
         self,
         names: Optional[List[str]] = None,
@@ -153,20 +161,40 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
             defaultdict(QuantizationAnnotation)
         )
         # Serperate into annotation for quantize and compress
-        if(self.mode in self.wc_modes):
+        if self.mode in self.wc_modes:
             self._algo.set_backend_entity(model)
             nodes_to_compress = self._algo.get_nodes_to_compress(nncf_graph)
             for node in nodes_to_compress:
-                quantization_insertion_point = quantization.quantizer_setup.WeightQuantizationInsertionPoint(target_node_name=node.node_name)
+                quantization_insertion_point = (
+                    quantization.quantizer_setup.WeightQuantizationInsertionPoint(
+                        target_node_name=node.node_name
+                    )
+                )
                 group_size = self._algo._group_size
-                num_bits = 4 if self.mode in [QuantizationMode.INT4_SYM_WC,QuantizationMode.INT4_ASYM_WC] else 8
-                qmode = QuantizationScheme.SYMMETRIC if self.mode in [QuantizationMode.INT4_SYM_WC,QuantizationMode.INT8_SYM_WC] else QuantizationScheme.ASYMMETRIC
+                num_bits = (
+                    4
+                    if self.mode
+                    in [QuantizationMode.INT4_SYM_WC, QuantizationMode.INT4_ASYM_WC]
+                    else 8
+                )
+                qmode = (
+                    QuantizationScheme.SYMMETRIC
+                    if self.mode
+                    in [QuantizationMode.INT4_SYM_WC, QuantizationMode.INT8_SYM_WC]
+                    else QuantizationScheme.ASYMMETRIC
+                )
                 nncf_qconfig = QuantizerConfig(num_bits=num_bits, mode=qmode)
-                qp = quantization.quantizer_setup.SingleConfigQuantizationPoint(qip=quantization_insertion_point, qconfig=nncf_qconfig, directly_quantized_operator_node_names=[node])
+                qp = quantization.quantizer_setup.SingleConfigQuantizationPoint(
+                    qip=quantization_insertion_point,
+                    qconfig=nncf_qconfig,
+                    directly_quantized_operator_node_names=[node],
+                )
                 edge_or_node, annotation = self._get_edge_or_node_and_annotation(
                     graph, nncf_graph, qp, node_vs_torch_annotation
                 )
-                qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(qp, group_size=group_size, weights_only=True)
+                qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(
+                    qp, group_size=group_size, weights_only=True
+                )
                 self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
         else:
             quantization_setup = self.get_nncf_quantization_setup(model, nncf_graph)
@@ -175,7 +203,9 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
                 edge_or_node, annotation = self._get_edge_or_node_and_annotation(
                     graph, nncf_graph, qp, node_vs_torch_annotation
                 )
-                qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(qp)
+                qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(
+                    qp
+                )
                 self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
 
             for quantizer_ids in quantization_setup.unified_scale_groups.values():
@@ -186,7 +216,8 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
                 root_qp = quantization_setup.quantization_points[root_quantizer_id]
 
                 if any(
-                    root_qp.qconfig != quantization_setup.quantization_points[q_id].qconfig
+                    root_qp.qconfig
+                    != quantization_setup.quantization_points[q_id].qconfig
                     for q_id in quantizer_ids
                 ):
                     qps = [
@@ -340,7 +371,9 @@ def _fill_torch_ao_annotation(
 
     @staticmethod
     def _get_torch_ao_qspec_from_nncf_config(
-        qp: quantization.quantizer_setup.QuantizationPointBase, group_size=-1, weights_only=False
+        qp: quantization.quantizer_setup.QuantizationPointBase,
+        group_size=-1,
+        weights_only=False,
     ) -> QuantizationSpec:
         """
         Retrieves the quantization configuration from the given quantization point and
@@ -368,8 +401,12 @@ def _get_torch_ao_qspec_from_nncf_config(
                 else torch.per_tensor_affine
             )
         if is_weight:
-            mapping_type = MappingType.SYMMETRIC if qconfig.mode == QuantizationScheme.SYMMETRIC else MappingType.ASYMMETRIC
-            if qconfig.num_bits==4:
+            mapping_type = (
+                MappingType.SYMMETRIC
+                if qconfig.mode == QuantizationScheme.SYMMETRIC
+                else MappingType.ASYMMETRIC
+            )
+            if qconfig.num_bits == 4:
                 extra_args["mapping_type"] = mapping_type
                 extra_args["target_dtype"] = torch.int8
                 extra_args["granularity"] = PerGroup(group_size=group_size)
@@ -378,16 +415,18 @@ def _get_torch_ao_qspec_from_nncf_config(
                 quant_max = 7
                 dtype = torch.int8
                 channel_axis = 0
-            elif qconfig.num_bits==8:
-                observer = NNCFInt8observer if weights_only else PerChannelMinMaxObserver
+            elif qconfig.num_bits == 8:
+                observer = (
+                    NNCFInt8observer if weights_only else PerChannelMinMaxObserver
+                )
                 quant_min = -128
                 quant_max = 127
                 dtype = torch.int8
                 channel_axis = 0
                 torch_qscheme = (
-                torch.per_channel_symmetric
-                if qconfig.mode is quantization.structs.QuantizationScheme.SYMMETRIC
-                else torch.per_channel_affine
+                    torch.per_channel_symmetric
+                    if qconfig.mode is quantization.structs.QuantizationScheme.SYMMETRIC
+                    else torch.per_channel_affine
                 )
         else:
             observer = (
diff --git a/backends/openvino/runtime/OpenvinoBackend.cpp b/backends/openvino/runtime/OpenvinoBackend.cpp
index 546f4d68573..bac006ce916 100644
--- a/backends/openvino/runtime/OpenvinoBackend.cpp
+++ b/backends/openvino/runtime/OpenvinoBackend.cpp
@@ -116,23 +116,23 @@ exr::Error OpenvinoBackend::execute(
     infer_request->set_input_tensor(i, ov_input_tensor);
 
     if (args[i]->isInt()) {
-        int64_t *val = &(args[i]->payload.copyable_union.as_int);
+      int64_t* val = &(args[i]->payload.copyable_union.as_int);
 
-        // Create OpenVINO tensor from integer input
-        ov::Tensor ov_input_tensor(ov::element::i64, ov::Shape{1}, val);
-        infer_request->set_input_tensor(i, ov_input_tensor);
+      // Create OpenVINO tensor from integer input
+      ov::Tensor ov_input_tensor(ov::element::i64, ov::Shape{1}, val);
+      infer_request->set_input_tensor(i, ov_input_tensor);
     } else {
-        auto input_tensor = args[i]->toTensor();
-        ov::Shape input_shape(
-            input_tensor.sizes().begin(), input_tensor.sizes().end());
+      auto input_tensor = args[i]->toTensor();
+      ov::Shape input_shape(
+          input_tensor.sizes().begin(), input_tensor.sizes().end());
 
-        // Convert input tensor to OpenVINO tensor
-        ov::element::Type ov_type =
-            convert_to_openvino_type(input_tensor.scalar_type());
-        ov::Tensor ov_input_tensor(
-            ov_type, input_shape, input_tensor.mutable_data_ptr());
+      // Convert input tensor to OpenVINO tensor
+      ov::element::Type ov_type =
+          convert_to_openvino_type(input_tensor.scalar_type());
+      ov::Tensor ov_input_tensor(
+          ov_type, input_shape, input_tensor.mutable_data_ptr());
 
-        infer_request->set_input_tensor(i, ov_input_tensor);
+      infer_request->set_input_tensor(i, ov_input_tensor);
     }
   }
 
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 7b74ee21f77..47527a326f9 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -906,7 +906,6 @@ def _to_edge_and_lower_llama_openvino(
     partitioners.append(get_openvino_partitioner(openvino_device))
     modelname = f"openvino_{modelname}"
 
-
     logging.info("Lowering model using following partitioner(s): ")
     for partitioner in partitioners:
         logging.info(f"--> {partitioner.__class__.__name__}")
@@ -915,8 +914,9 @@ def _to_edge_and_lower_llama_openvino(
     # TODO: Enable passing OpenVINOQuantizer as a parameter to pt2e_quantize
     if nncf_compression:
         try:
-            import nncf
             from functools import partial
+
+            import nncf
             from pytorch_tokenizers import get_tokenizer
         except ImportError:
             raise ImportError(
@@ -924,9 +924,7 @@ def _to_edge_and_lower_llama_openvino(
             )
         tokenizer = get_tokenizer(builder_exported.tokenizer_path)
 
-        def transform_fn(
-            prompts: str, tokenizer
-        ):
+        def transform_fn(prompts: str, tokenizer):
             tokenized_text = tokenizer.encode(prompts, bos=False, eos=False)
             logging.error(tokenized_text)
 
@@ -938,20 +936,33 @@ def transform_fn(
 
             return inputs
 
-        builder_exported.calibration_data = [builder_exported.calibration_data] if isinstance(builder_exported.calibration_data, str) else builder_exported.calibration_data
-        builder_exported.calibration_data = [word for prompt in builder_exported.calibration_data for word in prompt.split()] if not builder_exported.dynamic_shapes else builder_exported.calibration_data
+        builder_exported.calibration_data = (
+            [builder_exported.calibration_data]
+            if isinstance(builder_exported.calibration_data, str)
+            else builder_exported.calibration_data
+        )
+        builder_exported.calibration_data = (
+            [
+                word
+                for prompt in builder_exported.calibration_data
+                for word in prompt.split()
+            ]
+            if not builder_exported.dynamic_shapes
+            else builder_exported.calibration_data
+        )
 
         builder_exported.pre_autograd_graph_module = nncf.compress_weights(
-                                                            builder_exported.pre_autograd_graph_module,
-                                                            dataset=nncf.Dataset(builder_exported.calibration_data, transform_func=partial(transform_fn, tokenizer=tokenizer)),
-                                                            mode=nncf.CompressWeightsMode.INT4_SYM,
-                                                            ratio=0.8,
-                                                            sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
-                                                        )
-
-    builder = builder_exported.to_edge_transform_and_lower(
-        partitioners
-    )
+            builder_exported.pre_autograd_graph_module,
+            dataset=nncf.Dataset(
+                builder_exported.calibration_data,
+                transform_func=partial(transform_fn, tokenizer=tokenizer),
+            ),
+            mode=nncf.CompressWeightsMode.INT4_SYM,
+            ratio=0.8,
+            sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
+        )
+
+    builder = builder_exported.to_edge_transform_and_lower(partitioners)
 
     if verbose:
         print_delegation_info(builder.edge_manager.exported_program().graph_module)
diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py
index b34f0a85344..185bc011a32 100644
--- a/extension/llm/export/partitioner_lib.py
+++ b/extension/llm/export/partitioner_lib.py
@@ -63,12 +63,11 @@ def get_mps_partitioner(use_kv_cache: bool = False):
     compile_specs = [CompileSpec("use_fp16", bytes([True]))]
     return MPSPartitioner(compile_specs)  # pyre-fixme[16]
 
+
 def get_openvino_partitioner(device: str):
     try:
+        from executorch.backends.openvino.partitioner import OpenvinoPartitioner
         from executorch.exir.backend.backend_details import CompileSpec
-        from executorch.backends.openvino.partitioner import (
-            OpenvinoPartitioner,
-        )
     except ImportError:
         raise ImportError(
             "Please install the OpenVINO backend following https://github.com/pytorch/executorch/tree/main/backends/openvino"
@@ -77,6 +76,7 @@ def get_openvino_partitioner(device: str):
     compile_specs = [CompileSpec("device", device.encode())]
     return OpenvinoPartitioner(compile_specs)
 
+
 def get_coreml_partitioner(
     ios: int = 15,
     embedding_quantize: Optional[str] = None,

From bf659439771f5a52ec40a00070ef5ac5c6237cfa Mon Sep 17 00:00:00 2001
From: Cavus Mustafa <mustafa.cavus@intel.com>
Date: Wed, 20 Aug 2025 18:54:57 -0700
Subject: [PATCH 014/266] code formating changes

---
 .../quantizer/observers/nncf_observers.py     | 31 ++++++++++---------
 backends/openvino/quantizer/quantizer.py      |  9 ++++--
 2 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/backends/openvino/quantizer/observers/nncf_observers.py b/backends/openvino/quantizer/observers/nncf_observers.py
index aa531336d0c..f6ac2a3cb91 100644
--- a/backends/openvino/quantizer/observers/nncf_observers.py
+++ b/backends/openvino/quantizer/observers/nncf_observers.py
@@ -4,41 +4,42 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Tuple
-
 import torch
-from nncf.experimental.torch.fx.node_utils import get_tensor_constant_from_node
-from nncf.experimental.torch.fx.transformations import (
+from nncf.experimental.torch.fx.node_utils import (  # type: ignore[import-untyped]
+    get_tensor_constant_from_node,
+)
+from nncf.experimental.torch.fx.transformations import (  # type: ignore[import-untyped]
     constant_update_fn,
     module_insertion_transformation_builder,
 )
-from nncf.parameters import CompressWeightsMode
-from nncf.quantization.algorithms.weight_compression.config import (
+from nncf.parameters import CompressWeightsMode  # type: ignore[import-untyped]
+from nncf.quantization.algorithms.weight_compression.config import (  # type: ignore[import-untyped]
     WeightCompressionConfig,
 )
 
-from nncf.quantization.algorithms.weight_compression.weight_lowering import (
+from nncf.quantization.algorithms.weight_compression.weight_lowering import (  # type: ignore[import-untyped]
     do_integer_quantization,
 )
-from nncf.tensor.tensor import Tensor
-from nncf.torch.graph.transformations.commands import PTTargetPoint, TargetType
-from nncf.torch.quantization.layers import (
+from nncf.tensor.tensor import Tensor  # type: ignore[import-untyped]
+from nncf.torch.graph.transformations.commands import (  # type: ignore[import-untyped]
+    PTTargetPoint,
+    TargetType,
+)
+from nncf.torch.quantization.layers import (  # type: ignore[import-untyped]
     INT4AsymmetricWeightsDecompressor,
     INT4SymmetricWeightsDecompressor,
     INT8AsymmetricWeightsDecompressor,
     INT8SymmetricWeightsDecompressor,
 )
-from torch.ao.quantization.observer import (
+from torchao.quantization.observer import AffineQuantizedMinMaxObserver
+from torchao.quantization.pt2e import (
     get_block_size,
     MappingType,
     PerAxis,
     PerChannelMinMaxObserver,
     PerGroup,
 )
-from torch.ao.quantization.pt2e._affine_quantization import (
-    _get_reduction_params,
-    AffineQuantizedMinMaxObserver,
-)
+from torchao.quantization.quant_primitives import _get_reduction_params
 
 
 class PTPerBlockParamObserver(AffineQuantizedMinMaxObserver):
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index 820d5dd49ba..cd78f6907c7 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -21,8 +21,13 @@
 )
 
 from nncf.common.graph.graph import NNCFGraph  # type: ignore[import-untyped]
-from nncf.common.quantization.structs import QuantizationScheme, QuantizerConfig
-from nncf.quantization.quantize_model import get_weight_compression_configuration
+from nncf.common.quantization.structs import (  # type: ignore[import-untyped]
+    QuantizationScheme,
+    QuantizerConfig,
+)
+from nncf.quantization.quantize_model import (  # type: ignore[import-untyped]
+    get_weight_compression_configuration,
+)
 from torchao.quantization.pt2e import (
     HistogramObserver,
     MappingType,

From 30a1a258b22d1471c0aae328f30a5910af6af118 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Tue, 26 Aug 2025 12:31:49 +0400
Subject: [PATCH 015/266] openvino quantizer refactored

---
 backends/openvino/quantizer/__init__.py       |   4 +-
 backends/openvino/quantizer/observers.py      | 286 ++++++++++++
 .../quantizer/observers/nncf_observers.py     | 176 --------
 backends/openvino/quantizer/quantizer.py      | 412 ++++++++++--------
 examples/models/llama/export_llama_lib.py     |   9 +
 extension/llm/export/quantizer_lib.py         |  38 +-
 6 files changed, 573 insertions(+), 352 deletions(-)
 create mode 100644 backends/openvino/quantizer/observers.py
 delete mode 100644 backends/openvino/quantizer/observers/nncf_observers.py

diff --git a/backends/openvino/quantizer/__init__.py b/backends/openvino/quantizer/__init__.py
index df038483f2f..0fd8c10b249 100644
--- a/backends/openvino/quantizer/__init__.py
+++ b/backends/openvino/quantizer/__init__.py
@@ -1,3 +1,3 @@
-from .quantizer import OpenVINOQuantizer, quantize_model
+from .quantizer import OpenVINOQuantizer, quantize_model, QuantizationMode
 
-__all__ = ["OpenVINOQuantizer", "quantize_model"]
+__all__ = ["OpenVINOQuantizer", "quantize_model", "QuantizationMode"]
diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py
new file mode 100644
index 00000000000..2ea66f11a55
--- /dev/null
+++ b/backends/openvino/quantizer/observers.py
@@ -0,0 +1,286 @@
+# Copyright (c) Intel Corporation
+#
+# Licensed under the BSD License (the "License"); you may not use this file
+# except in compliance with the License. See the license file found in the
+# LICENSE file in the root directory of this source tree.
+
+# mypy: disable-error-code=import-not-found
+
+from abc import ABC, abstractmethod
+from typing import Optional, Tuple
+
+import nncf.torch.graph.operator_metatypes as om  # type: ignore[import-untyped]
+
+import torch
+from nncf.experimental.torch.fx.nncf_graph_builder import (  # type: ignore[import-untyped]
+    GraphConverter,
+)
+
+from nncf.experimental.torch.fx.node_utils import (  # type: ignore[import-untyped]
+    get_tensor_constant_from_node,
+)
+from nncf.experimental.torch.fx.transformations import (  # type: ignore[import-untyped]
+    constant_update_fn,
+    module_insertion_transformation_builder,
+)
+from nncf.parameters import CompressWeightsMode  # type: ignore[import-untyped]
+from nncf.quantization.algorithms.weight_compression.config import (  # type: ignore[import-untyped]
+    WeightCompressionConfig,
+)
+from nncf.quantization.algorithms.weight_compression.torch_fx_backend import (  # type: ignore[import-untyped]
+    FXWeightCompressionAlgoBackend,
+)
+from nncf.quantization.algorithms.weight_compression.weight_lowering import (  # type: ignore[import-untyped]
+    do_integer_quantization,
+)
+from nncf.tensor.tensor import Tensor  # type: ignore[import-untyped]
+from nncf.torch.graph.transformations.commands import (  # type: ignore[import-untyped]
+    PTTargetPoint,
+    TargetType,
+)
+from nncf.torch.quantization.layers import (  # type: ignore[import-untyped]
+    BaseWeightsDecompressor,
+    INT4AsymmetricWeightsDecompressor,
+    INT4SymmetricWeightsDecompressor,
+    INT8AsymmetricWeightsDecompressor,
+    INT8SymmetricWeightsDecompressor,
+)
+from torchao.quantization.pt2e import MappingType, ObserverBase
+from nncf.torch.model_graph_manager import get_weight_compression_reduction_axes
+
+class WeightObserverBase(ObserverBase, ABC):
+    """
+    Base implementation of an NNCF observer that defines the rules for compressing layer weights into the OpenVINO representation.
+    """
+
+    def calculate_qparams(  # type: ignore[override]
+        self,
+        weight: torch.Tensor,
+        observer_node: torch.fx.Node,
+        model: torch.fx.GraphModule,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Calculate quantization parameters such as scale, quantized weight and zero point.
+
+        :param weight: FP weight to be used for calculating qparams.
+        :return: quantization params quantized weight, scale and zero point
+        """
+        ndims = len(weight.size())
+        node_with_weight, weight_port_id = (
+            WeightObserverBase.get_node_with_weight_and_port_ids(observer_node, model)
+        )
+        _, node_metatype = GraphConverter.get_node_type_and_metatype(
+            node_with_weight, model
+        )
+        # Special case where embedding metatype has to be mapped to AtenEmbedding metatype
+        node_metatype = (
+            om.PTAtenEmbeddingMetatype
+            if node_metatype == om.PTEmbeddingMetatype
+            else node_metatype
+        )
+        reduction_dims = get_weight_compression_reduction_axes(
+            node_metatype, weight_port_id, ndims
+        )
+        reduction_dims = tuple(reduction_dims)
+
+        q_weight, scale, zp = do_integer_quantization(
+            Tensor(weight), self.wc_config, reduction_axes=reduction_dims
+        )
+        zp = zp.data if zp is not None else None
+        return q_weight.data, scale.data, zp
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x
+
+    @staticmethod
+    def get_node_with_weight_and_port_ids(
+        observer_node: torch.fx.Node, model: torch.fx.GraphModule
+    ) -> Tuple[torch.fx.Node, int]:
+        """
+        Returns the node which contains the weight and the weight port id.
+
+        :param observer_node: Observer node for the weight.
+        :param graph: The model.
+        :return: Node which contains the weight (for eg. Linear node) and the port ID for the weight.
+        """
+        for node in model.graph.nodes:
+            if observer_node in node.all_input_nodes:
+                return node, node.all_input_nodes.index(observer_node)
+        msg = f"Observer node {observer_node.name} has no consumer node"
+        raise RuntimeError(msg)
+
+    def convert(
+        self, model: torch.fx.GraphModule, observer_node: torch.fx.Node
+    ) -> None:
+        """
+        Converts the weight observer node into a decompression subgraph after calibration.
+        This method is responsible for transforming the model after the quantization preparation
+        and calibration phases. It replaces the observer node with the quantized weight and a decompression
+        module.
+
+        :param model: A `torch.fx.GraphModule` representing the statically traced model
+                    with observer nodes attached and calibrated.
+        :param observer_node: The `torch.fx.Node` corresponding to the observer module for
+                            the weight that is being transformed into a compressed representation.
+        """
+        weight_node = observer_node.args[0]
+        original_weight = get_tensor_constant_from_node(weight_node, model)
+        q_weight, scale, zero_point = self.calculate_qparams(
+            original_weight, observer_node, model
+        )
+
+        decompressor = self._create_decompressor(
+            scale, zero_point, q_weight, original_weight
+        )
+        packed_q_weight = decompressor.pack_weight(q_weight)
+
+        constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0)
+
+        compressed_weight_name = observer_node.all_input_nodes[0].name
+        decompressor_suffix = "_".join(
+            compressed_weight_name.replace(".", "_").split("_")[:-2]
+        )
+        decompressor_name = f"{decompressor.quantization_mode}_weights_decompressor_{decompressor_suffix}"
+
+        module_insertion_transformation_builder(
+            decompressor,
+            [
+                PTTargetPoint(
+                    TargetType.OPERATOR_POST_HOOK,
+                    target_node_name=compressed_weight_name,
+                )
+            ],
+            decompressor_name,
+        )(model)
+
+        decomp_node = observer_node.args[0]
+        observer_node.replace_all_uses_with(decomp_node)  # type: ignore[arg-type]
+        model.graph.erase_node(observer_node)
+
+    @abstractmethod
+    def _create_decompressor(
+        self,
+        scale: torch.Tensor,
+        zero_point: Optional[torch.Tensor],
+        q_weight: torch.Tensor,
+        original_weight: torch.Tensor,
+    ) -> BaseWeightsDecompressor:
+        """
+        Used to return the respective NNCF decompressor for different types of quantization.
+
+        :param scale: Calculated scale quantization parameter.
+        :param zero_point: Calculated zero_point quantization parameter.
+        :param q_weight: Calculated quantized weight.
+        :param original_weight: FP weight.
+        :return: NNCF observer according to the qmode which creates the decompression subgraph supported by OpenVINO.
+        """
+        pass
+
+    @abstractmethod
+    def get_wc_config(self) -> WeightCompressionConfig:
+        """
+        Used to return the respective NNCF Weight Compression Config.
+
+        :return: Weight compression config with the compression information such as qmode, group_size etc.
+        """
+        pass
+
+
+class INT4WeightObserver(WeightObserverBase):
+    """
+    This class defines the behavior for INT4 Weight Compression which has per-group granularity.
+    """
+
+    def __init__(
+        self,
+        group_size: int,
+        mapping_type: MappingType,
+        target_dtype: torch.dtype,
+        *args,
+        **kwargs,
+    ) -> None:
+        """
+        :param group_size: Group size for group wise quantization. group_size=-1 means it is per-channel quantization.
+        :param mapping_type: MappingType.SYMMETRIC and MappingType.ASYMMETRIC are supported types for this argument for symmetric or asymmetric quantization.
+        :param target_dtype: target dtype for quantization such as int8, uint8, etc.
+        """
+        super().__init__(dtype=target_dtype, is_dynamic=False)
+        self.wc_config = None
+        self.mapping_type = mapping_type
+
+        qmode = (
+            CompressWeightsMode.INT4_ASYM
+            if self.mapping_type == MappingType.ASYMMETRIC
+            else CompressWeightsMode.INT4_SYM
+        )
+        self.wc_config = WeightCompressionConfig(mode=qmode, group_size=group_size)
+
+    def _create_decompressor(
+        self,
+        scale: torch.Tensor,
+        zero_point: Optional[torch.Tensor],
+        q_weight: torch.Tensor,
+        original_weight: torch.Tensor,
+    ) -> BaseWeightsDecompressor:
+        if zero_point is not None:
+            return INT4AsymmetricWeightsDecompressor(
+                scale,
+                zero_point,
+                q_weight.shape,
+                original_weight.shape,
+                original_weight.dtype,
+            )
+        else:
+            return INT4SymmetricWeightsDecompressor(
+                scale, q_weight.shape, original_weight.shape, original_weight.dtype
+            )
+
+    def get_wc_config(self):
+        return self.wc_config
+
+
+class INT8WeightObserver(WeightObserverBase):
+    """
+    This class defines the behavior for Int8 WC which has per channel granularity.
+    """
+
+    def __init__(
+        self,
+        qscheme: torch.qscheme,
+        dtype: torch.dtype,
+        ch_axis: int = 0,
+        *args,
+        **kwargs,
+    ) -> None:
+        """
+        :param qscheme: Quantization scheme which is per-channel for Int8 WC.
+        :param dtype: dtype for quantization such as int8, uint8, etc..
+        :param ch_axis: Channel axis.
+        """
+        super().__init__(dtype=dtype, is_dynamic=False)
+        self.wc_config = None
+        self.qscheme = qscheme
+
+        qmode = (
+            CompressWeightsMode.INT8_SYM
+            if self.qscheme == torch.per_channel_symmetric
+            else CompressWeightsMode.INT8_ASYM
+        )
+        self.wc_config = WeightCompressionConfig(mode=qmode)
+
+    def _create_decompressor(
+        self,
+        scale: torch.Tensor,
+        zero_point: Optional[torch.Tensor],
+        q_weight: torch.Tensor,
+        original_weight: torch.Tensor,
+    ) -> BaseWeightsDecompressor:
+        if zero_point is not None:
+            return INT8AsymmetricWeightsDecompressor(
+                scale, zero_point, original_weight.dtype
+            )
+        else:
+            return INT8SymmetricWeightsDecompressor(scale, original_weight.dtype)
+
+    def get_wc_config(self):
+        return self.wc_config
\ No newline at end of file
diff --git a/backends/openvino/quantizer/observers/nncf_observers.py b/backends/openvino/quantizer/observers/nncf_observers.py
deleted file mode 100644
index f6ac2a3cb91..00000000000
--- a/backends/openvino/quantizer/observers/nncf_observers.py
+++ /dev/null
@@ -1,176 +0,0 @@
-# Copyright (c) Qualcomm Innovation Center, Inc.
-# All rights reserved
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-from nncf.experimental.torch.fx.node_utils import (  # type: ignore[import-untyped]
-    get_tensor_constant_from_node,
-)
-from nncf.experimental.torch.fx.transformations import (  # type: ignore[import-untyped]
-    constant_update_fn,
-    module_insertion_transformation_builder,
-)
-from nncf.parameters import CompressWeightsMode  # type: ignore[import-untyped]
-from nncf.quantization.algorithms.weight_compression.config import (  # type: ignore[import-untyped]
-    WeightCompressionConfig,
-)
-
-from nncf.quantization.algorithms.weight_compression.weight_lowering import (  # type: ignore[import-untyped]
-    do_integer_quantization,
-)
-from nncf.tensor.tensor import Tensor  # type: ignore[import-untyped]
-from nncf.torch.graph.transformations.commands import (  # type: ignore[import-untyped]
-    PTTargetPoint,
-    TargetType,
-)
-from nncf.torch.quantization.layers import (  # type: ignore[import-untyped]
-    INT4AsymmetricWeightsDecompressor,
-    INT4SymmetricWeightsDecompressor,
-    INT8AsymmetricWeightsDecompressor,
-    INT8SymmetricWeightsDecompressor,
-)
-from torchao.quantization.observer import AffineQuantizedMinMaxObserver
-from torchao.quantization.pt2e import (
-    get_block_size,
-    MappingType,
-    PerAxis,
-    PerChannelMinMaxObserver,
-    PerGroup,
-)
-from torchao.quantization.quant_primitives import _get_reduction_params
-
-
-class PTPerBlockParamObserver(AffineQuantizedMinMaxObserver):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        qmode = (
-            CompressWeightsMode.INT4_ASYM
-            if self.mapping_type == MappingType.ASYMMETRIC
-            else CompressWeightsMode.INT4_SYM
-        )
-        assert isinstance(
-            self.granularity, PerGroup
-        ), "Only PerGroup granularity is supported"
-        self.wc_config = WeightCompressionConfig(
-            mode=qmode, group_size=self.granularity.group_size
-        )
-
-    def calculate_qparams(self, weight):
-        assert hasattr(self, "min_val") and hasattr(
-            self, "max_val"
-        ), "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams"
-        _, reduction_dims = _get_reduction_params(self.block_size, weight.size())
-        assert len(reduction_dims) == 1, "Only 1-D group size is supported"
-        reduction_dims = reduction_dims[0] - 1
-        q_weight, scale, zp = do_integer_quantization(
-            Tensor(weight), self.wc_config, reduction_axes=reduction_dims
-        )
-        zp = zp.data if zp is not None else None
-        return q_weight.data, scale.data, zp
-
-    def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node):
-        print("calling convert")
-        assert (
-            self.original_dtype is not None
-        ), "Expecting original_dtype to be populated"
-        weight_node = observer_node.args[0]
-        original_weight = get_tensor_constant_from_node(weight_node, model)
-        q_weight, scale, zero_point = self.calculate_qparams(original_weight)
-
-        with model.graph.inserting_before(observer_node):
-            if zero_point is not None:
-                decompressor = INT4AsymmetricWeightsDecompressor(
-                    scale,
-                    zero_point,
-                    q_weight.shape,
-                    original_weight.shape,
-                    original_weight.dtype,
-                )
-            else:
-                decompressor = INT4SymmetricWeightsDecompressor(
-                    scale, q_weight.shape, original_weight.shape, original_weight.dtype
-                )
-            packed_q_weight = decompressor.pack_weight(q_weight)
-            constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0)
-            compressed_weight_name = observer_node.all_input_nodes[0].name
-            decompressor_suffix = "_".join(
-                compressed_weight_name.replace(".", "_").split("_")[:-2]
-            )
-            decompressor_name = f"{decompressor.quantization_mode}_weights_decompressor_{decompressor_suffix}"
-
-            module_insertion_transformation_builder(
-                decompressor,
-                [
-                    PTTargetPoint(
-                        TargetType.OPERATOR_POST_HOOK,
-                        target_node_name=compressed_weight_name,
-                    )
-                ],
-                decompressor_name,
-            )(model)
-        decomp_node = observer_node.args[0]
-        observer_node.replace_all_uses_with(decomp_node)
-        model.graph.erase_node(observer_node)
-
-
-class NNCFInt8observer(PerChannelMinMaxObserver):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        qmode = (
-            CompressWeightsMode.INT8_SYM
-            if self.qscheme == torch.per_channel_symmetric
-            else CompressWeightsMode.INT8_ASYM
-        )
-        self.wc_config = WeightCompressionConfig(mode=qmode)
-
-    def calculate_qparams(self, weight):
-        assert hasattr(self, "min_val") and hasattr(
-            self, "max_val"
-        ), "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams"
-        self.granularity = PerAxis(axis=self.ch_axis)
-        self.block_size = get_block_size(weight.shape, self.granularity)
-        _, reduction_dims = _get_reduction_params(self.block_size, weight.size())
-        q_weight, scale, zp = do_integer_quantization(
-            Tensor(weight), self.wc_config, reduction_axes=reduction_dims
-        )
-        zp = zp.data if zp is not None else None
-        return q_weight.data, scale.data, zp
-
-    def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node):
-        print("calling convert")
-        weight_node = observer_node.args[0]
-        original_weight = get_tensor_constant_from_node(weight_node, model)
-        q_weight, scale, zero_point = self.calculate_qparams(original_weight)
-
-        with model.graph.inserting_before(observer_node):
-            if zero_point is not None:
-                decompressor = INT8AsymmetricWeightsDecompressor(
-                    scale, zero_point, original_weight.dtype
-                )
-            else:
-                decompressor = INT8SymmetricWeightsDecompressor(
-                    scale, original_weight.dtype
-                )
-            packed_q_weight = decompressor.pack_weight(q_weight)
-            constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0)
-            compressed_weight_name = observer_node.all_input_nodes[0].name
-            decompressor_suffix = "_".join(
-                compressed_weight_name.replace(".", "_").split("_")[:-2]
-            )
-            decompressor_name = f"{decompressor.quantization_mode}_weights_decompressor_{decompressor_suffix}"
-
-            module_insertion_transformation_builder(
-                decompressor,
-                [
-                    PTTargetPoint(
-                        TargetType.OPERATOR_POST_HOOK,
-                        target_node_name=compressed_weight_name,
-                    )
-                ],
-                decompressor_name,
-            )(model)
-        decomp_node = observer_node.args[0]
-        observer_node.replace_all_uses_with(decomp_node)
-        model.graph.erase_node(observer_node)
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index cd78f6907c7..31d41bff7be 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -15,16 +15,11 @@
 import nncf.experimental.torch.fx as nncf_fx  # type: ignore[import-untyped]
 
 import torch.fx
-from executorch.backends.openvino.quantizer.observers.nncf_observers import (
-    NNCFInt8observer,
-    PTPerBlockParamObserver,
+from executorch.backends.openvino.quantizer.observers import (
+    INT4WeightObserver,
+    INT8WeightObserver,
 )
-
 from nncf.common.graph.graph import NNCFGraph  # type: ignore[import-untyped]
-from nncf.common.quantization.structs import (  # type: ignore[import-untyped]
-    QuantizationScheme,
-    QuantizerConfig,
-)
 from nncf.quantization.quantize_model import (  # type: ignore[import-untyped]
     get_weight_compression_configuration,
 )
@@ -32,7 +27,6 @@
     HistogramObserver,
     MappingType,
     PerChannelMinMaxObserver,
-    PerGroup,
     UniformQuantizationObserverBase,
 )
 from torchao.quantization.pt2e.quantizer import (
@@ -45,7 +39,6 @@
 )
 
 QUANT_ANNOTATION_KEY = "quantization_annotation"
-from torchao.quantization.pt2e.quantizer.quantizer import Q_ANNOTATION_KEY
 
 
 class QuantizationMode(Enum):
@@ -55,15 +48,19 @@ class QuantizationMode(Enum):
     - INT8_SYM: INT8 symmetric quantization for both activations and weights.
     - INT8_MIXED: INT8 asymmetric quantization for activations, symmetric for weights.
     - INT8_TRANSFORMER: Optimized INT8 quantization for transformer-based models
+    - INT8WO_SYM: INT8 symmetric quantization for weights only.
+    - INT8WO_ASYM: INT8 asymmetric quantization for weights only.
+    - INT4WO_SYM: INT4 symmetric quantization for weights only.
+    - INT4WO_ASYM: INT4 asymmetric quantization for weights only
     """
 
     INT8_SYM = "int8_sym"
     INT8_MIXED = "int8_mixed"
     INT8_TRANSFORMER = "int8_transformer"
-    INT8_SYM_WC = "int8_sym_wc"
-    INT8_ASYM_WC = "int8_asym_wc"
-    INT4_SYM_WC = "int4_sym"
-    INT4_ASYM_WC = "int4_asym"
+    INT8WO_SYM = "int8wo_sym"
+    INT8WO_ASYM = "int8wo_asym"
+    INT4WO_SYM = "int4wo_sym"
+    INT4WO_ASYM = "int4wo_asym"
 
 
 class OpenVINOQuantizer(Quantizer):
@@ -72,10 +69,17 @@ class OpenVINOQuantizer(Quantizer):
     optimally for the inference via OpenVINO.
     """
 
+    WEIGHTS_ONLY_COMPRESSION_MODES = (
+        QuantizationMode.INT4WO_SYM,
+        QuantizationMode.INT4WO_ASYM,
+        QuantizationMode.INT8WO_SYM,
+        QuantizationMode.INT8WO_ASYM,
+    )
+
     def __init__(
         self,
         *,
-        mode: Optional[QuantizationMode] = QuantizationMode.INT8_SYM,
+        mode: QuantizationMode = QuantizationMode.INT8_SYM,
         **kwargs,
     ):
         """
@@ -89,28 +93,21 @@ def __init__(
         :param kwargs: Arguments to pass to the NNCF MinMaxQuantization algorithm.
         """
         self.mode = mode
-        self.wc_modes = [
-            QuantizationMode.INT4_ASYM_WC,
-            QuantizationMode.INT4_SYM_WC,
-            QuantizationMode.INT8_ASYM_WC,
-            QuantizationMode.INT8_SYM_WC,
-        ]
-        if mode == QuantizationMode.INT8_SYM:
-            preset = quantization.structs.QuantizationPreset.PERFORMANCE
-            model_type = None
-        elif mode == QuantizationMode.INT8_MIXED:
-            preset = quantization.structs.QuantizationPreset.MIXED
-            model_type = None
-        else:
-            preset = None
-            model_type = nncf.parameters.ModelType.TRANSFORMER
-        if self.mode not in self.wc_modes:
-            self._min_max_algo = (
+        if self.mode not in OpenVINOQuantizer.WEIGHTS_ONLY_COMPRESSION_MODES:
+            if mode == QuantizationMode.INT8_SYM:
+                preset = quantization.structs.QuantizationPreset.PERFORMANCE
+                model_type = None
+            elif mode == QuantizationMode.INT8_MIXED:
+                preset = quantization.structs.QuantizationPreset.MIXED
+                model_type = None
+            else:
+                preset = None
+                model_type = nncf.parameters.ModelType.TRANSFORMER
+            self._algo = (
                 nncf.quantization.algorithms.min_max.algorithm.MinMaxQuantization(
                     preset=preset, model_type=model_type, **kwargs
                 )
             )
-            self._algo = self._min_max_algo
         else:
             weight_compression_configuration = get_weight_compression_configuration(
                 mode.value.replace(
@@ -118,10 +115,9 @@ def __init__(
                 ),  # Mode value has to match NNCF CompressWeightsMode
                 **kwargs,
             )
-            self._weight_compression_algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression(
+            self._algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression(
                 subset_size=None, **weight_compression_configuration
             )
-            self._algo = self._weight_compression_algo
 
     def set_ignored_scope(
         self,
@@ -158,104 +154,131 @@ def get_nncf_quantization_setup(
         self._algo._set_backend_entity(model)
         return self._algo.find_quantization_setup(model, nncf_graph)
 
-    def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
-        nncf_graph = nncf_fx.nncf_graph_builder.GraphConverter.create_nncf_graph(model)
+    def _annotate_weight_compression(
+        self,
+        model: torch.fx.GraphModule,
+        graph: torch.fx.Graph,
+        nncf_graph: NNCFGraph,
+        node_vs_torch_annotation: DefaultDict[torch.fx.Node, QuantizationAnnotation],
+    ) -> DefaultDict[torch.fx.Node, QuantizationAnnotation]:
+        """
+        Annotates the model graph with weight-only quantization specs.
 
-        graph = model.graph
-        node_vs_torch_annotation: DefaultDict[torch.fx.Node, QuantizationAnnotation] = (
-            defaultdict(QuantizationAnnotation)
-        )
-        # Serperate into annotation for quantize and compress
-        if self.mode in self.wc_modes:
-            self._algo.set_backend_entity(model)
-            nodes_to_compress = self._algo.get_nodes_to_compress(nncf_graph)
-            for node in nodes_to_compress:
-                quantization_insertion_point = (
-                    quantization.quantizer_setup.WeightQuantizationInsertionPoint(
-                        target_node_name=node.node_name
-                    )
-                )
-                group_size = self._algo._group_size
-                num_bits = (
-                    4
-                    if self.mode
-                    in [QuantizationMode.INT4_SYM_WC, QuantizationMode.INT4_ASYM_WC]
-                    else 8
-                )
-                qmode = (
-                    QuantizationScheme.SYMMETRIC
-                    if self.mode
-                    in [QuantizationMode.INT4_SYM_WC, QuantizationMode.INT8_SYM_WC]
-                    else QuantizationScheme.ASYMMETRIC
-                )
-                nncf_qconfig = QuantizerConfig(num_bits=num_bits, mode=qmode)
-                qp = quantization.quantizer_setup.SingleConfigQuantizationPoint(
-                    qip=quantization_insertion_point,
-                    qconfig=nncf_qconfig,
-                    directly_quantized_operator_node_names=[node],
-                )
-                edge_or_node, annotation = self._get_edge_or_node_and_annotation(
-                    graph, nncf_graph, qp, node_vs_torch_annotation
-                )
-                qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(
-                    qp, group_size=group_size, weights_only=True
+        Identifies compressible nodes in the NNCF graph and attaches the corresponding
+        TorchAO quantization specifications to their weight edges for later transformation.
+
+        :param model: The FX GraphModule to annotate.
+        :param graph: The underlying FX graph.
+        :param nncf_graph: The corresponding NNCF graph.
+        :param node_vs_torch_annotation: A mapping of FX nodes to quantization annotations.
+
+        :return: Updated mapping of FX nodes with weight compression annotations.
+        """
+        self._algo.set_backend_entity(model)
+        nodes_to_compress = self._algo.get_nodes_to_compress(nncf_graph)
+
+        for node in nodes_to_compress:
+            target_node = nncf_fx.node_utils.get_graph_node_by_name(
+                graph, node.node_name
+            )
+            annotation = node_vs_torch_annotation[target_node]
+            edge_or_node = OpenVINOQuantizer._get_weight_edge(target_node, nncf_graph)
+            group_size = getattr(self._algo, "_group_size", -1)
+            qspec = self._get_torch_ao_qspec_from_nncf_config(
+                qp=None, group_size=group_size, qmode=self.mode, weights_only=True
+            )
+            self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
+
+        return node_vs_torch_annotation
+
+    def _annotate_post_training_quantization(
+        self,
+        model: torch.fx.GraphModule,
+        graph: torch.fx.Graph,
+        nncf_graph: NNCFGraph,
+        node_vs_torch_annotation: DefaultDict[torch.fx.Node, QuantizationAnnotation],
+    ) -> DefaultDict[torch.fx.Node, QuantizationAnnotation]:
+        """
+        Annotates the model graph with post-training quantization configurations.
+
+        Converts NNCF quantization points into TorchAO-compatible quantization specs,
+        assigning them to corresponding nodes or edges. Also handles unified scale groups,
+        ensuring shared quantization specs across grouped quantizers with consistent configs.
+
+        :param model: The FX GraphModule to annotate.
+        :param graph: The underlying FX graph.
+        :param nncf_graph: The corresponding NNCF graph.
+        :param node_vs_torch_annotation: A mapping of FX nodes to quantization annotations.
+
+        :return: Updated mapping of FX nodes with post-training quantization annotations.
+        """
+        quantization_setup = self.get_nncf_quantization_setup(model, nncf_graph)
+
+        for qp in quantization_setup.quantization_points.values():
+            edge_or_node, annotation = self._get_edge_or_node_and_annotation(
+                graph, nncf_graph, qp, node_vs_torch_annotation
+            )
+            qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(qp)
+            self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
+
+        for quantizer_ids in quantization_setup.unified_scale_groups.values():
+            root_quantizer_id = self._get_unified_scales_root_quantizer_id(
+                nncf_graph, quantizer_ids, quantization_setup
+            )
+            root_qp = quantization_setup.quantization_points[root_quantizer_id]
+
+            if any(
+                root_qp.qconfig != quantization_setup.quantization_points[q_id].qconfig
+                for q_id in quantizer_ids
+            ):
+                qps = [
+                    quantization_setup.quantization_points[qid] for qid in quantizer_ids
+                ]
+                raise nncf.InternalError(
+                    "Different quantization configs are set to one unified scale group:"
+                    f"{[(qp.insertion_point.__dict__, str(qp.qconfig)) for qp in qps]}"
                 )
-                self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
-        else:
-            quantization_setup = self.get_nncf_quantization_setup(model, nncf_graph)
 
-            for qp in quantization_setup.quantization_points.values():
+            root_target_node = nncf_fx.node_utils.get_graph_node_by_name(
+                graph, root_qp.insertion_point.target_node_name
+            )
+            root_edge_or_node = self._get_edge_or_node(
+                root_target_node, root_qp, nncf_graph
+            )
+
+            for quantizer_id in quantizer_ids:
+                if quantizer_id == root_quantizer_id:
+                    continue
+
+                qspec = SharedQuantizationSpec(root_edge_or_node)  # type: ignore[assignment]
+                qp = quantization_setup.quantization_points[quantizer_id]
                 edge_or_node, annotation = self._get_edge_or_node_and_annotation(
                     graph, nncf_graph, qp, node_vs_torch_annotation
                 )
-                qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(
-                    qp
-                )
                 self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
 
-            for quantizer_ids in quantization_setup.unified_scale_groups.values():
+        return node_vs_torch_annotation
 
-                root_quantizer_id = self._get_unified_scales_root_quantizer_id(
-                    nncf_graph, quantizer_ids, quantization_setup
-                )
-                root_qp = quantization_setup.quantization_points[root_quantizer_id]
-
-                if any(
-                    root_qp.qconfig
-                    != quantization_setup.quantization_points[q_id].qconfig
-                    for q_id in quantizer_ids
-                ):
-                    qps = [
-                        quantization_setup.quantization_points[q_id]
-                        for q_id in quantizer_ids
-                    ]
-                    msg = (
-                        "Different quantization configs are set to one unified scale group:"
-                        f"{[(qp.insertion_point.__dict__, str(qp.qconfig)) for qp in qps]}"
-                    )
-                    raise nncf.InternalError(msg)
-
-                root_target_node = nncf_fx.node_utils.get_graph_node_by_name(
-                    graph, root_qp.insertion_point.target_node_name
-                )
-                root_edge_or_node = self._get_edge_or_node(
-                    root_target_node, root_qp, nncf_graph
-                )
-
-                for quantizer_id in quantizer_ids:
-                    if quantizer_id == root_quantizer_id:
-                        continue
+    def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+        nncf_graph = nncf_fx.nncf_graph_builder.GraphConverter.create_nncf_graph(model)
+        graph = model.graph
+        node_vs_torch_annotation: DefaultDict[torch.fx.Node, QuantizationAnnotation] = (
+            defaultdict(QuantizationAnnotation)
+        )
 
-                    qspec = SharedQuantizationSpec(root_edge_or_node)
-                    qp = quantization_setup.quantization_points[quantizer_id]
-                    edge_or_node, annotation = self._get_edge_or_node_and_annotation(
-                        graph, nncf_graph, qp, node_vs_torch_annotation
-                    )
-                    self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
+        if self.mode in OpenVINOQuantizer.WEIGHTS_ONLY_COMPRESSION_MODES:
+            node_vs_torch_annotation = self._annotate_weight_compression(
+                model, graph, nncf_graph, node_vs_torch_annotation
+            )
+        else:
+            node_vs_torch_annotation = self._annotate_post_training_quantization(
+                model, graph, nncf_graph, node_vs_torch_annotation
+            )
 
         for node, annotation in node_vs_torch_annotation.items():
-            assert Q_ANNOTATION_KEY not in node.meta
-            node.meta[Q_ANNOTATION_KEY] = annotation
+            assert QUANT_ANNOTATION_KEY not in node.meta
+            node.meta[QUANT_ANNOTATION_KEY] = annotation
+
         return model
 
     @staticmethod
@@ -317,6 +340,36 @@ def _get_edge_or_node_and_annotation(
         edge_or_node = OpenVINOQuantizer._get_edge_or_node(target_node, qp, nncf_graph)
         return edge_or_node, annotation
 
+    @staticmethod
+    def _get_weight_edge(
+        target_node: torch.fx.Node,
+        nncf_graph: NNCFGraph,
+    ):
+        """
+        Returns the FX node corresponding to the weight tensor input of a given operator node.
+        Uses the NNCF graph to identify which input port of the target node holds the weight.
+        If multiple weight ports are present, a warning is issued and only the first one is used.
+
+        :param target_node: FX node representing a weighted operation (e.g., Linear, Conv).
+        :param nncf_graph: NNCFGraph used to determine weight port indices.
+
+        :return: Edge represented by a Tuple of (weight_node, target_node), where weight_node is the FX node supplying the weight.
+        """
+        nncf_node = nncf_graph.get_node_by_name(target_node.name)
+        weights_ports_ids = nncf.torch.model_graph_manager.get_weight_tensor_port_ids(
+            nncf_node, nncf_graph
+        )
+        if len(weights_ports_ids) > 1:
+            # TODO(dlyakhov): support quantization for nodes with several weights
+            nncf.common.logging.nncf_logger.warning(
+                f"Quantization of the weighted node {target_node.name}"
+                " is not yet supported by the OpenVINOQuantizer."
+                f" Only the weight on port ID {weights_ports_ids[0]} will be quantized."
+                f" Quantizable weights are located on ports: {weights_ports_ids}."
+            )
+        weight_node = target_node.all_input_nodes[weights_ports_ids[0]]
+        return (weight_node, target_node)
+
     @staticmethod
     def _get_edge_or_node(
         target_node: torch.fx.Node,
@@ -333,22 +386,7 @@ def _get_edge_or_node(
         """
         ip = qp.insertion_point
         if qp.is_weight_quantization_point():
-            nncf_node = nncf_graph.get_node_by_name(target_node.name)
-            weights_ports_ids = (
-                nncf.torch.model_graph_manager.get_weight_tensor_port_ids(
-                    nncf_node, nncf_graph
-                )
-            )
-            if len(weights_ports_ids) > 1:
-                # TODO(dlyakhov): support quantization for nodes with several weights
-                nncf.common.logging.nncf_logger.warning(
-                    f"Quantization of the weighted node {target_node.name}"
-                    " is not yet supported by the OpenVINOQuantizer."
-                    f" Only the weight on port ID {weights_ports_ids[0]} will be quantized."
-                    f" Quantizable weights are located on ports: {weights_ports_ids}."
-                )
-            weight_node = target_node.all_input_nodes[weights_ports_ids[0]]
-            return (weight_node, target_node)
+            OpenVINOQuantizer._get_weight_edge(target_node, nncf_graph)
 
         if ip.input_port_id is None:
             return target_node
@@ -377,22 +415,67 @@ def _fill_torch_ao_annotation(
     @staticmethod
     def _get_torch_ao_qspec_from_nncf_config(
         qp: quantization.quantizer_setup.QuantizationPointBase,
-        group_size=-1,
-        weights_only=False,
+        group_size: int = -1,
+        qmode: Optional[QuantizationMode] = None,
+        weights_only: bool = False,
     ) -> QuantizationSpec:
         """
-        Retrieves the quantization configuration from the given quantization point and
-        converts it into a QuantizationSpec.
-
-        :param qp: An instance of QuantizationPointBase.
-        :return: A QuantizationSpec retrieved and converted from the quantization point.
+        Returns a TorchAO QuantizationSpec based on NNCF quantization config and other arguments.
+        For weight-only quantization (e.g., INT4/INT8 compression), uses `qmode`, `group_size`,
+        and `weights_only`. For post-training quantization, only `qp` is required.
+
+        :param qp: Quantization point from NNCF.
+        :param group_size: Group size for INT4 group-wise quantization.
+        :param qmode: Quantization mode for weight compression.
+        :param weights_only: If True, applies weight-only quantization logic.
+        :return: A TorchAO QuantizationSpec.
         """
+        observer: Type[UniformQuantizationObserverBase]
+
         # Eps value is copied from nncf/torch/quantization/layers.py
-        extra_args = {"eps": 1e-16}
+        extra_args: Dict[str, Any] = {"eps": 1e-16}
+
+        if weights_only:
+            mapping_type = (
+                MappingType.SYMMETRIC
+                if qmode == QuantizationMode.INT4WO_SYM
+                else MappingType.ASYMMETRIC
+            )
+            if qmode in [QuantizationMode.INT4WO_SYM, QuantizationMode.INT4WO_SYM]:
+                extra_args["mapping_type"] = mapping_type
+                extra_args["target_dtype"] = torch.int8
+                extra_args["group_size"] = group_size
+                observer = INT4WeightObserver
+                quant_min = -8 if mapping_type == MappingType.SYMMETRIC else 0
+                quant_max = 7 if mapping_type == MappingType.SYMMETRIC else 15
+                dtype = torch.int8
+                channel_axis = 0
+                torch_qscheme = None
+            else:
+                observer = INT8WeightObserver
+                quant_min = -128 if mapping_type == MappingType.SYMMETRIC else 0
+                quant_max = 1277 if mapping_type == MappingType.SYMMETRIC else 255
+                dtype = torch.int8
+                channel_axis = 0
+                torch_qscheme = (
+                    torch.per_channel_symmetric
+                    if qmode == QuantizationMode.INT8WO_SYM
+                    else torch.per_channel_affine
+                )
+
+            return QuantizationSpec(
+                dtype=dtype,
+                observer_or_fake_quant_ctr=observer.with_args(**extra_args),
+                quant_min=quant_min,
+                quant_max=quant_max,
+                qscheme=torch_qscheme,
+                ch_axis=channel_axis,
+                is_dynamic=False,
+            )
+
         is_weight = qp.is_weight_quantization_point()
         qconfig = qp.qconfig
 
-        observer: Type[UniformQuantizationObserverBase]
         if qconfig.per_channel:
             torch_qscheme = (
                 torch.per_channel_symmetric
@@ -406,33 +489,16 @@ def _get_torch_ao_qspec_from_nncf_config(
                 else torch.per_tensor_affine
             )
         if is_weight:
-            mapping_type = (
-                MappingType.SYMMETRIC
-                if qconfig.mode == QuantizationScheme.SYMMETRIC
-                else MappingType.ASYMMETRIC
+            observer = PerChannelMinMaxObserver
+            quant_min = -128
+            quant_max = 127
+            dtype = torch.int8
+            channel_axis = 0
+            torch_qscheme = (
+                torch.per_channel_symmetric
+                if qconfig.mode is quantization.structs.QuantizationScheme.SYMMETRIC
+                else torch.per_channel_affine
             )
-            if qconfig.num_bits == 4:
-                extra_args["mapping_type"] = mapping_type
-                extra_args["target_dtype"] = torch.int8
-                extra_args["granularity"] = PerGroup(group_size=group_size)
-                observer = PTPerBlockParamObserver
-                quant_min = -8
-                quant_max = 7
-                dtype = torch.int8
-                channel_axis = 0
-            elif qconfig.num_bits == 8:
-                observer = (
-                    NNCFInt8observer if weights_only else PerChannelMinMaxObserver
-                )
-                quant_min = -128
-                quant_max = 127
-                dtype = torch.int8
-                channel_axis = 0
-                torch_qscheme = (
-                    torch.per_channel_symmetric
-                    if qconfig.mode is quantization.structs.QuantizationScheme.SYMMETRIC
-                    else torch.per_channel_affine
-                )
         else:
             observer = (
                 HistogramObserver
@@ -514,4 +580,4 @@ def quantize_model(
         smooth_quant=smooth_quant,
         **kwargs,
     )
-    return quantized_model
+    return quantized_model
\ No newline at end of file
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 47527a326f9..54acf67a21d 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -50,6 +50,7 @@
     get_pt2e_quantization_params,
     get_pt2e_quantizers,
     get_qnn_quantizer,
+    get_ov_quantizer,
     get_vulkan_quantizer,
 )
 from executorch.util.activation_memory_profiler import generate_memory_trace
@@ -205,6 +206,8 @@ def build_args_parser() -> argparse.ArgumentParser:
         choices=[
             "xnnpack_dynamic",
             "xnnpack_dynamic_qc4",
+            "openvino_8da4w",
+            "openvino_8da8w",
             "qnn_8a8w",
             "qnn_16a16w",
             "qnn_16a4w",
@@ -786,6 +789,12 @@ def get_quantizer_and_quant_params(llm_config):
             llm_config.quantization.pt2e_quantize.value, llm_config.quantization.qmode
         )
         quantizers.append(qnn_quantizer)
+    if llm_config.backend.openvino.enabled and llm_config.quantization.pt2e_quantize:
+        assert len(quantizers) == 0, "Should not enable both xnnpack and openvino"
+        ov_quantizer = get_ov_quantizer(
+            llm_config.quantization.pt2e_quantize.value, llm_config.quantization.group_size
+        )
+        quantizers.append(ov_quantizer)
     if llm_config.backend.coreml.enabled and llm_config.quantization.pt2e_quantize:
         assert len(quantizers) == 0, "Should not enable both xnnpack / qnn and coreml"
         coreml_quantizer = get_coreml_quantizer(
diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index d87c722363f..4669d09e0e7 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -207,7 +207,7 @@ def get_qnn_quantizer(
             f"No support for quant type {quant_config}. Support 8a8w, 16a16w and 16a4w."
         )
 
-    assert (
+    assert (get_qnn_quantizer
         quantization_mode is None
     ), "Currently qnn backend only supports QnnQuantizer via pt2e flow"
     qnn_quantizer.add_custom_quant_annotations(custom_annotations)
@@ -215,6 +215,42 @@ def get_qnn_quantizer(
     return qnn_quantizer, quant_dtype
 
 
+def get_ov_quantizer(
+    pt2e_quantize: str,
+    group_size: int = 32,
+):
+    try:
+        from executorch.backends.openvino.quantizer import OpenVINOQuantizer, QuantizationMode
+
+    except ImportError:
+        raise ImportError(
+            "Please install nncf via backends/openvino/requirements.txt"
+        )
+    
+    backend, quant_config = pt2e_quantize.split("_")
+    assert (
+        backend == "openvino"
+    ), f"The quantization config is for backend {backend} instead of openvino."
+    ov_quantizer = OpenVINOQuantizer()
+    # Manually ignore MP layers.
+    # ov_quantizer.set_ignored_scope()
+
+    extra_quantizer_options = {"group_size": group_size}
+    if quant_config == "8da4w":
+        mode = QuantizationMode.INT4WO_SYM
+
+    elif quant_config == "8da8w":
+        mode = QuantizationMode.INT8WO_SYM
+    else:
+        raise AssertionError(
+            f"No support for quant type {quant_config}. Support 8a4w, 8a8w only."
+        )
+    
+    ov_quantizer = OpenVINOQuantizer(mode=mode, **extra_quantizer_options)
+
+    return ov_quantizer
+
+
 def get_coreml_quantizer(pt2e_quantize: str):
     try:
         from coremltools.optimize.torch.quantization.quantization_config import (

From 4cc7694433b12f7c8afe4c61b785e5158e0798e0 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Tue, 26 Aug 2025 18:32:27 +0400
Subject: [PATCH 016/266] fixes

---
 backends/openvino/quantizer/quantizer.py  | 10 ++++--
 examples/models/llama/export_llama_lib.py |  9 +++--
 extension/llm/export/config/llm_config.py |  2 ++
 extension/llm/export/quantizer_lib.py     | 42 +++++++++++++++++++----
 4 files changed, 51 insertions(+), 12 deletions(-)

diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index 31d41bff7be..f594c6fffa8 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -12,6 +12,7 @@
 
 import nncf  # type: ignore[import-untyped]
 import nncf.common.quantization as quantization  # type: ignore[import-untyped]
+from nncf.common.scopes import should_consider_scope  # type: ignore[import-untyped]
 import nncf.experimental.torch.fx as nncf_fx  # type: ignore[import-untyped]
 
 import torch.fx
@@ -176,8 +177,12 @@ def _annotate_weight_compression(
         """
         self._algo.set_backend_entity(model)
         nodes_to_compress = self._algo.get_nodes_to_compress(nncf_graph)
+        ignored_names = self._algo.get_ignored_node_names(nncf_graph)
 
         for node in nodes_to_compress:
+            is_target_node = should_consider_scope(node.node_name, ignored_names)
+            if not is_target_node:
+                continue
             target_node = nncf_fx.node_utils.get_graph_node_by_name(
                 graph, node.node_name
             )
@@ -442,9 +447,9 @@ def _get_torch_ao_qspec_from_nncf_config(
                 else MappingType.ASYMMETRIC
             )
             if qmode in [QuantizationMode.INT4WO_SYM, QuantizationMode.INT4WO_SYM]:
+                extra_args["group_size"] = group_size
                 extra_args["mapping_type"] = mapping_type
                 extra_args["target_dtype"] = torch.int8
-                extra_args["group_size"] = group_size
                 observer = INT4WeightObserver
                 quant_min = -8 if mapping_type == MappingType.SYMMETRIC else 0
                 quant_max = 7 if mapping_type == MappingType.SYMMETRIC else 15
@@ -454,7 +459,7 @@ def _get_torch_ao_qspec_from_nncf_config(
             else:
                 observer = INT8WeightObserver
                 quant_min = -128 if mapping_type == MappingType.SYMMETRIC else 0
-                quant_max = 1277 if mapping_type == MappingType.SYMMETRIC else 255
+                quant_max = 127 if mapping_type == MappingType.SYMMETRIC else 255
                 dtype = torch.int8
                 channel_axis = 0
                 torch_qscheme = (
@@ -462,7 +467,6 @@ def _get_torch_ao_qspec_from_nncf_config(
                     if qmode == QuantizationMode.INT8WO_SYM
                     else torch.per_channel_affine
                 )
-
             return QuantizationSpec(
                 dtype=dtype,
                 observer_or_fake_quant_ctr=observer.with_args(**extra_args),
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 54acf67a21d..269f927e9f6 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -791,8 +791,10 @@ def get_quantizer_and_quant_params(llm_config):
         quantizers.append(qnn_quantizer)
     if llm_config.backend.openvino.enabled and llm_config.quantization.pt2e_quantize:
         assert len(quantizers) == 0, "Should not enable both xnnpack and openvino"
+        group_size = llm_config.quantization.group_size
+        group_size = group_size if group_size else 32 
         ov_quantizer = get_ov_quantizer(
-            llm_config.quantization.pt2e_quantize.value, llm_config.quantization.group_size
+            llm_config.quantization.pt2e_quantize.value, 
         )
         quantizers.append(ov_quantizer)
     if llm_config.backend.coreml.enabled and llm_config.quantization.pt2e_quantize:
@@ -904,6 +906,7 @@ def _to_edge_and_lower_llama_xnnpack(
 def _to_edge_and_lower_llama_openvino(
     builder_exported,
     modelname,
+    quantizers,
     additional_passes,
     openvino_device: str = "CPU",
     nncf_compression: bool = False,
@@ -935,7 +938,6 @@ def _to_edge_and_lower_llama_openvino(
 
         def transform_fn(prompts: str, tokenizer):
             tokenized_text = tokenizer.encode(prompts, bos=False, eos=False)
-            logging.error(tokenized_text)
 
             inputs = ()
             inputs = (
@@ -971,7 +973,7 @@ def transform_fn(prompts: str, tokenizer):
             sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
         )
 
-    builder = builder_exported.to_edge_transform_and_lower(partitioners)
+    builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(partitioners)
 
     if verbose:
         print_delegation_info(builder.edge_manager.exported_program().graph_module)
@@ -1214,6 +1216,7 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager:  # noqa: C901
         builder = _to_edge_and_lower_llama_openvino(
             builder_exported,
             modelname,
+            quantizers,
             additional_passes,
             openvino_device=llm_config.backend.openvino.device,
             nncf_compression=llm_config.backend.openvino.nncf_compression,
diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py
index ab18c19159b..b4175d54cd7 100644
--- a/extension/llm/export/config/llm_config.py
+++ b/extension/llm/export/config/llm_config.py
@@ -275,6 +275,8 @@ class Pt2eQuantize(str, Enum):
 
     xnnpack_dynamic = "xnnpack_dynamic"
     xnnpack_dynamic_qc4 = "xnnpack_dynamic_qc4"
+    openvino_8da4w = "openvino_8da4w"
+    openvino_8da8w = "openvino_8da8w"
     qnn_8a8w = "qnn_8a8w"
     qnn_16a16w = "qnn_16a16w"
     qnn_16a4w = "qnn_16a4w"
diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index 4669d09e0e7..2a20a90d55a 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -207,7 +207,7 @@ def get_qnn_quantizer(
             f"No support for quant type {quant_config}. Support 8a8w, 16a16w and 16a4w."
         )
 
-    assert (get_qnn_quantizer
+    assert (
         quantization_mode is None
     ), "Currently qnn backend only supports QnnQuantizer via pt2e flow"
     qnn_quantizer.add_custom_quant_annotations(custom_annotations)
@@ -231,22 +231,52 @@ def get_ov_quantizer(
     assert (
         backend == "openvino"
     ), f"The quantization config is for backend {backend} instead of openvino."
-    ov_quantizer = OpenVINOQuantizer()
+    assert group_size != None, "Group Size None is Not Supported. It should be set to -1 for per-channel."
+
     # Manually ignore MP layers.
-    # ov_quantizer.set_ignored_scope()
+    fp_node_names = linear_list = [
+        "embedding", # First embedding is kept in Full precision
+        "linear_14",
+        "linear_15",
+        "linear_35",
+        "linear_56",
+        "linear_57",
+        "linear_63",
+        "linear_70",
+        "linear_71",
+        "linear_77",
+        "linear_78",
+        "linear_81",
+        "linear_84",
+        "linear_85",
+        "linear_88",
+        "linear_89",
+        "linear_91",
+        "linear_92",
+        "linear_95",
+        "linear_96",
+        "linear_98",
+        "linear_99",
+        "linear_102",
+        "linear_103",
+        "linear_105",
+        "linear_106",
+        "linear_109",
+        "linear_110",
+        "linear_112",]
 
-    extra_quantizer_options = {"group_size": group_size}
     if quant_config == "8da4w":
         mode = QuantizationMode.INT4WO_SYM
 
     elif quant_config == "8da8w":
+        group_size = -1
         mode = QuantizationMode.INT8WO_SYM
     else:
         raise AssertionError(
             f"No support for quant type {quant_config}. Support 8a4w, 8a8w only."
         )
-    
-    ov_quantizer = OpenVINOQuantizer(mode=mode, **extra_quantizer_options)
+    ov_quantizer = OpenVINOQuantizer(mode=mode, group_size=group_size)
+    ov_quantizer.set_ignored_scope(names=fp_node_names)
 
     return ov_quantizer
 

From 5da40a57d7d42363b795d483630b00d9ce4b5f31 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Wed, 27 Aug 2025 13:48:41 +0400
Subject: [PATCH 017/266] support all_layers, backup mode in OVQuantizer

---
 backends/openvino/quantizer/quantizer.py  | 25 ++++---
 examples/models/llama/export_llama_lib.py | 82 ++++++++++-------------
 extension/llm/export/quantizer_lib.py     |  8 +--
 3 files changed, 55 insertions(+), 60 deletions(-)

diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index f594c6fffa8..2ede04e53db 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -116,8 +116,14 @@ def __init__(
                 ),  # Mode value has to match NNCF CompressWeightsMode
                 **kwargs,
             )
+            subset_size = 1 # Doesn't really matter in this case since it is data-free. Should just be +ve
+            dataset = None # Only Data Free Quantization is Supported in OVQuantizer
+            compression_format = nncf.CompressionFormat.DQ
+            nncf.quantization.algorithms.weight_compression.algorithm.check_user_compression_configuration(
+                subset_size=subset_size, dataset=dataset, compression_format=compression_format, **weight_compression_configuration
+                )
             self._algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression(
-                subset_size=None, **weight_compression_configuration
+                subset_size=subset_size, **weight_compression_configuration
             )
 
     def set_ignored_scope(
@@ -176,21 +182,20 @@ def _annotate_weight_compression(
         :return: Updated mapping of FX nodes with weight compression annotations.
         """
         self._algo.set_backend_entity(model)
-        nodes_to_compress = self._algo.get_nodes_to_compress(nncf_graph)
-        ignored_names = self._algo.get_ignored_node_names(nncf_graph)
+        all_wc_params, _ = self._algo.get_processed_weight_compression_parameters(model, nncf_graph)
 
-        for node in nodes_to_compress:
-            is_target_node = should_consider_scope(node.node_name, ignored_names)
-            if not is_target_node:
-                continue
+        for wc_param in all_wc_params:
+            wc_config = wc_param.compression_config
+            node_with_weight = wc_param.node_with_weight
             target_node = nncf_fx.node_utils.get_graph_node_by_name(
-                graph, node.node_name
+                graph, node_with_weight.node_name
             )
             annotation = node_vs_torch_annotation[target_node]
             edge_or_node = OpenVINOQuantizer._get_weight_edge(target_node, nncf_graph)
-            group_size = getattr(self._algo, "_group_size", -1)
+            group_size = wc_config.group_size
+            qmode = wc_config.mode
             qspec = self._get_torch_ao_qspec_from_nncf_config(
-                qp=None, group_size=group_size, qmode=self.mode, weights_only=True
+                qp=None, group_size=group_size, qmode=qmode, weights_only=True
             )
             self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
 
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 269f927e9f6..00785491100 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -792,9 +792,9 @@ def get_quantizer_and_quant_params(llm_config):
     if llm_config.backend.openvino.enabled and llm_config.quantization.pt2e_quantize:
         assert len(quantizers) == 0, "Should not enable both xnnpack and openvino"
         group_size = llm_config.quantization.group_size
-        group_size = group_size if group_size else 32 
+        group_size = group_size if group_size else 32
         ov_quantizer = get_ov_quantizer(
-            llm_config.quantization.pt2e_quantize.value, 
+            llm_config.quantization.pt2e_quantize.value, group_size
         )
         quantizers.append(ov_quantizer)
     if llm_config.backend.coreml.enabled and llm_config.quantization.pt2e_quantize:
@@ -921,59 +921,51 @@ def _to_edge_and_lower_llama_openvino(
     logging.info("Lowering model using following partitioner(s): ")
     for partitioner in partitioners:
         logging.info(f"--> {partitioner.__class__.__name__}")
-
+    try:
+        import nncf
+        from functools import partial
+        from pytorch_tokenizers import get_tokenizer
+    except ImportError:
+        raise ImportError(
+            "Please install nncf via backends/openvino/requirements.txt"
+        )
+   
+    tokenizer = get_tokenizer(builder_exported.tokenizer_path)
+    from datasets import load_dataset
     # Use NNCF compression if enabled
     # TODO: Enable passing OpenVINOQuantizer as a parameter to pt2e_quantize
     if nncf_compression:
-        try:
-            from functools import partial
-
-            import nncf
-            from pytorch_tokenizers import get_tokenizer
-        except ImportError:
-            raise ImportError(
-                "Please install nncf via backends/openvino/requirements.txt"
-            )
-        tokenizer = get_tokenizer(builder_exported.tokenizer_path)
-
-        def transform_fn(prompts: str, tokenizer):
-            tokenized_text = tokenizer.encode(prompts, bos=False, eos=False)
-
+        dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+        dataset = dataset.filter(lambda example: example['text'].strip() != "")
+        dataset = dataset.filter(lambda example: example['text'].strip() != "\n")
+        def transform_fn(
+            prompts: str, tokenizer
+        ):
+            tokenized_text = tokenizer.encode(prompts["text"], bos=False, eos=False)
+            device = torch.device("cpu") if openvino_device=="CPU" else torch.device("cuda")
             inputs = ()
             inputs = (
-                torch.tensor(tokenized_text).unsqueeze(0),
-                {"input_pos": torch.tensor([0])},
+                torch.tensor(tokenized_text[:128], device=device).unsqueeze(0),
+                {"input_pos": torch.tensor([0], device=device)},
             )
 
             return inputs
-
-        builder_exported.calibration_data = (
-            [builder_exported.calibration_data]
-            if isinstance(builder_exported.calibration_data, str)
-            else builder_exported.calibration_data
-        )
-        builder_exported.calibration_data = (
-            [
-                word
-                for prompt in builder_exported.calibration_data
-                for word in prompt.split()
-            ]
-            if not builder_exported.dynamic_shapes
-            else builder_exported.calibration_data
-        )
-
+        
         builder_exported.pre_autograd_graph_module = nncf.compress_weights(
-            builder_exported.pre_autograd_graph_module,
-            dataset=nncf.Dataset(
-                builder_exported.calibration_data,
-                transform_func=partial(transform_fn, tokenizer=tokenizer),
-            ),
-            mode=nncf.CompressWeightsMode.INT4_SYM,
-            ratio=0.8,
-            sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
-        )
+                                                            builder_exported.pre_autograd_graph_module,
+                                                            dataset=nncf.Dataset(dataset,  partial(transform_fn, tokenizer=tokenizer)),
+                                                            mode=nncf.CompressWeightsMode.INT4_SYM,
+                                                            group_size=32,
+                                                            backup_mode=nncf.BackupMode.NONE,
+                                                            ratio=0.8,
+                                                            sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
+                                                        )
+ 
+        builder = builder_exported.to_edge_transform_and_lower(partitioners)
+    
+    else:
+        builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(partitioners)
 
-    builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(partitioners)
 
     if verbose:
         print_delegation_info(builder.edge_manager.exported_program().graph_module)
diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index 2a20a90d55a..9220c1efbdc 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -221,7 +221,7 @@ def get_ov_quantizer(
 ):
     try:
         from executorch.backends.openvino.quantizer import OpenVINOQuantizer, QuantizationMode
-
+        import nncf
     except ImportError:
         raise ImportError(
             "Please install nncf via backends/openvino/requirements.txt"
@@ -234,8 +234,7 @@ def get_ov_quantizer(
     assert group_size != None, "Group Size None is Not Supported. It should be set to -1 for per-channel."
 
     # Manually ignore MP layers.
-    fp_node_names = linear_list = [
-        "embedding", # First embedding is kept in Full precision
+    fp_node_names = [
         "linear_14",
         "linear_15",
         "linear_35",
@@ -262,8 +261,7 @@ def get_ov_quantizer(
         "linear_105",
         "linear_106",
         "linear_109",
-        "linear_110",
-        "linear_112",]
+        "linear_110",]
 
     if quant_config == "8da4w":
         mode = QuantizationMode.INT4WO_SYM

From 9e65a7ef860e5725522859bbf8d863c76e26503d Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Wed, 27 Aug 2025 17:29:05 +0400
Subject: [PATCH 018/266] clean up and use new nncf method for obtaining
 compression parameters

---
 backends/openvino/quantizer/observers.py | 127 ++++++-----------------
 backends/openvino/quantizer/quantizer.py |  52 ++++------
 2 files changed, 48 insertions(+), 131 deletions(-)

diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py
index 2ea66f11a55..845a091d24b 100644
--- a/backends/openvino/quantizer/observers.py
+++ b/backends/openvino/quantizer/observers.py
@@ -25,10 +25,7 @@
 )
 from nncf.parameters import CompressWeightsMode  # type: ignore[import-untyped]
 from nncf.quantization.algorithms.weight_compression.config import (  # type: ignore[import-untyped]
-    WeightCompressionConfig,
-)
-from nncf.quantization.algorithms.weight_compression.torch_fx_backend import (  # type: ignore[import-untyped]
-    FXWeightCompressionAlgoBackend,
+    WeightCompressionParameters,
 )
 from nncf.quantization.algorithms.weight_compression.weight_lowering import (  # type: ignore[import-untyped]
     do_integer_quantization,
@@ -45,19 +42,31 @@
     INT8AsymmetricWeightsDecompressor,
     INT8SymmetricWeightsDecompressor,
 )
-from torchao.quantization.pt2e import MappingType, ObserverBase
-from nncf.torch.model_graph_manager import get_weight_compression_reduction_axes
+from torchao.quantization.pt2e import ObserverBase
+
 
 class WeightObserverBase(ObserverBase, ABC):
     """
     Base implementation of an NNCF observer that defines the rules for compressing layer weights into the OpenVINO representation.
     """
 
+    def __init__(
+        self,
+        wc_param: WeightCompressionParameters,
+        dtype: torch.dtype,
+        **kwargs,
+    ) -> None:
+        """
+        :param wc_param: Weight compression parameter which contains information such as group_size
+                        reduction_axes, quantization mode etc.
+        :param dtype: target dtype for quantization such as int8, uint8, etc.
+        """
+        super().__init__(dtype=dtype, is_dynamic=False)
+        self.wc_param = wc_param
+
     def calculate_qparams(  # type: ignore[override]
         self,
         weight: torch.Tensor,
-        observer_node: torch.fx.Node,
-        model: torch.fx.GraphModule,
     ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
         """
         Calculate quantization parameters such as scale, quantized weight and zero point.
@@ -65,26 +74,11 @@ def calculate_qparams(  # type: ignore[override]
         :param weight: FP weight to be used for calculating qparams.
         :return: quantization params quantized weight, scale and zero point
         """
-        ndims = len(weight.size())
-        node_with_weight, weight_port_id = (
-            WeightObserverBase.get_node_with_weight_and_port_ids(observer_node, model)
-        )
-        _, node_metatype = GraphConverter.get_node_type_and_metatype(
-            node_with_weight, model
-        )
-        # Special case where embedding metatype has to be mapped to AtenEmbedding metatype
-        node_metatype = (
-            om.PTAtenEmbeddingMetatype
-            if node_metatype == om.PTEmbeddingMetatype
-            else node_metatype
-        )
-        reduction_dims = get_weight_compression_reduction_axes(
-            node_metatype, weight_port_id, ndims
-        )
-        reduction_dims = tuple(reduction_dims)
-
+        wc_param = self.get_wc_param()
+        wc_config = wc_param.compression_config
+        reduction_axes = wc_param.reduction_axes
         q_weight, scale, zp = do_integer_quantization(
-            Tensor(weight), self.wc_config, reduction_axes=reduction_dims
+            Tensor(weight), wc_config, reduction_axes=reduction_axes
         )
         zp = zp.data if zp is not None else None
         return q_weight.data, scale.data, zp
@@ -92,23 +86,6 @@ def calculate_qparams(  # type: ignore[override]
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x
 
-    @staticmethod
-    def get_node_with_weight_and_port_ids(
-        observer_node: torch.fx.Node, model: torch.fx.GraphModule
-    ) -> Tuple[torch.fx.Node, int]:
-        """
-        Returns the node which contains the weight and the weight port id.
-
-        :param observer_node: Observer node for the weight.
-        :param graph: The model.
-        :return: Node which contains the weight (for eg. Linear node) and the port ID for the weight.
-        """
-        for node in model.graph.nodes:
-            if observer_node in node.all_input_nodes:
-                return node, node.all_input_nodes.index(observer_node)
-        msg = f"Observer node {observer_node.name} has no consumer node"
-        raise RuntimeError(msg)
-
     def convert(
         self, model: torch.fx.GraphModule, observer_node: torch.fx.Node
     ) -> None:
@@ -126,7 +103,7 @@ def convert(
         weight_node = observer_node.args[0]
         original_weight = get_tensor_constant_from_node(weight_node, model)
         q_weight, scale, zero_point = self.calculate_qparams(
-            original_weight, observer_node, model
+            original_weight
         )
 
         decompressor = self._create_decompressor(
@@ -134,6 +111,7 @@ def convert(
         )
         packed_q_weight = decompressor.pack_weight(q_weight)
 
+        # Weight port id is 0 since observer is inserted for a single weight only.
         constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0)
 
         compressed_weight_name = observer_node.all_input_nodes[0].name
@@ -177,7 +155,7 @@ def _create_decompressor(
         pass
 
     @abstractmethod
-    def get_wc_config(self) -> WeightCompressionConfig:
+    def get_wc_param(self) -> WeightCompressionParameters:
         """
         Used to return the respective NNCF Weight Compression Config.
 
@@ -191,30 +169,6 @@ class INT4WeightObserver(WeightObserverBase):
     This class defines the behavior for INT4 Weight Compression which has per-group granularity.
     """
 
-    def __init__(
-        self,
-        group_size: int,
-        mapping_type: MappingType,
-        target_dtype: torch.dtype,
-        *args,
-        **kwargs,
-    ) -> None:
-        """
-        :param group_size: Group size for group wise quantization. group_size=-1 means it is per-channel quantization.
-        :param mapping_type: MappingType.SYMMETRIC and MappingType.ASYMMETRIC are supported types for this argument for symmetric or asymmetric quantization.
-        :param target_dtype: target dtype for quantization such as int8, uint8, etc.
-        """
-        super().__init__(dtype=target_dtype, is_dynamic=False)
-        self.wc_config = None
-        self.mapping_type = mapping_type
-
-        qmode = (
-            CompressWeightsMode.INT4_ASYM
-            if self.mapping_type == MappingType.ASYMMETRIC
-            else CompressWeightsMode.INT4_SYM
-        )
-        self.wc_config = WeightCompressionConfig(mode=qmode, group_size=group_size)
-
     def _create_decompressor(
         self,
         scale: torch.Tensor,
@@ -235,8 +189,8 @@ def _create_decompressor(
                 scale, q_weight.shape, original_weight.shape, original_weight.dtype
             )
 
-    def get_wc_config(self):
-        return self.wc_config
+    def get_wc_param(self) -> WeightCompressionParameters:
+        return self.wc_param
 
 
 class INT8WeightObserver(WeightObserverBase):
@@ -244,30 +198,6 @@ class INT8WeightObserver(WeightObserverBase):
     This class defines the behavior for Int8 WC which has per channel granularity.
     """
 
-    def __init__(
-        self,
-        qscheme: torch.qscheme,
-        dtype: torch.dtype,
-        ch_axis: int = 0,
-        *args,
-        **kwargs,
-    ) -> None:
-        """
-        :param qscheme: Quantization scheme which is per-channel for Int8 WC.
-        :param dtype: dtype for quantization such as int8, uint8, etc..
-        :param ch_axis: Channel axis.
-        """
-        super().__init__(dtype=dtype, is_dynamic=False)
-        self.wc_config = None
-        self.qscheme = qscheme
-
-        qmode = (
-            CompressWeightsMode.INT8_SYM
-            if self.qscheme == torch.per_channel_symmetric
-            else CompressWeightsMode.INT8_ASYM
-        )
-        self.wc_config = WeightCompressionConfig(mode=qmode)
-
     def _create_decompressor(
         self,
         scale: torch.Tensor,
@@ -282,5 +212,6 @@ def _create_decompressor(
         else:
             return INT8SymmetricWeightsDecompressor(scale, original_weight.dtype)
 
-    def get_wc_config(self):
-        return self.wc_config
\ No newline at end of file
+    def get_wc_param(self) -> WeightCompressionParameters:
+        return self.wc_param
+    
\ No newline at end of file
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index 2ede04e53db..ef9a83ca77c 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -24,9 +24,11 @@
 from nncf.quantization.quantize_model import (  # type: ignore[import-untyped]
     get_weight_compression_configuration,
 )
+from nncf.quantization.algorithms.weight_compression.config import (  # type: ignore[import-untyped]
+    WeightCompressionParameters,
+)
 from torchao.quantization.pt2e import (
     HistogramObserver,
-    MappingType,
     PerChannelMinMaxObserver,
     UniformQuantizationObserverBase,
 )
@@ -112,16 +114,11 @@ def __init__(
         else:
             weight_compression_configuration = get_weight_compression_configuration(
                 mode.value.replace(
-                    "_wc", ""
+                    "wo", ""
                 ),  # Mode value has to match NNCF CompressWeightsMode
                 **kwargs,
             )
             subset_size = 1 # Doesn't really matter in this case since it is data-free. Should just be +ve
-            dataset = None # Only Data Free Quantization is Supported in OVQuantizer
-            compression_format = nncf.CompressionFormat.DQ
-            nncf.quantization.algorithms.weight_compression.algorithm.check_user_compression_configuration(
-                subset_size=subset_size, dataset=dataset, compression_format=compression_format, **weight_compression_configuration
-                )
             self._algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression(
                 subset_size=subset_size, **weight_compression_configuration
             )
@@ -185,17 +182,14 @@ def _annotate_weight_compression(
         all_wc_params, _ = self._algo.get_processed_weight_compression_parameters(model, nncf_graph)
 
         for wc_param in all_wc_params:
-            wc_config = wc_param.compression_config
             node_with_weight = wc_param.node_with_weight
             target_node = nncf_fx.node_utils.get_graph_node_by_name(
                 graph, node_with_weight.node_name
             )
             annotation = node_vs_torch_annotation[target_node]
             edge_or_node = OpenVINOQuantizer._get_weight_edge(target_node, nncf_graph)
-            group_size = wc_config.group_size
-            qmode = wc_config.mode
             qspec = self._get_torch_ao_qspec_from_nncf_config(
-                qp=None, group_size=group_size, qmode=qmode, weights_only=True
+                qp=None, wc_param=wc_param
             )
             self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
 
@@ -425,19 +419,16 @@ def _fill_torch_ao_annotation(
     @staticmethod
     def _get_torch_ao_qspec_from_nncf_config(
         qp: quantization.quantizer_setup.QuantizationPointBase,
-        group_size: int = -1,
-        qmode: Optional[QuantizationMode] = None,
-        weights_only: bool = False,
+        wc_param: WeightCompressionParameters = None,
     ) -> QuantizationSpec:
         """
         Returns a TorchAO QuantizationSpec based on NNCF quantization config and other arguments.
-        For weight-only quantization (e.g., INT4/INT8 compression), uses `qmode`, `group_size`,
-        and `weights_only`. For post-training quantization, only `qp` is required.
+        For weight-only quantization (e.g., INT4/INT8 compression), uses `wc_param` which carries 
+        weight only quantization info such as group_size, reduction_axes etc. For post-training 
+        quantization, only `qp` is required.
 
         :param qp: Quantization point from NNCF.
-        :param group_size: Group size for INT4 group-wise quantization.
-        :param qmode: Quantization mode for weight compression.
-        :param weights_only: If True, applies weight-only quantization logic.
+        :param wc_param: NNCF Weight compression parameters for the node.
         :return: A TorchAO QuantizationSpec.
         """
         observer: Type[UniformQuantizationObserverBase]
@@ -445,26 +436,21 @@ def _get_torch_ao_qspec_from_nncf_config(
         # Eps value is copied from nncf/torch/quantization/layers.py
         extra_args: Dict[str, Any] = {"eps": 1e-16}
 
-        if weights_only:
-            mapping_type = (
-                MappingType.SYMMETRIC
-                if qmode == QuantizationMode.INT4WO_SYM
-                else MappingType.ASYMMETRIC
-            )
-            if qmode in [QuantizationMode.INT4WO_SYM, QuantizationMode.INT4WO_SYM]:
-                extra_args["group_size"] = group_size
-                extra_args["mapping_type"] = mapping_type
-                extra_args["target_dtype"] = torch.int8
+        if wc_param:
+            qmode = wc_param.compression_config.mode
+            if qmode in [nncf.CompressWeightsMode.INT4_ASYM, nncf.CompressWeightsMode.INT4_SYM]:
+                extra_args["wc_param"] = wc_param
                 observer = INT4WeightObserver
-                quant_min = -8 if mapping_type == MappingType.SYMMETRIC else 0
-                quant_max = 7 if mapping_type == MappingType.SYMMETRIC else 15
+                quant_min = -8 if not wc_param.compression_config.is_asym_mode else 0
+                quant_max = 7 if not wc_param.compression_config.is_asym_mode else 15
                 dtype = torch.int8
                 channel_axis = 0
                 torch_qscheme = None
             else:
+                extra_args["wc_param"] = wc_param
                 observer = INT8WeightObserver
-                quant_min = -128 if mapping_type == MappingType.SYMMETRIC else 0
-                quant_max = 127 if mapping_type == MappingType.SYMMETRIC else 255
+                quant_min = -128 if not wc_param.compression_config.is_asym_mode else 0
+                quant_max = 127 if not wc_param.compression_config.is_asym_mode else 255
                 dtype = torch.int8
                 channel_axis = 0
                 torch_qscheme = (

From 53e0f4cd0e01ed5a8adb85a7c08a2722d4a5a622 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 1 Sep 2025 10:39:20 +0400
Subject: [PATCH 019/266] review changes & update method names according to wc
 algo

---
 backends/openvino/quantizer/observers.py | 4 ++--
 backends/openvino/quantizer/quantizer.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py
index 845a091d24b..50fcc673ed6 100644
--- a/backends/openvino/quantizer/observers.py
+++ b/backends/openvino/quantizer/observers.py
@@ -30,7 +30,7 @@
 from nncf.quantization.algorithms.weight_compression.weight_lowering import (  # type: ignore[import-untyped]
     do_integer_quantization,
 )
-from nncf.tensor.tensor import Tensor  # type: ignore[import-untyped]
+from nncf.tensor.tensor import Tensor as NNCFTensor  # type: ignore[import-untyped]
 from nncf.torch.graph.transformations.commands import (  # type: ignore[import-untyped]
     PTTargetPoint,
     TargetType,
@@ -78,7 +78,7 @@ def calculate_qparams(  # type: ignore[override]
         wc_config = wc_param.compression_config
         reduction_axes = wc_param.reduction_axes
         q_weight, scale, zp = do_integer_quantization(
-            Tensor(weight), wc_config, reduction_axes=reduction_axes
+            NNCFTensor(weight), wc_config, reduction_axes=reduction_axes
         )
         zp = zp.data if zp is not None else None
         return q_weight.data, scale.data, zp
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index ef9a83ca77c..2e364424b16 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -179,7 +179,7 @@ def _annotate_weight_compression(
         :return: Updated mapping of FX nodes with weight compression annotations.
         """
         self._algo.set_backend_entity(model)
-        all_wc_params, _ = self._algo.get_processed_weight_compression_parameters(model, nncf_graph)
+        all_wc_params, _ = self._algo.get_weight_compression_parameters(model, nncf_graph)
 
         for wc_param in all_wc_params:
             node_with_weight = wc_param.node_with_weight

From bf959305dc210416f20c327509291db3655028e9 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 1 Sep 2025 11:14:13 +0400
Subject: [PATCH 020/266] review changes

---
 backends/openvino/quantizer/observers.py | 2 +-
 backends/openvino/quantizer/quantizer.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py
index 50fcc673ed6..b1054460a16 100644
--- a/backends/openvino/quantizer/observers.py
+++ b/backends/openvino/quantizer/observers.py
@@ -166,7 +166,7 @@ def get_wc_param(self) -> WeightCompressionParameters:
 
 class INT4WeightObserver(WeightObserverBase):
     """
-    This class defines the behavior for INT4 Weight Compression which has per-group granularity.
+    OpenVINO INT4 Weight Compression observer.
     """
 
     def _create_decompressor(
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index 2e364424b16..485d67e3bb9 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -187,7 +187,7 @@ def _annotate_weight_compression(
                 graph, node_with_weight.node_name
             )
             annotation = node_vs_torch_annotation[target_node]
-            edge_or_node = OpenVINOQuantizer._get_weight_edge(target_node, nncf_graph)
+            edge_or_node = self._get_weight_edge(target_node, nncf_graph)
             qspec = self._get_torch_ao_qspec_from_nncf_config(
                 qp=None, wc_param=wc_param
             )

From 2d4bec7a4b0041ead027a6c651e00eee32343dc4 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 1 Sep 2025 11:31:40 +0400
Subject: [PATCH 021/266] review changes

---
 backends/openvino/quantizer/observers.py  | 38 ++++++-----------------
 backends/openvino/quantizer/quantizer.py  |  7 +----
 examples/models/llama/export_llama_lib.py |  2 +-
 3 files changed, 12 insertions(+), 35 deletions(-)

diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py
index b1054460a16..d44a22556dd 100644
--- a/backends/openvino/quantizer/observers.py
+++ b/backends/openvino/quantizer/observers.py
@@ -9,12 +9,7 @@
 from abc import ABC, abstractmethod
 from typing import Optional, Tuple
 
-import nncf.torch.graph.operator_metatypes as om  # type: ignore[import-untyped]
-
 import torch
-from nncf.experimental.torch.fx.nncf_graph_builder import (  # type: ignore[import-untyped]
-    GraphConverter,
-)
 
 from nncf.experimental.torch.fx.node_utils import (  # type: ignore[import-untyped]
     get_tensor_constant_from_node,
@@ -23,7 +18,6 @@
     constant_update_fn,
     module_insertion_transformation_builder,
 )
-from nncf.parameters import CompressWeightsMode  # type: ignore[import-untyped]
 from nncf.quantization.algorithms.weight_compression.config import (  # type: ignore[import-untyped]
     WeightCompressionParameters,
 )
@@ -57,9 +51,8 @@ def __init__(
         **kwargs,
     ) -> None:
         """
-        :param wc_param: Weight compression parameter which contains information such as group_size
-                        reduction_axes, quantization mode etc.
-        :param dtype: target dtype for quantization such as int8, uint8, etc.
+        :param wc_param: Weight compression parameters container.
+        :param dtype: target dtype for the quantization.
         """
         super().__init__(dtype=dtype, is_dynamic=False)
         self.wc_param = wc_param
@@ -69,10 +62,10 @@ def calculate_qparams(  # type: ignore[override]
         weight: torch.Tensor,
     ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
         """
-        Calculate quantization parameters such as scale, quantized weight and zero point.
+        Calculates quantization parameters: quantized weight, quantization scale and quantization zero point.
 
         :param weight: FP weight to be used for calculating qparams.
-        :return: quantization params quantized weight, scale and zero point
+        :return: A tuple containing the quantized weight, quantization scale and quantization zero point.
         """
         wc_param = self.get_wc_param()
         wc_config = wc_param.compression_config
@@ -90,10 +83,8 @@ def convert(
         self, model: torch.fx.GraphModule, observer_node: torch.fx.Node
     ) -> None:
         """
-        Converts the weight observer node into a decompression subgraph after calibration.
-        This method is responsible for transforming the model after the quantization preparation
-        and calibration phases. It replaces the observer node with the quantized weight and a decompression
-        module.
+        Replaces the given observer node from the given model with a quantized 
+        weight and a OpenVINO specific decompression module.
 
         :param model: A `torch.fx.GraphModule` representing the statically traced model
                     with observer nodes attached and calibrated.
@@ -144,7 +135,7 @@ def _create_decompressor(
         original_weight: torch.Tensor,
     ) -> BaseWeightsDecompressor:
         """
-        Used to return the respective NNCF decompressor for different types of quantization.
+        Returns a respective NNCF decompressor for different types of quantization.
 
         :param scale: Calculated scale quantization parameter.
         :param zero_point: Calculated zero_point quantization parameter.
@@ -152,17 +143,14 @@ def _create_decompressor(
         :param original_weight: FP weight.
         :return: NNCF observer according to the qmode which creates the decompression subgraph supported by OpenVINO.
         """
-        pass
 
-    @abstractmethod
     def get_wc_param(self) -> WeightCompressionParameters:
         """
-        Used to return the respective NNCF Weight Compression Config.
+        Returns a respective NNCF Weight Compression Config.
 
         :return: Weight compression config with the compression information such as qmode, group_size etc.
         """
-        pass
-
+        return self.wc_param
 
 class INT4WeightObserver(WeightObserverBase):
     """
@@ -189,13 +177,10 @@ def _create_decompressor(
                 scale, q_weight.shape, original_weight.shape, original_weight.dtype
             )
 
-    def get_wc_param(self) -> WeightCompressionParameters:
-        return self.wc_param
-
 
 class INT8WeightObserver(WeightObserverBase):
     """
-    This class defines the behavior for Int8 WC which has per channel granularity.
+    OpenVINO INT8 Weight Compression per channel observer.
     """
 
     def _create_decompressor(
@@ -212,6 +197,3 @@ def _create_decompressor(
         else:
             return INT8SymmetricWeightsDecompressor(scale, original_weight.dtype)
 
-    def get_wc_param(self) -> WeightCompressionParameters:
-        return self.wc_param
-    
\ No newline at end of file
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index 485d67e3bb9..7f86686d03c 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -205,15 +205,10 @@ def _annotate_post_training_quantization(
         """
         Annotates the model graph with post-training quantization configurations.
 
-        Converts NNCF quantization points into TorchAO-compatible quantization specs,
-        assigning them to corresponding nodes or edges. Also handles unified scale groups,
-        ensuring shared quantization specs across grouped quantizers with consistent configs.
-
         :param model: The FX GraphModule to annotate.
         :param graph: The underlying FX graph.
         :param nncf_graph: The corresponding NNCF graph.
         :param node_vs_torch_annotation: A mapping of FX nodes to quantization annotations.
-
         :return: Updated mapping of FX nodes with post-training quantization annotations.
         """
         quantization_setup = self.get_nncf_quantization_setup(model, nncf_graph)
@@ -575,4 +570,4 @@ def quantize_model(
         smooth_quant=smooth_quant,
         **kwargs,
     )
-    return quantized_model
\ No newline at end of file
+    return quantized_model
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 00785491100..269022f2cf7 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -790,7 +790,7 @@ def get_quantizer_and_quant_params(llm_config):
         )
         quantizers.append(qnn_quantizer)
     if llm_config.backend.openvino.enabled and llm_config.quantization.pt2e_quantize:
-        assert len(quantizers) == 0, "Should not enable both xnnpack and openvino"
+        assert quantizers, "Should not enable both xnnpack and openvino"
         group_size = llm_config.quantization.group_size
         group_size = group_size if group_size else 32
         ov_quantizer = get_ov_quantizer(

From 0a2e361f04aa724c8af7d88c1dbd286b4c7556d6 Mon Sep 17 00:00:00 2001
From: Aamir Nazir <aamir.nazir@intel.com>
Date: Wed, 3 Sep 2025 20:48:10 +0400
Subject: [PATCH 022/266] Update export_llama_lib.py

---
 examples/models/llama/export_llama_lib.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 269022f2cf7..8eab3eefbc0 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -790,7 +790,7 @@ def get_quantizer_and_quant_params(llm_config):
         )
         quantizers.append(qnn_quantizer)
     if llm_config.backend.openvino.enabled and llm_config.quantization.pt2e_quantize:
-        assert quantizers, "Should not enable both xnnpack and openvino"
+        assert not quantizers, "Should not enable both xnnpack and openvino"
         group_size = llm_config.quantization.group_size
         group_size = group_size if group_size else 32
         ov_quantizer = get_ov_quantizer(

From 4c86a9c91d6eeec8eca53ea66d4f5132cd007a6d Mon Sep 17 00:00:00 2001
From: Cavus Mustafa <mustafa.cavus@intel.com>
Date: Wed, 3 Sep 2025 13:32:08 -0700
Subject: [PATCH 023/266] enable group_size parameter for nncf compression

---
 backends/openvino/requirements.txt        | 2 +-
 examples/models/llama/export_llama_lib.py | 3 +++
 extension/llm/export/config/llm_config.py | 5 ++++-
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/backends/openvino/requirements.txt b/backends/openvino/requirements.txt
index 316633e9004..2ada445414c 100644
--- a/backends/openvino/requirements.txt
+++ b/backends/openvino/requirements.txt
@@ -1,2 +1,2 @@
 transformers
-git+https://github.com/openvinotoolkit/nncf@6b0fc1c#egg=nncf
+git+https://github.com/openvinotoolkit/nncf@5cb2b58#egg=nncf
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 47527a326f9..417d25550ab 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -898,6 +898,7 @@ def _to_edge_and_lower_llama_openvino(
     additional_passes,
     openvino_device: str = "CPU",
     nncf_compression: bool = False,
+    nncf_compression_group_size: int = 32,
     verbose: bool = False,
 ) -> LLMEdgeManager:  # noqa: C901
     partitioners = []
@@ -959,6 +960,7 @@ def transform_fn(prompts: str, tokenizer):
             ),
             mode=nncf.CompressWeightsMode.INT4_SYM,
             ratio=0.8,
+            group_size=nncf_compression_group_size,
             sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
         )
 
@@ -1208,6 +1210,7 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager:  # noqa: C901
             additional_passes,
             openvino_device=llm_config.backend.openvino.device,
             nncf_compression=llm_config.backend.openvino.nncf_compression,
+            nncf_compression_group_size=llm_config.backend.openvino.nncf_compression_group_size,
             verbose=llm_config.debug.verbose,
         )
     else:
diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py
index ab18c19159b..c8f15bc1f9a 100644
--- a/extension/llm/export/config/llm_config.py
+++ b/extension/llm/export/config/llm_config.py
@@ -456,7 +456,8 @@ class OpenvinoConfig:
 
     enabled: bool = False
     device: str = "CPU"
-    nncf_compression = False
+    nncf_compression: bool = False
+    nncf_compression_group_size: int = 32
 
 
 @dataclass
@@ -645,6 +646,8 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig":  # noqa: C901
             llm_config.backend.openvino.device = args.openvino_device
         if hasattr(args, "nncf_compression"):
             llm_config.backend.openvino.nncf_compression = args.nncf_compression
+        if hasattr(args, "group_size") and args.group_size:
+            llm_config.backend.openvino.nncf_compression_group_size = args.group_size
 
         # DebugConfig
         if hasattr(args, "profile_memory"):

From 46ed3f6d5ca71439c13c781eea1156bd4383ad3c Mon Sep 17 00:00:00 2001
From: Mustafa Cavus <mustafa.cavus@intel.com>
Date: Wed, 3 Sep 2025 15:09:13 -0700
Subject: [PATCH 024/266] Update README.md

---
 backends/openvino/README.md | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/backends/openvino/README.md b/backends/openvino/README.md
index a67cf12eca2..73b6bd9b20a 100644
--- a/backends/openvino/README.md
+++ b/backends/openvino/README.md
@@ -42,11 +42,23 @@ executorch
 
 Before you begin, ensure you have openvino installed and configured on your system.
 
-### Build OpenVINO from Source
+### Use OpenVINO from Release Packages
+
+1. Download the OpenVINO release package from [here](https://docs.openvino.ai/2025/get-started/install-openvino.html). Make sure to select your configuration and click on **OpenVINO Archives** under the distribution section to download the appropriate archive for your platform.
+
+2. Extract the release package from the archive and set the environment variables.
+
+   ```bash
+   tar -zxf openvino_toolkit_<your_release_configuration>.tgz
+   cd openvino_toolkit_<your_release_configuration>
+   source setupvars.sh
+   ```
+
+### (Optional) Build OpenVINO from Source
 
 ```bash
 git clone https://github.com/openvinotoolkit/openvino.git
-cd openvino && git checkout b16b776ac119dafda51f69a80f1e6b7376d02c3b
+cd openvino
 git submodule update --init --recursive
 sudo ./install_build_dependencies.sh
 mkdir build && cd build
@@ -59,18 +71,6 @@ cd <your_preferred_install_location>
 source setupvars.sh
 ```
 
-### Use OpenVINO from Release Packages
-
-1. Download the OpenVINO release package from [here](https://docs.openvino.ai/2025/get-started/install-openvino.html). Make sure to select your configuration and click on **OpenVINO Archives** under the distribution section to download the appropriate archive for your platform.
-
-2. Extract the release package from the archive and set the environment variables.
-
-   ```bash
-   tar -zxf openvino_toolkit_<your_release_configuration>.tgz
-   cd openvino_toolkit_<your_release_configuration>
-   source setupvars.sh
-   ```
-
 For more information about OpenVINO build, refer to the [OpenVINO Build Instructions](https://github.com/openvinotoolkit/openvino/blob/master/docs/dev/build_linux.md).
 
 ### Setup

From 0a1256eb351a5562e593f82ed921da2eeb9b245f Mon Sep 17 00:00:00 2001
From: Mustafa Cavus <mustafa.cavus@intel.com>
Date: Wed, 3 Sep 2025 15:26:08 -0700
Subject: [PATCH 025/266] Update README.md

---
 backends/openvino/README.md | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/backends/openvino/README.md b/backends/openvino/README.md
index 73b6bd9b20a..ce10b902646 100644
--- a/backends/openvino/README.md
+++ b/backends/openvino/README.md
@@ -77,17 +77,27 @@ For more information about OpenVINO build, refer to the [OpenVINO Build Instruct
 
 Follow the steps below to setup your build environment:
 
-1. **Setup ExecuTorch Environment**: Refer to the [Environment Setup](https://pytorch.org/executorch/main/getting-started-setup#environment-setup) guide for detailed instructions on setting up the ExecuTorch environment.
 
-2. **Setup OpenVINO Backend Environment**
+1. **Create a Virtual Environment**
+- Create a virtual environment and activate it by executing the commands below.
+   ```bash
+   python -m venv env
+   source env/bin/activate
+   ```
+2. **Clone ExecuTorch Repository from Github**
+- Clone Executorch repository by executing the command below.
+   ```bash
+   git clone --recurse-submodules https://github.com/pytorch/executorch.git
+   ```
+3. **Setup OpenVINO Backend Environment**
 - Install the dependent libs. Ensure that you are inside `executorch/backends/openvino/` directory
    ```bash
    pip install -r requirements.txt
    ```
   Note: To achieve optimal performance with NNCF quantization, you should install the latest development version of NNCF (version 2.16.0.dev0+191b53d9 or higher).
-3. Navigate to `scripts/` directory.
+4. Navigate to `scripts/` directory.
 
-4. **Build OpenVINO Backend C++ Libraries and Executor Runner**: Once the prerequisites are in place, run the `openvino_build.sh` script to start the build process. By default, OpenVINO backend will be built under `cmake-out/backends/openvino/` as `libopenvino_backend.a`
+5. **Build OpenVINO Backend C++ Libraries and Executor Runner**: Once the prerequisites are in place, run the `openvino_build.sh` script to start the build process. By default, OpenVINO backend will be built under `cmake-out/backends/openvino/` as `libopenvino_backend.a`
 
    ```bash
    ./openvino_build.sh
@@ -97,6 +107,7 @@ Follow the steps below to setup your build environment:
    ```bash
    ./openvino_build.sh --enable_python
    ```
+For more information about ExecuTorch environment setup, refer to the [Environment Setup](https://pytorch.org/executorch/main/getting-started-setup#environment-setup) guide.
 
 ### Run
 

From f2151e3baddd32003f5d0e5bb36e34830207a76c Mon Sep 17 00:00:00 2001
From: Mustafa Cavus <mustafa.cavus@intel.com>
Date: Wed, 3 Sep 2025 17:25:15 -0700
Subject: [PATCH 026/266] Update README.md

---
 backends/openvino/README.md | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/backends/openvino/README.md b/backends/openvino/README.md
index ce10b902646..cc5b20cbab8 100644
--- a/backends/openvino/README.md
+++ b/backends/openvino/README.md
@@ -89,24 +89,25 @@ Follow the steps below to setup your build environment:
    ```bash
    git clone --recurse-submodules https://github.com/pytorch/executorch.git
    ```
-3. **Setup OpenVINO Backend Environment**
-- Install the dependent libs. Ensure that you are inside `executorch/backends/openvino/` directory
+3. **Build ExecuTorch with OpenVINO Backend**
+- Ensure that you are inside `executorch/backends/openvino/scripts` directory. The following command builds and installs ExecuTorch with the OpenVINO backend, and also compiles the C++ runtime binaries into `<executorch_root>/cmake-out` for quick inference testing.
    ```bash
-   pip install -r requirements.txt
-   ```
-  Note: To achieve optimal performance with NNCF quantization, you should install the latest development version of NNCF (version 2.16.0.dev0+191b53d9 or higher).
-4. Navigate to `scripts/` directory.
-
-5. **Build OpenVINO Backend C++ Libraries and Executor Runner**: Once the prerequisites are in place, run the `openvino_build.sh` script to start the build process. By default, OpenVINO backend will be built under `cmake-out/backends/openvino/` as `libopenvino_backend.a`
-
-   ```bash
-   ./openvino_build.sh
+   openvino_build.sh
    ```
+- Optionally, `openvino_build.sh` script can be used to build python package or C++ bineries seperately.
    **Build OpenVINO Backend Python Package with Pybindings**: To build and install the OpenVINO backend Python package with Python bindings, run the `openvino_build.sh` script with the `--enable_python` argument. This will compile and install the ExecuTorch Python package with the OpenVINO backend into your Python environment. This option will also enable python bindings required to execute OpenVINO backend tests and `aot_optimize_and_infer.py` script inside `executorch/examples/openvino` folder.
-
-   ```bash
+     ```bash
    ./openvino_build.sh --enable_python
    ```
+   **Build C++ Runtime Libraries for OpenVINO Backend**: Run the `openvino_build.sh` script with the `--cpp_runtime` argument to build C++ runtime libraries into `<executorch_root>/cmake-out` folder. `<executorch_root>/cmake-out/backends/openvino/openvino_executor_runner` binary file can be used for quick inferencing with vision models.
+     ```bash
+   ./openvino_build.sh --cpp_runtime
+   ```
+   **Build C++ Llama Runner**: This step requires first building the C++ runtime libraries by following the previous instructions. Then, run `openvino_build.sh` script with the `--llama_runner` argument to compile the llama runner to execute inference with models exported using `export_llama`. The compiled binary file is located in `<executorch_root>/cmake-out/examples/models/llama/llama_main`.
+     ```bash
+   ./openvino_build.sh --llama_runner
+   ```
+
 For more information about ExecuTorch environment setup, refer to the [Environment Setup](https://pytorch.org/executorch/main/getting-started-setup#environment-setup) guide.
 
 ### Run

From dfc8eab6d862a9be10e95fd6ae82e122c9869574 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa <mustafa.cavus@intel.com>
Date: Wed, 3 Sep 2025 17:55:26 -0700
Subject: [PATCH 027/266] openvino backend build script updates

---
 backends/openvino/scripts/openvino_build.sh | 155 ++++++++++++--------
 1 file changed, 91 insertions(+), 64 deletions(-)

diff --git a/backends/openvino/scripts/openvino_build.sh b/backends/openvino/scripts/openvino_build.sh
index 08741840ddb..b7e5f5270ab 100755
--- a/backends/openvino/scripts/openvino_build.sh
+++ b/backends/openvino/scripts/openvino_build.sh
@@ -7,79 +7,106 @@ set -e
 EXECUTORCH_ROOT=$(realpath "$(dirname "$0")/../../..")
 echo EXECUTORCH_ROOT=${EXECUTORCH_ROOT}
 
-main() {
-    build_type=${1:-"--cpp_runtime"}
-
-    # If the first arguments is --cpp_runtime (default), build libraries for C++ runtime
-    if [[ -z "$build_type" || "$build_type" == "--cpp_runtime" ]]; then
-        echo "Building C++ Runtime Libraries"
-
-        # Set build directory
-        local build_dir="cmake-out"
-
-        # Enter the Executorch root directory
-        cd "$EXECUTORCH_ROOT"
-        rm -rf "${build_dir}"
-
-        # Configure the project with CMake
-        # Note: Add any additional configuration options you need here
-        cmake -DCMAKE_INSTALL_PREFIX="${build_dir}" \
-              -DCMAKE_BUILD_TYPE=Release \
-              -DEXECUTORCH_BUILD_OPENVINO=ON \
-              -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-              -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-              -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
-              -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
-              -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-              -DEXECUTORCH_BUILD_OPENVINO_EXECUTOR_RUNNER=ON \
-              -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-              -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
-              -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
-              -B"${build_dir}"
-
-
-        # Build the project
-        cmake --build ${build_dir} --target install --config Release -j$(nproc)
+install_requirements() {
+    echo "Installing Requirements For OpenVINO Backend"
+    cd "$EXECUTORCH_ROOT"
+    pip install -r backends/openvino/requirements.txt
+}
 
-    # If the first arguments is --enable_python, build python package with python bindings
-    elif [[ "$build_type" == "--enable_python" ]]; then
-        echo "Building Python Package with Pybinding"
+build_cpp_runtime() {
+    echo "Building C++ Runtime Libraries"
+
+    # Set build directory
+    local build_dir="cmake-out"
+
+    # Enter the Executorch root directory
+    cd "$EXECUTORCH_ROOT"
+    rm -rf "${build_dir}"
+
+    # Configure the project with CMake
+    # Note: Add any additional configuration options you need here
+    cmake -DCMAKE_INSTALL_PREFIX="${build_dir}" \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DEXECUTORCH_BUILD_OPENVINO=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+          -DEXECUTORCH_BUILD_OPENVINO_EXECUTOR_RUNNER=ON \
+          -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
+          -B"${build_dir}"
+
+
+    # Build the project
+    cmake --build ${build_dir} --target install --config Release -j$(nproc)
+}
+
+build_llama_runner() {
+    echo "Building Export Llama Runner"
+
+    # Set build directory
+    local build_dir="cmake-out"
+
+    # Enter the Executorch root directory
+    cd "$EXECUTORCH_ROOT"
+
+    # Configure the project with CMake
+    # Note: Add any additional configuration options you need here
+    cmake -DCMAKE_INSTALL_PREFIX="${build_dir}" \
+        -DCMAKE_BUILD_TYPE=Release \
+        -B"${build_dir}"/examples/models/llama \
+        examples/models/llama
+    # Build the export llama runner
+    cmake --build cmake-out/examples/models/llama -j$(nproc) --config Release
+}
 
-        # Enter the Executorch root directory
-        cd "$EXECUTORCH_ROOT"
-        ./install_executorch.sh --clean
+build_python_enabled() {
+    echo "Building Python Package with Pybinding"
 
-        # Set parameters to configure the project with CMake
-        # Note: Add any additional configuration options you need here
-        export CMAKE_ARGS="-DEXECUTORCH_BUILD_OPENVINO=ON \
-                           -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON"
-        export CMAKE_BUILD_ARGS="--target openvino_backend"
+    # Enter the Executorch root directory
+    cd "$EXECUTORCH_ROOT"
+    ./install_executorch.sh --clean
 
-        # Build the package
-        ./install_executorch.sh --minimal
+    # Set parameters to configure the project with CMake
+    # Note: Add any additional configuration options you need here
+    export CMAKE_ARGS="-DEXECUTORCH_BUILD_OPENVINO=ON \
+                       -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON"
+    export CMAKE_BUILD_ARGS="--target openvino_backend"
 
-        # Install torchao
-        pip install third-party/ao
+    # Build the package
+    ./install_executorch.sh --minimal
+
+    # Install torchao
+    pip install third-party/ao
+}
+
+main() {
+    build_type=${1:-"--build_all"}
+
+    # If the first arguments is --build_all (default), build python package, C++ runtime, and llama runner binary
+    if [[ -z "$build_type" || "$build_type" == "--build_all" ]]; then
+        install_requirements
+        build_python_enabled
+        build_cpp_runtime
+        build_llama_runner
+
+    # If the first arguments is --cpp_runtime, build libraries for C++ runtime
+    elif [[ "$build_type" == "--cpp_runtime" ]]; then
+        build_cpp_runtime
 
     # If the first arguments is --llama_runner, build export llama runner binary
     # Note: c++ runtime with openvino backend should be built before building export llama runner
     elif [[ "$build_type" == "--llama_runner" ]]; then
-        echo "Building Export Llama Runner"
-
-        # Set build directory
-        local build_dir="cmake-out"
-
-        # Enter the Executorch root directory
-        cd "$EXECUTORCH_ROOT"
-
-        # Configure the project with CMake
-        # Note: Add any additional configuration options you need here
-        cmake -DCMAKE_INSTALL_PREFIX="${build_dir}" \
-            -DCMAKE_BUILD_TYPE=Release \
-            -B"${build_dir}"/examples/models/llama \
-            examples/models/llama
-        # Build the export llama runner
-        cmake --build cmake-out/examples/models/llama -j$(nproc) --config Release
+        build_llama_runner
+
+    # If the first arguments is --enable_python, build python package with python bindings
+    elif [[ "$build_type" == "--enable_python" ]]; then
+        install_requirements
+        build_python_enabled
+
     else
         echo "Error: Argument is not valid: $build_type"
         exit 1  # Exit the script with an error code

From 2ac8a8c0b7ea3f2e0b391b1b7cba9460b71dad86 Mon Sep 17 00:00:00 2001
From: Mustafa Cavus <mustafa.cavus@intel.com>
Date: Thu, 4 Sep 2025 15:41:46 -0700
Subject: [PATCH 028/266] Update README.md

---
 backends/openvino/README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/backends/openvino/README.md b/backends/openvino/README.md
index cc5b20cbab8..71bd27f6b50 100644
--- a/backends/openvino/README.md
+++ b/backends/openvino/README.md
@@ -18,6 +18,11 @@ For more information on the supported hardware, please refer to [OpenVINO System
 executorch
 ├── backends
 │   └── openvino
+│       ├── quantizer
+│           ├── observers
+│               └── nncf_observers.py
+│           ├── __init__.py
+│           └── quantizer.py
 │       ├── runtime
 │           ├── OpenvinoBackend.cpp
 │           └── OpenvinoBackend.h
@@ -95,6 +100,7 @@ Follow the steps below to setup your build environment:
    openvino_build.sh
    ```
 - Optionally, `openvino_build.sh` script can be used to build python package or C++ bineries seperately.
+
    **Build OpenVINO Backend Python Package with Pybindings**: To build and install the OpenVINO backend Python package with Python bindings, run the `openvino_build.sh` script with the `--enable_python` argument. This will compile and install the ExecuTorch Python package with the OpenVINO backend into your Python environment. This option will also enable python bindings required to execute OpenVINO backend tests and `aot_optimize_and_infer.py` script inside `executorch/examples/openvino` folder.
      ```bash
    ./openvino_build.sh --enable_python

From 35444aefa26b92b802305669fcef5a7ee857a654 Mon Sep 17 00:00:00 2001
From: Mustafa Cavus <mustafa.cavus@intel.com>
Date: Thu, 4 Sep 2025 15:59:36 -0700
Subject: [PATCH 029/266] Update README.md

---
 backends/openvino/README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/backends/openvino/README.md b/backends/openvino/README.md
index 71bd27f6b50..0046ad23486 100644
--- a/backends/openvino/README.md
+++ b/backends/openvino/README.md
@@ -95,21 +95,21 @@ Follow the steps below to setup your build environment:
    git clone --recurse-submodules https://github.com/pytorch/executorch.git
    ```
 3. **Build ExecuTorch with OpenVINO Backend**
-- Ensure that you are inside `executorch/backends/openvino/scripts` directory. The following command builds and installs ExecuTorch with the OpenVINO backend, and also compiles the C++ runtime binaries into `<executorch_root>/cmake-out` for quick inference testing.
+- Ensure that you are inside `executorch/backends/openvino/scripts` directory. The following command builds and installs ExecuTorch with the OpenVINO backend, also compiles the C++ runtime libraries and binaries into `<executorch_root>/cmake-out` for quick inference testing.
    ```bash
    openvino_build.sh
    ```
-- Optionally, `openvino_build.sh` script can be used to build python package or C++ bineries seperately.
+- Optionally, `openvino_build.sh` script can be used to build python package or C++ libraries/binaries seperately.
 
-   **Build OpenVINO Backend Python Package with Pybindings**: To build and install the OpenVINO backend Python package with Python bindings, run the `openvino_build.sh` script with the `--enable_python` argument. This will compile and install the ExecuTorch Python package with the OpenVINO backend into your Python environment. This option will also enable python bindings required to execute OpenVINO backend tests and `aot_optimize_and_infer.py` script inside `executorch/examples/openvino` folder.
+   **Build OpenVINO Backend Python Package with Pybindings**: To build and install the OpenVINO backend Python package with Python bindings, run the `openvino_build.sh` script with the `--enable_python` argument as shown in the below command. This will compile and install the ExecuTorch Python package with the OpenVINO backend into your Python environment. This option will also enable python bindings required to execute OpenVINO backend tests and `aot_optimize_and_infer.py` script inside `executorch/examples/openvino` folder.
      ```bash
    ./openvino_build.sh --enable_python
    ```
-   **Build C++ Runtime Libraries for OpenVINO Backend**: Run the `openvino_build.sh` script with the `--cpp_runtime` argument to build C++ runtime libraries into `<executorch_root>/cmake-out` folder. `<executorch_root>/cmake-out/backends/openvino/openvino_executor_runner` binary file can be used for quick inferencing with vision models.
+   **Build C++ Runtime Libraries for OpenVINO Backend**: Run the `openvino_build.sh` script with the `--cpp_runtime` flag to build the C++ runtime libraries as shown in the below command. The compiled libraries files and binaries can be found in the `<executorch_root>/cmake-out` directory. The binary located at `<executorch_root>/cmake-out/backends/openvino/openvino_executor_runner` can be used to run inference with vision models.
      ```bash
    ./openvino_build.sh --cpp_runtime
    ```
-   **Build C++ Llama Runner**: This step requires first building the C++ runtime libraries by following the previous instructions. Then, run `openvino_build.sh` script with the `--llama_runner` argument to compile the llama runner to execute inference with models exported using `export_llama`. The compiled binary file is located in `<executorch_root>/cmake-out/examples/models/llama/llama_main`.
+   **Build C++ Llama Runner**: First, ensure the C++ runtime libraries are built by following the earlier instructions. Then, run the `openvino_build.sh` script with the `--llama_runner flag` to compile the LlaMA runner as shown the below command, which enables executing inference with models exported using export_llama. The resulting binary is located at: `<executorch_root>/cmake-out/examples/models/llama/llama_main`
      ```bash
    ./openvino_build.sh --llama_runner
    ```

From 5b8b633a94ca13b672db873e07725363c2e2014c Mon Sep 17 00:00:00 2001
From: Cavus Mustafa <mustafa.cavus@intel.com>
Date: Thu, 4 Sep 2025 17:18:03 -0700
Subject: [PATCH 030/266] formatting fix

---
 backends/openvino/partitioner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py
index a2920285f99..10d4b2b30a7 100644
--- a/backends/openvino/partitioner.py
+++ b/backends/openvino/partitioner.py
@@ -27,7 +27,7 @@
 
 
 class PatternNode:
-    op_types = {}
+    op_types: dict[str, list] = {}
 
     def __init__(self):
         self.op_types = {}

From f4a1423ddc5517495b0993d7d183450e4605f702 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa <mustafa.cavus@intel.com>
Date: Thu, 4 Sep 2025 17:33:16 -0700
Subject: [PATCH 031/266] formatting fix

---
 backends/openvino/partitioner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py
index 10d4b2b30a7..4893a89bebb 100644
--- a/backends/openvino/partitioner.py
+++ b/backends/openvino/partitioner.py
@@ -114,7 +114,7 @@ def __init__(
         self.delegation_spec = DelegationSpec(OpenvinoBackend.__name__, compile_spec)
         self._op_types_to_skip = op_types_to_skip
         self._op_names_to_skip = op_names_to_skip
-        self._enabled_ops_by_name = set()
+        self._enabled_ops_by_name: set = set()
 
     def ops_to_not_decompose(
         self,

From 44f08831df4d4707b1fba855299293ab435815f6 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa <mustafa.cavus@intel.com>
Date: Thu, 4 Sep 2025 17:39:03 -0700
Subject: [PATCH 032/266] formatting fix

---
 backends/openvino/partitioner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py
index 4893a89bebb..1d93ebd9cec 100644
--- a/backends/openvino/partitioner.py
+++ b/backends/openvino/partitioner.py
@@ -141,10 +141,10 @@ def check_pattern(
         self, node: torch.fx.Node, pattern: PatternNode, enabled_ops: list
     ) -> bool:
         if node.op == "call_function":
-            if ("call_function" + ":" + str(node.target.__name__)) in pattern.op_types:
+            if ("call_function" + ":" + str(node.target.__name__)) in pattern.op_types:  # type: ignore[union-attr]
                 pt_input_nodes = node.all_input_nodes
                 pattern_input_ops = pattern.op_types[
-                    "call_function" + ":" + str(node.target.__name__)
+                    "call_function" + ":" + str(node.target.__name__)  # type: ignore[union-attr]
                 ]
                 if pattern_input_ops is None:
                     enabled_ops.append(node)

From 5f657d3ce8cdc34edf2c3129b274c02917a30231 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa <mustafa.cavus@intel.com>
Date: Fri, 5 Sep 2025 10:02:55 -0700
Subject: [PATCH 033/266] formatting fix

---
 backends/openvino/partitioner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py
index 1d93ebd9cec..d4aff6fa7d3 100644
--- a/backends/openvino/partitioner.py
+++ b/backends/openvino/partitioner.py
@@ -27,7 +27,7 @@
 
 
 class PatternNode:
-    op_types: dict[str, list] = {}
+    op_types: dict[str, Optional[list]] = {}
 
     def __init__(self):
         self.op_types = {}

From eafcc33ab6bf99b0bfe8155f324af3e961cba279 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa <mustafa.cavus@intel.com>
Date: Fri, 5 Sep 2025 10:06:29 -0700
Subject: [PATCH 034/266] formatting fix

---
 backends/openvino/partitioner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py
index d4aff6fa7d3..5ed9508ca89 100644
--- a/backends/openvino/partitioner.py
+++ b/backends/openvino/partitioner.py
@@ -193,7 +193,7 @@ def capture_nncf_patterns(self, graph_module: torch.fx.GraphModule):
                 str(node.op) == "call_function"
                 and str(node.target.__name__) == "aten.stack.default"
             ):
-                enabled_ops = []
+                enabled_ops: list = []
                 pattern_match = self.check_pattern(node, stack_node, enabled_ops)
                 if pattern_match:
                     for pattern_op in enabled_ops:

From 1763b99d7c7785a1b2f5c3152601924f97c07fea Mon Sep 17 00:00:00 2001
From: Cavus Mustafa <mustafa.cavus@intel.com>
Date: Fri, 5 Sep 2025 10:14:59 -0700
Subject: [PATCH 035/266] formatting fix

---
 backends/openvino/partitioner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py
index 5ed9508ca89..20841d6730b 100644
--- a/backends/openvino/partitioner.py
+++ b/backends/openvino/partitioner.py
@@ -138,7 +138,7 @@ def ops_to_not_decompose(
         return (ops_not_decompose, None)
 
     def check_pattern(
-        self, node: torch.fx.Node, pattern: PatternNode, enabled_ops: list
+        self, node: torch.fx.Node, pattern: type[PatternNode], enabled_ops: list
     ) -> bool:
         if node.op == "call_function":
             if ("call_function" + ":" + str(node.target.__name__)) in pattern.op_types:  # type: ignore[union-attr]

From 486382636b43a348512a934110f3215bbc67e842 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa <mustafa.cavus@intel.com>
Date: Fri, 5 Sep 2025 10:23:35 -0700
Subject: [PATCH 036/266] formatting fix

---
 backends/openvino/quantizer/observers/nncf_observers.py | 4 ++--
 backends/openvino/quantizer/quantizer.py                | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/backends/openvino/quantizer/observers/nncf_observers.py b/backends/openvino/quantizer/observers/nncf_observers.py
index f6ac2a3cb91..ac95b1bbef5 100644
--- a/backends/openvino/quantizer/observers/nncf_observers.py
+++ b/backends/openvino/quantizer/observers/nncf_observers.py
@@ -111,7 +111,7 @@ def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node):
                 decompressor_name,
             )(model)
         decomp_node = observer_node.args[0]
-        observer_node.replace_all_uses_with(decomp_node)
+        observer_node.replace_all_uses_with(decomp_node)  # type: ignore[arg-type]
         model.graph.erase_node(observer_node)
 
 
@@ -172,5 +172,5 @@ def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node):
                 decompressor_name,
             )(model)
         decomp_node = observer_node.args[0]
-        observer_node.replace_all_uses_with(decomp_node)
+        observer_node.replace_all_uses_with(decomp_node)  # type: ignore[arg-type]
         model.graph.erase_node(observer_node)
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index cd78f6907c7..84e29239419 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -75,7 +75,7 @@ class OpenVINOQuantizer(Quantizer):
     def __init__(
         self,
         *,
-        mode: Optional[QuantizationMode] = QuantizationMode.INT8_SYM,
+        mode: QuantizationMode = QuantizationMode.INT8_SYM,
         **kwargs,
     ):
         """

From e24072fc68c7884b62a437de3d8d2b7f60cd9efe Mon Sep 17 00:00:00 2001
From: Cavus Mustafa <mustafa.cavus@intel.com>
Date: Fri, 5 Sep 2025 10:43:00 -0700
Subject: [PATCH 037/266] formatting fix

---
 backends/openvino/quantizer/quantizer.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index 84e29239419..5cbd50c3136 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -208,7 +208,7 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
                 edge_or_node, annotation = self._get_edge_or_node_and_annotation(
                     graph, nncf_graph, qp, node_vs_torch_annotation
                 )
-                qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(
+                qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(  # type: ignore[no-redef]
                     qp
                 )
                 self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
@@ -412,9 +412,9 @@ def _get_torch_ao_qspec_from_nncf_config(
                 else MappingType.ASYMMETRIC
             )
             if qconfig.num_bits == 4:
-                extra_args["mapping_type"] = mapping_type
-                extra_args["target_dtype"] = torch.int8
-                extra_args["granularity"] = PerGroup(group_size=group_size)
+                extra_args["mapping_type"] = mapping_type  # type: ignore[assignment]
+                extra_args["target_dtype"] = torch.int8  # type: ignore[assignment]
+                extra_args["granularity"] = PerGroup(group_size=group_size)  # type: ignore[assignment]
                 observer = PTPerBlockParamObserver
                 quant_min = -8
                 quant_max = 7

From b9bb5f08224544f9f4e9a6896bf756fc41462ce3 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa <mustafa.cavus@intel.com>
Date: Fri, 5 Sep 2025 10:51:16 -0700
Subject: [PATCH 038/266] formatting fix

---
 backends/openvino/quantizer/quantizer.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index 5cbd50c3136..aef9e56876b 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -391,6 +391,10 @@ def _get_torch_ao_qspec_from_nncf_config(
         extra_args = {"eps": 1e-16}
         is_weight = qp.is_weight_quantization_point()
         qconfig = qp.qconfig
+        dtype = None
+        quant_min = None
+        quant_max = None
+        channel_axis = None
 
         observer: Type[UniformQuantizationObserverBase]
         if qconfig.per_channel:

From 291dcd993e17136a3609e30919aa4d406ed54113 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa <mustafa.cavus@intel.com>
Date: Fri, 5 Sep 2025 10:56:31 -0700
Subject: [PATCH 039/266] formatting fix

---
 backends/openvino/quantizer/quantizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index aef9e56876b..f2011431a03 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -391,7 +391,7 @@ def _get_torch_ao_qspec_from_nncf_config(
         extra_args = {"eps": 1e-16}
         is_weight = qp.is_weight_quantization_point()
         qconfig = qp.qconfig
-        dtype = None
+        dtype = torch.int8
         quant_min = None
         quant_max = None
         channel_axis = None

From c8ea777098b8a812e6162b767dbfeabdd7c193c4 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Sat, 6 Sep 2025 13:39:52 +0400
Subject: [PATCH 040/266] use new transformations

---
 backends/openvino/quantizer/observers.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py
index d44a22556dd..76ab33eb5c5 100644
--- a/backends/openvino/quantizer/observers.py
+++ b/backends/openvino/quantizer/observers.py
@@ -15,8 +15,9 @@
     get_tensor_constant_from_node,
 )
 from nncf.experimental.torch.fx.transformations import (  # type: ignore[import-untyped]
-    constant_update_fn,
-    module_insertion_transformation_builder,
+    constant_update,
+    module_insertion,
+    node_removal,
 )
 from nncf.quantization.algorithms.weight_compression.config import (  # type: ignore[import-untyped]
     WeightCompressionParameters,
@@ -103,7 +104,7 @@ def convert(
         packed_q_weight = decompressor.pack_weight(q_weight)
 
         # Weight port id is 0 since observer is inserted for a single weight only.
-        constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0)
+        constant_update(model, observer_node, packed_q_weight, input_port_id=0)
 
         compressed_weight_name = observer_node.all_input_nodes[0].name
         decompressor_suffix = "_".join(
@@ -111,7 +112,8 @@ def convert(
         )
         decompressor_name = f"{decompressor.quantization_mode}_weights_decompressor_{decompressor_suffix}"
 
-        module_insertion_transformation_builder(
+        module_insertion(
+            model,
             decompressor,
             [
                 PTTargetPoint(
@@ -120,11 +122,8 @@ def convert(
                 )
             ],
             decompressor_name,
-        )(model)
-
-        decomp_node = observer_node.args[0]
-        observer_node.replace_all_uses_with(decomp_node)  # type: ignore[arg-type]
-        model.graph.erase_node(observer_node)
+        )
+        node_removal(model, observer_node, 0)
 
     @abstractmethod
     def _create_decompressor(

From a6b605f41b5390ff9de70b2397a2d00003f34ff2 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Sat, 6 Sep 2025 13:46:24 +0400
Subject: [PATCH 041/266] add comment for manual MP allocation

---
 extension/llm/export/quantizer_lib.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index 9220c1efbdc..e839827208c 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -233,7 +233,7 @@ def get_ov_quantizer(
     ), f"The quantization config is for backend {backend} instead of openvino."
     assert group_size != None, "Group Size None is Not Supported. It should be set to -1 for per-channel."
 
-    # Manually ignore MP layers.
+    # (TODO) Manually ignore MP layers. This is done manually for now till we use the dynamic allocation MP 
     fp_node_names = [
         "linear_14",
         "linear_15",

From 9614fc4da170d76a39e047d0c364177bf96d0209 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Sat, 6 Sep 2025 13:48:58 +0400
Subject: [PATCH 042/266] remove nncf_compression from export llama lib

---
 examples/models/llama/export_llama_lib.py | 54 +----------------------
 1 file changed, 1 insertion(+), 53 deletions(-)

diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 8eab3eefbc0..ac52893b99c 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -567,13 +567,6 @@ def build_args_parser() -> argparse.ArgumentParser:
         help="path to the input pruning token mapping file (token_map.json)",
     )
 
-    parser.add_argument(
-        "--nncf_compression",
-        default=False,
-        action="store_true",
-        help="Enables nncf compression for openvino backend",
-    )
-
     parser.add_argument(
         "--export_only",
         default=False,
@@ -909,7 +902,6 @@ def _to_edge_and_lower_llama_openvino(
     quantizers,
     additional_passes,
     openvino_device: str = "CPU",
-    nncf_compression: bool = False,
     verbose: bool = False,
 ) -> LLMEdgeManager:  # noqa: C901
     partitioners = []
@@ -921,51 +913,8 @@ def _to_edge_and_lower_llama_openvino(
     logging.info("Lowering model using following partitioner(s): ")
     for partitioner in partitioners:
         logging.info(f"--> {partitioner.__class__.__name__}")
-    try:
-        import nncf
-        from functools import partial
-        from pytorch_tokenizers import get_tokenizer
-    except ImportError:
-        raise ImportError(
-            "Please install nncf via backends/openvino/requirements.txt"
-        )
-   
-    tokenizer = get_tokenizer(builder_exported.tokenizer_path)
-    from datasets import load_dataset
-    # Use NNCF compression if enabled
-    # TODO: Enable passing OpenVINOQuantizer as a parameter to pt2e_quantize
-    if nncf_compression:
-        dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
-        dataset = dataset.filter(lambda example: example['text'].strip() != "")
-        dataset = dataset.filter(lambda example: example['text'].strip() != "\n")
-        def transform_fn(
-            prompts: str, tokenizer
-        ):
-            tokenized_text = tokenizer.encode(prompts["text"], bos=False, eos=False)
-            device = torch.device("cpu") if openvino_device=="CPU" else torch.device("cuda")
-            inputs = ()
-            inputs = (
-                torch.tensor(tokenized_text[:128], device=device).unsqueeze(0),
-                {"input_pos": torch.tensor([0], device=device)},
-            )
-
-            return inputs
-        
-        builder_exported.pre_autograd_graph_module = nncf.compress_weights(
-                                                            builder_exported.pre_autograd_graph_module,
-                                                            dataset=nncf.Dataset(dataset,  partial(transform_fn, tokenizer=tokenizer)),
-                                                            mode=nncf.CompressWeightsMode.INT4_SYM,
-                                                            group_size=32,
-                                                            backup_mode=nncf.BackupMode.NONE,
-                                                            ratio=0.8,
-                                                            sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
-                                                        )
- 
-        builder = builder_exported.to_edge_transform_and_lower(partitioners)
-    
-    else:
-        builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(partitioners)
 
+    builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(partitioners)
 
     if verbose:
         print_delegation_info(builder.edge_manager.exported_program().graph_module)
@@ -1211,7 +1160,6 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager:  # noqa: C901
             quantizers,
             additional_passes,
             openvino_device=llm_config.backend.openvino.device,
-            nncf_compression=llm_config.backend.openvino.nncf_compression,
             verbose=llm_config.debug.verbose,
         )
     else:

From 45007cf90c054ccfd527874ae35d383fc34a4ee8 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Sat, 6 Sep 2025 13:52:58 +0400
Subject: [PATCH 043/266] change pt2e quantize flag to use openvino_4wo instead
 of openvino_8da4w and so on

---
 extension/llm/export/config/llm_config.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py
index b4175d54cd7..49855d61e6e 100644
--- a/extension/llm/export/config/llm_config.py
+++ b/extension/llm/export/config/llm_config.py
@@ -275,8 +275,8 @@ class Pt2eQuantize(str, Enum):
 
     xnnpack_dynamic = "xnnpack_dynamic"
     xnnpack_dynamic_qc4 = "xnnpack_dynamic_qc4"
-    openvino_8da4w = "openvino_8da4w"
-    openvino_8da8w = "openvino_8da8w"
+    openvino_4wo = "openvino_4wo"
+    openvino_8wo = "openvino_8wo"
     qnn_8a8w = "qnn_8a8w"
     qnn_16a16w = "qnn_16a16w"
     qnn_16a4w = "qnn_16a4w"

From 9d494147457e6696f7149e4b7cb69f95811cbd47 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Sat, 6 Sep 2025 13:53:14 +0400
Subject: [PATCH 044/266] follow up to last commit

---
 examples/models/llama/export_llama_lib.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index ac52893b99c..ec03f4b26c9 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -206,8 +206,8 @@ def build_args_parser() -> argparse.ArgumentParser:
         choices=[
             "xnnpack_dynamic",
             "xnnpack_dynamic_qc4",
-            "openvino_8da4w",
-            "openvino_8da8w",
+            "openvino_4wo",
+            "openvino_8wo",
             "qnn_8a8w",
             "qnn_16a16w",
             "qnn_16a4w",

From d6727cfed609d07281fdea42358d2e234ac82f19 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Sat, 6 Sep 2025 13:56:47 +0400
Subject: [PATCH 045/266] update quantizer lib with openvino_4wo

---
 extension/llm/export/quantizer_lib.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index e839827208c..8a097f9b8f1 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -263,10 +263,10 @@ def get_ov_quantizer(
         "linear_109",
         "linear_110",]
 
-    if quant_config == "8da4w":
+    if quant_config == "4wo":
         mode = QuantizationMode.INT4WO_SYM
 
-    elif quant_config == "8da8w":
+    elif quant_config == "8wo":
         group_size = -1
         mode = QuantizationMode.INT8WO_SYM
     else:

From 4a0a7819ab69aa0d8fdfce70f3be219c14abc409 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Sat, 6 Sep 2025 14:06:48 +0400
Subject: [PATCH 046/266] split qspec function into 2 parts; 1 for WC and other
 for PTQ qspecs

---
 backends/openvino/quantizer/quantizer.py | 92 +++++++++++++-----------
 1 file changed, 50 insertions(+), 42 deletions(-)

diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index 7f86686d03c..ef04ed0de46 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -188,8 +188,8 @@ def _annotate_weight_compression(
             )
             annotation = node_vs_torch_annotation[target_node]
             edge_or_node = self._get_weight_edge(target_node, nncf_graph)
-            qspec = self._get_torch_ao_qspec_from_nncf_config(
-                qp=None, wc_param=wc_param
+            qspec = self._get_torch_ao_qspec_from_nncf_config_for_wc(
+                wc_param=wc_param
             )
             self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
 
@@ -217,7 +217,7 @@ def _annotate_post_training_quantization(
             edge_or_node, annotation = self._get_edge_or_node_and_annotation(
                 graph, nncf_graph, qp, node_vs_torch_annotation
             )
-            qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(qp)
+            qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config_for_ptq(qp)
             self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
 
         for quantizer_ids in quantization_setup.unified_scale_groups.values():
@@ -412,18 +412,58 @@ def _fill_torch_ao_annotation(
             annotation_to_update.input_qspec_map[edge_or_node[0]] = qspec
 
     @staticmethod
-    def _get_torch_ao_qspec_from_nncf_config(
+    def _get_torch_ao_qspec_from_nncf_config_for_wc(
+        wc_param: WeightCompressionParameters,
+    ) -> QuantizationSpec:
+        """
+        Returns a TorchAO QuantizationSpec based on NNCF weight compression parameter.
+
+        :param wc_param: NNCF Weight compression parameters for the node.
+        :return: A TorchAO QuantizationSpec.
+        """
+        observer: Type[UniformQuantizationObserverBase]
+
+        extra_args: Dict[str, Any] = {}
+
+        qmode = wc_param.compression_config.mode
+        if qmode in [nncf.CompressWeightsMode.INT4_ASYM, nncf.CompressWeightsMode.INT4_SYM]:
+            extra_args["wc_param"] = wc_param
+            observer = INT4WeightObserver
+            quant_min = -8 if not wc_param.compression_config.is_asym_mode else 0
+            quant_max = 7 if not wc_param.compression_config.is_asym_mode else 15
+            dtype = torch.int8
+            channel_axis = 0
+            torch_qscheme = None
+        else:
+            extra_args["wc_param"] = wc_param
+            observer = INT8WeightObserver
+            quant_min = -128 if not wc_param.compression_config.is_asym_mode else 0
+            quant_max = 127 if not wc_param.compression_config.is_asym_mode else 255
+            dtype = torch.int8
+            channel_axis = 0
+            torch_qscheme = (
+                torch.per_channel_symmetric
+                if qmode == QuantizationMode.INT8WO_SYM
+                else torch.per_channel_affine
+            )
+        return QuantizationSpec(
+            dtype=dtype,
+            observer_or_fake_quant_ctr=observer.with_args(**extra_args),
+            quant_min=quant_min,
+            quant_max=quant_max,
+            qscheme=torch_qscheme,
+            ch_axis=channel_axis,
+            is_dynamic=False,
+        )
+
+    @staticmethod
+    def _get_torch_ao_qspec_from_nncf_config_for_ptq(
         qp: quantization.quantizer_setup.QuantizationPointBase,
-        wc_param: WeightCompressionParameters = None,
     ) -> QuantizationSpec:
         """
-        Returns a TorchAO QuantizationSpec based on NNCF quantization config and other arguments.
-        For weight-only quantization (e.g., INT4/INT8 compression), uses `wc_param` which carries 
-        weight only quantization info such as group_size, reduction_axes etc. For post-training 
-        quantization, only `qp` is required.
+        Returns a TorchAO QuantizationSpec based on NNCF quantization point.
 
         :param qp: Quantization point from NNCF.
-        :param wc_param: NNCF Weight compression parameters for the node.
         :return: A TorchAO QuantizationSpec.
         """
         observer: Type[UniformQuantizationObserverBase]
@@ -431,38 +471,6 @@ def _get_torch_ao_qspec_from_nncf_config(
         # Eps value is copied from nncf/torch/quantization/layers.py
         extra_args: Dict[str, Any] = {"eps": 1e-16}
 
-        if wc_param:
-            qmode = wc_param.compression_config.mode
-            if qmode in [nncf.CompressWeightsMode.INT4_ASYM, nncf.CompressWeightsMode.INT4_SYM]:
-                extra_args["wc_param"] = wc_param
-                observer = INT4WeightObserver
-                quant_min = -8 if not wc_param.compression_config.is_asym_mode else 0
-                quant_max = 7 if not wc_param.compression_config.is_asym_mode else 15
-                dtype = torch.int8
-                channel_axis = 0
-                torch_qscheme = None
-            else:
-                extra_args["wc_param"] = wc_param
-                observer = INT8WeightObserver
-                quant_min = -128 if not wc_param.compression_config.is_asym_mode else 0
-                quant_max = 127 if not wc_param.compression_config.is_asym_mode else 255
-                dtype = torch.int8
-                channel_axis = 0
-                torch_qscheme = (
-                    torch.per_channel_symmetric
-                    if qmode == QuantizationMode.INT8WO_SYM
-                    else torch.per_channel_affine
-                )
-            return QuantizationSpec(
-                dtype=dtype,
-                observer_or_fake_quant_ctr=observer.with_args(**extra_args),
-                quant_min=quant_min,
-                quant_max=quant_max,
-                qscheme=torch_qscheme,
-                ch_axis=channel_axis,
-                is_dynamic=False,
-            )
-
         is_weight = qp.is_weight_quantization_point()
         qconfig = qp.qconfig
 

From f6a1ee3d708ca46fe495f081bc45872042b1bed6 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 8 Sep 2025 12:14:34 +0400
Subject: [PATCH 047/266] micro fix

---
 backends/openvino/quantizer/quantizer.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index ef04ed0de46..762ed2a9171 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -426,24 +426,29 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc(
         extra_args: Dict[str, Any] = {}
 
         qmode = wc_param.compression_config.mode
+        is_asym_mode = wc_param.compression_config.is_asym_mode
         if qmode in [nncf.CompressWeightsMode.INT4_ASYM, nncf.CompressWeightsMode.INT4_SYM]:
             extra_args["wc_param"] = wc_param
             observer = INT4WeightObserver
-            quant_min = -8 if not wc_param.compression_config.is_asym_mode else 0
-            quant_max = 7 if not wc_param.compression_config.is_asym_mode else 15
+            quant_min = -8 if not is_asym_mode else 0
+            quant_max = 7 if not is_asym_mode else 15
             dtype = torch.int8
             channel_axis = 0
-            torch_qscheme = None
+            torch_qscheme = torch_qscheme = (
+                torch.per_channel_symmetric
+                if not is_asym_mode
+                else torch.per_channel_affine
+            )
         else:
             extra_args["wc_param"] = wc_param
             observer = INT8WeightObserver
-            quant_min = -128 if not wc_param.compression_config.is_asym_mode else 0
-            quant_max = 127 if not wc_param.compression_config.is_asym_mode else 255
+            quant_min = -128 if not is_asym_mode else 0
+            quant_max = 127 if not is_asym_mode else 255
             dtype = torch.int8
             channel_axis = 0
             torch_qscheme = (
                 torch.per_channel_symmetric
-                if qmode == QuantizationMode.INT8WO_SYM
+                if not is_asym_mode
                 else torch.per_channel_affine
             )
         return QuantizationSpec(

From d285fcce354f8bde55e968892932cbe4a34421cd Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 8 Sep 2025 15:35:49 +0400
Subject: [PATCH 048/266] udpate mixed precision layers for higher accuracy.
 Change INT4 mode to Asymmetric

---
 extension/llm/export/quantizer_lib.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index 8a097f9b8f1..46b10dcb960 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -235,21 +235,17 @@ def get_ov_quantizer(
 
     # (TODO) Manually ignore MP layers. This is done manually for now till we use the dynamic allocation MP 
     fp_node_names = [
+        "linear_13",
         "linear_14",
-        "linear_15",
         "linear_35",
         "linear_56",
-        "linear_57",
-        "linear_63",
         "linear_70",
         "linear_71",
         "linear_77",
         "linear_78",
-        "linear_81",
         "linear_84",
         "linear_85",
         "linear_88",
-        "linear_89",
         "linear_91",
         "linear_92",
         "linear_95",
@@ -261,10 +257,11 @@ def get_ov_quantizer(
         "linear_105",
         "linear_106",
         "linear_109",
-        "linear_110",]
+        "linear_110",
+        "linear_111",]
 
     if quant_config == "4wo":
-        mode = QuantizationMode.INT4WO_SYM
+        mode = QuantizationMode.INT4WO_ASYM
 
     elif quant_config == "8wo":
         group_size = -1

From 4e66df1a52e40e90178f4c9fce815d364c5282f9 Mon Sep 17 00:00:00 2001
From: Aamir Nazir <aamir.nazir@intel.com>
Date: Mon, 8 Sep 2025 18:12:37 +0400
Subject: [PATCH 049/266] Apply suggestions from code review

Co-authored-by: Daniil Lyakhov <daniil.lyakhov@intel.com>
---
 backends/openvino/quantizer/observers.py | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py
index 76ab33eb5c5..59a40f2be2d 100644
--- a/backends/openvino/quantizer/observers.py
+++ b/backends/openvino/quantizer/observers.py
@@ -56,9 +56,9 @@ def __init__(
         :param dtype: target dtype for the quantization.
         """
         super().__init__(dtype=dtype, is_dynamic=False)
-        self.wc_param = wc_param
+        self._wc_param = wc_param
 
-    def calculate_qparams(  # type: ignore[override]
+    def _calculate_qparams(  # type: ignore[override]
         self,
         weight: torch.Tensor,
     ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
@@ -68,7 +68,7 @@ def calculate_qparams(  # type: ignore[override]
         :param weight: FP weight to be used for calculating qparams.
         :return: A tuple containing the quantized weight, quantization scale and quantization zero point.
         """
-        wc_param = self.get_wc_param()
+        wc_param = self._wc_param
         wc_config = wc_param.compression_config
         reduction_axes = wc_param.reduction_axes
         q_weight, scale, zp = do_integer_quantization(
@@ -143,13 +143,6 @@ def _create_decompressor(
         :return: NNCF observer according to the qmode which creates the decompression subgraph supported by OpenVINO.
         """
 
-    def get_wc_param(self) -> WeightCompressionParameters:
-        """
-        Returns a respective NNCF Weight Compression Config.
-
-        :return: Weight compression config with the compression information such as qmode, group_size etc.
-        """
-        return self.wc_param
 
 class INT4WeightObserver(WeightObserverBase):
     """

From e850e419cb313e86fd0f5669e7eaa1d115fcc10c Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 8 Sep 2025 18:13:28 +0400
Subject: [PATCH 050/266] Review changes

---
 backends/openvino/quantizer/observers.py | 30 ++++++++++++------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py
index 59a40f2be2d..457399117e0 100644
--- a/backends/openvino/quantizer/observers.py
+++ b/backends/openvino/quantizer/observers.py
@@ -94,7 +94,7 @@ def convert(
         """
         weight_node = observer_node.args[0]
         original_weight = get_tensor_constant_from_node(weight_node, model)
-        q_weight, scale, zero_point = self.calculate_qparams(
+        q_weight, scale, zero_point = self._calculate_qparams(
             original_weight
         )
 
@@ -156,18 +156,17 @@ def _create_decompressor(
         q_weight: torch.Tensor,
         original_weight: torch.Tensor,
     ) -> BaseWeightsDecompressor:
-        if zero_point is not None:
-            return INT4AsymmetricWeightsDecompressor(
-                scale,
-                zero_point,
-                q_weight.shape,
-                original_weight.shape,
-                original_weight.dtype,
-            )
-        else:
+        if zero_point is None:
             return INT4SymmetricWeightsDecompressor(
                 scale, q_weight.shape, original_weight.shape, original_weight.dtype
             )
+        return INT4AsymmetricWeightsDecompressor(
+            scale,
+            zero_point,
+            q_weight.shape,
+            original_weight.shape,
+            original_weight.dtype,
+        )
 
 
 class INT8WeightObserver(WeightObserverBase):
@@ -182,10 +181,11 @@ def _create_decompressor(
         q_weight: torch.Tensor,
         original_weight: torch.Tensor,
     ) -> BaseWeightsDecompressor:
-        if zero_point is not None:
-            return INT8AsymmetricWeightsDecompressor(
-                scale, zero_point, original_weight.dtype
+        if zero_point is None:
+            return INT8SymmetricWeightsDecompressor(
+                scale, original_weight.dtype
             )
-        else:
-            return INT8SymmetricWeightsDecompressor(scale, original_weight.dtype)
+        return INT8AsymmetricWeightsDecompressor(
+            scale, zero_point, original_weight.dtype
+        )
 

From 204043f973ba928c3f2b73dc11e1db6572b7c4a7 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 8 Sep 2025 18:33:16 +0400
Subject: [PATCH 051/266] review changes in quantizer

---
 backends/openvino/quantizer/quantizer.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index 762ed2a9171..7e0e3c92af0 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -175,7 +175,6 @@ def _annotate_weight_compression(
         :param graph: The underlying FX graph.
         :param nncf_graph: The corresponding NNCF graph.
         :param node_vs_torch_annotation: A mapping of FX nodes to quantization annotations.
-
         :return: Updated mapping of FX nodes with weight compression annotations.
         """
         self._algo.set_backend_entity(model)
@@ -343,7 +342,7 @@ def _get_edge_or_node_and_annotation(
     def _get_weight_edge(
         target_node: torch.fx.Node,
         nncf_graph: NNCFGraph,
-    ):
+    ) -> tuple[torch.fx.Node, torch.fx.Node]:
         """
         Returns the FX node corresponding to the weight tensor input of a given operator node.
         Uses the NNCF graph to identify which input port of the target node holds the weight.
@@ -351,7 +350,6 @@ def _get_weight_edge(
 
         :param target_node: FX node representing a weighted operation (e.g., Linear, Conv).
         :param nncf_graph: NNCFGraph used to determine weight port indices.
-
         :return: Edge represented by a Tuple of (weight_node, target_node), where weight_node is the FX node supplying the weight.
         """
         nncf_node = nncf_graph.get_node_by_name(target_node.name)
@@ -428,7 +426,6 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc(
         qmode = wc_param.compression_config.mode
         is_asym_mode = wc_param.compression_config.is_asym_mode
         if qmode in [nncf.CompressWeightsMode.INT4_ASYM, nncf.CompressWeightsMode.INT4_SYM]:
-            extra_args["wc_param"] = wc_param
             observer = INT4WeightObserver
             quant_min = -8 if not is_asym_mode else 0
             quant_max = 7 if not is_asym_mode else 15
@@ -440,7 +437,6 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc(
                 else torch.per_channel_affine
             )
         else:
-            extra_args["wc_param"] = wc_param
             observer = INT8WeightObserver
             quant_min = -128 if not is_asym_mode else 0
             quant_max = 127 if not is_asym_mode else 255
@@ -453,7 +449,7 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc(
             )
         return QuantizationSpec(
             dtype=dtype,
-            observer_or_fake_quant_ctr=observer.with_args(**extra_args),
+            observer_or_fake_quant_ctr=observer.with_args(wc_param=wc_param),
             quant_min=quant_min,
             quant_max=quant_max,
             qscheme=torch_qscheme,

From ae6b089f293d20248df4c3d8a0d0c5ddfed62c4c Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 8 Sep 2025 18:45:54 +0400
Subject: [PATCH 052/266] revert extra args changes

---
 backends/openvino/quantizer/quantizer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index 7e0e3c92af0..89d528f8d16 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -424,6 +424,7 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc(
         extra_args: Dict[str, Any] = {}
 
         qmode = wc_param.compression_config.mode
+        extra_args["wc_param"] = wc_param
         is_asym_mode = wc_param.compression_config.is_asym_mode
         if qmode in [nncf.CompressWeightsMode.INT4_ASYM, nncf.CompressWeightsMode.INT4_SYM]:
             observer = INT4WeightObserver
@@ -449,7 +450,7 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc(
             )
         return QuantizationSpec(
             dtype=dtype,
-            observer_or_fake_quant_ctr=observer.with_args(wc_param=wc_param),
+            observer_or_fake_quant_ctr=observer.with_args(**extra_args),
             quant_min=quant_min,
             quant_max=quant_max,
             qscheme=torch_qscheme,

From 2de569398917362b9ffc02849037528c2a15efa7 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Tue, 9 Sep 2025 11:43:00 +0400
Subject: [PATCH 053/266] precommit fixes

---
 backends/openvino/quantizer/observers.py  | 11 +++------
 backends/openvino/quantizer/quantizer.py  | 30 +++++++++++++----------
 examples/models/llama/export_llama_lib.py |  6 +++--
 extension/llm/export/quantizer_lib.py     | 21 +++++++++-------
 4 files changed, 36 insertions(+), 32 deletions(-)

diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py
index 457399117e0..faeb4fa7a60 100644
--- a/backends/openvino/quantizer/observers.py
+++ b/backends/openvino/quantizer/observers.py
@@ -84,7 +84,7 @@ def convert(
         self, model: torch.fx.GraphModule, observer_node: torch.fx.Node
     ) -> None:
         """
-        Replaces the given observer node from the given model with a quantized 
+        Replaces the given observer node from the given model with a quantized
         weight and a OpenVINO specific decompression module.
 
         :param model: A `torch.fx.GraphModule` representing the statically traced model
@@ -94,9 +94,7 @@ def convert(
         """
         weight_node = observer_node.args[0]
         original_weight = get_tensor_constant_from_node(weight_node, model)
-        q_weight, scale, zero_point = self._calculate_qparams(
-            original_weight
-        )
+        q_weight, scale, zero_point = self._calculate_qparams(original_weight)
 
         decompressor = self._create_decompressor(
             scale, zero_point, q_weight, original_weight
@@ -182,10 +180,7 @@ def _create_decompressor(
         original_weight: torch.Tensor,
     ) -> BaseWeightsDecompressor:
         if zero_point is None:
-            return INT8SymmetricWeightsDecompressor(
-                scale, original_weight.dtype
-            )
+            return INT8SymmetricWeightsDecompressor(scale, original_weight.dtype)
         return INT8AsymmetricWeightsDecompressor(
             scale, zero_point, original_weight.dtype
         )
-
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index 9db79fce9f9..bef1ef3274f 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -12,7 +12,6 @@
 
 import nncf  # type: ignore[import-untyped]
 import nncf.common.quantization as quantization  # type: ignore[import-untyped]
-from nncf.common.scopes import should_consider_scope  # type: ignore[import-untyped]
 import nncf.experimental.torch.fx as nncf_fx  # type: ignore[import-untyped]
 
 import torch.fx
@@ -21,12 +20,12 @@
     INT8WeightObserver,
 )
 from nncf.common.graph.graph import NNCFGraph  # type: ignore[import-untyped]
-from nncf.quantization.quantize_model import (  # type: ignore[import-untyped]
-    get_weight_compression_configuration,
-)
 from nncf.quantization.algorithms.weight_compression.config import (  # type: ignore[import-untyped]
     WeightCompressionParameters,
 )
+from nncf.quantization.quantize_model import (  # type: ignore[import-untyped]
+    get_weight_compression_configuration,
+)
 from torchao.quantization.pt2e import (
     HistogramObserver,
     PerChannelMinMaxObserver,
@@ -118,7 +117,7 @@ def __init__(
                 ),  # Mode value has to match NNCF CompressWeightsMode
                 **kwargs,
             )
-            subset_size = 1 # Doesn't really matter in this case since it is data-free. Should just be +ve
+            subset_size = 1  # Doesn't really matter in this case since it is data-free. Should just be +ve
             self._algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression(
                 subset_size=subset_size, **weight_compression_configuration
             )
@@ -178,7 +177,9 @@ def _annotate_weight_compression(
         :return: Updated mapping of FX nodes with weight compression annotations.
         """
         self._algo.set_backend_entity(model)
-        all_wc_params, _ = self._algo.get_weight_compression_parameters(model, nncf_graph)
+        all_wc_params, _ = self._algo.get_weight_compression_parameters(
+            model, nncf_graph
+        )
 
         for wc_param in all_wc_params:
             node_with_weight = wc_param.node_with_weight
@@ -187,9 +188,7 @@ def _annotate_weight_compression(
             )
             annotation = node_vs_torch_annotation[target_node]
             edge_or_node = self._get_weight_edge(target_node, nncf_graph)
-            qspec = self._get_torch_ao_qspec_from_nncf_config_for_wc(
-                wc_param=wc_param
-            )
+            qspec = self._get_torch_ao_qspec_from_nncf_config_for_wc(wc_param=wc_param)
             self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
 
         return node_vs_torch_annotation
@@ -216,7 +215,9 @@ def _annotate_post_training_quantization(
             edge_or_node, annotation = self._get_edge_or_node_and_annotation(
                 graph, nncf_graph, qp, node_vs_torch_annotation
             )
-            qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config_for_ptq(qp)
+            qspec: QuantizationSpecBase = (
+                self._get_torch_ao_qspec_from_nncf_config_for_ptq(qp)
+            )
             self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
 
         for quantizer_ids in quantization_setup.unified_scale_groups.values():
@@ -426,8 +427,11 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc(
         qmode = wc_param.compression_config.mode
         extra_args["wc_param"] = wc_param
         is_asym_mode = wc_param.compression_config.is_asym_mode
-        if qmode in [nncf.CompressWeightsMode.INT4_ASYM, nncf.CompressWeightsMode.INT4_SYM]:
-            observer = INT4WeightObserver
+        if qmode in [
+            nncf.CompressWeightsMode.INT4_ASYM,
+            nncf.CompressWeightsMode.INT4_SYM,
+        ]:
+            observer = INT4WeightObserver  # type: ignore[type-abstract]
             quant_min = -8 if not is_asym_mode else 0
             quant_max = 7 if not is_asym_mode else 15
             dtype = torch.int8
@@ -438,7 +442,7 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc(
                 else torch.per_channel_affine
             )
         else:
-            observer = INT8WeightObserver
+            observer = INT8WeightObserver  # type: ignore[type-abstract]
             quant_min = -128 if not is_asym_mode else 0
             quant_max = 127 if not is_asym_mode else 255
             dtype = torch.int8
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 578fd0fea7b..d9c282888cc 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -43,10 +43,10 @@
 )
 from executorch.extension.llm.export.quantizer_lib import (
     get_coreml_quantizer,
+    get_ov_quantizer,
     get_pt2e_quantization_params,
     get_pt2e_quantizers,
     get_qnn_quantizer,
-    get_ov_quantizer,
     get_vulkan_quantizer,
 )
 from executorch.util.activation_memory_profiler import generate_memory_trace
@@ -897,7 +897,9 @@ def _to_edge_and_lower_llama_openvino(
     for partitioner in partitioners:
         logging.info(f"--> {partitioner.__class__.__name__}")
 
-    builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(partitioners)
+    builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(
+        partitioners
+    )
 
     if verbose:
         print_delegation_info(builder.edge_manager.exported_program().graph_module)
diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index 83d4a84420d..df8c2a5e36c 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -220,20 +220,22 @@ def get_ov_quantizer(
     group_size: int = 32,
 ):
     try:
-        from executorch.backends.openvino.quantizer import OpenVINOQuantizer, QuantizationMode
-        import nncf
-    except ImportError:
-        raise ImportError(
-            "Please install nncf via backends/openvino/requirements.txt"
+        from executorch.backends.openvino.quantizer import (
+            OpenVINOQuantizer,
+            QuantizationMode,
         )
-    
+    except ImportError:
+        raise ImportError("Please install nncf via backends/openvino/requirements.txt")
+
     backend, quant_config = pt2e_quantize.split("_")
     assert (
         backend == "openvino"
     ), f"The quantization config is for backend {backend} instead of openvino."
-    assert group_size != None, "Group Size None is Not Supported. It should be set to -1 for per-channel."
+    assert (
+        group_size
+    ), "Group Size None is Not Supported. It should be set to -1 for per-channel."
 
-    # (TODO) Manually ignore MP layers. This is done manually for now till we use the dynamic allocation MP 
+    # (TODO) Manually ignore MP layers. This is done manually for now till we use the dynamic allocation MP
     fp_node_names = [
         "linear_13",
         "linear_14",
@@ -258,7 +260,8 @@ def get_ov_quantizer(
         "linear_106",
         "linear_109",
         "linear_110",
-        "linear_111",]
+        "linear_111",
+    ]
 
     if quant_config == "4wo":
         mode = QuantizationMode.INT4WO_ASYM

From 0e10f28242129a3c332ccdbd7a3b9a4340a8e1a1 Mon Sep 17 00:00:00 2001
From: Aamir Nazir <aamir.nazir@intel.com>
Date: Tue, 9 Sep 2025 21:52:23 +0400
Subject: [PATCH 054/266] revert _calculate_qparams back to calculate_qparams

---
 backends/openvino/quantizer/observers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py
index faeb4fa7a60..6cda4561604 100644
--- a/backends/openvino/quantizer/observers.py
+++ b/backends/openvino/quantizer/observers.py
@@ -58,7 +58,7 @@ def __init__(
         super().__init__(dtype=dtype, is_dynamic=False)
         self._wc_param = wc_param
 
-    def _calculate_qparams(  # type: ignore[override]
+    def calculate_qparams(  # type: ignore[override]
         self,
         weight: torch.Tensor,
     ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
@@ -94,7 +94,7 @@ def convert(
         """
         weight_node = observer_node.args[0]
         original_weight = get_tensor_constant_from_node(weight_node, model)
-        q_weight, scale, zero_point = self._calculate_qparams(original_weight)
+        q_weight, scale, zero_point = self.calculate_qparams(original_weight)
 
         decompressor = self._create_decompressor(
             scale, zero_point, q_weight, original_weight

From 05f5a929c7c5b9a79859d9c9848ce37dd0c16b41 Mon Sep 17 00:00:00 2001
From: Aamir Nazir <aamir.nazir@intel.com>
Date: Wed, 10 Sep 2025 18:49:08 +0400
Subject: [PATCH 055/266] remove manual ignored nodes

---
 extension/llm/export/quantizer_lib.py | 29 ---------------------------
 1 file changed, 29 deletions(-)

diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index df8c2a5e36c..870080a7549 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -235,34 +235,6 @@ def get_ov_quantizer(
         group_size
     ), "Group Size None is Not Supported. It should be set to -1 for per-channel."
 
-    # (TODO) Manually ignore MP layers. This is done manually for now till we use the dynamic allocation MP
-    fp_node_names = [
-        "linear_13",
-        "linear_14",
-        "linear_35",
-        "linear_56",
-        "linear_70",
-        "linear_71",
-        "linear_77",
-        "linear_78",
-        "linear_84",
-        "linear_85",
-        "linear_88",
-        "linear_91",
-        "linear_92",
-        "linear_95",
-        "linear_96",
-        "linear_98",
-        "linear_99",
-        "linear_102",
-        "linear_103",
-        "linear_105",
-        "linear_106",
-        "linear_109",
-        "linear_110",
-        "linear_111",
-    ]
-
     if quant_config == "4wo":
         mode = QuantizationMode.INT4WO_ASYM
 
@@ -274,7 +246,6 @@ def get_ov_quantizer(
             f"No support for quant type {quant_config}. Support 8a4w, 8a8w only."
         )
     ov_quantizer = OpenVINOQuantizer(mode=mode, group_size=group_size)
-    ov_quantizer.set_ignored_scope(names=fp_node_names)
 
     return ov_quantizer
 

From fbe0e21137ee9ebc8ea246e61fd9cfa252f57b15 Mon Sep 17 00:00:00 2001
From: Aamir Nazir <aamir.nazir@intel.com>
Date: Wed, 10 Sep 2025 18:52:42 +0400
Subject: [PATCH 056/266] add ratio to quantizer initialization

---
 extension/llm/export/quantizer_lib.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index 870080a7549..350e8b3ce7c 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -235,17 +235,23 @@ def get_ov_quantizer(
         group_size
     ), "Group Size None is Not Supported. It should be set to -1 for per-channel."
 
+    quantization_params = {}
+
     if quant_config == "4wo":
-        mode = QuantizationMode.INT4WO_ASYM
+        quantization_params["mode"] = QuantizationMode.INT4WO_ASYM
+        quantization_params["group_size"] = group_size
+        quantization_params["ratio"] = 0.8
 
     elif quant_config == "8wo":
-        group_size = -1
-        mode = QuantizationMode.INT8WO_SYM
+        quantization_params["mode"] = QuantizationMode.INT8WO_ASYM
+        quantization_params["group_size"] = -1
+        quantization_params["ratio"] = None
+
     else:
         raise AssertionError(
             f"No support for quant type {quant_config}. Support 8a4w, 8a8w only."
         )
-    ov_quantizer = OpenVINOQuantizer(mode=mode, group_size=group_size)
+    ov_quantizer = OpenVINOQuantizer(**quantization_params)
 
     return ov_quantizer
 

From 6bff1cdb00ebdae53b57ab706cab6e9e9ee7e335 Mon Sep 17 00:00:00 2001
From: Aamir Nazir <aamir.nazir@intel.com>
Date: Thu, 11 Sep 2025 23:04:13 +0400
Subject: [PATCH 057/266] Update export_llama_lib.py

---
 examples/models/llama/export_llama_lib.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index d9c282888cc..cbbf169a085 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -768,7 +768,7 @@ def get_quantizer_and_quant_params(llm_config):
     if llm_config.backend.openvino.enabled and llm_config.quantization.pt2e_quantize:
         assert not quantizers, "Should not enable both xnnpack and openvino"
         group_size = llm_config.quantization.group_size
-        group_size = group_size if group_size else 32
+        group_size = group_size if group_size else 128
         ov_quantizer = get_ov_quantizer(
             llm_config.quantization.pt2e_quantize.value, group_size
         )

From d744ae95f3cf806278b12db346105e233a2daec5 Mon Sep 17 00:00:00 2001
From: Aamir Nazir <aamir.nazir@intel.com>
Date: Thu, 11 Sep 2025 23:04:50 +0400
Subject: [PATCH 058/266] Update quantizer_lib.py

---
 extension/llm/export/quantizer_lib.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index 350e8b3ce7c..f92c59cebd3 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -217,7 +217,7 @@ def get_qnn_quantizer(
 
 def get_ov_quantizer(
     pt2e_quantize: str,
-    group_size: int = 32,
+    group_size: int = 128,
 ):
     try:
         from executorch.backends.openvino.quantizer import (

From b874204d7d8eba9aa35dc8f9e55bd47bc0719cbb Mon Sep 17 00:00:00 2001
From: suryasidd <surya.siddharth.pemmaraju@intel.com>
Date: Thu, 11 Sep 2025 14:22:29 -0700
Subject: [PATCH 059/266] Updated NNCF commit id

---
 backends/openvino/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/openvino/requirements.txt b/backends/openvino/requirements.txt
index 2ada445414c..519818d0aac 100644
--- a/backends/openvino/requirements.txt
+++ b/backends/openvino/requirements.txt
@@ -1,2 +1,2 @@
 transformers
-git+https://github.com/openvinotoolkit/nncf@5cb2b58#egg=nncf
+git+https://github.com/openvinotoolkit/nncf@3d753ac#egg=nncf

From 41ac36a8a513e2adbc3015d231f071b7530efae0 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa <mustafa.cavus@intel.com>
Date: Thu, 11 Sep 2025 16:21:43 -0700
Subject: [PATCH 060/266] openvino llama export configuration - initial

---
 examples/openvino/llama/README.md             | 11 ++++++++++
 .../llama/llama3_2_ov_4wo_config.yaml         | 20 +++++++++++++++++++
 2 files changed, 31 insertions(+)
 create mode 100644 examples/openvino/llama/README.md
 create mode 100644 examples/openvino/llama/llama3_2_ov_4wo_config.yaml

diff --git a/examples/openvino/llama/README.md b/examples/openvino/llama/README.md
new file mode 100644
index 00000000000..30644af3cde
--- /dev/null
+++ b/examples/openvino/llama/README.md
@@ -0,0 +1,11 @@
+
+LLAMA_CHECKPOINT=<model_directory>/consolidated.00.pth
+LLAMA_PARAMS=<model_directory>/params.json
+LLAMA_TOKENIZER=<model_directory>/tokenizer.model
+
+python -m extension.llm.export.export_llm \
+  --config llama3_2_ov_4wo_config.yaml \
+  +base.model_class="llama3_2" \
+  +base.checkpoint="${LLAMA_CHECKPOINT:?}" \
+  +base.params="${LLAMA_PARAMS:?}" \
+  +base.tokenizer_path="${LLAMA_TOKENIZER:?}" \
diff --git a/examples/openvino/llama/llama3_2_ov_4wo_config.yaml b/examples/openvino/llama/llama3_2_ov_4wo_config.yaml
new file mode 100644
index 00000000000..7f47f133216
--- /dev/null
+++ b/examples/openvino/llama/llama3_2_ov_4wo_config.yaml
@@ -0,0 +1,20 @@
+base:
+  metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+
+model:
+  use_kv_cache: True
+  dtype_override: fp32
+  enable_dynamic_shape: False
+
+export:
+  output_dir: "../"
+
+quantization:
+  pt2e_quantize: "openvino_4wo"
+
+backend:
+  openvino:
+    enabled: True
+
+debug:
+  verbose: True

From 08461ec1b54de22b279511669a862d20ecef0f5d Mon Sep 17 00:00:00 2001
From: Cavus Mustafa <mustafa.cavus@intel.com>
Date: Thu, 11 Sep 2025 16:32:20 -0700
Subject: [PATCH 061/266] updated ov llama config file

---
 .../{llama3_2_ov_4wo_config.yaml => llama3_2_ov_4wo.yaml}     | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)
 rename examples/openvino/llama/{llama3_2_ov_4wo_config.yaml => llama3_2_ov_4wo.yaml} (90%)

diff --git a/examples/openvino/llama/llama3_2_ov_4wo_config.yaml b/examples/openvino/llama/llama3_2_ov_4wo.yaml
similarity index 90%
rename from examples/openvino/llama/llama3_2_ov_4wo_config.yaml
rename to examples/openvino/llama/llama3_2_ov_4wo.yaml
index 7f47f133216..68a53708fb9 100644
--- a/examples/openvino/llama/llama3_2_ov_4wo_config.yaml
+++ b/examples/openvino/llama/llama3_2_ov_4wo.yaml
@@ -6,11 +6,9 @@ model:
   dtype_override: fp32
   enable_dynamic_shape: False
 
-export:
-  output_dir: "../"
-
 quantization:
   pt2e_quantize: "openvino_4wo"
+  group_size: 128
 
 backend:
   openvino:

From be85af8b86b995b4879e4382cc00f00eb7584d16 Mon Sep 17 00:00:00 2001
From: Mustafa Cavus <mustafa.cavus@intel.com>
Date: Thu, 11 Sep 2025 16:14:11 -0700
Subject: [PATCH 062/266] Update README.md

---
 examples/openvino/llama/README.md | 40 +++++++++++++++++++++++++++----
 1 file changed, 35 insertions(+), 5 deletions(-)

diff --git a/examples/openvino/llama/README.md b/examples/openvino/llama/README.md
index 30644af3cde..e5571e3da79 100644
--- a/examples/openvino/llama/README.md
+++ b/examples/openvino/llama/README.md
@@ -1,11 +1,41 @@
 
-LLAMA_CHECKPOINT=<model_directory>/consolidated.00.pth
-LLAMA_PARAMS=<model_directory>/params.json
-LLAMA_TOKENIZER=<model_directory>/tokenizer.model
+# Export Llama with OpenVINO Backend
 
-python -m extension.llm.export.export_llm \
+## Download the Model
+Follow the [instructions](../../examples/models/llama#step-2-prepare-model) to download the required model files. Export Llama with OpenVINO backend is only verified with Llama-3.2-1B variants at this time. 
+
+## Environment Setup
+Follow the [instructions](../../backends/openvino/README.md) of **Prerequisites** and **Setup** in `backends/openvino/README.md` to set up the OpenVINO backend.
+
+## Export the model:
+Navigate into `<executorch_root>/examples/openvino/llama` and execute the commands below to export the model. Update the model file paths to match the location where your model is downloaded.
+
+```
+LLAMA_CHECKPOINT=<path/to/model/folder>/consolidated.00.pth
+LLAMA_PARAMS=<path/to/model/folder>/params.json
+LLAMA_TOKENIZER=<path/to/model/folder>/tokenizer.model
+
+python -m executorch.extension.llm.export.export_llm \
   --config llama3_2_ov_4wo_config.yaml \
   +base.model_class="llama3_2" \
   +base.checkpoint="${LLAMA_CHECKPOINT:?}" \
   +base.params="${LLAMA_PARAMS:?}" \
-  +base.tokenizer_path="${LLAMA_TOKENIZER:?}" \
+  +base.tokenizer_path="${LLAMA_TOKENIZER:?}"
+```
+
+## Build OpenVINO C++ Runtime with Llama Runner:
+First, build the backend libraries by executing the script below in `<executorch_root>/backends/openvino/scripts` folder:
+```bash
+./openvino_build.sh --cpp_runtime
+```
+Then, build the llama runner by executing the script below (with `--llama_runner` argument) also in `<executorch_root>/backends/openvino/scripts` folder:
+```bash
+./openvino_build.sh --llama_runner
+```
+The executable is saved in `<executorch_root>/cmake-out/examples/models/llama/llama_main`
+
+## Execute Inference Using Llama Runner
+Update the model tokenizer file path to match the location where your model is downloaded and replace the prompt.
+```
+./cmake-out/examples/models/llama/llama_main --model_path=llama3_2.pte --tokenizer_path=<path/to/model/folder>/tokenizer.model --prompt="Your custom prompt"
+```

From 35f1d84b05b285f1cf041ac6e4c95b840e9631ca Mon Sep 17 00:00:00 2001
From: Mustafa Cavus <mustafa.cavus@intel.com>
Date: Thu, 11 Sep 2025 16:20:28 -0700
Subject: [PATCH 063/266] Update README.md

---
 examples/openvino/README.md | 53 +++----------------------------------
 1 file changed, 4 insertions(+), 49 deletions(-)

diff --git a/examples/openvino/README.md b/examples/openvino/README.md
index dbce5df1b55..0ecedde092c 100644
--- a/examples/openvino/README.md
+++ b/examples/openvino/README.md
@@ -9,7 +9,10 @@ Below is the layout of the `examples/openvino` directory, which includes the nec
 ```
 examples/openvino
 ├── README.md                           # Documentation for examples (this file)
-└── aot_optimize_and_infer.py           # Example script to export and execute models
+├── aot_optimize_and_infer.py           # Example script to export and execute models
+└── llama
+    ├── README.md                       # Documentation for Llama example
+    └── llama3_2_ov_4wo.yaml            # Configuration file for exporting Llama3.2 with OpenVINO backend
 ```
 
 # Build Instructions for Examples
@@ -183,51 +186,3 @@ Run inference with a given model for 10 iterations:
     --model_path=model.pte \
     --num_executions=10
 ```
-
-# Export Llama with OpenVINO Backend
-
-## Download the Model
-Follow the [instructions](../../examples/models/llama#step-2-prepare-model) to download the required model files. Export Llama with OpenVINO backend is only verified with Llama-3.2-1B variants at this time. 
-
-## Environment Setup
-Follow the [instructions](../../backends/openvino/README.md) of **Prerequisites** and **Setup** in `backends/openvino/README.md` to set up the OpenVINO backend.
-
-## Export the model:
-Execute the commands below to export the model. Update the model file paths to match the location where your model is downloaded.
-
-```
-LLAMA_CHECKPOINT=<path/to/model/folder>/consolidated.00.pth
-LLAMA_PARAMS=<path/to/model/folder>/params.json
-LLAMA_TOKENIZER=<path/to/model/folder>/tokenizer.model
-
-python -u -m examples.models.llama.export_llama \
-  --model "llama3_2" \
-  --checkpoint "${LLAMA_CHECKPOINT:?}" \
-  --params "${LLAMA_PARAMS:?}" \
-  -kv \
-  --openvino \
-  -d fp32 \
-  --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
-  --output_name="llama.pte" \
-  --verbose \
-  --disable_dynamic_shape \
-  --tokenizer_path "${LLAMA_TOKENIZER:?}" \
-  --nncf_compression
-```
-
-## Build OpenVINO C++ Runtime with Llama Runner:
-First, build the backend libraries by executing the script below in `<executorch_root>/backends/openvino/scripts` folder:
-```bash
-./openvino_build.sh
-```
-Then, build the llama runner by executing the script below (with `--llama_runner` argument) also in `<executorch_root>/backends/openvino/scripts` folder:
-```bash
-./openvino_build.sh --llama_runner
-```
-The executable is saved in `<executorch_root>/cmake-out/examples/models/llama/llama_main`
-
-## Execute Inference Using Llama Runner
-Update the model tokenizer file path to match the location where your model is downloaded and replace the prompt.
-```
-./cmake-out/examples/models/llama/llama_main --model_path=llama.pte --tokenizer_path=<path/to/model/folder>/tokenizer.model --prompt="Your custom prompt"
-```

From 4426541d133b8d9c3148c06654b870f27b4123d0 Mon Sep 17 00:00:00 2001
From: Mustafa Cavus <mustafa.cavus@intel.com>
Date: Thu, 11 Sep 2025 16:25:34 -0700
Subject: [PATCH 064/266] Update README.md

---
 examples/openvino/llama/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/openvino/llama/README.md b/examples/openvino/llama/README.md
index e5571e3da79..abb3f5179cb 100644
--- a/examples/openvino/llama/README.md
+++ b/examples/openvino/llama/README.md
@@ -8,7 +8,7 @@ Follow the [instructions](../../examples/models/llama#step-2-prepare-model) to d
 Follow the [instructions](../../backends/openvino/README.md) of **Prerequisites** and **Setup** in `backends/openvino/README.md` to set up the OpenVINO backend.
 
 ## Export the model:
-Navigate into `<executorch_root>/examples/openvino/llama` and execute the commands below to export the model. Update the model file paths to match the location where your model is downloaded.
+Navigate into `<executorch_root>/examples/openvino/llama` and execute the commands below to export the model. Update the model file paths to match the location where your model is downloaded. The exported model will be generated in the same directory with the filename `llama3_2.pte`.
 
 ```
 LLAMA_CHECKPOINT=<path/to/model/folder>/consolidated.00.pth
@@ -37,5 +37,5 @@ The executable is saved in `<executorch_root>/cmake-out/examples/models/llama/ll
 ## Execute Inference Using Llama Runner
 Update the model tokenizer file path to match the location where your model is downloaded and replace the prompt.
 ```
-./cmake-out/examples/models/llama/llama_main --model_path=llama3_2.pte --tokenizer_path=<path/to/model/folder>/tokenizer.model --prompt="Your custom prompt"
+./cmake-out/examples/models/llama/llama_main --model_path=<executorch_root>/examples/openvino/llama/llama3_2.pte --tokenizer_path=<path/to/model/folder>/tokenizer.model --prompt="Your custom prompt"
 ```

From 6b936c5ddf8ab6c356315fd67f293a331f1a4aaf Mon Sep 17 00:00:00 2001
From: Mustafa Cavus <mustafa.cavus@intel.com>
Date: Thu, 11 Sep 2025 16:26:51 -0700
Subject: [PATCH 065/266] Update README.md

---
 examples/openvino/llama/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/openvino/llama/README.md b/examples/openvino/llama/README.md
index abb3f5179cb..4de20a0f061 100644
--- a/examples/openvino/llama/README.md
+++ b/examples/openvino/llama/README.md
@@ -16,7 +16,7 @@ LLAMA_PARAMS=<path/to/model/folder>/params.json
 LLAMA_TOKENIZER=<path/to/model/folder>/tokenizer.model
 
 python -m executorch.extension.llm.export.export_llm \
-  --config llama3_2_ov_4wo_config.yaml \
+  --config llama3_2_ov_4wo.yaml \
   +base.model_class="llama3_2" \
   +base.checkpoint="${LLAMA_CHECKPOINT:?}" \
   +base.params="${LLAMA_PARAMS:?}" \

From bba4a01437ef5b1b6a6ddd7af5a406a9cc9842ca Mon Sep 17 00:00:00 2001
From: Mustafa Cavus <mustafa.cavus@intel.com>
Date: Thu, 11 Sep 2025 16:51:22 -0700
Subject: [PATCH 066/266] Update README.md

---
 examples/openvino/llama/README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/openvino/llama/README.md b/examples/openvino/llama/README.md
index 4de20a0f061..d357f038781 100644
--- a/examples/openvino/llama/README.md
+++ b/examples/openvino/llama/README.md
@@ -8,7 +8,7 @@ Follow the [instructions](../../examples/models/llama#step-2-prepare-model) to d
 Follow the [instructions](../../backends/openvino/README.md) of **Prerequisites** and **Setup** in `backends/openvino/README.md` to set up the OpenVINO backend.
 
 ## Export the model:
-Navigate into `<executorch_root>/examples/openvino/llama` and execute the commands below to export the model. Update the model file paths to match the location where your model is downloaded. The exported model will be generated in the same directory with the filename `llama3_2.pte`.
+Navigate into `<executorch_root>/examples/openvino/llama` and execute the commands below to export the model. Update the model file paths to match the location where your model is downloaded. Replace device with the target hardware you want to compile the model for (`CPU`, `GPU`, or `NPU`). The exported model will be generated in the same directory with the filename `llama3_2.pte`.
 
 ```
 LLAMA_CHECKPOINT=<path/to/model/folder>/consolidated.00.pth
@@ -17,6 +17,7 @@ LLAMA_TOKENIZER=<path/to/model/folder>/tokenizer.model
 
 python -m executorch.extension.llm.export.export_llm \
   --config llama3_2_ov_4wo.yaml \
+  +backend.openvino.device="CPU" \
   +base.model_class="llama3_2" \
   +base.checkpoint="${LLAMA_CHECKPOINT:?}" \
   +base.params="${LLAMA_PARAMS:?}" \

From 1421921da0a6b083c17c9fe85b5b5f8beebd7216 Mon Sep 17 00:00:00 2001
From: Aamir Nazir <aamir.nazir@intel.com>
Date: Fri, 12 Sep 2025 13:05:24 +0400
Subject: [PATCH 067/266] Update README.md with quantization paragraph

---
 examples/openvino/llama/README.md | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/examples/openvino/llama/README.md b/examples/openvino/llama/README.md
index d357f038781..7a97e27410c 100644
--- a/examples/openvino/llama/README.md
+++ b/examples/openvino/llama/README.md
@@ -24,6 +24,24 @@ python -m executorch.extension.llm.export.export_llm \
   +base.tokenizer_path="${LLAMA_TOKENIZER:?}"
 ```
 
+### Compress Model Weights and Export
+OpenVINO backend also offers Quantization support for llama models when exporting the model. The different quantization modes that are offered are INT4 groupwise & per-channel weights compression and INT8 per-channel weights compression. It can be achieved using the `--pt2e_quantize opevnino_4wo` flag. For modifying the group size `--group_size` can be used. By default group size 128 is used to achieve optimal performance with the NPU.
+
+```
+LLAMA_CHECKPOINT=<path/to/model/folder>/consolidated.00.pth
+LLAMA_PARAMS=<path/to/model/folder>/params.json
+LLAMA_TOKENIZER=<path/to/model/folder>/tokenizer.model
+
+python -m executorch.extension.llm.export.export_llm \
+  --config llama3_2_ov_4wo.yaml \
+  +backend.openvino.device="CPU" \
+  +base.model_class="llama3_2" \
+  +pt2e_quantize opevnino_4wo \
+  +base.checkpoint="${LLAMA_CHECKPOINT:?}" \
+  +base.params="${LLAMA_PARAMS:?}" \
+  +base.tokenizer_path="${LLAMA_TOKENIZER:?}"
+```
+
 ## Build OpenVINO C++ Runtime with Llama Runner:
 First, build the backend libraries by executing the script below in `<executorch_root>/backends/openvino/scripts` folder:
 ```bash

From f050eeac96dd63c158afb526c1df1ac13beec0f6 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa <mustafa.cavus@intel.com>
Date: Sun, 14 Sep 2025 20:39:13 -0700
Subject: [PATCH 068/266] formatting fix

---
 backends/openvino/quantizer/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/openvino/quantizer/__init__.py b/backends/openvino/quantizer/__init__.py
index 0fd8c10b249..5aae52ef3e8 100644
--- a/backends/openvino/quantizer/__init__.py
+++ b/backends/openvino/quantizer/__init__.py
@@ -1,3 +1,3 @@
-from .quantizer import OpenVINOQuantizer, quantize_model, QuantizationMode
+from .quantizer import OpenVINOQuantizer, QuantizationMode, quantize_model
 
 __all__ = ["OpenVINOQuantizer", "quantize_model", "QuantizationMode"]

From 4bfdca9e95de0bbe41e3f0e8df8e4f1e8476d97f Mon Sep 17 00:00:00 2001
From: Mustafa Cavus <mustafa.cavus@intel.com>
Date: Sun, 14 Sep 2025 20:44:22 -0700
Subject: [PATCH 069/266] Update README.md

---
 examples/openvino/llama/README.md | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/examples/openvino/llama/README.md b/examples/openvino/llama/README.md
index 7a97e27410c..46dbfb8c2f0 100644
--- a/examples/openvino/llama/README.md
+++ b/examples/openvino/llama/README.md
@@ -25,22 +25,7 @@ python -m executorch.extension.llm.export.export_llm \
 ```
 
 ### Compress Model Weights and Export
-OpenVINO backend also offers Quantization support for llama models when exporting the model. The different quantization modes that are offered are INT4 groupwise & per-channel weights compression and INT8 per-channel weights compression. It can be achieved using the `--pt2e_quantize opevnino_4wo` flag. For modifying the group size `--group_size` can be used. By default group size 128 is used to achieve optimal performance with the NPU.
-
-```
-LLAMA_CHECKPOINT=<path/to/model/folder>/consolidated.00.pth
-LLAMA_PARAMS=<path/to/model/folder>/params.json
-LLAMA_TOKENIZER=<path/to/model/folder>/tokenizer.model
-
-python -m executorch.extension.llm.export.export_llm \
-  --config llama3_2_ov_4wo.yaml \
-  +backend.openvino.device="CPU" \
-  +base.model_class="llama3_2" \
-  +pt2e_quantize opevnino_4wo \
-  +base.checkpoint="${LLAMA_CHECKPOINT:?}" \
-  +base.params="${LLAMA_PARAMS:?}" \
-  +base.tokenizer_path="${LLAMA_TOKENIZER:?}"
-```
+OpenVINO backend also offers Quantization support for llama models when exporting the model. The different quantization modes that are offered are INT4 groupwise & per-channel weights compression and INT8 per-channel weights compression. It can be achieved by setting `pt2e_quantize` option in `llama3_2_ov_4wo.yaml` file under `quantization`. Set this parameter to `openvino_4wo` for INT4 or `openvino_8wo` for INT8 weight compression. It is set to `openvino_4wo` in `llama3_2_ov_4wo.yaml` file by default. For modifying the group size, set `group_size` option in `llama3_2_ov_4wo.yaml` file under `quantization`. By default group size 128 is used to achieve optimal performance with the NPU.
 
 ## Build OpenVINO C++ Runtime with Llama Runner:
 First, build the backend libraries by executing the script below in `<executorch_root>/backends/openvino/scripts` folder:

From 16aba1bb1bb52632829e5a84ef0dd15f0e01d464 Mon Sep 17 00:00:00 2001
From: Mustafa Cavus <mustafa.cavus@intel.com>
Date: Tue, 16 Sep 2025 10:21:59 -0700
Subject: [PATCH 070/266] Update non_cpu_backends.md for OpenVINO instructions

---
 examples/models/llama/non_cpu_backends.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/examples/models/llama/non_cpu_backends.md b/examples/models/llama/non_cpu_backends.md
index f414582a3c1..6e5d0b63256 100644
--- a/examples/models/llama/non_cpu_backends.md
+++ b/examples/models/llama/non_cpu_backends.md
@@ -22,3 +22,6 @@ After exporting the CoreML model .pte file, please [follow the instruction to bu
 
 ### MTK
 Please [follow the instructions](https://github.com/pytorch/executorch/tree/main/examples/mediatek#llama-example-instructions) to deploy llama3 8b to an Android phones with MediaTek chip
+
+### OpenVINO
+Please follow [the instructions](../../openvino/llama/README.md) to deploy Llama 3 1B to Intel CPUs, GPUs, and NPUs.

From 155529f2a63bffeaa6539908dabda16e8d0e415f Mon Sep 17 00:00:00 2001
From: Mustafa Cavus <mustafa.cavus@intel.com>
Date: Tue, 16 Sep 2025 10:22:58 -0700
Subject: [PATCH 071/266] Update llama instructions link for OpenVINO backend

---
 examples/models/llama/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md
index 784142b61f1..aba3b255fee 100644
--- a/examples/models/llama/README.md
+++ b/examples/models/llama/README.md
@@ -136,7 +136,7 @@ Llama 3 8B performance was measured on the Samsung Galaxy S22, S24, and OnePlus
       </em>
 </p>
 
-[Please visit this section to try it on non-CPU backend, including CoreML, MPS, Qualcomm HTP or MediaTek](non_cpu_backends.md).
+[Please visit this section to try it on non-CPU backend, including CoreML, MPS, Qualcomm HTP, MediaTek, or OpenVINO](non_cpu_backends.md).
 
 # Instructions
 

From 5875aa8af0b07474b6d7d066164dc5a298b26d9a Mon Sep 17 00:00:00 2001
From: Mustafa Cavus <mustafa.cavus@intel.com>
Date: Tue, 16 Sep 2025 10:25:46 -0700
Subject: [PATCH 072/266] Remove OpenVINO from non_cpu_backends.md

---
 examples/models/llama/non_cpu_backends.md | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/examples/models/llama/non_cpu_backends.md b/examples/models/llama/non_cpu_backends.md
index 6e5d0b63256..f414582a3c1 100644
--- a/examples/models/llama/non_cpu_backends.md
+++ b/examples/models/llama/non_cpu_backends.md
@@ -22,6 +22,3 @@ After exporting the CoreML model .pte file, please [follow the instruction to bu
 
 ### MTK
 Please [follow the instructions](https://github.com/pytorch/executorch/tree/main/examples/mediatek#llama-example-instructions) to deploy llama3 8b to an Android phones with MediaTek chip
-
-### OpenVINO
-Please follow [the instructions](../../openvino/llama/README.md) to deploy Llama 3 1B to Intel CPUs, GPUs, and NPUs.

From 2630fd6c1db8f3e8eb5a840b34b96b48210c9362 Mon Sep 17 00:00:00 2001
From: Mustafa Cavus <mustafa.cavus@intel.com>
Date: Tue, 16 Sep 2025 11:03:51 -0700
Subject: [PATCH 073/266] Update llama instructions for OpenVINO backend

---
 examples/models/llama/README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md
index aba3b255fee..516f0073ef1 100644
--- a/examples/models/llama/README.md
+++ b/examples/models/llama/README.md
@@ -94,6 +94,8 @@ Llama 3.2 1B and 3B performance was measured on Android OnePlus 12 device. The p
   </tr>
 </table>
 
+[Please visit this section to try it on OpenVINO backend](../../openvino/llama/README.md).
+
 ## Llama 3/3.1 8B
 Since Llama 3 8B model needs at least 4-bit quantization to fit even within some of the highend phones, results presented here correspond to 4-bit groupwise post-training quantized (PTQ) model.
 
@@ -136,7 +138,7 @@ Llama 3 8B performance was measured on the Samsung Galaxy S22, S24, and OnePlus
       </em>
 </p>
 
-[Please visit this section to try it on non-CPU backend, including CoreML, MPS, Qualcomm HTP, MediaTek, or OpenVINO](non_cpu_backends.md).
+[Please visit this section to try it on non-CPU backend, including CoreML, MPS, Qualcomm HTP, or MediaTek](non_cpu_backends.md).
 
 # Instructions
 

From 6d0cbc53a5143c0bf66333872fdecefbc66b60d0 Mon Sep 17 00:00:00 2001
From: Mustafa Cavus <mustafa.cavus@intel.com>
Date: Tue, 16 Sep 2025 11:11:17 -0700
Subject: [PATCH 074/266] Removed the comma which was added by mistake

---
 examples/models/llama/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md
index 516f0073ef1..d0e72234c54 100644
--- a/examples/models/llama/README.md
+++ b/examples/models/llama/README.md
@@ -138,7 +138,7 @@ Llama 3 8B performance was measured on the Samsung Galaxy S22, S24, and OnePlus
       </em>
 </p>
 
-[Please visit this section to try it on non-CPU backend, including CoreML, MPS, Qualcomm HTP, or MediaTek](non_cpu_backends.md).
+[Please visit this section to try it on non-CPU backend, including CoreML, MPS, Qualcomm HTP or MediaTek](non_cpu_backends.md).
 
 # Instructions
 

From 3fbefecb61e147114c2aabc02079e88fa6d7777f Mon Sep 17 00:00:00 2001
From: suryasidd <surya.siddharth.pemmaraju@intel.com>
Date: Tue, 16 Sep 2025 12:18:52 -0700
Subject: [PATCH 075/266] Added NPU in choices

---
 examples/models/llama/export_llama_lib.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index ed352c0997e..4f4ef2553aa 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -464,8 +464,8 @@ def build_args_parser() -> argparse.ArgumentParser:
         "--openvino_device",
         type=str,
         default="CPU",
-        choices=["CPU", "GPU"],
-        help="Specify the device for Openvino (CPU or GPU).",
+        choices=["CPU", "GPU", "NPU"],
+        help="Specify the device for Openvino (CPU, GPU or NPU).",
     )
 
     parser.add_argument(

From 12e51c72d6f184c1ee6902d6d8f895292a4d6d92 Mon Sep 17 00:00:00 2001
From: suryasidd <surya.siddharth.pemmaraju@intel.com>
Date: Tue, 16 Sep 2025 15:26:06 -0700
Subject: [PATCH 076/266] Fixed ref links

---
 examples/openvino/llama/README.md            |  6 +++---
 examples/openvino/llama/llama3_2_ov_4wo.yaml | 11 +++++++----
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/examples/openvino/llama/README.md b/examples/openvino/llama/README.md
index 46dbfb8c2f0..a98645b3918 100644
--- a/examples/openvino/llama/README.md
+++ b/examples/openvino/llama/README.md
@@ -2,13 +2,13 @@
 # Export Llama with OpenVINO Backend
 
 ## Download the Model
-Follow the [instructions](../../examples/models/llama#step-2-prepare-model) to download the required model files. Export Llama with OpenVINO backend is only verified with Llama-3.2-1B variants at this time. 
+Follow the [instructions](../../../examples/models/llama/README.md#step-2-prepare-model) to download the required model files. Export Llama with OpenVINO backend is only verified with Llama-3.2-1B variants at this time.
 
 ## Environment Setup
-Follow the [instructions](../../backends/openvino/README.md) of **Prerequisites** and **Setup** in `backends/openvino/README.md` to set up the OpenVINO backend.
+Follow the [instructions](../../../backends/openvino/README.md) of **Prerequisites** and **Setup** in `backends/openvino/README.md` to set up the OpenVINO backend.
 
 ## Export the model:
-Navigate into `<executorch_root>/examples/openvino/llama` and execute the commands below to export the model. Update the model file paths to match the location where your model is downloaded. Replace device with the target hardware you want to compile the model for (`CPU`, `GPU`, or `NPU`). The exported model will be generated in the same directory with the filename `llama3_2.pte`.
+Navigate into `<executorch_root>/examples/openvino/llama` and execute the commands below to export the model. Update the model file paths to match the location where your model is downloaded. Replace device with the target hardware you want to compile the model for (`CPU`, `GPU`, or `NPU`). The exported model will be generated in the same directory with the filename `llama3_2_ov.pte`. For modifying the output name, change `output_name` in `llama3_2_ov_4wo.yaml` file under `export`.
 
 ```
 LLAMA_CHECKPOINT=<path/to/model/folder>/consolidated.00.pth
diff --git a/examples/openvino/llama/llama3_2_ov_4wo.yaml b/examples/openvino/llama/llama3_2_ov_4wo.yaml
index 68a53708fb9..8fb1d7a1c09 100644
--- a/examples/openvino/llama/llama3_2_ov_4wo.yaml
+++ b/examples/openvino/llama/llama3_2_ov_4wo.yaml
@@ -2,17 +2,20 @@ base:
   metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
 
 model:
-  use_kv_cache: True
+  use_kv_cache: true
   dtype_override: fp32
-  enable_dynamic_shape: False
+  enable_dynamic_shape: false
 
 quantization:
   pt2e_quantize: "openvino_4wo"
   group_size: 128
 
+export:
+  output_name: "llama3_2_ov.pte"
+
 backend:
   openvino:
-    enabled: True
+    enabled: true
 
 debug:
-  verbose: True
+  verbose: false

From 72331f5d0feaea93cef7517fda0eba7942ac6dd2 Mon Sep 17 00:00:00 2001
From: suryasidd <surya.siddharth.pemmaraju@intel.com>
Date: Wed, 17 Sep 2025 13:16:49 -0700
Subject: [PATCH 077/266] Added Remove clone ops transformation to OpenVINO
 backend

---
 backends/openvino/preprocess.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/backends/openvino/preprocess.py b/backends/openvino/preprocess.py
index c343f44a8b5..66d5ec97b0a 100644
--- a/backends/openvino/preprocess.py
+++ b/backends/openvino/preprocess.py
@@ -8,6 +8,7 @@
 
 from typing import final, List
 
+from executorch.backends.transforms.remove_clone_ops import RemoveCloneOpsTransform
 from executorch.exir.backend.backend_details import (
     BackendDetails,
     ExportedProgram,
@@ -36,6 +37,14 @@ def preprocess(
         Returns:
             PreprocessResult: The result of preprocessing, including the compiled model bytes.
         """
+        # Apply RemoveCloneOpsTransform to eliminate unnecessary clone operations
+        remove_clone_transform = RemoveCloneOpsTransform()
+        transformed_result = remove_clone_transform(edge_program.graph_module)
+
+        # Update the edge_program with the transformed graph
+        if transformed_result.graph_module is not None:
+            edge_program._graph_module = transformed_result.graph_module
+
         input_names = edge_program.graph_signature.user_inputs
         args = []
         for node in edge_program.graph.nodes:
@@ -47,7 +56,9 @@ def preprocess(
             compile_options[spec.key] = spec.value.decode()
 
         compiled = openvino_compile(
-            edge_program.module(), *args, options=compile_options
+            edge_program.module(),
+            *args,
+            options=compile_options
         )
         model_bytes = compiled.export_model()
 

From 8016165619eee3777e2ef437e4b83de84b3582b6 Mon Sep 17 00:00:00 2001
From: suryasidd <surya.siddharth.pemmaraju@intel.com>
Date: Wed, 17 Sep 2025 13:28:50 -0700
Subject: [PATCH 078/266] Fixed variable names

---
 backends/openvino/preprocess.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/backends/openvino/preprocess.py b/backends/openvino/preprocess.py
index 66d5ec97b0a..7fc9d61d68e 100644
--- a/backends/openvino/preprocess.py
+++ b/backends/openvino/preprocess.py
@@ -38,12 +38,11 @@ def preprocess(
             PreprocessResult: The result of preprocessing, including the compiled model bytes.
         """
         # Apply RemoveCloneOpsTransform to eliminate unnecessary clone operations
-        remove_clone_transform = RemoveCloneOpsTransform()
-        transformed_result = remove_clone_transform(edge_program.graph_module)
+        transformed_ep = RemoveCloneOpsTransform()(edge_program.graph_module)
 
         # Update the edge_program with the transformed graph
-        if transformed_result.graph_module is not None:
-            edge_program._graph_module = transformed_result.graph_module
+        if transformed_ep.graph_module is not None:
+            edge_program._graph_module = transformed_ep.graph_module
 
         input_names = edge_program.graph_signature.user_inputs
         args = []

From f0d9fc72f504cb7e80ee34c02bca2e62977a1c9e Mon Sep 17 00:00:00 2001
From: Cavus Mustafa <mustafa.cavus@intel.com>
Date: Wed, 17 Sep 2025 15:30:48 -0700
Subject: [PATCH 079/266] Added extended support list for openvino backend

---
 backends/openvino/partitioner.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py
index 20841d6730b..00107959412 100644
--- a/backends/openvino/partitioner.py
+++ b/backends/openvino/partitioner.py
@@ -34,6 +34,9 @@ def __init__(self):
 
 
 class OpenvinoOperatorsSupport(OperatorSupportBase):
+    extended_support_dict = {
+        "torch.ops.dim_order_ops._clone_dim_order.default": None,
+    }
 
     def __init__(
         self,
@@ -77,7 +80,9 @@ def is_node_supported(self, _, node: torch.fx.Node) -> bool:
         if node.name in self._enabled_ops_by_name:
             return True
 
-        supported_ops = OperatorSupport(options)._support_dict
+        supported_ops = (
+            OperatorSupport(options)._support_dict | self.extended_support_dict
+        )
         if op_type == "getitem":
             return True
 

From 9b41c28be3e266c10808ae07cc1cf1ff84112280 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa <mustafa.cavus@intel.com>
Date: Wed, 17 Sep 2025 15:31:06 -0700
Subject: [PATCH 080/266] formating fix

---
 backends/openvino/preprocess.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/backends/openvino/preprocess.py b/backends/openvino/preprocess.py
index 7fc9d61d68e..3ba693973e0 100644
--- a/backends/openvino/preprocess.py
+++ b/backends/openvino/preprocess.py
@@ -55,9 +55,7 @@ def preprocess(
             compile_options[spec.key] = spec.value.decode()
 
         compiled = openvino_compile(
-            edge_program.module(),
-            *args,
-            options=compile_options
+            edge_program.module(), *args, options=compile_options
         )
         model_bytes = compiled.export_model()
 

From e7517263cdae812bf96941c6ececd73790f1c69a Mon Sep 17 00:00:00 2001
From: Cavus Mustafa <mustafa.cavus@intel.com>
Date: Wed, 17 Sep 2025 16:09:00 -0700
Subject: [PATCH 081/266] formatting fix

---
 backends/openvino/preprocess.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/openvino/preprocess.py b/backends/openvino/preprocess.py
index 3ba693973e0..72c781c0fb3 100644
--- a/backends/openvino/preprocess.py
+++ b/backends/openvino/preprocess.py
@@ -41,7 +41,7 @@ def preprocess(
         transformed_ep = RemoveCloneOpsTransform()(edge_program.graph_module)
 
         # Update the edge_program with the transformed graph
-        if transformed_ep.graph_module is not None:
+        if transformed_ep and transformed_ep.graph_module:
             edge_program._graph_module = transformed_ep.graph_module
 
         input_names = edge_program.graph_signature.user_inputs

From 8106204b8a4af557bc6d925b070d9202789c14b4 Mon Sep 17 00:00:00 2001
From: suryasidd <surya.siddharth.pemmaraju@intel.com>
Date: Tue, 30 Sep 2025 15:32:58 -0700
Subject: [PATCH 082/266] Added DimorderOpsRevertPass to Openvino backend

---
 backends/openvino/partitioner.py | 1 +
 backends/openvino/preprocess.py  | 5 ++---
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py
index 00107959412..0d407e33f6e 100644
--- a/backends/openvino/partitioner.py
+++ b/backends/openvino/partitioner.py
@@ -36,6 +36,7 @@ def __init__(self):
 class OpenvinoOperatorsSupport(OperatorSupportBase):
     extended_support_dict = {
         "torch.ops.dim_order_ops._clone_dim_order.default": None,
+        "torch.ops.dim_order_ops._to_dim_order_copy.default": None,
     }
 
     def __init__(
diff --git a/backends/openvino/preprocess.py b/backends/openvino/preprocess.py
index 72c781c0fb3..7d89e117dc6 100644
--- a/backends/openvino/preprocess.py
+++ b/backends/openvino/preprocess.py
@@ -8,7 +8,7 @@
 
 from typing import final, List
 
-from executorch.backends.transforms.remove_clone_ops import RemoveCloneOpsTransform
+from executorch.exir.passes.memory_format_ops_pass import DimOrderOpsRevertPass
 from executorch.exir.backend.backend_details import (
     BackendDetails,
     ExportedProgram,
@@ -37,8 +37,7 @@ def preprocess(
         Returns:
             PreprocessResult: The result of preprocessing, including the compiled model bytes.
         """
-        # Apply RemoveCloneOpsTransform to eliminate unnecessary clone operations
-        transformed_ep = RemoveCloneOpsTransform()(edge_program.graph_module)
+        transformed_ep = DimOrderOpsRevertPass()(edge_program.graph_module)
 
         # Update the edge_program with the transformed graph
         if transformed_ep and transformed_ep.graph_module:

From d95143ebe0fee4bfe127ff6d99e7fe3bd1693728 Mon Sep 17 00:00:00 2001
From: Onuralp SEZER <thunderbirdtr@gmail.com>
Date: Wed, 1 Oct 2025 21:15:57 +0300
Subject: [PATCH 083/266] refactor:(samsung backend): replace pkg_resources
 with importlib.resources for schema loading (#14654)

This PR refactors the Samsung backend schema loading logic in
compile_options.py by replacing pkg_resources with importlib.resources.
This modernizes resource access, improves compatibility with Python
packaging standards, and removes the dependency on setuptools. No
functional changes to the compile options logic; only the resource
loading mechanism is updated.

Signed-off-by: Onuralp SEZER <onuralp@ultralytics.com>
---
 .../samsung/serialization/compile_options.py     | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/backends/samsung/serialization/compile_options.py b/backends/samsung/serialization/compile_options.py
index 1ad2350cfeb..a4af40368e9 100644
--- a/backends/samsung/serialization/compile_options.py
+++ b/backends/samsung/serialization/compile_options.py
@@ -11,7 +11,8 @@
 from dataclasses import dataclass
 from enum import IntEnum, unique
 
-import pkg_resources
+from importlib.resources import files
+
 from executorch.exir._serialize._dataclass import _DataclassEncoder
 from executorch.exir._serialize._flatbuffer import _flatc_compile
 from executorch.exir.backend.backend_details import CompileSpec
@@ -36,12 +37,15 @@ def gen_samsung_backend_compile_spec_core(options: EnnExecuTorchOptions) -> Comp
     with tempfile.TemporaryDirectory() as d:
         # schema
         schema_path = os.path.join(d, "{}.fbs".format(COMPILE_OPTION_SCHEMA_NAME))
+
+        schema_content = (
+            files(__package__)
+            .joinpath(f"{COMPILE_OPTION_SCHEMA_NAME}.fbs")
+            .read_bytes()
+        )
+
         with open(schema_path, "wb") as schema_file:
-            schema_file.write(
-                pkg_resources.resource_string(
-                    __name__, "{}.fbs".format(COMPILE_OPTION_SCHEMA_NAME)
-                )
-            )
+            schema_file.write(schema_content)
         # dump json
         json_path = os.path.join(d, "{}.json".format(COMPILE_OPTION_SCHEMA_NAME))
         enn_options_json = json.dumps(options, cls=_DataclassEncoder, indent=4)

From eaf0e174f09e9cfa1584d8e77b8f06abf18b8e1b Mon Sep 17 00:00:00 2001
From: suryasidd <surya.siddharth.pemmaraju@intel.com>
Date: Wed, 1 Oct 2025 11:28:42 -0700
Subject: [PATCH 084/266] Fixed linter issues

---
 backends/openvino/preprocess.py           | 3 ++-
 extension/llm/export/config/llm_config.py | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/backends/openvino/preprocess.py b/backends/openvino/preprocess.py
index 7d89e117dc6..691115f6579 100644
--- a/backends/openvino/preprocess.py
+++ b/backends/openvino/preprocess.py
@@ -8,13 +8,14 @@
 
 from typing import final, List
 
-from executorch.exir.passes.memory_format_ops_pass import DimOrderOpsRevertPass
 from executorch.exir.backend.backend_details import (
     BackendDetails,
     ExportedProgram,
     PreprocessResult,
 )
 from executorch.exir.backend.compile_spec_schema import CompileSpec
+
+from executorch.exir.passes.memory_format_ops_pass import DimOrderOpsRevertPass
 from openvino.frontend.pytorch.torchdynamo.compile import (  # type: ignore[import-untyped]
     openvino_compile,
 )
diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py
index a176fa71dcc..0ac965b98cc 100644
--- a/extension/llm/export/config/llm_config.py
+++ b/extension/llm/export/config/llm_config.py
@@ -465,6 +465,7 @@ class OpenvinoConfig:
     nncf_compression: bool = False
     nncf_compression_group_size: int = 32
 
+
 @dataclass
 class TorchAOKernelsConfig:
     """

From 19be2a3ccbfb26f20cce1cc83a1f07e6e8c909be Mon Sep 17 00:00:00 2001
From: cccclai <chenlai@meta.com>
Date: Wed, 1 Oct 2025 12:43:30 -0700
Subject: [PATCH 085/266] Try to get nightly wheel build work with qnn (#14633)

Our current nightly/release wheel package is done following
https://github.com/pytorch/test-infra/wiki/Using-Nova-Reusable-Build-Workflows

As described by
https://github.com/pytorch/test-infra/blob/5398e1a00c39939f43251f29031c37e6d0c84647/.github/workflows/build_wheels_linux.yml#L4,
The docker image infra team used to release nightly/release package is
from https://github.com/pypa/manylinux, and it's currently using
https://github.com/pypa/manylinux?tab=readme-ov-file#manylinux_2_28-almalinux-8-based.
It means the glibc version is 2.28 and GCC is 14.

The issue is that, QNN .so files are not compatible with 2.28. The
minimum version is 2.34 (I tried 2.29 the first time when it failed and
asked for 2.29, but it still fails).

In this PR, instead of checking glibc and failed directly when minimum
version isn't matched, we will download the glibc 2.34 to /tmp. A
different strategy compared with glibc++ is that, we don't load them,
because the python process itself start with the system glibc 2.28. We
need to re-execute the process with the new glibc
---
 backends/qualcomm/__init__.py                 |  14 +-
 backends/qualcomm/scripts/download_qnn_sdk.py | 280 ++++++++++++++----
 setup.py                                      |   3 +-
 3 files changed, 218 insertions(+), 79 deletions(-)

diff --git a/backends/qualcomm/__init__.py b/backends/qualcomm/__init__.py
index 04ba5fcf24b..5770dfb0fcd 100644
--- a/backends/qualcomm/__init__.py
+++ b/backends/qualcomm/__init__.py
@@ -1,23 +1,13 @@
 import os
 
-from .scripts.download_qnn_sdk import (
-    check_glibc_exist_and_validate,
-    install_qnn_sdk,
-    is_linux_x86,
-)
+from .scripts.download_qnn_sdk import install_qnn_sdk, is_linux_x86
 
 
 env_flag = os.getenv("EXECUTORCH_BUILDING_WHEEL", "0").lower()
 # If users have preinstalled QNN_SDK_ROOT, we will use it.
 qnn_sdk_root_flag = os.getenv("QNN_SDK_ROOT", None)
 
-if (
-    env_flag not in ("1", "true", "yes")
-    and not qnn_sdk_root_flag
-    and is_linux_x86()
-    and check_glibc_exist_and_validate()
-):
+if env_flag not in ("1", "true", "yes") and not qnn_sdk_root_flag and is_linux_x86():
     ok = install_qnn_sdk()
-
     if not ok:
         raise RuntimeError("Failed to install QNN SDK. Please check the logs above.")
diff --git a/backends/qualcomm/scripts/download_qnn_sdk.py b/backends/qualcomm/scripts/download_qnn_sdk.py
index 35006a41433..747524a0e5b 100644
--- a/backends/qualcomm/scripts/download_qnn_sdk.py
+++ b/backends/qualcomm/scripts/download_qnn_sdk.py
@@ -6,12 +6,15 @@
 import platform
 import re
 import shutil
+import subprocess
+import sys
 import tarfile
 import tempfile
 import urllib.request
 import zipfile
 from typing import Dict, List, Optional, Tuple
 
+
 logger = logging.getLogger(__name__)
 logger.addHandler(logging.NullHandler())
 
@@ -34,68 +37,81 @@ def is_linux_x86() -> bool:
     )
 
 
-import subprocess
+#########################
+# Cache directory helper
+#########################
 
-MINIMUM_LIBC_VERSION = 2.29
+APP_NAMESPACE = ["executorch", "qnn"]
 
-REQUIRED_LIBC_LIBS = [
-    "/lib/x86_64-linux-gnu/libc.so.6",
-    "/lib64/libc.so.6",
-    "/lib/libc.so.6",
-]
 
+def _get_staging_dir(*parts: str) -> pathlib.Path:
+    r"""
+    Return a cross-platform staging directory for staging SDKs/libraries.
+
+    - On Linux:
+        ~/.cache/executorch/qnn/<parts...>
+        (falls back to $HOME/.cache if $XDG_CACHE_HOME is unset)
 
-def check_glibc_exist_and_validate() -> bool:
+    - On Windows (not supported yet, but as placeholder):
+        %LOCALAPPDATA%\executorch\qnn\<parts...>
+        (falls back to $HOME/AppData/Local if %LOCALAPPDATA% is unset)
+
+    - Override:
+        If QNN_STAGING_DIR is set in the environment, that path is used instead.
+
+    Args:
+        parts (str): Subdirectories to append under the root staging dir.
+
+    Returns:
+        pathlib.Path: Fully qualified staging path.
     """
-    Check if users have glibc installed.
+    # Environment override wins
+    base = os.environ.get("QNN_STAGING_DIR")
+    if base:
+        return pathlib.Path(base).joinpath(*parts)
+
+    system = platform.system().lower()
+    if system == "windows":
+        # On Windows, prefer %LOCALAPPDATA%, fallback to ~/AppData/Local
+        base = pathlib.Path(
+            os.environ.get("LOCALAPPDATA", pathlib.Path.home() / "AppData" / "Local")
+        )
+    elif is_linux_x86():
+        # On Linux/Unix, prefer $XDG_CACHE_HOME, fallback to ~/.cache
+        base = pathlib.Path(
+            os.environ.get("XDG_CACHE_HOME", pathlib.Path.home() / ".cache")
+        )
+    else:
+        raise ValueError(f"Unsupported platform: {system}")
+
+    return base.joinpath(*APP_NAMESPACE, *parts)
+
+
+def _atomic_download(url: str, dest: pathlib.Path):
     """
-    exists = False
-    for path in REQUIRED_LIBC_LIBS:
-        try:
-            output = subprocess.check_output(
-                [path, "--version"], stderr=subprocess.STDOUT
-            )
-            output = output.decode().split("\n")[0]
-            logger.debug(f"[QNN] glibc version for path {path} is: {output}")
-            match = re.search(r"version (\d+\.\d+)", output)
-            if match:
-                version = match.group(1)
-                if float(version) >= MINIMUM_LIBC_VERSION:
-                    logger.debug(f"[QNN] glibc version is {version}.")
-                    exists = True
-                    return True
-                else:
-                    logger.error(
-                        f"[QNN] glibc version is too low. The minimum libc version is {MINIMUM_LIBC_VERSION} Please install glibc following the commands below."
-                    )
-            else:
-                logger.error("[QNN] glibc version not found.")
+    Download URL into dest atomically:
+      - Write to a temp file in the same dir
+      - Move into place if successful
+    """
+    dest.parent.mkdir(parents=True, exist_ok=True)
 
-        except Exception:
-            continue
+    # Temp file in same dir (guarantees atomic rename)
+    with tempfile.NamedTemporaryFile(dir=dest.parent, delete=False) as tmp:
+        tmp_path = pathlib.Path(tmp.name)
 
-    if not exists:
-        logger.error(
-            r""""
-            [QNN] glibc not found or the version is too low. Please install glibc following the commands below.
-            Ubuntu/Debian:
-                sudo apt update
-                sudo apt install libc6
-
-            Fedora/Red Hat:
-                sudo dnf install glibc
-
-            Arch Linux:
-                sudo pacman -S glibc
-            
-            Also please make sure the glibc version is >= MINIMUM_LIBC_VERSION. You can verify the glibc version by running the following command:
-            Option 1:
-                ldd --version
-            Option 2:
-                /path/to/libc.so.6 --version
-            """
-        )
-    return exists
+    try:
+        urllib.request.urlretrieve(url, tmp_path)
+        tmp_path.replace(dest)  # atomic rename
+    except Exception:
+        # Clean up partial file on failure
+        if tmp_path.exists():
+            tmp_path.unlink(missing_ok=True)
+        raise
+
+
+####################
+# qnn sdk download management
+####################
 
 
 def _download_archive(url: str, archive_path: pathlib.Path) -> bool:
@@ -178,9 +194,6 @@ def _download_qnn_sdk(dst_folder=SDK_DIR) -> Optional[pathlib.Path]:
     if not is_linux_x86():
         logger.info("[QNN] Skipping Qualcomm SDK (only supported on Linux x86).")
         return None
-    elif not check_glibc_exist_and_validate():
-        logger.info("[QNN] Skipping Qualcomm SDK (glibc not found or version too old).")
-        return None
     else:
         logger.info("[QNN] Downloading Qualcomm SDK for Linux x86")
 
@@ -241,6 +254,136 @@ def _extract_tar(archive_path: pathlib.Path, prefix: str, target_dir: pathlib.Pa
                     dst.write(src.read())
 
 
+####################
+# libc management
+####################
+
+GLIBC_VERSION = "2.34"
+GLIBC_REEXEC_GUARD = "QNN_GLIBC_REEXEC"
+MINIMUM_LIBC_VERSION = GLIBC_VERSION
+
+
+def _get_glibc_libdir() -> pathlib.Path:
+    glibc_root = _get_staging_dir(f"glibc-{GLIBC_VERSION}")
+    return glibc_root / "lib"
+
+
+def _parse_version(v: str) -> tuple[int, int]:
+    """Turn '2.34' → (2,34) so it can be compared."""
+    parts = v.split(".")
+    return int(parts[0]), int(parts[1]) if len(parts) > 1 else 0
+
+
+def _current_glibc_version() -> str:
+    """Return system glibc version string (via ctypes)."""
+    try:
+        libc = ctypes.CDLL("libc.so.6")
+        func = libc.gnu_get_libc_version
+        func.restype = ctypes.c_char_p
+        return func().decode()
+    except Exception as e:
+        return f"error:{e}"
+
+
+def _resolve_glibc_loader() -> pathlib.Path | None:
+    """Return staged ld.so path if available."""
+    for p in [
+        _get_glibc_libdir() / f"ld-{GLIBC_VERSION}.so",
+        _get_glibc_libdir() / "ld-linux-x86-64.so.2",
+    ]:
+        if p.exists():
+            return p
+    return None
+
+
+def _stage_prebuilt_glibc():
+    """Download + extract Fedora 35 glibc RPM into /tmp."""
+    logger.info(">>> Staging prebuilt glibc-%s from Fedora 35 RPM", GLIBC_VERSION)
+    _get_glibc_libdir().mkdir(parents=True, exist_ok=True)
+    rpm_path = _get_staging_dir("glibc") / "glibc.rpm"
+    work_dir = _get_staging_dir("glibc") / "extracted"
+    rpm_url = (
+        "https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/35/"
+        "Everything/x86_64/os/Packages/g/glibc-2.34-7.fc35.x86_64.rpm"
+    )
+
+    rpm_path.parent.mkdir(parents=True, exist_ok=True)
+    logger.info("[glibc] Downloading %s -> %s", rpm_url, rpm_path)
+    try:
+        urllib.request.urlretrieve(rpm_url, rpm_path)
+    except Exception as e:
+        logger.error("[glibc] Failed to download %s: %s", rpm_url, e)
+        raise
+
+    # Extract
+    if work_dir.exists():
+        shutil.rmtree(work_dir)
+    work_dir.mkdir(parents=True)
+    subprocess.check_call(["bsdtar", "-C", str(work_dir), "-xf", str(rpm_path)])
+
+    # Copy runtime libs
+    staged = [
+        "ld-linux-x86-64.so.2",
+        "libc.so.6",
+        "libdl.so.2",
+        "libpthread.so.0",
+        "librt.so.1",
+        "libm.so.6",
+        "libutil.so.1",
+    ]
+    for lib in staged:
+        src = work_dir / "lib64" / lib
+        if src.exists():
+            shutil.copy2(src, _get_glibc_libdir() / lib)
+            logger.info("[glibc] Staged %s", lib)
+        else:
+            logger.warning("[glibc] Missing %s in RPM", lib)
+
+
+def ensure_glibc_minimum(min_version: str = GLIBC_VERSION):
+    """
+    Ensure process runs under glibc >= min_version.
+    - If system glibc is new enough → skip.
+    - Else → stage Fedora RPM and re-exec under staged loader.
+    """
+    current = _current_glibc_version()
+    logger.info("[glibc] Current loaded glibc: %s", current)
+
+    # If system glibc already sufficient → skip everything
+    m = re.match(r"(\d+\.\d+)", current)
+    if m and _parse_version(m.group(1)) >= _parse_version(min_version):
+        logger.info("[glibc] System glibc >= %s, no staging needed.", min_version)
+        return
+
+    # Avoid infinite loop
+    if os.environ.get(GLIBC_REEXEC_GUARD) == "1":
+        logger.info("[glibc] Already re-exec'd once, continuing.")
+        return
+
+    # Stage prebuilt if not already staged
+    if not (_get_glibc_libdir() / "libc.so.6").exists():
+        _stage_prebuilt_glibc()
+
+    loader = _resolve_glibc_loader()
+    if not loader:
+        logger.error("[glibc] Loader not found in %s", _get_glibc_libdir())
+        return
+
+    logger.info(
+        "[glibc] Re-execing under loader %s with libdir %s", loader, _get_glibc_libdir()
+    )
+    os.environ[GLIBC_REEXEC_GUARD] = "1"
+    os.execv(
+        str(loader),
+        [str(loader), "--library-path", str(_get_glibc_libdir()), sys.executable]
+        + sys.argv,
+    )
+
+
+####################
+# libc++ management
+####################
+
 LLVM_VERSION = "14.0.0"
 LIBCXX_BASE_NAME = f"clang+llvm-{LLVM_VERSION}-x86_64-linux-gnu-ubuntu-18.04"
 LLVM_URL = f"https://github.com/llvm/llvm-project/releases/download/llvmorg-{LLVM_VERSION}/{LIBCXX_BASE_NAME}.tar.xz"
@@ -258,12 +401,17 @@ def _stage_libcxx(target_dir: pathlib.Path):
         logger.info("[libcxx] Already staged at %s, skipping download", target_dir)
         return
 
-    temp_tar = pathlib.Path("/tmp") / f"{LIBCXX_BASE_NAME}.tar.xz"
-    temp_extract = pathlib.Path("/tmp") / LIBCXX_BASE_NAME
+    libcxx_stage = _get_staging_dir(f"libcxx-{LLVM_VERSION}")
+    temp_tar = libcxx_stage / f"{LIBCXX_BASE_NAME}.tar.xz"
+    temp_extract = libcxx_stage / LIBCXX_BASE_NAME
 
     if not temp_tar.exists():
         logger.info("[libcxx] Downloading %s", LLVM_URL)
-        urllib.request.urlretrieve(LLVM_URL, temp_tar)
+        _atomic_download(LLVM_URL, temp_tar)
+
+    # Sanity check before extracting
+    if not temp_tar.exists() or temp_tar.stat().st_size == 0:
+        raise FileNotFoundError(f"[libcxx] Tarball missing or empty: {temp_tar}")
 
     logger.info("[libcxx] Extracting %s", temp_tar)
     with tarfile.open(temp_tar, "r:xz") as tar:
@@ -437,8 +585,10 @@ def install_qnn_sdk() -> bool:
     Returns:
         True if both steps succeeded (or were already satisfied), else False.
     """
-    if check_glibc_exist_and_validate():
-        if _ensure_libcxx_stack():
-            if _ensure_qnn_sdk_lib():
-                return True
-    return False
+    logger.info("[QNN] Starting SDK installation")
+
+    # Make sure we’re running under >= 2.34
+    ensure_glibc_minimum(GLIBC_VERSION)
+
+    # libc++ and QNN SDK setup
+    return _ensure_libcxx_stack() and _ensure_qnn_sdk_lib()
diff --git a/setup.py b/setup.py
index fe9543f3243..97a1d05096e 100644
--- a/setup.py
+++ b/setup.py
@@ -467,11 +467,10 @@ def run(self):
             # Following code is for building the Qualcomm backend.
             from backends.qualcomm.scripts.download_qnn_sdk import (
                 _download_qnn_sdk,
-                check_glibc_exist_and_validate,
                 is_linux_x86,
             )
 
-            if is_linux_x86() and check_glibc_exist_and_validate():
+            if is_linux_x86():
                 os.environ["EXECUTORCH_BUILDING_WHEEL"] = "1"
 
                 with tempfile.TemporaryDirectory() as tmpdir:

From 7ed926693fbaf471ec8072ff8896090f9fe5fd44 Mon Sep 17 00:00:00 2001
From: Hardik Sharma <hardiksharma@meta.com>
Date: Wed, 1 Oct 2025 13:31:09 -0700
Subject: [PATCH 086/266] Move to ProxyValue instead of FakeTensor weights.

Differential Revision: D82605179

Pull Request resolved: https://github.com/pytorch/executorch/pull/14697
---
 backends/cadence/aot/replace_ops.py  | 202 +++++++++------------------
 backends/cadence/aot/simplify_ops.py |   4 +-
 2 files changed, 68 insertions(+), 138 deletions(-)

diff --git a/backends/cadence/aot/replace_ops.py b/backends/cadence/aot/replace_ops.py
index 8de0af7311d..9e95460f2f5 100644
--- a/backends/cadence/aot/replace_ops.py
+++ b/backends/cadence/aot/replace_ops.py
@@ -43,7 +43,6 @@
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.dialects.edge._ops import EdgeOpOverload, EdgeOpOverloadPacket
 from executorch.exir.pass_base import ExportPass, NodeMetadata, PassResult, ProxyValue
-from torch._subclasses import FakeTensor
 from torch.fx.node import Argument
 
 # A map to represent ops that:
@@ -90,11 +89,7 @@ def replace_logical_nop_where_with_where(
 
             # Get the third arg node and its input
             logical_not_node = node.args[0]
-            logical_not_input_tensor = (
-                logical_not_node.args[0].to_tensor()
-                if isinstance(logical_not_node.args[0], ProxyValue)
-                else logical_not_node.args[0]
-            )
+            logical_not_input_tensor = logical_not_node.args[0].to_tensor()
 
             # If the logical_not input is not a boolean tensor, bail.
             if logical_not_input_tensor.meta["spec"].dtype != torch.bool:
@@ -263,7 +258,7 @@ def call_operator(self, op, args, kwargs, meta):
             return super().call_operator(op, args, kwargs, meta)
 
         # Glean the shape of input and output tensor
-        in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0]
+        in_tensor = args[0].to_tensor()
         in_shape = in_tensor.shape
         out_shape = meta["val"].shape
         # Get the select dimension
@@ -295,7 +290,7 @@ def call_operator(self, op, args, kwargs, meta):
 
         # Create a zero bias tensor, and insert it as a graph buffer before the
         # current node
-        mat2_tensor = mat2.to_tensor() if isinstance(mat2, ProxyValue) else mat2
+        mat2_tensor = mat2.to_tensor()
         bias_size = mat2_tensor.size(1)
         zero_bias = super().call_operator(
             exir_ops.edge.aten.full.default,
@@ -410,7 +405,7 @@ def call_operator(self, op, args, kwargs, meta):
             return super().call_operator(op, args, kwargs, meta)
 
         # Get the old dim and new dim order
-        in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0]
+        in_tensor = args[0].to_tensor()
         old_dims = tuple(range(in_tensor.dim()))
         new_dims = args[1]
 
@@ -488,11 +483,7 @@ def call_operator(self, op, args, kwargs, meta):
         repeats = args[1]
 
         # Glean the shapes of input tensor
-        in_shape = list(
-            in_tensor.to_tensor().shape
-            if isinstance(in_tensor, ProxyValue)
-            else in_tensor.shape
-        )
+        in_shape = list(in_tensor.to_tensor().shape)
 
         # If the size of repeats is more than the dimensionality of the tensor,
         # the output of repeat will be a higher-dimensional tensor. We reshape
@@ -793,15 +784,9 @@ def call_operator(self, op, args, kwargs, meta):
         (in_tensor, weight, bias, stride, padding, dilation, groups) = args[0:7]
 
         # Glean the shapes of input, weight, and output
-        in_shape = (
-            in_tensor.to_tensor().shape
-            if isinstance(in_tensor, ProxyValue)
-            else in_tensor.shape
-        )
+        in_shape = in_tensor.to_tensor().shape
 
-        weight_shape = (
-            weight.to_tensor().shape if isinstance(weight, ProxyValue) else weight.shape
-        )
+        weight_shape = weight.to_tensor().shape
         out_shape = meta["val"].shape
         assert None not in {in_shape, weight_shape, out_shape}
 
@@ -823,26 +808,16 @@ def call_operator(self, op, args, kwargs, meta):
         # Reshape the weight to [out_channels, in_channels * X]
         K = math.prod(weight_shape[1:])
 
-        # If weight is a ProxyValue, linear_weight needs to be the output of a
-        # graph operation (in this case a view_copy op) to be an explicit ProxyValue
-        # as well. If not, the view op can be done directly on the tensor.
-        linear_weight = (
-            super().call_operator(
-                exir_ops.edge.aten.view_copy.default,
-                (
-                    weight,
-                    [weight_shape[0], K],
-                ),
-                kwargs,
-                meta,
-            )
-            if isinstance(weight, ProxyValue)
-            else weight.contiguous().view(weight_shape[0], K)
+        # Weight is always a ProxyValue, so we need a view_copy operation
+        linear_weight = super().call_operator(
+            exir_ops.edge.aten.view_copy.default,
+            (
+                weight,
+                [weight_shape[0], K],
+            ),
+            kwargs,
+            meta,
         )
-        # From the previous check, if linear_weight is a FakeTensor, it has to be
-        # a constant (if not, it would be a ProxyValue). Mark it as such.
-        if isinstance(linear_weight, FakeTensor):
-            linear_weight.constant = linear_weight
 
         # Reshape the input from 3d to 2d tensor
         in_view = super().call_operator(
@@ -865,11 +840,7 @@ def call_operator(self, op, args, kwargs, meta):
                 out_zero_point,
             ) = args[7:12]
             # If the multiplier and shift tensors are provided, use them.
-            if (
-                len(args) >= 14
-                and isinstance(args[12], ProxyValue)
-                and isinstance(args[13], ProxyValue)
-            ):
+            if len(args) >= 14:
                 out_multiplier = args[12]
                 out_shift = args[13]
             # If not, compute them.
@@ -1073,9 +1044,7 @@ def call_operator(self, op, args, kwargs, meta):
         if groups != 1:
             return super().call_operator(op, args, kwargs, meta)
 
-        weight_shape = (
-            weight.to_tensor().shape if isinstance(weight, ProxyValue) else weight.shape
-        )
+        weight_shape = weight.to_tensor().shape
         # If this is a pointwise convolution, im2col will start dominating the
         # runtime. So we call convolution op for this case.
         if (
@@ -1114,8 +1083,6 @@ def call_operator(self, op, args, kwargs, meta):
                     {"dtype": torch.int32},
                     meta,
                 )
-                if isinstance(in_tensor.to_tensor(), FakeTensor)
-                else get_zero_point(in_tensor.to_tensor())
             )
             if quantized_op
             else torch.tensor(0, dtype=torch.int32)
@@ -1151,26 +1118,16 @@ def call_operator(self, op, args, kwargs, meta):
         # Get the product of the >2 dims of the weight
         K = math.prod(weight_shape[1:])
 
-        # If weight is a ProxyValue, linear_weight needs to be the output of a
-        # graph operation (in this case a view_copy op) to be an explicit ProxyValue
-        # as well. If not, the view op can be done directly on the tensor.
-        linear_weight = (
-            super().call_operator(
-                exir_ops.edge.aten.view_copy.default,
-                (
-                    weight,
-                    [weight_shape[0], K],
-                ),
-                kwargs,
-                meta,
-            )
-            if isinstance(weight, ProxyValue)
-            else weight.contiguous().view(weight_shape[0], K)
+        # Weight is always a ProxyValue, so we need a view_copy operation
+        linear_weight = super().call_operator(
+            exir_ops.edge.aten.view_copy.default,
+            (
+                weight,
+                [weight_shape[0], K],
+            ),
+            kwargs,
+            meta,
         )
-        # From the previous check, if linear_weight is a FakeTensor, it has to be
-        # a constant (if not, it would be a ProxyValue). Mark it as such.
-        if isinstance(linear_weight, FakeTensor):
-            linear_weight.constant = linear_weight
 
         # Create the linear node, which multiplies the 3d input with 2d weight
         # tensors with bias addition. The outermost dimension of the input is
@@ -1184,11 +1141,7 @@ def call_operator(self, op, args, kwargs, meta):
                 out_zero_point,
             ) = args[7:12]
             # If the multiplier and shift tensors are provided, use them.
-            if (
-                len(args) >= 14
-                and isinstance(args[12], ProxyValue)
-                and isinstance(args[13], ProxyValue)
-            ):
+            if len(args) >= 14:
                 out_multiplier = args[12]
                 out_shift = args[13]
             # If not, compute them.
@@ -1276,9 +1229,7 @@ def call_operator(self, op, args, kwargs, meta):
 
         # Get the shapes
         out_shape = meta["val"].shape
-        weight_shape = (
-            weight.to_tensor().shape if isinstance(weight, ProxyValue) else weight.shape
-        )
+        weight_shape = weight.to_tensor().shape
         assert None not in {weight_shape, out_shape}
 
         # Determine if the transposed_convolution is NCHW or NHWC. The NHWC,
@@ -1332,26 +1283,16 @@ def call_operator(self, op, args, kwargs, meta):
         # Reshape the weight to [out_channels, in_channels * X]
         K = math.prod(weight_shape[1:])
 
-        # If weight is a ProxyValue, linear_weight needs to be the output of a
-        # graph operation (in this case a view_copy op) to be an explicit ProxyValue
-        # as well. If not, the view op can be done directly on the tensor.
-        linear_weight = (
-            super().call_operator(
-                exir_ops.edge.aten.view_copy.default,
-                (
-                    weight,
-                    [weight_shape[0], K],
-                ),
-                kwargs,
-                meta,
-            )
-            if isinstance(weight, ProxyValue)
-            else weight.contiguous().view(weight_shape[0], K)
+        # Weight is always a ProxyValue, so we need a view_copy operation
+        linear_weight = super().call_operator(
+            exir_ops.edge.aten.view_copy.default,
+            (
+                weight,
+                [weight_shape[0], K],
+            ),
+            kwargs,
+            meta,
         )
-        # From the previous check, if linear_weight is a FakeTensor, it has to be
-        # a constant (if not, it would be a ProxyValue). Mark it as such.
-        if isinstance(linear_weight, FakeTensor):
-            linear_weight.constant = linear_weight
 
         # Create the linear node, which multiplies the 3d input with 2d weight
         # tensors with bias addition. The outermost dimension of the input is
@@ -1422,7 +1363,7 @@ def call_operator(self, op, args, kwargs, meta):
             return super().call_operator(op, args, kwargs, meta)
 
         # Get the input tensor and shape
-        in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0]
+        in_tensor = args[0].to_tensor()
         in_shape = in_tensor.shape
         # Get the output tensor shape
         out_shape = meta["val"].shape
@@ -1491,7 +1432,7 @@ def call_operator(self, op, args, kwargs, meta):
             return super().call_operator(op, args, kwargs, meta)
 
         # Extract the input tensor
-        in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0]
+        in_tensor = args[0].to_tensor()
         leading_dims = math.prod(in_tensor.shape[:-1])
         # If the tensor is not a vector, do nothing.
         if leading_dims != 1:
@@ -1557,11 +1498,7 @@ def call_operator(self, op, args, kwargs, meta):
         return super().call_operator(
             exir_ops.edge.aten.full.default,
             (
-                (
-                    args[0].to_tensor().shape
-                    if isinstance(args[0], ProxyValue)
-                    else args[0].shape
-                ),
+                args[0].to_tensor().shape,
                 args[1],
             ),
             {},
@@ -1602,59 +1539,57 @@ class ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass(ExportPass):
     replaced_scalar_args: dict[
         EdgeOpOverloadPacket, tuple[EdgeOpOverload, Sequence[int]]
     ] = {
-        exir_ops.edge.cadence.quantized_add: (
+        exir_ops.edge.cadence.quantized_add.default: (
             exir_ops.edge.cadence.quantized_add.per_tensor,
             [1, 2, 4, 5],
         ),
-        exir_ops.edge.cadence.quantized_conv2d_nchw: (
+        exir_ops.edge.cadence.quantized_conv2d_nchw.default: (
             exir_ops.edge.cadence.quantized_conv2d_nchw.per_tensor,
             [8, 9, 12, 13],
         ),
-        exir_ops.edge.cadence.quantized_conv2d_nhwc: (
+        exir_ops.edge.cadence.quantized_conv2d_nhwc.default: (
             exir_ops.edge.cadence.quantized_conv2d_nhwc.per_tensor,
             [8, 9, 12, 13],
         ),
-        exir_ops.edge.cadence.quantized_fully_connected: (
+        exir_ops.edge.cadence.quantized_fully_connected.default: (
             exir_ops.edge.cadence.quantized_fully_connected.per_tensor,
             [4, 5, 6],
         ),
-        exir_ops.edge.cadence.quantized_layer_norm: (
+        exir_ops.edge.cadence.quantized_layer_norm.default: (
             exir_ops.edge.cadence.quantized_layer_norm.per_tensor,
             [1, 2],
         ),
-        exir_ops.edge.cadence.quantized_linear: (
+        exir_ops.edge.cadence.quantized_linear.default: (
             exir_ops.edge.cadence.quantized_linear.per_tensor,
             [4, 5, 6],
         ),
-        exir_ops.edge.cadence.quantized_relu: (
+        exir_ops.edge.cadence.quantized_relu.default: (
             exir_ops.edge.cadence.quantized_relu.per_tensor,
             [1, 3, 4],
         ),
-        exir_ops.edge.cadence.im2row: (
+        exir_ops.edge.cadence.im2row.default: (
             exir_ops.edge.cadence.im2row.per_tensor,
             [5],
         ),
-        exir_ops.edge.cadence.requantize: (
+        exir_ops.edge.cadence.requantize.default: (
             exir_ops.edge.cadence.requantize.per_tensor,
             [1, 2, 3, 4],
         ),
     }
 
     def call_operator(self, op, args, kwargs, meta):
-        op_edge_overload_packet = get_edge_overload_packet(op)
-
-        if op_edge_overload_packet not in self.replaced_scalar_args:
+        if op not in self.replaced_scalar_args:
             return super().call_operator(op, args, kwargs, meta)
 
         # Get all the args that need to be replaced.
-        new_op, args_to_be_replaced = self.replaced_scalar_args[op_edge_overload_packet]
+        new_op, args_to_be_replaced = self.replaced_scalar_args[op]
+
+        if op == new_op:
+            return super().call_operator(op, args, kwargs, meta)
 
         updated_args = list(args)
         for op_arg_index in args_to_be_replaced:
             arg = args[op_arg_index]
-            if not isinstance(arg, ProxyValue):
-                return super().call_operator(op, args, kwargs, meta)
-
             if not arg.is_tensor():
                 return super().call_operator(op, args, kwargs, meta)
 
@@ -1696,7 +1631,7 @@ def call_operator(self, op, args, kwargs, meta):
         # Determine if the op is avg_pool1d or avg_pool2d
         avg_pool1d: bool = op == exir_ops.edge.aten.avg_pool1d.default
         # Get the input tensor
-        in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0]
+        in_tensor = args[0].to_tensor()
 
         # Replace avg_pool2d with custom avg_pool2d, and if the input tensor is
         # quantized, pass its zero_point tensor as arg to the custom avg_pool2d.
@@ -2062,7 +1997,7 @@ def call_operator(self, op, args, kwargs, meta):
             return super().call_operator(op, args, kwargs, meta)
 
         # Get the second tensor
-        Y_tensor = Y_arg.to_tensor() if isinstance(Y_arg, ProxyValue) else Y_arg
+        Y_tensor = Y_arg.to_tensor()
         # Concretize the bias
         zero_bias = super().call_operator(
             exir_ops.edge.aten.full.default,
@@ -2071,19 +2006,14 @@ def call_operator(self, op, args, kwargs, meta):
             meta,
         )
 
-        # If the arg was a ProxyValue, insert a transpose node. Otherwise we
-        # can simply transpose the tensor inplace.
-        if isinstance(Y_arg, ProxyValue):
-            transpose_args = (Y_arg, -1, -2)
-            transpose_node = super().call_operator(
-                exir_ops.edge.aten.transpose_copy.int,
-                transpose_args,
-                {},
-                meta,
-            )
-            Y_arg_t = transpose_node
-        else:
-            Y_arg_t = Y_tensor.transpose(-1, -2)
+        # Y_arg is always a ProxyValue, so we insert a transpose node
+        transpose_args = (Y_arg, -1, -2)
+        Y_arg_t = super().call_operator(
+            exir_ops.edge.aten.transpose_copy.int,
+            transpose_args,
+            {},
+            meta,
+        )
 
         # Construct the new args, and return the transposed matmult op
         new_args = (
@@ -2178,7 +2108,7 @@ def call_operator(self, op, args, kwargs, meta):
             return super().call_operator(op, args, kwargs, meta)
 
         # Get the input tensor
-        in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0]
+        in_tensor = args[0].to_tensor()
         # Permute NCHW to NHWC for computation
         in_tensor_permuted = in_tensor.permute(0, 2, 3, 1)
         in_tensor_shape = in_tensor_permuted.shape
diff --git a/backends/cadence/aot/simplify_ops.py b/backends/cadence/aot/simplify_ops.py
index bf836f09044..92c14cb0f5d 100644
--- a/backends/cadence/aot/simplify_ops.py
+++ b/backends/cadence/aot/simplify_ops.py
@@ -19,7 +19,7 @@
 from executorch.backends.cadence.aot.utils import rebind
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.dialects.edge._ops import EdgeOpOverload
-from executorch.exir.pass_base import ExportPass, ProxyValue
+from executorch.exir.pass_base import ExportPass
 
 
 @register_cadence_pass(CadencePassAttribute(opt_level=0))
@@ -75,7 +75,7 @@ def call_operator(self, op, args, kwargs, meta):
         slice_scatter = op == exir_ops.edge.aten.slice_scatter.default
         # Parse the arguments
         # Extract the tensor to be sliced, and the slicing dimension
-        in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0]
+        in_tensor = args[0].to_tensor()
         dim = args[1 + slice_scatter] if len(args) > 1 + slice_scatter else 0
         # Make dim non-negative
         dim = dim if dim >= 0 else dim + in_tensor.dim()

From a4ac70d965298a192eaff26464017363876aa400 Mon Sep 17 00:00:00 2001
From: Abhinayk <abhinayk@meta.com>
Date: Wed, 1 Oct 2025 14:38:34 -0700
Subject: [PATCH 087/266] Disable nxp tests (#14730)

---
 backends/nxp/tests/TARGETS | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/backends/nxp/tests/TARGETS b/backends/nxp/tests/TARGETS
index f492111aff2..c8ccd5fe900 100644
--- a/backends/nxp/tests/TARGETS
+++ b/backends/nxp/tests/TARGETS
@@ -1,3 +1,4 @@
+load("@fbsource//tools/target_determinator/macros:ci.bzl", "ci")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load("@fbcode_macros//build_defs:python_pytest.bzl", "python_pytest")
 
@@ -50,5 +51,9 @@ python_pytest(
         "//executorch/backends/nxp:neutron_backend",
         ":executorch_pipeline",
         ":models",
-    ]
+    ],
+    labels = [
+        "local_only",
+        ci.skip_test(),
+    ],
 )

From 649f92d4e5426d93312f4aff74ef6ba02697e834 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Lindstr=C3=B6m?=
 <33344797+martinlsm@users.noreply.github.com>
Date: Wed, 1 Oct 2025 23:51:57 +0200
Subject: [PATCH 088/266] Arm backend: Correct type annotations in
 aot_arm_compiler (#14627)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Correct/add type annotation in aot_arm_compiler.py
- Remove one redundant variable assignment (dead code)


Signed-off-by: Martin Lindström <Martin.Lindstroem@arm.com>
---
 examples/arm/aot_arm_compiler.py | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
index 53020d1bea0..0f3526975ff 100644
--- a/examples/arm/aot_arm_compiler.py
+++ b/examples/arm/aot_arm_compiler.py
@@ -61,6 +61,8 @@
 
 from executorch.extension.export_util.utils import save_pte_program
 from tabulate import tabulate
+from torch.export import ExportedProgram
+from torch.fx import GraphModule
 from torch.utils.data import DataLoader
 
 # Quantize model if required using the standard export quantizaion flow.
@@ -145,13 +147,13 @@ def get_model_and_inputs_from_name(
 
 
 def quantize(
-    model: torch.nn.Module,
+    model: GraphModule,
     model_name: str,
     compile_specs: EthosUCompileSpec | VgfCompileSpec | TosaCompileSpec,
     example_inputs: Tuple[torch.Tensor],
     evaluator_name: str | None,
     evaluator_config: Dict[str, Any] | None,
-) -> torch.nn.Module:
+) -> GraphModule:
     """This is the official recommended flow for quantization in pytorch 2.0
     export"""
     logging.info("Quantizing Model...")
@@ -601,7 +603,12 @@ def save_bpte_program(exec_prog, original_model: torch.nn.Module, output_name: s
     save_bundled_program(exec_prog, method_test_suites, output_name)
 
 
-def quantize_model(args, model: torch.nn.Module, example_inputs, compile_spec):
+def quantize_model(
+    args,
+    model: GraphModule,
+    example_inputs: Tuple[torch.Tensor],
+    compile_spec,
+) -> Tuple[GraphModule, ExportedProgram]:
     model_int8 = quantize(
         model,
         args.model_name,
@@ -619,7 +626,10 @@ def quantize_model(args, model: torch.nn.Module, example_inputs, compile_spec):
 
 
 def to_edge_TOSA_delegate(
-    exported_program, args, model: torch.nn.Module, example_inputs
+    exported_program: ExportedProgram,
+    args,
+    model: GraphModule,
+    example_inputs: Tuple[torch.Tensor],
 ):
     # As we can target multiple output encodings, one must
     # be specified.
@@ -638,7 +648,6 @@ def to_edge_TOSA_delegate(
         model_int8, exported_program = quantize_model(
             args, model, example_inputs, compile_spec
         )
-        model = model_int8
 
     if isinstance(compile_spec, EthosUCompileSpec):
         partitioner = EthosUPartitioner(compile_spec)
@@ -660,7 +669,12 @@ def to_edge_TOSA_delegate(
     return model_int8, edge
 
 
-def to_edge_no_delegate(exported_program, args, model: torch.nn.Module, example_inputs):
+def to_edge_no_delegate(
+    exported_program: ExportedProgram,
+    args,
+    model: GraphModule,
+    example_inputs: Tuple[torch.Tensor],
+):
     model_int8 = None
     if args.quantize:
         # As we can target multiple output encodings, one must

From 871fe39f4e2a2eb9833ac9d490543d9d7b73244a Mon Sep 17 00:00:00 2001
From: Oscar Andersson <87121123+oscarandersson8218@users.noreply.github.com>
Date: Wed, 1 Oct 2025 23:54:12 +0200
Subject: [PATCH 089/266] Arm backend: Update full quantization annotation
 (#14585)

full, full.default and fill_.Scalar were previously part of
_one_to_one_shared_input_or_input_act_qspec without having any input
nodes. This meant that these nodes were never annotated and solely
relied on the next node to annotate its input. This patch changes so
that full, full.default and fill_.Scalar are annotated in the same way
as scalar_tensor.default.

Also adds these targets to _is_large_scalar().


Signed-off-by: Oscar Andersson <oscar.andersson@arm.com>
---
 .../arm/quantizer/quantization_annotator.py   | 24 ++++++++++++-------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/backends/arm/quantizer/quantization_annotator.py b/backends/arm/quantizer/quantization_annotator.py
index d7c85447dd5..ebc91c22bbb 100644
--- a/backends/arm/quantizer/quantization_annotator.py
+++ b/backends/arm/quantizer/quantization_annotator.py
@@ -6,7 +6,7 @@
 import logging
 import operator
 from dataclasses import dataclass
-from typing import Callable, List, Optional, Sequence
+from typing import Callable, cast, List, Optional, Sequence
 
 import torch
 import torch.fx
@@ -137,11 +137,18 @@ def _is_large_scalar(node: Node, gm: torch.fx.GraphModule):
     node since histc op (in HistogramObserver) only works for values up to certain upper
     bound.
     """
+    HISTC_UPPER_BOUND = 3.4028235e15
     if node.op == "get_attr" and isinstance(node.target, str):
         tensor = _get_node_target(gm, node.target)
         # torch.histc works until this upper bound
-        HISTC_UPPER_BOUND = 3.4028235e15
         return tensor.numel() == 1 and abs(tensor.item()) > HISTC_UPPER_BOUND
+    if node.op == "call_function" and node.target in (
+        torch.ops.aten.full.default,
+        torch.ops.aten.full,
+        torch.ops.aten.fill_.Scalar,
+    ):
+        fill_value = cast(float, node.args[1])
+        return abs(fill_value) > HISTC_UPPER_BOUND
     return False
 
 
@@ -358,9 +365,6 @@ def _match_pattern(
     torch.ops.aten.permute_copy.default,
     torch.ops.aten.avg_pool2d.default,
     torch.ops.aten.max_pool2d.default,
-    torch.ops.aten.full.default,
-    torch.ops.aten.full,
-    torch.ops.aten.fill_.Scalar,
     torch.ops.aten.flatten.using_ints,
     torch.ops.aten.dropout.default,
     torch.ops.aten.dropout_.default,
@@ -518,9 +522,6 @@ def any_or_hardtanh_min_zero(n: Node):
         ]
         quant_properties.quant_output = _QuantProperty(0, shared_qspec)  # type: ignore[arg-type]
     elif node.target in _one_to_one_shared_input_or_input_act_qspec:
-        if not isinstance(node.args[0], Node):
-            return None
-
         input_qspec = (
             SharedQuantizationSpec(node.args[0])  # type: ignore[arg-type]
             if is_output_annotated(node.args[0])  # type: ignore
@@ -578,7 +579,12 @@ def any_or_hardtanh_min_zero(n: Node):
             ),
         ]
         quant_properties.quant_output = None
-    elif node.target in [torch.ops.aten.scalar_tensor.default]:
+    elif node.target in [
+        torch.ops.aten.scalar_tensor.default,
+        torch.ops.aten.full.default,
+        torch.ops.aten.full,
+        torch.ops.aten.fill_.Scalar,
+    ]:
         quant_properties.quant_inputs = []
         quant_properties.quant_output = _QuantProperty(0, output_act_qspec)
     elif node.target in [operator.getitem]:

From 0081bef92ef8bd0f58d0be3580f85dfcded2a3aa Mon Sep 17 00:00:00 2001
From: Erik Lundell <erik.lundell@arm.com>
Date: Wed, 1 Oct 2025 23:56:36 +0200
Subject: [PATCH 090/266] Arm backend: Add complie spec factories (#14376)

Signed-off-by: Erik Lundell <erik.lundell@arm.com>
Co-authored-by: Digant Desai <digantdesai@meta.com>
---
 backends/arm/TARGETS                   | 14 ++++
 backends/arm/test/TARGETS              |  6 ++
 backends/arm/test/common.py            |  1 +
 backends/arm/test/tester/arm_tester.py | 96 ++++++++------------------
 backends/arm/tosa/backend.py           |  8 +--
 backends/arm/util/_factory.py          | 59 ++++++++++++++++
 examples/arm/aot_arm_compiler.py       | 33 ++-------
 7 files changed, 121 insertions(+), 96 deletions(-)
 create mode 100644 backends/arm/util/_factory.py

diff --git a/backends/arm/TARGETS b/backends/arm/TARGETS
index a78ab252739..a737c4bc9de 100644
--- a/backends/arm/TARGETS
+++ b/backends/arm/TARGETS
@@ -106,3 +106,17 @@ runtime.python_library(
         "//caffe2:torch",
     ]
 )
+runtime.python_library(
+    name = "_factory",
+    srcs = [
+        "util/_factory.py"
+    ],
+    deps = [
+        ":ethosu",
+        ":vgf",
+        ":arm_compile_spec",
+        "//executorch/backends/arm/quantizer:lib",
+        "//executorch/exir/backend:operator_support",
+        "//executorch/exir/backend:compile_spec_schema",
+    ]
+)
diff --git a/backends/arm/test/TARGETS b/backends/arm/test/TARGETS
index ec35b63f8f6..fd7d894fbf0 100644
--- a/backends/arm/test/TARGETS
+++ b/backends/arm/test/TARGETS
@@ -1,3 +1,8 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load(":targets.bzl", "define_arm_tests")
 
@@ -58,6 +63,7 @@ runtime.python_library(
         "//executorch/backends/arm/quantizer:lib",
         "//executorch/backends/arm/tosa:mapping",
         "//executorch/backends/arm:vgf",
+        "//executorch/backends/arm:_factory",
         "//executorch/devtools/backend_debug:delegation_info",
         "//executorch/exir/backend:operator_support",
         "fbsource//third-party/pypi/tabulate:tabulate",
diff --git a/backends/arm/test/common.py b/backends/arm/test/common.py
index 963084d6091..f8a6242fc0c 100644
--- a/backends/arm/test/common.py
+++ b/backends/arm/test/common.py
@@ -14,6 +14,7 @@
 
 import pytest
 from executorch.backends.arm.ethosu import EthosUCompileSpec
+
 from executorch.backends.arm.test.runner_utils import (
     arm_executor_runner_exists,
     corstone300_installed,
diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py
index 8bf72827549..9f530f428ce 100644
--- a/backends/arm/test/tester/arm_tester.py
+++ b/backends/arm/test/tester/arm_tester.py
@@ -28,17 +28,11 @@
 
 import torch.fx
 import torch.utils._pytree as pytree
-
 from executorch.backends.arm._passes.arm_pass_manager import ArmPassManager
 
 from executorch.backends.arm.common.arm_compile_spec import ArmCompileSpec
-from executorch.backends.arm.ethosu import EthosUCompileSpec, EthosUPartitioner
-from executorch.backends.arm.quantizer import (
-    EthosUQuantizer,
-    get_symmetric_quantization_config,
-    TOSAQuantizer,
-    VgfQuantizer,
-)
+from executorch.backends.arm.ethosu import EthosUCompileSpec
+from executorch.backends.arm.quantizer import get_symmetric_quantization_config
 from executorch.backends.arm.test.runner_utils import (
     dbg_tosa_fb_to_json,
     get_output_quantization_params,
@@ -53,9 +47,13 @@
 from executorch.backends.arm.tosa import TosaSpecification
 from executorch.backends.arm.tosa.compile_spec import TosaCompileSpec
 from executorch.backends.arm.tosa.mapping import extract_tensor_meta
-from executorch.backends.arm.tosa.partitioner import TOSAPartitioner
 
-from executorch.backends.arm.vgf import VgfCompileSpec, VgfPartitioner
+from executorch.backends.arm.util._factory import (
+    create_partitioner,
+    create_quantizer,
+    parse_compile_spec,
+)
+from executorch.backends.arm.vgf import VgfCompileSpec
 
 from executorch.backends.test.harness.error_statistics import ErrorStatistics
 from executorch.backends.test.harness.stages import Stage, StageType
@@ -83,7 +81,6 @@
     _copy_module,
     _update_exported_program_graph_module,
 )
-
 from tabulate import tabulate
 
 from torch.export.graph_signature import ExportGraphSignature, InputSpec, OutputSpec
@@ -103,12 +100,6 @@ def _dump_lowered_modules_artifact(
         artifact.exported_program().graph_signature
     )
 
-    def get_output_format(lowered_module) -> str | None:
-        for spec in lowered_module.compile_specs:
-            if spec.key == "output_format":
-                return spec.value.decode()
-        return None
-
     for node in graph_module.graph.nodes:
         if node.op == "get_attr" and node.name.startswith("lowered_module_"):
             lowered_module = getattr(graph_module, node.name)
@@ -116,13 +107,13 @@ def get_output_format(lowered_module) -> str | None:
                 lowered_module, LoweredBackendModule
             ), f"Attribute {node.name} must be of type LoweredBackendModule."
 
-            output_format = get_output_format(lowered_module)
-            if output_format == "tosa":
+            compile_spec = parse_compile_spec(lowered_module.compile_specs)
+            if isinstance(compile_spec, TosaCompileSpec):
                 tosa_fb = lowered_module.processed_bytes
                 to_print = dbg_tosa_fb_to_json(tosa_fb)
                 to_print = pformat(to_print, compact=True, indent=1)
                 output += f"\nTOSA deserialized {node.name}: \n{to_print}\n"
-            elif output_format == EthosUCompileSpec.get_output_format():
+            elif isinstance(compile_spec, EthosUCompileSpec):
                 vela_cmd_stream = lowered_module.processed_bytes
                 output += f"\nVela command stream {node.name}: \n{vela_cmd_stream}\n"
             else:
@@ -284,13 +275,7 @@ def quantize(
         quantize_stage: Optional[tester.Quantize] = None,
     ):
         if quantize_stage is None:
-            quantizer = None
-            if isinstance(self.compile_spec, TosaCompileSpec):
-                quantizer = TOSAQuantizer(self.compile_spec)
-            elif isinstance(self.compile_spec, EthosUCompileSpec):
-                quantizer = EthosUQuantizer(self.compile_spec)
-            elif isinstance(self.compile_spec, VgfCompileSpec):
-                quantizer = VgfQuantizer(self.compile_spec)
+            quantizer = create_quantizer(self.compile_spec)
             quantize_stage = tester.Quantize(
                 quantizer,
                 get_symmetric_quantization_config(),
@@ -312,14 +297,7 @@ def to_edge(
 
     def partition(self, partition_stage: Optional[Partition] = None):
         if partition_stage is None:
-            if isinstance(self.compile_spec, TosaCompileSpec):
-                arm_partitioner = TOSAPartitioner(self.compile_spec)
-            elif isinstance(self.compile_spec, EthosUCompileSpec):
-                arm_partitioner = EthosUPartitioner(self.compile_spec)
-            elif isinstance(self.compile_spec, VgfCompileSpec):
-                arm_partitioner = VgfPartitioner(self.compile_spec)
-            else:
-                raise ValueError("compile spec doesn't target any Arm Partitioner")
+            arm_partitioner = create_partitioner(self.compile_spec)
             partition_stage = Partition(arm_partitioner)
         return super().partition(partition_stage)
 
@@ -329,7 +307,7 @@ def to_edge_transform_and_lower(
         partitioners: Optional[List[Partitioner]] = None,
         edge_compile_config: Optional[EdgeCompileConfig] = None,
         additional_checks: Optional[
-            List[Union[DontPartition | DontPartitionModule | DontPartitionName]]
+            List[DontPartition | DontPartitionModule | DontPartitionName]
         ] = None,
         transform_passes: Optional[
             Union[Sequence[PassType], Dict[str, Sequence[PassType]]]
@@ -343,20 +321,9 @@ def to_edge_transform_and_lower(
 
         if to_edge_and_lower_stage is None:
             if partitioners is None:
-                if isinstance(self.compile_spec, TosaCompileSpec):
-                    arm_partitioner = TOSAPartitioner(
-                        self.compile_spec, additional_checks
-                    )
-                elif isinstance(self.compile_spec, EthosUCompileSpec):
-                    arm_partitioner = EthosUPartitioner(
-                        self.compile_spec, additional_checks
-                    )
-                elif isinstance(self.compile_spec, VgfCompileSpec):
-                    arm_partitioner = VgfPartitioner(
-                        self.compile_spec, additional_checks
-                    )
-                else:
-                    raise ValueError("compile spec doesn't target any Arm Partitioner")
+                arm_partitioner = create_partitioner(
+                    self.compile_spec, additional_checks
+                )
                 partitioners = [arm_partitioner]
             to_edge_and_lower_stage = ToEdgeTransformAndLower(
                 partitioners,
@@ -743,22 +710,19 @@ def _get_tosa_operator_distribution(
     op_list = []
     id = 0
     while lowered_module := getattr(graph_module, f"lowered_module_{id}", None):
-        for spec in lowered_module.compile_specs:
-            if spec.key != "output_format":
-                continue
-            if spec.value == b"tosa":
-                tosa_fb = lowered_module.processed_bytes
-                tosa_json = dbg_tosa_fb_to_json(tosa_fb)
-                for region in tosa_json["regions"]:
-                    for block in region["blocks"]:
-                        op_list.extend(
-                            [operator["op"] for operator in block["operators"]]
-                        )
-                break
-            elif spec.value == EthosUCompileSpec.get_output_format().encode():
-                return "Can not get operator distribution for Vela command stream."
-            else:
-                return f"Unknown output format '{spec.value}'."
+        compile_spec = parse_compile_spec(lowered_module.compile_specs)
+        if isinstance(compile_spec, TosaCompileSpec):
+            tosa_fb = lowered_module.processed_bytes
+            tosa_json = dbg_tosa_fb_to_json(tosa_fb)
+            for region in tosa_json["regions"]:
+                for block in region["blocks"]:
+                    op_list.extend([operator["op"] for operator in block["operators"]])
+        elif isinstance(compile_spec, EthosUCompileSpec):
+            return "Can not get operator distribution for Vela command stream."
+        elif isinstance(compile_spec, VgfCompileSpec):
+            return "Can not get operator distribution for VGF."
+        else:
+            return f"Unknown output format '{compile_spec.get_output_format()}'."
         id += 1
     if id == 0:
         return "No delegate with name 'lowered_module_0 found in graph module."
diff --git a/backends/arm/tosa/backend.py b/backends/arm/tosa/backend.py
index 7596573be84..7a7ea2ca377 100644
--- a/backends/arm/tosa/backend.py
+++ b/backends/arm/tosa/backend.py
@@ -206,8 +206,8 @@ def filter_tosa_compile_specs(
         hardware.
         """
 
-        new_compile_spec = TosaCompileSpec.__new__(TosaCompileSpec)
-        new_compile_spec._set_compile_specs(
-            compile_spec.tosa_spec, [], compile_spec.get_intermediate_path()
+        return (
+            TosaCompileSpec(compile_spec.tosa_spec)
+            .dump_intermediate_artifacts_to(compile_spec.get_intermediate_path())
+            .dump_debug_info(compile_spec.tosa_debug_mode)
         )
-        return new_compile_spec
diff --git a/backends/arm/util/_factory.py b/backends/arm/util/_factory.py
new file mode 100644
index 00000000000..23d8215fc9b
--- /dev/null
+++ b/backends/arm/util/_factory.py
@@ -0,0 +1,59 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.arm.common.arm_compile_spec import ArmCompileSpec
+from executorch.backends.arm.ethosu import EthosUCompileSpec, EthosUPartitioner
+from executorch.backends.arm.quantizer import (
+    EthosUQuantizer,
+    TOSAQuantizer,
+    VgfQuantizer,
+)
+from executorch.backends.arm.tosa.compile_spec import TosaCompileSpec
+from executorch.backends.arm.tosa.partitioner import TOSAPartitioner
+from executorch.backends.arm.vgf import VgfCompileSpec, VgfPartitioner
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from torch.fx.passes.operator_support import OperatorSupportBase
+
+
+def parse_compile_spec(compile_specs: list[CompileSpec]) -> ArmCompileSpec:
+    output_format = None
+    for spec in compile_specs:
+        if spec.key == "output_format":
+            output_format = spec.value.decode()
+            break
+    else:
+        raise ValueError("Compile spec without output format.")
+    if output_format == TosaCompileSpec.get_output_format():
+        return TosaCompileSpec.from_list(compile_specs)
+    if output_format == EthosUCompileSpec.get_output_format():
+        return EthosUCompileSpec.from_list(compile_specs)
+    if output_format == VgfCompileSpec.get_output_format():
+        return VgfCompileSpec.from_list(compile_specs)
+    raise ValueError(f"Unknown output format {output_format}")
+
+
+def create_partitioner(
+    compile_spec: ArmCompileSpec,
+    additional_checks: list[OperatorSupportBase] | None = None,
+):
+    if isinstance(compile_spec, TosaCompileSpec):
+        return TOSAPartitioner(compile_spec, additional_checks)
+    elif isinstance(compile_spec, EthosUCompileSpec):
+        return EthosUPartitioner(compile_spec, additional_checks)
+    elif isinstance(compile_spec, VgfCompileSpec):
+        return VgfPartitioner(compile_spec, additional_checks)
+    else:
+        raise ValueError("compile spec doesn't target any Arm Partitioner")
+
+
+def create_quantizer(compile_spec: ArmCompileSpec):
+    if isinstance(compile_spec, TosaCompileSpec):
+        return TOSAQuantizer(compile_spec)
+    elif isinstance(compile_spec, EthosUCompileSpec):
+        return EthosUQuantizer(compile_spec)
+    elif isinstance(compile_spec, VgfCompileSpec):
+        return VgfQuantizer(compile_spec)
+    else:
+        raise ValueError("compile spec doesn't target any Arm Quantizer")
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
index 0f3526975ff..f3de38c20da 100644
--- a/examples/arm/aot_arm_compiler.py
+++ b/examples/arm/aot_arm_compiler.py
@@ -18,23 +18,18 @@
 import torch
 from examples.devtools.scripts.export_bundled_program import save_bundled_program
 from executorch.backends.arm.common.arm_compile_spec import ArmCompileSpec
-from executorch.backends.arm.ethosu import EthosUCompileSpec, EthosUPartitioner
-from executorch.backends.arm.quantizer import (
-    EthosUQuantizer,
-    get_symmetric_quantization_config,
-    TOSAQuantizer,
-    VgfQuantizer,
-)
+from executorch.backends.arm.ethosu import EthosUCompileSpec
+from executorch.backends.arm.quantizer import get_symmetric_quantization_config
 from executorch.backends.arm.tosa import TosaSpecification
 from executorch.backends.arm.tosa.compile_spec import TosaCompileSpec
-from executorch.backends.arm.tosa.partitioner import TOSAPartitioner
+from executorch.backends.arm.util._factory import create_partitioner, create_quantizer
 
 from executorch.backends.arm.util.arm_model_evaluator import (
     evaluate_model,
     evaluator_calibration_data,
 )
 
-from executorch.backends.arm.vgf import VgfCompileSpec, VgfPartitioner
+from executorch.backends.arm.vgf import VgfCompileSpec
 
 # To use Cortex-M backend
 from executorch.backends.cortex_m.passes.quantized_linear_fusion_pass import (
@@ -158,15 +153,8 @@ def quantize(
     export"""
     logging.info("Quantizing Model...")
     logging.debug(f"Original model: {model}")
-    quantizer = None
-    if isinstance(compile_specs, EthosUCompileSpec):
-        quantizer = EthosUQuantizer(compile_specs)
-    elif isinstance(compile_specs, TosaCompileSpec):
-        quantizer = TOSAQuantizer(compile_specs)
-    elif isinstance(compile_specs, VgfCompileSpec):
-        quantizer = VgfQuantizer(compile_specs)
-    else:
-        raise RuntimeError("Unsupported compilespecs for quantization!")
+
+    quantizer = create_quantizer(compile_specs)
 
     operator_config = get_symmetric_quantization_config()
     quantizer.set_global(operator_config)
@@ -649,14 +637,7 @@ def to_edge_TOSA_delegate(
             args, model, example_inputs, compile_spec
         )
 
-    if isinstance(compile_spec, EthosUCompileSpec):
-        partitioner = EthosUPartitioner(compile_spec)
-    elif isinstance(compile_spec, TosaCompileSpec):
-        partitioner = TOSAPartitioner(compile_spec)
-    elif isinstance(compile_spec, VgfCompileSpec):
-        partitioner = VgfPartitioner(compile_spec)
-    else:
-        raise RuntimeError(f"Unhandled compile spec: {compile_spec}")
+    partitioner = create_partitioner(compile_spec)
 
     edge = to_edge_transform_and_lower(
         exported_program,

From 0cd8256d145ef7a7913d953347228f0dac4b1ee9 Mon Sep 17 00:00:00 2001
From: Sebastian Larsson <38941629+Sebastian-Larsson@users.noreply.github.com>
Date: Wed, 1 Oct 2025 23:57:16 +0200
Subject: [PATCH 091/266] Arm backend: Add docstrings for
 operator_support/convolution_support.py (#14684)

Signed-off-by: Sebastian Larsson <sebastian.larsson@arm.com>
---
 .../operator_support/convolution_support.py   | 47 +++++++++++++++----
 1 file changed, 38 insertions(+), 9 deletions(-)

diff --git a/backends/arm/operator_support/convolution_support.py b/backends/arm/operator_support/convolution_support.py
index 6e9d3b3528e..f335c5046f5 100644
--- a/backends/arm/operator_support/convolution_support.py
+++ b/backends/arm/operator_support/convolution_support.py
@@ -2,6 +2,12 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+"""Declare operator support for ``aten.convolution`` in TOSA.
+
+Provide general checks and hardware-specific constraints (e.g., U55 subset) for
+convolution nodes prior to delegation to the TOSA backend.
+
+"""
 
 from typing import cast
 
@@ -18,6 +24,8 @@
 
 @register_tosa_support_check
 class ConvolutionSupported(SupportedTOSAOperatorCheck):
+    """Provide TOSA support check for convolutions."""
+
     targets = [exir_ops.edge.aten.convolution.default]
 
     tosa_specs = [
@@ -25,8 +33,15 @@ class ConvolutionSupported(SupportedTOSAOperatorCheck):
         TosaSpecification.create_from_string("TOSA-1.0+FP"),
     ]
 
-    def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
+    def is_node_tosa_supported(
+        self, node: fx.Node, tosa_spec: TosaSpecification
+    ) -> bool:
+        """Return True if the node is supported by TOSA.
 
+        Reject transposed convolutions and convolutions with non-zero output
+        padding. Apply additional hardware-specific constraints for U55.
+
+        """
         # Not implemented
         transposed = cast(bool, node.args[6])
         output_padding = cast(list[int], node.args[7])
@@ -46,9 +61,19 @@ def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
         else:
             return True
 
-    def _is_node_supported_u55(self, node: fx.Node):
-        """Hardware constraints for Ethos-U-55 case, Vela 4.2.0 (25.02 release)"""
+    def _is_node_supported_u55(self, node: fx.Node) -> bool:
+        """Enforce Ethos-U55-specific constraints (Vela 4.2.0).
+
+        Check channel dimensions, kernel sizes, and stride/pad/dilation
+        combinations permitted on U55.
 
+        Args:
+            node (fx.Node): Convolution node to validate.
+
+        Returns:
+            bool: True if supported; otherwise, False.
+
+        """
         shape_in = cast(torch.Tensor, node.all_input_nodes[0].meta["val"]).shape
         shape_out = node.meta["val"].shape
         kernel = cast(fx.Node, node.args[1]).meta["val"].shape
@@ -98,13 +123,17 @@ def _is_node_supported_u55(self, node: fx.Node):
         return True
 
     def _stride_condition(self, node: fx.Node) -> bool:
-        """This condition is somewhat complex but boils down
-        to not supporting stride > 3, unless we have some special conditions.
-        This condition is a simplified, relaxed version of the hardware constraint,
-        since the actual constraint requires information not available
-        here (without a lot of work).
+        """Check a simplified stride/padding/dilation constraint.
+
+        Disallow strides greater than 3 unless there is no padding and the
+        dilation is 1. For 3D convolutions, enforce ``stride_z <= 1``.
+
+        Args:
+            node (fx.Node): Convolution node to evaluate.
+
+        Returns:
+            bool: True if the condition is satisfied.
 
-        This means that we might accept ops that are not actually supported.
         """
         strides = cast(list[int], node.args[3])
         has_padding = any(pad > 0 for pad in cast(list[int], node.args[4]))

From 96dfa9c516ee76c8dbda8eeb7104f5f8c8c19a5f Mon Sep 17 00:00:00 2001
From: lucylq <lfq@meta.com>
Date: Wed, 1 Oct 2025 15:30:27 -0700
Subject: [PATCH 092/266] Add pybindings for bpte and ptd file

Differential Revision: D83518944

Pull Request resolved: https://github.com/pytorch/executorch/pull/14678
---
 extension/pybindings/pybindings.cpp          | 127 +++++++++++++------
 extension/pybindings/test/test_pybindings.py |  19 ++-
 2 files changed, 103 insertions(+), 43 deletions(-)

diff --git a/extension/pybindings/pybindings.cpp b/extension/pybindings/pybindings.cpp
index a896a4bde36..c3cd4ed0b47 100644
--- a/extension/pybindings/pybindings.cpp
+++ b/extension/pybindings/pybindings.cpp
@@ -158,6 +158,24 @@ void setup_output_storage(
   }
 }
 
+inline std::unique_ptr<DataLoader> loader_from_buffer(
+    const void* ptr,
+    size_t ptr_len) {
+  return std::make_unique<BufferDataLoader>(ptr, ptr_len);
+}
+
+inline std::unique_ptr<DataLoader> loader_from_file(const std::string& path) {
+  Result<MmapDataLoader> res = MmapDataLoader::from(
+      path.c_str(), MmapDataLoader::MlockConfig::UseMlockIgnoreErrors);
+  THROW_IF_ERROR(
+      res.error(),
+      "Failed to create MmapDataLoader from file %s, error: 0x:%" PRIx32,
+      path.c_str(),
+      static_cast<uint32_t>(res.error()));
+
+  return std::make_unique<MmapDataLoader>(std::move(res.get()));
+}
+
 inline std::unique_ptr<Module> load_module_from_buffer(
     const void* ptr,
     size_t ptr_len,
@@ -166,11 +184,11 @@ inline std::unique_ptr<Module> load_module_from_buffer(
     std::unique_ptr<runtime::EventTracer> event_tracer,
     Program::Verification program_verification) {
   EXECUTORCH_SCOPE_PROF("load_module_from_buffer");
-  auto loader = std::make_unique<BufferDataLoader>(ptr, ptr_len);
+  auto loader = loader_from_buffer(ptr, ptr_len);
 
   if (data_map_ptr.has_value() && data_map_len.has_value()) {
-    auto data_map_loader = std::make_unique<BufferDataLoader>(
-        data_map_ptr.value(), data_map_len.value());
+    auto data_map_loader =
+        loader_from_buffer(data_map_ptr.value(), data_map_len.value());
     return std::make_unique<Module>(
         std::move(loader),
         nullptr, // memory_allocator
@@ -194,27 +212,9 @@ inline std::unique_ptr<Module> load_module_from_file(
     Program::Verification program_verification) {
   EXECUTORCH_SCOPE_PROF("load_module_from_file");
 
-  Result<MmapDataLoader> program_loader_res = MmapDataLoader::from(
-      program_path.c_str(), MmapDataLoader::MlockConfig::UseMlockIgnoreErrors);
-  THROW_IF_ERROR(
-      program_loader_res.error(),
-      "Failed to create MmapDataLoader from file %s, error: 0x:%" PRIx32,
-      program_path.c_str(),
-      static_cast<uint32_t>(program_loader_res.error()));
-  auto program_loader =
-      std::make_unique<MmapDataLoader>(std::move(program_loader_res.get()));
-
+  auto program_loader = loader_from_file(program_path);
   if (data_map_path.has_value()) {
-    Result<MmapDataLoader> data_map_loader_res = MmapDataLoader::from(
-        data_map_path->c_str(),
-        MmapDataLoader::MlockConfig::UseMlockIgnoreErrors);
-    THROW_IF_ERROR(
-        data_map_loader_res.error(),
-        "Failed to create MmapDataLoader from file %s, error: 0x:%" PRIx32,
-        data_map_path->c_str(),
-        static_cast<uint32_t>(data_map_loader_res.error()));
-    auto data_map_loader =
-        std::make_unique<MmapDataLoader>(std::move(data_map_loader_res.get()));
+    auto data_map_loader = loader_from_file(data_map_path.value());
     return std::make_unique<Module>(
         std::move(program_loader),
         nullptr, // memory_allocator
@@ -230,6 +230,22 @@ inline std::unique_ptr<Module> load_module_from_file(
       nullptr); // data_map_loader
 }
 
+inline std::unique_ptr<Module> load_module_from_buffer_with_data_file(
+    const void* ptr,
+    size_t ptr_len,
+    const std::string& data_map_path,
+    std::unique_ptr<runtime::EventTracer> event_tracer,
+    Program::Verification program_verification) {
+  auto program_loader = loader_from_buffer(ptr, ptr_len);
+  auto data_loader = loader_from_file(data_map_path);
+  return std::make_unique<Module>(
+      std::move(program_loader),
+      nullptr, // memory_allocator
+      nullptr, // temp_allocator
+      std::move(event_tracer), // event_tracer
+      std::move(data_loader));
+}
+
 inline py::list get_outputs_as_py_list(
     const std::vector<EValue>& outputs,
     bool clone_outputs = true) {
@@ -555,6 +571,22 @@ struct PyModule final {
             setup_event_tracer(enable_etdump, debug_buffer_size),
             program_verification)) {}
 
+  explicit PyModule(
+      const void* ptr,
+      size_t ptr_len,
+      const std::string& data_path,
+      bool enable_etdump,
+      size_t debug_buffer_size = 0,
+      Program::Verification program_verification =
+          Program::Verification::InternalConsistency)
+      : debug_buffer_size_(debug_buffer_size),
+        module_(load_module_from_buffer_with_data_file(
+            ptr,
+            ptr_len,
+            data_path,
+            setup_event_tracer(enable_etdump, debug_buffer_size),
+            program_verification)) {}
+
   explicit PyModule(
       const std::string& program_path,
       std::optional<const std::string>& data_path,
@@ -605,6 +637,7 @@ struct PyModule final {
         program_verification);
   }
 
+  // Load with data as a buffer.
   static std::unique_ptr<PyModule> load_from_bundled_program(
       PyBundledModule& m,
       std::optional<const py::bytes> data_map_buffer,
@@ -628,6 +661,21 @@ struct PyModule final {
         Program::Verification::InternalConsistency);
   }
 
+  // Load with data as a file.
+  static std::unique_ptr<PyModule> load_from_bundled_program(
+      PyBundledModule& m,
+      const std::string& data_path,
+      bool enable_etdump,
+      size_t debug_buffer_size = 0) {
+    return std::make_unique<PyModule>(
+        m.get_program_ptr(),
+        m.get_program_len(),
+        data_path,
+        enable_etdump,
+        debug_buffer_size,
+        Program::Verification::InternalConsistency);
+  }
+
   py::list run_method(
       const std::string& method_name,
       const py::sequence& inputs,
@@ -900,24 +948,6 @@ struct PyModule final {
   }
 };
 
-inline std::unique_ptr<DataLoader> loader_from_buffer(
-    const void* ptr,
-    size_t ptr_len) {
-  return std::make_unique<BufferDataLoader>(ptr, ptr_len);
-}
-
-inline std::unique_ptr<DataLoader> loader_from_file(const std::string& path) {
-  Result<MmapDataLoader> res = MmapDataLoader::from(
-      path.c_str(), MmapDataLoader::MlockConfig::UseMlockIgnoreErrors);
-  THROW_IF_ERROR(
-      res.error(),
-      "Failed to create MmapDataLoader from file %s, error: 0x:%" PRIx32,
-      path.c_str(),
-      static_cast<uint32_t>(res.error()));
-
-  return std::make_unique<MmapDataLoader>(std::move(res.get()));
-}
-
 inline std::shared_ptr<ProgramState> load_program(
     std::unique_ptr<DataLoader> loader,
     Program::Verification program_verification) {
@@ -1474,12 +1504,25 @@ PYBIND11_MODULE(EXECUTORCH_PYTHON_MODULE_NAME, m) {
       call_guard);
   m.def(
       "_load_for_executorch_from_bundled_program",
-      &PyModule::load_from_bundled_program,
+      py::overload_cast<
+          PyBundledModule&,
+          std::optional<const py::bytes>,
+          bool,
+          size_t>(&PyModule::load_from_bundled_program),
       py::arg("ptr"),
       py::arg("data_map_buffer") = std::nullopt,
       py::arg("enable_etdump") = false,
       py::arg("debug_buffer_size") = 0,
       call_guard);
+  m.def(
+      "_load_for_executorch_from_bundled_program",
+      py::overload_cast<PyBundledModule&, const std::string&, bool, size_t>(
+          &PyModule::load_from_bundled_program),
+      py::arg("ptr"),
+      py::arg("data_path"),
+      py::arg("enable_etdump") = false,
+      py::arg("debug_buffer_size") = 0,
+      call_guard);
   m.def(
       "_load_bundled_program_from_buffer",
       &PyBundledModule::load_from_buffer,
diff --git a/extension/pybindings/test/test_pybindings.py b/extension/pybindings/test/test_pybindings.py
index 02ad6b5e327..ec45428c7d7 100644
--- a/extension/pybindings/test/test_pybindings.py
+++ b/extension/pybindings/test/test_pybindings.py
@@ -701,7 +701,7 @@ def test_program_data_separation(self) -> None:
         bundled_buffer = serialize_from_bundled_program_to_flatbuffer(bundled_program)
         bundled_module = self.runtime._load_bundled_program_from_buffer(bundled_buffer)
 
-        # Load module from bundled program with external data
+        # Load module from bundled program with external data buffer
         executorch_module_bundled = (
             self.runtime._load_for_executorch_from_bundled_program(
                 bundled_module, data_buffer
@@ -710,6 +710,23 @@ def test_program_data_separation(self) -> None:
         executorch_output_bundled = executorch_module_bundled.forward(inputs)[0]
         self.assertTrue(torch.allclose(expected, executorch_output_bundled))
 
+        # Load module from bundled program with external data file
+        with tempfile.TemporaryDirectory() as tmpdir:
+            ptd_file = os.path.join(tmpdir, "linear.ptd")
+            with open(ptd_file, "wb") as ptd:
+                ptd.write(data_buffer)
+            executorch_module_bundled_data_file = (
+                self.runtime._load_for_executorch_from_bundled_program(
+                    bundled_module, ptd_file
+                )
+            )
+            executorch_output_bundled_data_file = (
+                executorch_module_bundled_data_file.forward(inputs)[0]
+            )
+            self.assertTrue(
+                torch.allclose(expected, executorch_output_bundled_data_file)
+            )
+
         # Test 6: Bundled program without external data should fail
         executorch_module_bundled_no_data = (
             self.runtime._load_for_executorch_from_bundled_program(bundled_module)

From b1309e71a2d91353dae5f8579500de5b47cdd03d Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu0820@users.noreply.github.com>
Date: Wed, 1 Oct 2025 23:52:46 +0100
Subject: [PATCH 093/266] Aoti support multi method (#14715)

This pull request introduces several improvements to the CUDA backend.
The main changes include adding a new graph pass to replace unnecessary
`slice_copy` operations, improving how method names are tracked in
compilation artifacts, and making the preprocessing pipeline more robust
and accurate.

**Key changes:**

### Graph optimization and preprocessing

* Introduced `ReplaceSliceCopyWithSlicePass`, a new export pass that
replaces non-mutated `slice_copy` operations with more efficient `slice`
view operations in the computational graph
(`replace_slice_copy_with_slice.py`, used in `cuda_backend.py`).
[[1]](diffhunk://#diff-c4a228b182f50f778545991d472609ad705d2325994342174093ff374738851dR1-R113)
[[2]](diffhunk://#diff-5b5ea2257772b3aba04b2534f5ea1429a0c631bfd25a7ef531f526e76c471d7aR115-R117)
* Added context management for attention kernel selection and no-grad
mode during AOT compilation to ensure correct backend selection for
decomposition. This is needed in the short term until we have a flash
attention cuda kernel.

### Method name and compile specification handling

* Added a `COMPILE_SPEC_KEYS` enum and utility methods
(`generate_method_name_compile_spec`, `method_name_from_compile_specs`)
to consistently embed and retrieve the method name in compile specs and
as a key in the data store, improving traceability of compiled
artifacts.
[[1]](diffhunk://#diff-5b5ea2257772b3aba04b2534f5ea1429a0c631bfd25a7ef531f526e76c471d7aL24-R35)
[[2]](diffhunk://#diff-5b5ea2257772b3aba04b2534f5ea1429a0c631bfd25a7ef531f526e76c471d7aL161-R158)
[[3]](diffhunk://#diff-5b5ea2257772b3aba04b2534f5ea1429a0c631bfd25a7ef531f526e76c471d7aR169-R195)

### Code cleanup and maintainability

* Minor refactor in `cuda_partitioner.py` to clarify delegation tag
assignment.
* Improved imports and code organization for clarity in
`cuda_backend.py`.

These changes collectively improve the reliability, performance, and
maintainability of the CUDA backend pipeline.
---
 backends/cuda/cuda_backend.py                 |  50 +++++++-
 backends/cuda/cuda_partitioner.py             |   6 +-
 .../cuda/replace_slice_copy_with_slice.py     | 115 ++++++++++++++++++
 3 files changed, 166 insertions(+), 5 deletions(-)
 create mode 100644 backends/cuda/replace_slice_copy_with_slice.py

diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
index 49314bed5e6..a39065f6a52 100644
--- a/backends/cuda/cuda_backend.py
+++ b/backends/cuda/cuda_backend.py
@@ -7,10 +7,14 @@
 import contextlib
 import os
 import typing
+from enum import Enum
 
 from typing import Any, Dict, final, List, Optional, Set
 
 import torch
+from executorch.backends.cuda.replace_slice_copy_with_slice import (
+    ReplaceSliceCopyWithSlicePass,
+)
 from executorch.exir._serialize._named_data_store import NamedDataStore
 from executorch.exir._warnings import experimental
 from executorch.exir.backend.backend_details import (
@@ -21,7 +25,7 @@
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from torch._inductor.codegen.cpp_wrapper_cpu import CppWrapperCpu
 from torch.export.passes import move_to_device_pass
-
+from torch.nn.attention import SDPBackend
 
 # exist fallback operators in et namespace;
 supported_fallback_kernels: Dict[str, Any] = {}
@@ -30,6 +34,10 @@
 missing_fallback_kernels: Set[str] = set()
 
 
+class COMPILE_SPEC_KEYS(Enum):
+    METHOD_NAME = "method_name"
+
+
 # context manager for non-fallback guarantee
 # it will raise exception when generating fallback kernels during aoti compile
 @contextlib.contextmanager
@@ -108,6 +116,9 @@ def preprocess(
         # Move the edge_program from CPU to CUDA for aoti compile
         cuda_edge_program = move_to_device_pass(edge_program, "cuda")
 
+        # replace slice_copy with slice
+        ReplaceSliceCopyWithSlicePass()(cuda_edge_program.graph_module)
+
         edge_program_module = cuda_edge_program.module()
 
         # Grab all input placeholders from the graph
@@ -132,7 +143,10 @@ def preprocess(
             "max_autotune_conv_backends": "TRITON",
         }
 
-        with collect_unsupported_fallback_kernels():
+        with collect_unsupported_fallback_kernels(), torch.nn.attention.sdpa_kernel(
+            [SDPBackend.MATH]
+        ), torch.no_grad():
+            # torch._logging.set_logs(post_grad_graphs=True)
             so_path = torch._inductor.aot_compile(edge_program_module, tuple(user_input_placeholders), options=options)  # type: ignore[arg-type]
             if len(missing_fallback_kernels) > 0:
                 formatted_kernels = "\n  - ".join(sorted(missing_fallback_kernels))
@@ -146,7 +160,10 @@ def preprocess(
             so_data = f.read()
 
         named_data_store = NamedDataStore()
-        named_data_store.add_named_data("so_blob", so_data, 1, "aoti_cuda_blob")
+        method_name = CudaBackend.method_name_from_compile_specs(compile_specs)
+        named_data_store.add_named_data(
+            method_name + "_so_blob", so_data, 1, "aoti_cuda_blob"
+        )
 
         # Clean up the generated so file; it has been packaged into the NamdeDataStore
         # pyre-ignorep[6]: Incompatible parameter type
@@ -157,3 +174,30 @@ def preprocess(
             debug_handle_map={},
             data_store_output=named_data_store.get_named_data_store_output(),
         )
+
+    @staticmethod
+    def generate_method_name_compile_spec(
+        method_name: str,
+    ) -> CompileSpec:
+        """
+        Returns the compile spec representing the model compute precision, for additional details
+        please refer to the documentation for ``coremltools.precision``.
+        """
+        return CompileSpec(
+            COMPILE_SPEC_KEYS.METHOD_NAME.value,
+            method_name.encode("utf-8"),
+        )
+
+    @staticmethod
+    def method_name_from_compile_specs(
+        compile_specs: List[CompileSpec],
+    ) -> str:
+        """
+        Returns the method name from the compile specs.
+        """
+        for spec in compile_specs:
+            if spec.key == COMPILE_SPEC_KEYS.METHOD_NAME.value:
+                return spec.value.decode("utf-8")
+        raise RuntimeError(
+            f"Could not find method name in compile specs: {compile_specs}"
+        )
diff --git a/backends/cuda/cuda_partitioner.py b/backends/cuda/cuda_partitioner.py
index d52d7d3d087..14c75bdb937 100644
--- a/backends/cuda/cuda_partitioner.py
+++ b/backends/cuda/cuda_partitioner.py
@@ -44,12 +44,14 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         """
 
         partition_tags: Dict[str, DelegationSpec] = {}
+        tag = "tag0"
+
         for node in exported_program.graph.nodes:
             if node.op != "call_function":
                 continue
-            tag = "tag0"
             node.meta["delegation_tag"] = tag
-            partition_tags[tag] = self.delegation_spec
+
+        partition_tags[tag] = self.delegation_spec
 
         tag_constant_data(exported_program)
 
diff --git a/backends/cuda/replace_slice_copy_with_slice.py b/backends/cuda/replace_slice_copy_with_slice.py
new file mode 100644
index 00000000000..55ddef5de9b
--- /dev/null
+++ b/backends/cuda/replace_slice_copy_with_slice.py
@@ -0,0 +1,115 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+from typing import Iterable
+
+import torch
+from executorch.exir.dialects._ops import ops
+from executorch.exir.pass_base import ExportPass, PassResult
+from torch import fx
+
+
+_SLICE_COPY_TARGETS = (
+    torch.ops.aten.slice_copy.Tensor,
+    ops.edge.aten.slice_copy.Tensor,
+)
+
+_SLICE_TARGETS = {
+    torch.ops.aten.slice_copy.Tensor: torch.ops.aten.slice.Tensor,
+    ops.edge.aten.slice_copy.Tensor: ops.edge.aten.slice.Tensor,
+}
+
+
+class ReplaceSliceCopyWithSlicePass(ExportPass):
+    """Replace non-mutated ``slice_copy`` results with ``slice`` views."""
+
+    def call(self, graph_module: fx.GraphModule) -> PassResult:
+        graph_changed = False
+
+        for node in graph_module.graph.nodes:
+            if node.op != "call_function" or node.target not in _SLICE_COPY_TARGETS:
+                continue
+
+            if self._has_blocking_user(node, node.users.keys()):
+                continue
+
+            node.target = _SLICE_TARGETS[node.target]
+            graph_changed = True
+
+        if graph_changed:
+            graph_module.graph.lint()
+            graph_module.recompile()
+
+        return PassResult(graph_module, graph_changed)
+
+    def _has_blocking_user(self, node: fx.Node, users: Iterable[fx.Node]) -> bool:
+        for user in users:
+            if self._is_mutating_user(node, user) or self._is_view_user(node, user):
+                return True
+        return False
+
+    def _is_mutating_user(self, node: fx.Node, user: fx.Node) -> bool:
+        if user.op == "call_method":
+            # Treat in-place tensor methods conservatively as mutations only when the
+            # method name ends with ``_`` which is the PyTorch convention for mutation.
+            return isinstance(user.target, str) and user.target.endswith("_")
+
+        if user.op != "call_function":
+            return False
+
+        target = user.target
+        if not hasattr(target, "_schema"):
+            return False
+
+        schema = target._schema  # pyre-ignore[16]
+        # Positional arguments
+        for index, arg in enumerate(user.args):
+            if arg is node and self._argument_mutates(schema, index):
+                return True
+
+        # Keyword arguments
+        for name, arg in user.kwargs.items():
+            if arg is node and self._argument_mutates(schema, name):
+                return True
+
+        return False
+
+    def _is_view_user(self, node: fx.Node, user: fx.Node) -> bool:
+        if user.op == "call_method":
+            # Treat tensor methods conservatively and assume they may be view-producing.
+            return True
+
+        if user.op != "call_function":
+            return False
+
+        target = user.target
+        if getattr(target, "is_view", False):
+            for arg in user.args:
+                if arg is node:
+                    return True
+            for arg in user.kwargs.values():
+                if arg is node:
+                    return True
+
+        return False
+
+    def _argument_mutates(
+        self, schema: torch._C.FunctionSchema, key
+    ) -> bool:  # pyre-ignore[11]
+        arguments = schema.arguments
+        if isinstance(key, int):
+            if key >= len(arguments):
+                return False
+            argument = arguments[key]
+        else:
+            argument = next((arg for arg in arguments if arg.name == key), None)
+            if argument is None:
+                return False
+
+        alias_info = argument.alias_info
+        return bool(alias_info and alias_info.is_write)

From 426b7015e2b8302791d37d249710cd8111c5b57b Mon Sep 17 00:00:00 2001
From: Zingo Andersen <zingo.andersen@arm.com>
Date: Thu, 2 Oct 2025 01:12:35 +0200
Subject: [PATCH 094/266] Arm backend: Backend test TOSA FP, INT and
 Ethos-U55/U85 (#14653)

### Summary
Create arm_ethos_u55 and arm_ethos_u85 test flows and add them to CI
Build a semihosted runner for testing on the Corstone3x0 FVP
And split the arm_tosa test job that tested TOSA-1.0+FP into arm_tosa_fp
and arm_tosa_int to also test TOSA-1.0+INT

### Test plan
This will add new tests for arm_tosa_int arm_ethos_u55 and arm_ethos_u85

cc @digantdesai @freddan80 @per @oscarandersson8218

---------

Signed-off-by: Zingo Andersen <zingo.andersen@arm.com>
---
 .ci/scripts/test_backend.sh            |  7 +++
 .github/workflows/test-backend-arm.yml |  2 +-
 backends/test/suite/flow.py            | 17 ++++++-
 backends/test/suite/flows/arm.py       | 68 ++++++++++++++++++++++----
 4 files changed, 81 insertions(+), 13 deletions(-)

diff --git a/.ci/scripts/test_backend.sh b/.ci/scripts/test_backend.sh
index df98fb43372..ba5df5c3fe3 100755
--- a/.ci/scripts/test_backend.sh
+++ b/.ci/scripts/test_backend.sh
@@ -1,6 +1,7 @@
 #!/usr/bin/env bash
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -58,6 +59,12 @@ fi
 if [[ "$FLOW" == *arm* ]]; then
     # Setup ARM deps.
     .ci/scripts/setup-arm-baremetal-tools.sh
+
+    if [[ "$FLOW" == *ethos_u* ]]; then
+        # Prepare a test runner binary that can run on the Corstone-3x0 FVPs
+        backends/arm/scripts/build_executorch.sh
+        backends/arm/test/setup_testing.sh
+    fi
 fi
 
 if [[ $IS_MACOS -eq 1 ]]; then
diff --git a/.github/workflows/test-backend-arm.yml b/.github/workflows/test-backend-arm.yml
index bee74fee172..428e3fd1239 100644
--- a/.github/workflows/test-backend-arm.yml
+++ b/.github/workflows/test-backend-arm.yml
@@ -23,7 +23,7 @@ jobs:
     uses: ./.github/workflows/_test_backend.yml
     with:
       backend: arm
-      flows: '["arm_tosa"]'
+      flows: '["arm_tosa_fp", "arm_tosa_int", "arm_ethos_u55", "arm_ethos_u85"]'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 120
       run-linux: true
diff --git a/backends/test/suite/flow.py b/backends/test/suite/flow.py
index 05fc760683d..29394951bd7 100644
--- a/backends/test/suite/flow.py
+++ b/backends/test/suite/flow.py
@@ -1,3 +1,8 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
 import logging
 
 from dataclasses import dataclass, field
@@ -122,10 +127,18 @@ def all_flows() -> dict[str, TestFlow]:
         logger.info(f"Skipping QNN flow registration: {e}")
 
     try:
-        from executorch.backends.test.suite.flows.arm import ARM_TOSA_FLOW
+        from executorch.backends.test.suite.flows.arm import (
+            ARM_ETHOS_U55_FLOW,
+            ARM_ETHOS_U85_FLOW,
+            ARM_TOSA_FP_FLOW,
+            ARM_TOSA_INT_FLOW,
+        )
 
         flows += [
-            ARM_TOSA_FLOW,
+            ARM_TOSA_FP_FLOW,
+            ARM_TOSA_INT_FLOW,
+            ARM_ETHOS_U55_FLOW,
+            ARM_ETHOS_U85_FLOW,
         ]
     except Exception as e:
         logger.info(f"Skipping ARM flow registration: {e}")
diff --git a/backends/test/suite/flows/arm.py b/backends/test/suite/flows/arm.py
index baa2df79de9..34a6346fb1f 100644
--- a/backends/test/suite/flows/arm.py
+++ b/backends/test/suite/flows/arm.py
@@ -1,24 +1,72 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from executorch.backends.arm.quantizer import (
+    get_symmetric_quantization_config,
+    TOSAQuantizer,
+)
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.backends.test.suite.flow import TestFlow
+from executorch.backends.xnnpack.test.tester.tester import Quantize
 
 
-def _create_arm_tester_tosa_fp(*args, **kwargs) -> ArmTester:
-    kwargs["compile_spec"] = common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP")
+def _create_tosa_flow(
+    name,
+    compile_spec,
+    quantize: bool = False,
+    symmetric_io_quantization: bool = False,
+    per_channel_quantization: bool = True,
+) -> TestFlow:
 
-    return ArmTester(
-        *args,
-        **kwargs,
-    )
+    def _create_arm_tester(*args, **kwargs) -> ArmTester:
+        kwargs["compile_spec"] = compile_spec
 
+        return ArmTester(
+            *args,
+            **kwargs,
+        )
+
+    # Create and configure quantizer to use in the flow
+    def create_quantize_stage() -> Quantize:
+        quantizer = TOSAQuantizer(compile_spec)
+        quantization_config = get_symmetric_quantization_config(
+            is_per_channel=per_channel_quantization
+        )
+        if symmetric_io_quantization:
+            quantizer.set_io(quantization_config)
+        return Quantize(quantizer, quantization_config)
 
-def _create_tosa_flow() -> TestFlow:
     return TestFlow(
-        "arm_tosa",
+        name,
         backend="arm",
-        tester_factory=_create_arm_tester_tosa_fp,
+        tester_factory=_create_arm_tester,
         supports_serialize=False,
+        quantize=quantize,
+        quantize_stage_factory=create_quantize_stage if quantize else None,
     )
 
 
-ARM_TOSA_FLOW = _create_tosa_flow()
+ARM_TOSA_FP_FLOW = _create_tosa_flow(
+    "arm_tosa_fp", common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP")
+)
+ARM_TOSA_INT_FLOW = _create_tosa_flow(
+    "arm_tosa_int",
+    common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+INT"),
+    quantize=True,
+)
+
+ARM_ETHOS_U55_FLOW = _create_tosa_flow(
+    "arm_ethos_u55",
+    common.get_u55_compile_spec(),
+    quantize=True,
+)
+
+ARM_ETHOS_U85_FLOW = _create_tosa_flow(
+    "arm_ethos_u85",
+    common.get_u85_compile_spec(),
+    quantize=True,
+)

From d4f208d2690bc9abae4709a8932d0ab596d81cc4 Mon Sep 17 00:00:00 2001
From: Hansong Zhang <107070759+kirklandsign@users.noreply.github.com>
Date: Wed, 1 Oct 2025 17:11:37 -0700
Subject: [PATCH 095/266] Android set different maven package names of flavors
 (#14674)

Different flavor name generates different maven packages
---
 .github/workflows/android-release-artifacts.yml   | 4 ++++
 extension/android/executorch_android/build.gradle | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/android-release-artifacts.yml b/.github/workflows/android-release-artifacts.yml
index bec6d3a0f5e..beda0f77c83 100644
--- a/.github/workflows/android-release-artifacts.yml
+++ b/.github/workflows/android-release-artifacts.yml
@@ -90,6 +90,10 @@ jobs:
         fi
 
         FLAVOR="${{ inputs.flavor }}"
+        if [ ! -z "$FLAVOR" ]; then
+          GRADLE_ARGS+=" -Dflavor=${FLAVOR}"
+        fi
+
         if [[ "$FLAVOR" == "vulkan" || -z "$FLAVOR" ]]; then
           curl -O https://sdk.lunarg.com/sdk/download/1.4.321.1/linux/vulkansdk-linux-x86_64-1.4.321.1.tar.xz
           tar xf vulkansdk-linux-x86_64-1.4.321.1.tar.xz -C /tmp
diff --git a/extension/android/executorch_android/build.gradle b/extension/android/executorch_android/build.gradle
index e36044e3da5..0c18d60721e 100644
--- a/extension/android/executorch_android/build.gradle
+++ b/extension/android/executorch_android/build.gradle
@@ -15,6 +15,7 @@ plugins {
 
 def qnnVersion = System.properties['qnnVersion']
 def execuTorchVersion = System.properties['execuTorchVersion']
+def flavor = System.properties['flavor']
 
 android {
     namespace = "org.pytorch.executorch"
@@ -69,7 +70,7 @@ mavenPublishing {
   publishToMavenCentral()
   signAllPublications()
 
-  coordinates("org.pytorch", "executorch-android" + (qnnVersion ? "-qnn" : ""), execuTorchVersion ? execuTorchVersion : "0.7.0-SNAPSHOT")
+  coordinates("org.pytorch", "executorch-android" + (flavor ? "-" + flavor : ""), execuTorchVersion ? execuTorchVersion : "1.0.0-SNAPSHOT")
 
   pom {
     name = "ExecuTorch Android"

From e608a21fc1845d960142b4c78ade06cdafdf5036 Mon Sep 17 00:00:00 2001
From: Gregory Comer <gjcomer@meta.com>
Date: Wed, 1 Oct 2025 19:45:58 -0600
Subject: [PATCH 096/266] [Backend Tester] Update README (#14739)

### Summary
Update the readme for the backend test suite to describe how to run with
pytest and to generally update for recent changes. Add CLI examples for
common invocation patterns (filter by test, flow, or backend) and add
some brief info on the JSON report format.
---
 backends/test/suite/README.md | 80 +++++++++++++++++++++++++----------
 1 file changed, 57 insertions(+), 23 deletions(-)

diff --git a/backends/test/suite/README.md b/backends/test/suite/README.md
index 564f44362ad..901cd461dbe 100644
--- a/backends/test/suite/README.md
+++ b/backends/test/suite/README.md
@@ -5,37 +5,71 @@ This directory contains tests that validate correctness and coverage of backends
 These tests are intended to ensure that backends are robust and provide a smooth, "out-of-box" experience for users across the full span of input patterns. They are not intended to be a replacement for backend-specific tests, as they do not attempt to validate performance or that backends delegate operators that they expect to.
 
 ## Running Tests and Interpreting Output
-Tests can be run from the command line, either using the runner.py entry point or the standard Python unittest runner. When running through runner.py, the test runner will report test statistics, including the number of tests with each result type.
+Tests can be run from the command line using pytest. When generating a JSON test report, the runner will report detailed test statistics, including output accuracy, delegated nodes, lowering timing, and more.
 
-Backends can be specified with the `ET_TEST_ENABLED_BACKENDS` environment variable. By default, all available backends are enabled. Note that backends such as Core ML or Vulkan may require specific hardware or software to be available. See the documentation for each backend for information on requirements.
+Each backend and test flow (recipe) registers a pytest [marker](https://docs.pytest.org/en/stable/example/markers.html) that can be passed to pytest with the `-m marker` argument to filter execution.
 
-Example:
+To run all XNNPACK backend operator tests:
 ```
-ET_TEST_ENABLED_BACKENDS=xnnpack python -m executorch.backends.test.suite.runner
+pytest -c /dev/nul backends/test/suite/operators/ -m backend_xnnpack -n auto
 ```
 
+To run all model tests for the CoreML static int8 lowering flow:
+```
+pytest -c /dev/nul backends/test/suite/models/ -m flow_coreml_static_int8 -n auto
 ```
-2465 Passed / 2494
-16 Failed
-13 Skipped
 
-[Success]
-736 Delegated
-1729 Undelegated
+To run a specific test:
+```
+pytest -c /dev/nul backends/test/suite/ -k "test_prelu_f32_custom_init[xnnpack]"
+```
 
-[Failure]
-5 Lowering Fail
-3 PTE Run Fail
-8 Output Mismatch Fail
+To generate a JSON report:
+```
+pytest -c /dev/nul backends/test/suite/operators/ -n auto --json-report --json-report-file="test_report.json"
 ```
 
-Outcomes can be interpreted as follows:
- * Success (delegated): The test passed and at least one op was delegated by the backend.
- * Success (undelegated): The test passed with no ops delegated by the backend. This is a pass, as the partitioner works as intended.
- * Skipped: test fails in eager or export (indicative of a test or dynamo issue).
- * Lowering fail: The test fails in to_edge_transform_and_lower.
- * PTE run failure: The test errors out when loading or running the method.
- * Output mismatch failure: Output delta (vs eager) exceeds the configured tolerance.
+See [pytest-json-report](https://pypi.org/project/pytest-json-report/) for information on the report format. The test logic in this repository attaches additional metadata to each test entry under the `metadata`/`subtests` keys. One entry is created for each call to `test_runner.lower_and_run_model`.
+
+Here is a excerpt from a test run, showing a successful run of the `test_add_f32_bcast_first[xnnpack]` test.
+```json
+"tests": [
+    {
+      "nodeid": "operators/test_add.py::test_add_f32_bcast_first[xnnpack]",
+      "lineno": 38,
+      "outcome": "passed",
+      "keywords": [
+        "test_add_f32_bcast_first[xnnpack]",
+        "flow_xnnpack",
+        "backend_xnnpack",
+        ...
+      ],
+      "metadata": {
+        "subtests": [
+          {
+            "Test ID": "test_add_f32_bcast_first[xnnpack]",
+            "Test Case": "test_add_f32_bcast_first",
+            "Subtest": 0,
+            "Flow": "xnnpack",
+            "Result": "Pass",
+            "Result Detail": "",
+            "Error": "",
+            "Delegated": "True",
+            "Quantize Time (s)": null,
+            "Lower Time (s)": "2.881",
+            "Output 0 Error Max": "0.000",
+            "Output 0 Error MAE": "0.000",
+            "Output 0 SNR": "inf",
+            "Delegated Nodes": 1,
+            "Undelegated Nodes": 0,
+            "Delegated Ops": {
+              "aten::add.Tensor": 1
+            },
+            "PTE Size (Kb)": "1.600"
+          }
+        ]
+      }
+```
 
 ## Backend Registration
 
@@ -43,11 +77,11 @@ To plug into the test framework, each backend should provide an implementation o
 
 At a minimum, the backend will likely need to provide a custom implementation of the Partition and ToEdgeTransformAndLower stages using the appropriate backend partitioner. See backends/xnnpack/test/tester/tester.py for an example implementation.
 
-Once a tester is available, the backend flow(s) can be added in __init__.py in this directory by adding an entry to `ALL_TESTER_FLOWS`. Each flow entry consists of a name (used in the test case naming) and a function to instantiate a tester for a given model and input tuple.
+Once a tester is available, the backend flow(s) can be added under flows/ and registered in flow.py. It is intended that this will be unified with the lowering recipes under executorch/export in the near future.
 
 ## Test Cases
 
-Operator test cases are defined under the operators/ directory. Tests are written in a backend-independent manner, and each test is programmatically expanded to generate a variant for each registered backend flow. The `@operator_test` decorator is applied to each test class to trigger this behavior. Tests can also be tagged with an appropriate type specifier, such as `@dtype_test`, to generate variants for each dtype. The decorators and "magic" live in __init__.py in this directory.
+Operator test cases are defined under the operators/ directory. Model tests are under models/. Tests are written in a backend-independent manner, and each test is programmatically expanded to generate a variant for each registered backend flow by use of the `test_runner` fixture parameter. Tests can additionally be parameterized using standard pytest decorators. Parameterizing over dtype is a common use case.
 
 ## Evolution of this Test Suite
 

From fb66fb38604dc02abfc3c52d97d5af72725c92b3 Mon Sep 17 00:00:00 2001
From: robert-kalmar <robert.kalmar@nxp.com>
Date: Thu, 2 Oct 2025 04:12:16 +0200
Subject: [PATCH 097/266] NXP Backend: Add codeowner for the NXP Backend
 (#14723)

Add codeowner for the NXP Backend.

cc @digantdesai @JakeStevens
---
 CODEOWNERS | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CODEOWNERS b/CODEOWNERS
index 10baed9ede4..11f3ca07615 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -14,6 +14,7 @@
 /backends/transforms @kimishpatel
 /backends/vulkan @SS-JIA
 /backends/xnnpack @digantdesai @mcr229
+/backends/nxp @robert-kalmar
 
 /devtools @Gasoonjia
 
@@ -33,6 +34,7 @@
 /examples/qualcomm @cccclai
 /examples/selective_build @lucylq @larryliu0820 @JacobSzwejbka
 /examples/xnnpack @digantdesai @mcr229
+/examples/nxp @robert-kalmar
 
 /exir/backend @cccclai @kimishpatel @JacobSzwejbka
 /exir @JacobSzwejbka @larryliu0820

From baaaa86ca9bb0a61c42cc36b781571bd5cac2cf6 Mon Sep 17 00:00:00 2001
From: Andrew Grebenisan <33402477+DrJessop@users.noreply.github.com>
Date: Wed, 1 Oct 2025 19:30:25 -0700
Subject: [PATCH 098/266] Add transposed convolution

Differential Revision: D83602808

Pull Request resolved: https://github.com/pytorch/executorch/pull/14708
---
 backends/cadence/aot/ref_implementations.py   |  59 ++++++++
 .../aot/tests/test_ref_implementations.py     | 137 ++++++++++++++++++
 2 files changed, 196 insertions(+)

diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py
index 312bed89315..ca15e825ff0 100644
--- a/backends/cadence/aot/ref_implementations.py
+++ b/backends/cadence/aot/ref_implementations.py
@@ -960,6 +960,7 @@ def convolution(
     _stride: tuple[int, int] | int = stride
     _padding: tuple[int, int] | int = padding
     _dilation: tuple[int, int] | int = dilation
+
     if conv_is_1d:
         conv = torch.nn.functional.conv1d
         _stride = stride[0]
@@ -978,6 +979,64 @@ def convolution(
     return conv_out
 
 
+@impl(m, "transposed_convolution")
+def transposed_convolution(
+    input_tensor: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    stride: tuple[int, int],
+    padding: tuple[int, int],
+    dilation: tuple[int, int],
+    output_padding: tuple[int, int],
+    groups: int,
+    channel_last: bool = False,
+) -> torch.Tensor:
+
+    conv_is_1d = len(input_tensor.shape) == 3
+    if channel_last:
+        if conv_is_1d:
+            input_tensor = input_tensor.movedim(-1, 1).contiguous()
+            if len(weight.shape) != 3:
+                raise ValueError("Weight tensor must be 3D if input is 3D")
+            weight = weight.movedim(-1, 1).contiguous()
+        else:
+            input_tensor = input_tensor.movedim(-1, -3)
+            if len(weight.shape) != 4:
+                raise ValueError("Weight tensor must be 4D if input is nd > 3")
+            weight = torch.permute(weight, (0, -1, 1, 2)).contiguous()
+
+    _stride: tuple[int, int] | int = stride
+    _padding: tuple[int, int] | int = padding
+    _dilation: tuple[int, int] | int = dilation
+    _output_padding: tuple[int, int] | int = output_padding
+    if conv_is_1d:
+        conv = torch.nn.functional.conv_transpose1d
+        _stride = stride[0]
+        _padding = padding[0]
+        _dilation = dilation[0]
+        _output_padding = output_padding[0]
+    else:
+        conv = torch.nn.functional.conv_transpose2d
+
+    conv_out = conv(
+        input_tensor,
+        weight,
+        bias,
+        _stride,
+        _padding,
+        _output_padding,
+        groups,
+        _dilation,
+    )
+    if channel_last:
+        if conv_is_1d:
+            conv_out = conv_out.movedim(1, -1).contiguous()
+        else:
+            conv_out = conv_out.movedim(-3, -1).contiguous()
+
+    return conv_out
+
+
 @impl(m, "avg_pool2d")
 def avg_pool2d(
     input_tensor: torch.Tensor,
diff --git a/backends/cadence/aot/tests/test_ref_implementations.py b/backends/cadence/aot/tests/test_ref_implementations.py
index 32e9b43e68e..8d02c5c2963 100644
--- a/backends/cadence/aot/tests/test_ref_implementations.py
+++ b/backends/cadence/aot/tests/test_ref_implementations.py
@@ -1534,6 +1534,143 @@ def test_convolution(
             f"Output values don't match expected in {name}. Got {output}, expected {expected_output}",
         )
 
+    @expand(
+        [
+            # Basic 2D transposed convolution with stride=1 (current test case - corrected name)
+            (
+                "basic_2d_stride1",
+                torch.tensor(
+                    [[[[1.0, 2.0], [3.0, 4.0]]]], dtype=torch.float32
+                ),  # input: 1x1x2x2
+                torch.tensor(
+                    [[[[1.0, 1.0], [1.0, 1.0]]]], dtype=torch.float32
+                ),  # weight: 1x1x2x2
+                torch.tensor([0.0], dtype=torch.float32),  # bias
+                (1, 1),  # stride
+                (0, 0),  # padding
+                (1, 1),  # dilation
+                1,  # groups
+                (0, 0),  # output_padding
+                False,  # channel_last
+                torch.tensor(
+                    [[[[1.0, 3.0, 2.0], [4.0, 10.0, 6.0], [3.0, 7.0, 4.0]]]],
+                    dtype=torch.float32,
+                ),
+            ),
+            # 2D transposed convolution with channel_last=True (NHWC format)
+            (
+                "channel_last_nhwc",
+                torch.tensor(
+                    [[[[1.0], [2.0]], [[3.0], [4.0]]]], dtype=torch.float32
+                ),  # input: 1x2x2x1 (NHWC)
+                torch.tensor(
+                    [[[[1.0], [1.0]], [[1.0], [1.0]]]], dtype=torch.float32
+                ),  # weight: 1x2x2x1 (NHWC)
+                torch.tensor([0.0], dtype=torch.float32),  # bias
+                (1, 1),  # stride
+                (0, 0),  # padding
+                (1, 1),  # dilation
+                1,  # groups
+                (0, 0),  # output_padding
+                True,  # channel_last=True
+                torch.tensor(
+                    [
+                        [
+                            [[1.0], [3.0], [2.0]],
+                            [[4.0], [10.0], [6.0]],
+                            [[3.0], [7.0], [4.0]],
+                        ]
+                    ],
+                    dtype=torch.float32,
+                ),
+            ),
+            # 2D transposed convolution with non-zero bias
+            (
+                "with_bias",
+                torch.tensor(
+                    [[[[1.0, 2.0], [3.0, 4.0]]]], dtype=torch.float32
+                ),  # input: 1x1x2x2
+                torch.tensor(
+                    [[[[1.0, 0.0], [0.0, 1.0]]]], dtype=torch.float32
+                ),  # weight: 1x1x2x2
+                torch.tensor([5.0], dtype=torch.float32),  # bias=5.0
+                (1, 1),  # stride
+                (0, 0),  # padding
+                (1, 1),  # dilation
+                1,  # groups
+                (0, 0),  # output_padding
+                False,  # channel_last
+                torch.tensor(
+                    [[[[6.0, 7.0, 5.0], [8.0, 10.0, 7.0], [5.0, 8.0, 9.0]]]],
+                    dtype=torch.float32,
+                ),
+            ),
+            # 1D transposed convolution (3D tensor, NLC format)
+            (
+                "conv1d_nlc",
+                torch.tensor(
+                    [[[1.0], [2.0], [3.0]]], dtype=torch.float32
+                ),  # input: 1x3x1 (NLC)
+                torch.tensor(
+                    [[[1.0], [0.5]]], dtype=torch.float32
+                ),  # weight: 1x2x1 (NLC)
+                torch.tensor([0.0], dtype=torch.float32),  # bias
+                (2, 0),  # stride
+                (0, 0),  # padding
+                (1, 1),  # dilation
+                1,  # groups
+                (0, 0),  # output_padding
+                True,  # channel_last=True
+                torch.tensor(
+                    [[[1.0], [0.5], [2.0], [1.0], [3.0], [1.5]]], dtype=torch.float32
+                ),
+            ),
+        ]
+    )
+    def test_transposed_convolution(
+        self,
+        name: str,
+        input_tensor: torch.Tensor,
+        weight: torch.Tensor,
+        bias: torch.Tensor,
+        stride: tuple[int, int],
+        padding: tuple[int, int],
+        dilation: tuple[int, int],
+        groups: int,
+        output_padding: tuple[int, int],
+        channel_last: bool,
+        expected_output: torch.Tensor,
+    ) -> None:
+        output = torch.ops.cadence.transposed_convolution(
+            input_tensor,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            output_padding,
+            groups,
+            channel_last,
+        )
+
+        # Verify output properties
+        self.assertEqual(
+            output.dtype,
+            input_tensor.dtype,
+            f"Output dtype should match input dtype in {name}",
+        )
+        self.assertEqual(
+            output.shape,
+            expected_output.shape,
+            f"Output shape should match expected shape in {name}",
+        )
+
+        # Verify output matches expected values
+        self.assertTrue(
+            torch.equal(output, expected_output),
+            f"Output values don't match expected in {name}. Got {output}, expected {expected_output}",
+        )
+
     @expand(
         [
             # Basic non-quantized average pooling

From 9ab5592a6533e9d903d927ff70d9aef83a74f0c6 Mon Sep 17 00:00:00 2001
From: cccclai <chenlai@meta.com>
Date: Wed, 1 Oct 2025 21:13:24 -0700
Subject: [PATCH 099/266] support qnn mean (dim=None) (#14675)

Summary: Address mean op lower failure. When dim is not specified, it
will take mean across all axes. For QNN, we need to get axes based on
input shape

Differential Revision: D83520776
---
 backends/qualcomm/builders/op_mean_dim.py    |  19 ++-
 backends/qualcomm/tests/models.py            |  25 ++--
 backends/qualcomm/tests/test_qnn_delegate.py | 132 ++++++++++++++++---
 3 files changed, 143 insertions(+), 33 deletions(-)

diff --git a/backends/qualcomm/builders/op_mean_dim.py b/backends/qualcomm/builders/op_mean_dim.py
index 630b1b0b8de..22cb47ee288 100644
--- a/backends/qualcomm/builders/op_mean_dim.py
+++ b/backends/qualcomm/builders/op_mean_dim.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import cast, Dict, List
+from typing import cast, Dict
 
 import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
 
@@ -40,7 +40,22 @@ def define_node(
         )
 
         # mean dims and keep dims
-        mean_dims = cast(List[int], node.args[1])
+        rank = len(input_node.meta["val"].shape)
+
+        if rank == 0:
+            raise RuntimeError(
+                "Mean doesn't support 0d input, please report a bug in https://github.com/pytorch/executorch/issues"
+            )
+
+        dim_arg = node.args[1]
+
+        if dim_arg is None or len(dim_arg) == 0:
+            mean_dims = list(range(rank))  # reduce over all dims
+        elif isinstance(dim_arg, int):
+            mean_dims = [dim_arg]
+        else:
+            mean_dims = list(dim_arg)
+        print("mean_dims: ", mean_dims, "rank: ", rank)
         mean_dims = [
             mean_dim % len(input_node.meta["val"].shape) for mean_dim in mean_dims
         ]
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
index a37648cb6be..cf4b2f21aaa 100644
--- a/backends/qualcomm/tests/models.py
+++ b/backends/qualcomm/tests/models.py
@@ -4,8 +4,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import torch
+from typing import List, Optional, Tuple, Union
 
+import torch
 
 # module with related operator only
 
@@ -1332,20 +1333,20 @@ def forward(self, x):
         return self.max_pool2d(x)
 
 
-class MeanWKeppDim(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x):
-        return torch.mean(x, (-1, -2), keepdim=True)
-
-
-class MeanWOKeppDim(torch.nn.Module):
-    def __init__(self):
+class Mean(torch.nn.Module):
+    def __init__(
+        self,
+        dim: Optional[Union[int, Tuple[int, ...], List[int]]] = None,
+        keepdim: bool = False,
+        dtype: Optional[torch.dtype] = None,
+    ):
         super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
+        self.dtype = dtype
 
     def forward(self, x):
-        return torch.mean(x, (-1, -2))
+        return torch.mean(x, dim=self.dim, keepdim=self.keepdim, dtype=self.dtype)
 
 
 class MaskedFill(torch.nn.Module):
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index 6c444c90c08..e3cf52b9a6f 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -1018,12 +1018,61 @@ def test_qnn_backend_max_pool2d(self):
         sample_input = (torch.randn(4, 3, 24, 24),)
         self.lower_module_and_test_output(module, sample_input)
 
-    def test_qnn_backend_mean_dim(self):
-        modules = [MeanWKeppDim(), MeanWOKeppDim()]  # noqa: F405
-        sample_input = (torch.randn([2, 5, 1, 3]),)
-        for i, module in enumerate(modules):
+    def test_qnn_backend_mean(self):
+        test_comb = [
+            # Reduce over last two dims, keepdim=True
+            {
+                QCOM_MODULE: Mean(dim=(-1, -2), keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn([2, 5, 1, 3]),),
+            },
+            # Reduce over last two dims, keepdim=False
+            {
+                QCOM_MODULE: Mean(dim=(-1, -2), keepdim=False),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn([2, 5, 1, 3]),),
+            },
+            # Default: reduce all dims
+            {
+                QCOM_MODULE: Mean(),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(10, 10),),
+            },
+            # TODO: To be enabled via reshape input to 1d tensor
+            # # Scalar case
+            # {
+            #     QCOM_MODULE: Mean(),
+            #     QCOM_SAMPLE_INPUTS: (torch.tensor(5.0),),
+            # },
+            # Edge case: dim is a empty list
+            {
+                QCOM_MODULE: Mean(dim=[]),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(4, 6, 8),),
+            },
+            # Edge case: reduce along dim=0 (batch dimension)
+            {
+                QCOM_MODULE: Mean(dim=0),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(4, 6, 8),),
+            },
+            # Edge case: reduce along dim=0 with keepdim=True
+            {
+                QCOM_MODULE: Mean(dim=0, keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(4, 6, 8),),
+            },
+            # Edge case: reduce along multiple dims
+            {
+                QCOM_MODULE: Mean(dim=(0, 2)),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(3, 4, 5),),
+            },
+            # Edge case: high-dimensional tensor
+            {
+                QCOM_MODULE: Mean(dim=(1, 3), keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(2, 3, 4, 5, 6),),
+            },
+        ]
+
+        for i, test in enumerate(test_comb):
             with self.subTest(i=i):
-                self.lower_module_and_test_output(module, sample_input)
+                self.lower_module_and_test_output(
+                    test[QCOM_MODULE], test[QCOM_SAMPLE_INPUTS]
+                )
 
     @unittest.skip("failed to lower in QNN 2.26")
     def test_qnn_backend_mha(self):
@@ -1216,10 +1265,8 @@ def test_qnn_backend_slice_scatter(self):
                 ],
                 QCOM_SAMPLE_INPUTS: [
                     (
-                        (
-                            torch.zeros(8, 8),
-                            torch.ones(8, 2),
-                        )
+                        torch.zeros(8, 8),
+                        torch.ones(8, 2),
                     )
                 ],
             },
@@ -2666,13 +2713,62 @@ def test_qnn_backend_max_pool2d(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
-    def test_qnn_backend_mean_dim(self):
-        modules = [MeanWKeppDim(), MeanWOKeppDim()]  # noqa: F405
-        sample_input = (torch.randn([2, 5, 1, 3]),)
-        for i, module in enumerate(modules):
+    def test_qnn_backend_mean(self):
+        test_comb = [
+            # Reduce over last two dims, keepdim=True
+            {
+                QCOM_MODULE: Mean(dim=(-1, -2), keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn([2, 5, 1, 3]),),
+            },
+            # Reduce over last two dims, keepdim=False
+            {
+                QCOM_MODULE: Mean(dim=(-1, -2), keepdim=False),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn([2, 5, 1, 3]),),
+            },
+            # Default: reduce all dims
+            {
+                QCOM_MODULE: Mean(),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(10, 10),),
+            },
+            # TODO: To be enabled via reshape input to 1d tensor
+            # Scalar case
+            # {
+            #     QCOM_MODULE: Mean(),
+            #     QCOM_SAMPLE_INPUTS: (torch.tensor(5.0),),
+            # },
+            # Edge case: dim is a empty list
+            {
+                QCOM_MODULE: Mean(dim=[]),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(4, 6, 8),),
+            },
+            # Edge case: reduce along dim=0 (batch dimension)
+            {
+                QCOM_MODULE: Mean(dim=0),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(4, 6, 8),),
+            },
+            # Edge case: reduce along dim=0 with keepdim=True
+            {
+                QCOM_MODULE: Mean(dim=0, keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(4, 6, 8),),
+            },
+            # Edge case: reduce along multiple dims
+            {
+                QCOM_MODULE: Mean(dim=(0, 2)),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(3, 4, 5),),
+            },
+            # Edge case: high-dimensional tensor
+            {
+                QCOM_MODULE: Mean(dim=(1, 3), keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(2, 3, 4, 5, 6),),
+            },
+        ]
+
+        for i, test in enumerate(test_comb):
             with self.subTest(i=i):
-                module = self.get_qdq_module(module, sample_input)
-                self.lower_module_and_test_output(module, sample_input)
+                module = self.get_qdq_module(
+                    test[QCOM_MODULE], test[QCOM_SAMPLE_INPUTS]
+                )
+                self.lower_module_and_test_output(module, test[QCOM_SAMPLE_INPUTS])
 
     def test_qnn_backend_mha(self):
         module = MultiheadAttention()  # noqa: F405
@@ -2897,10 +2993,8 @@ def test_qnn_backend_slice_scatter(self):
                 ],
                 QCOM_SAMPLE_INPUTS: [
                     (
-                        (
-                            torch.zeros(8, 8),
-                            torch.ones(8, 2),
-                        )
+                        torch.zeros(8, 8),
+                        torch.ones(8, 2),
                     )
                 ],
             },

From f24351a365ef5929538473a6d8983f7d0f1ddb50 Mon Sep 17 00:00:00 2001
From: Eli Amesefe <eliamesefe@meta.com>
Date: Wed, 1 Oct 2025 22:28:18 -0700
Subject: [PATCH 100/266] Update mul int16 test

Differential Revision: D83437473

Pull Request resolved: https://github.com/pytorch/executorch/pull/14646
---
 backends/arm/operators/op_repeat.py | 2 +-
 backends/arm/test/ops/test_mul.py   | 6 ------
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/backends/arm/operators/op_repeat.py b/backends/arm/operators/op_repeat.py
index 5db7ce9347c..9ee4e9fedf8 100644
--- a/backends/arm/operators/op_repeat.py
+++ b/backends/arm/operators/op_repeat.py
@@ -44,7 +44,7 @@ def define_node(
         validate_valid_dtype(
             self.target,
             [inputs[0], output],
-            [ts.DType.INT8, ts.DType.INT32, ts.DType.FP32],
+            [ts.DType.INT8, ts.DType.INT32, ts.DType.INT16, ts.DType.FP32],
             output.tosa_spec,
         )
 
diff --git a/backends/arm/test/ops/test_mul.py b/backends/arm/test/ops/test_mul.py
index e3f2096e7da..2c7b040658a 100644
--- a/backends/arm/test/ops/test_mul.py
+++ b/backends/arm/test/ops/test_mul.py
@@ -338,9 +338,6 @@ def test_mul_tensor_16a8w_tosa_INT(test_data: input_t1):
 
 @common.parametrize("test_data", test_data_suite)
 @common.XfailIfNoCorstone300
-@pytest.mark.xfail(
-    reason="Vela compilation fails with 'Invalid arguments' for int16 mul operations. See: https://github.com/pytorch/executorch/issues/13947"
-)
 def test_mul_tensor_16a8w_u55_INT16(test_data: input_t1):
     """Test mul operation with 16A8W quantization on U55 (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -365,9 +362,6 @@ def test_mul_tensor_16a8w_u55_INT16(test_data: input_t1):
 
 @common.parametrize("test_data", test_data_suite)
 @common.XfailIfNoCorstone320
-@pytest.mark.xfail(
-    reason="Vela compilation fails with 'Invalid arguments' for int16 mul operations. See: https://github.com/pytorch/executorch/issues/13947"
-)
 def test_mul_tensor_16a8w_u85_INT16(test_data: input_t1):
     """Test mul operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False

From 499ce5038cf4589eb6761dceb5763acc736fbec1 Mon Sep 17 00:00:00 2001
From: Yufeng Shi <yufeng.shi@arm.com>
Date: Thu, 2 Oct 2025 13:57:33 +0100
Subject: [PATCH 101/266] Arm backend: Add VGF tests to StableDiffusion module
 tests (#14655)

Also refactor the StableDiffusion module tests to use test_pipeline
instead of ArmTester directly.

Signed-off-by: Yufeng Shi <yufeng.shi@arm.com>
---
 .../test_CLIPTextModelWithProjection.py       | 146 ++++++++++++------
 .../test_SD3Transformer2DModel.py             | 138 +++++++++++------
 .../stable_diffusion/test_T5EncoderModel.py   | 140 +++++++++++------
 .../test_vae_AutoencoderKL.py                 | 114 +++++++++-----
 4 files changed, 359 insertions(+), 179 deletions(-)

diff --git a/backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py b/backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py
index 49266beee63..fad31b57537 100644
--- a/backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py
+++ b/backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py
@@ -4,7 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 
-import unittest
+from typing import Tuple
 
 import torch
 from executorch.backends.arm._passes import (
@@ -17,11 +17,17 @@
 from executorch.backends.arm.test.models.stable_diffusion.stable_diffusion_module_test_configs import (
     CLIP_text_encoder_config,
 )
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.test.tester.test_pipeline import (
+    TosaPipelineFP,
+    TosaPipelineINT,
+    VgfPipeline,
+)
 from transformers import CLIPTextModelWithProjection
 
+input_t = Tuple[torch.Tensor]
+
 
-class TestCLIPTextModelWithProjection(unittest.TestCase):
+class TestCLIPTextModelWithProjection:
     """
     Test class of CLIPTextModelWithProjection.
     CLIPTextModelWithProjection is one of the text_encoder used by Stable Diffusion 3.5 Medium
@@ -69,47 +75,93 @@ def prepare_model_and_inputs(self):
 
         return text_encoder_model, text_encoder_model_inputs
 
-    def test_CLIPTextModelWithProjection_tosa_FP(self):
-        text_encoder_model, text_encoder_model_inputs = self.prepare_model_and_inputs()
-        with torch.no_grad():
-            (
-                ArmTester(
-                    text_encoder_model,
-                    example_inputs=text_encoder_model_inputs,
-                    compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP"),
-                    transform_passes=[
-                        ConvertInt64ConstOpsToInt32Pass(),
-                        ConvertInt64OutputOpsToInt32Pass(),
-                        InsertInt32CastsAfterInt64PlaceholdersPass(),
-                    ],
-                )
-                .export()
-                .to_edge_transform_and_lower()
-                .dump_operator_distribution()
-                .check_count(self.ops_after_partitioner_FP)
-                .to_executorch()
-                .run_method_and_compare_outputs(
-                    inputs=text_encoder_model_inputs,
-                )
-            )
-
-    def test_CLIPTextModelWithProjection_tosa_INT(self):
-        text_encoder_model, text_encoder_model_inputs = self.prepare_model_and_inputs()
-        with torch.no_grad():
-            (
-                ArmTester(
-                    text_encoder_model,
-                    example_inputs=text_encoder_model_inputs,
-                    compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+INT"),
-                )
-                .quantize()
-                .export()
-                .to_edge_transform_and_lower()
-                .dump_operator_distribution()
-                .check_count(self.ops_after_partitioner_INT)
-                .to_executorch()
-                .run_method_and_compare_outputs(
-                    inputs=text_encoder_model_inputs,
-                    atol=0.8,
-                )
-            )
+
+def test_CLIPTextModelWithProjection_tosa_FP():
+    text_encoder_model, text_encoder_model_inputs = (
+        TestCLIPTextModelWithProjection().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = TosaPipelineFP[input_t](
+            text_encoder_model,
+            text_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            use_to_edge_transform_and_lower=True,
+            transform_passes=[
+                ConvertInt64ConstOpsToInt32Pass(),
+                ConvertInt64OutputOpsToInt32Pass(),
+                InsertInt32CastsAfterInt64PlaceholdersPass(),
+            ],
+        )
+        pipeline.change_args(
+            "check_count.exir", TestCLIPTextModelWithProjection.ops_after_partitioner_FP
+        )
+        pipeline.run()
+
+
+def test_CLIPTextModelWithProjection_tosa_INT():
+    text_encoder_model, text_encoder_model_inputs = (
+        TestCLIPTextModelWithProjection().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = TosaPipelineINT[input_t](
+            text_encoder_model,
+            text_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            use_to_edge_transform_and_lower=True,
+            atol=0.8,
+        )
+        pipeline.change_args(
+            "check_count.exir",
+            TestCLIPTextModelWithProjection.ops_after_partitioner_INT,
+        )
+        pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+def test_CLIPTextModelWithProjection_vgf_FP():
+    text_encoder_model, text_encoder_model_inputs = (
+        TestCLIPTextModelWithProjection().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = VgfPipeline[input_t](
+            text_encoder_model,
+            text_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            tosa_version="TOSA-1.0+FP",
+            use_to_edge_transform_and_lower=True,
+            atol=4,  # TODO: Investiage numerical issue: MAX Diff ~50%
+            transform_passes=[
+                ConvertInt64ConstOpsToInt32Pass(),
+                ConvertInt64OutputOpsToInt32Pass(),
+                InsertInt32CastsAfterInt64PlaceholdersPass(),
+            ],
+        )
+        pipeline.change_args(
+            "check_count.exir", TestCLIPTextModelWithProjection.ops_after_partitioner_FP
+        )
+        pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+def test_CLIPTextModelWithProjection_vgf_INT():
+    text_encoder_model, text_encoder_model_inputs = (
+        TestCLIPTextModelWithProjection().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = VgfPipeline[input_t](
+            text_encoder_model,
+            text_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            tosa_version="TOSA-1.0+INT",
+            use_to_edge_transform_and_lower=True,
+            atol=0.8,
+        )
+        pipeline.change_args(
+            "check_count.exir",
+            TestCLIPTextModelWithProjection.ops_after_partitioner_INT,
+        )
+        pipeline.run()
diff --git a/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py b/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py
index f9d814d044b..1267c5b8e4c 100644
--- a/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py
+++ b/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py
@@ -4,7 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 
-import unittest
+from typing import Tuple
 
 import torch
 from diffusers.models.transformers import SD3Transformer2DModel
@@ -13,10 +13,16 @@
 from executorch.backends.arm.test.models.stable_diffusion.stable_diffusion_module_test_configs import (
     SD3Transformer2DModel_init_dict,
 )
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.test.tester.test_pipeline import (
+    TosaPipelineFP,
+    TosaPipelineINT,
+    VgfPipeline,
+)
+
+input_t4 = Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
 
 
-class TestSD3Transformer2DModel(unittest.TestCase):
+class TestSD3Transformer2DModel:
     """
     Test class of AutoenSD3Transformer2DModelcoderKL.
     SD3Transformer2DModel is the transformer model used by Stable Diffusion 3.5 Medium
@@ -93,48 +99,88 @@ def forward(self, *args, **kwargs):
 
         return sd35_transformer2D_model, sd35_transformer2D_model_inputs
 
-    def test_SD3Transformer2DModel_tosa_FP(self):
-        sd35_transformer2D_model, sd35_transformer2D_model_inputs = (
-            self.prepare_model_and_inputs()
-        )
-        with torch.no_grad():
-            (
-                ArmTester(
-                    sd35_transformer2D_model,
-                    example_inputs=sd35_transformer2D_model_inputs,
-                    compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP"),
-                )
-                .export()
-                .to_edge_transform_and_lower()
-                .check_count(self.ops_after_partitioner_FP)
-                .to_executorch()
-                .run_method_and_compare_outputs(
-                    inputs=sd35_transformer2D_model_inputs,
-                    rtol=1.0,  # TODO: MLETORCH-875: Reduce tolerance of SD3Transformer2DModel with FP and INT
-                    atol=4.0,
-                )
-            )
 
-    def test_SD3Transformer2DModel_tosa_INT(self):
-        sd35_transformer2D_model, sd35_transformer2D_model_inputs = (
-            self.prepare_model_and_inputs()
+def test_SD3Transformer2DModel_tosa_FP():
+    sd35_transformer2D_model, sd35_transformer2D_model_inputs = (
+        TestSD3Transformer2DModel().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = TosaPipelineFP[input_t4](
+            sd35_transformer2D_model,
+            sd35_transformer2D_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            use_to_edge_transform_and_lower=True,
+            rtol=1.0,  # TODO: MLETORCH-875: Reduce tolerance of SD3Transformer2DModel with FP and INT
+            atol=4.0,
         )
-        with torch.no_grad():
-            (
-                ArmTester(
-                    sd35_transformer2D_model,
-                    example_inputs=sd35_transformer2D_model_inputs,
-                    compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+INT"),
-                )
-                .quantize()
-                .export()
-                .to_edge_transform_and_lower()
-                .check_count(self.ops_after_partitioner_INT)
-                .to_executorch()
-                .run_method_and_compare_outputs(
-                    inputs=sd35_transformer2D_model_inputs,
-                    qtol=1.0,  # TODO: MLETORCH-875: Reduce tolerance of SD3Transformer2DModel with FP and INT
-                    rtol=1.0,
-                    atol=4.0,
-                )
-            )
+        pipeline.change_args(
+            "check_count.exir", TestSD3Transformer2DModel.ops_after_partitioner_FP
+        )
+        pipeline.run()
+
+
+def test_SD3Transformer2DModel_tosa_INT():
+    sd35_transformer2D_model, sd35_transformer2D_model_inputs = (
+        TestSD3Transformer2DModel().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = TosaPipelineINT[input_t4](
+            sd35_transformer2D_model,
+            sd35_transformer2D_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            use_to_edge_transform_and_lower=True,
+            qtol=1.0,  # TODO: MLETORCH-875: Reduce tolerance of SD3Transformer2DModel with FP and INT
+            rtol=1.0,
+            atol=4.0,
+        )
+        pipeline.change_args(
+            "check_count.exir", TestSD3Transformer2DModel.ops_after_partitioner_INT
+        )
+        pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+def test_SD3Transformer2DModel_vgf_FP():
+    sd35_transformer2D_model, sd35_transformer2D_model_inputs = (
+        TestSD3Transformer2DModel().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = VgfPipeline[input_t4](
+            sd35_transformer2D_model,
+            sd35_transformer2D_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            tosa_version="TOSA-1.0+FP",
+            use_to_edge_transform_and_lower=True,
+            rtol=1.0,  # TODO: MLETORCH-875: Reduce tolerance of SD3Transformer2DModel with FP and INT
+            atol=4.0,
+        )
+        pipeline.change_args(
+            "check_count.exir", TestSD3Transformer2DModel.ops_after_partitioner_FP
+        )
+        pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+def test_SD3Transformer2DModel_vgf_INT():
+    sd35_transformer2D_model, sd35_transformer2D_model_inputs = (
+        TestSD3Transformer2DModel().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = VgfPipeline[input_t4](
+            sd35_transformer2D_model,
+            sd35_transformer2D_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            tosa_version="TOSA-1.0+INT",
+            use_to_edge_transform_and_lower=True,
+            qtol=1.0,  # TODO: MLETORCH-875: Reduce tolerance of SD3Transformer2DModel with FP and INT
+            rtol=1.0,
+            atol=4.0,
+        )
+        pipeline.change_args(
+            "check_count.exir", TestSD3Transformer2DModel.ops_after_partitioner_INT
+        )
+        pipeline.run()
diff --git a/backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py b/backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py
index 22a47042eb1..20b92e4a258 100644
--- a/backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py
+++ b/backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py
@@ -4,7 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 
-import unittest
+from typing import Tuple
 
 import torch
 from executorch.backends.arm._passes import (
@@ -17,11 +17,17 @@
 from executorch.backends.arm.test.models.stable_diffusion.stable_diffusion_module_test_configs import (
     T5_encoder_config,
 )
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.test.tester.test_pipeline import (
+    TosaPipelineFP,
+    TosaPipelineINT,
+    VgfPipeline,
+)
 from transformers import T5EncoderModel
 
+input_t = Tuple[torch.Tensor]
+
 
-class TestT5EncoderModel(unittest.TestCase):
+class TestT5EncoderModel:
     """
     Test class of T5EncoderModel.
     T5EncoderModel is one of the text_encoder used by Stable Diffusion 3.5 Medium
@@ -61,46 +67,88 @@ def prepare_model_and_inputs(self):
 
         return t5_encoder_model, t5_encoder_model_inputs
 
-    def test_T5EncoderModel_tosa_FP(self):
-        t5_encoder_model, t5_encoder_model_inputs = self.prepare_model_and_inputs()
-        with torch.no_grad():
-            (
-                ArmTester(
-                    t5_encoder_model,
-                    example_inputs=t5_encoder_model_inputs,
-                    compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP"),
-                    transform_passes=[
-                        ConvertInt64ConstOpsToInt32Pass(),
-                        ConvertInt64OutputOpsToInt32Pass(),
-                        InsertInt32CastsAfterInt64PlaceholdersPass(),
-                    ],
-                )
-                .export()
-                .to_edge_transform_and_lower()
-                .dump_operator_distribution()
-                .check_count(self.ops_after_partitioner_FP)
-                .to_executorch()
-                .run_method_and_compare_outputs(
-                    inputs=t5_encoder_model_inputs,
-                )
-            )
-
-    def test_T5EncoderModel_tosa_INT(self):
-        t5_encoder_model, t5_encoder_model_inputs = self.prepare_model_and_inputs()
-        with torch.no_grad():
-            (
-                ArmTester(
-                    t5_encoder_model,
-                    example_inputs=t5_encoder_model_inputs,
-                    compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+INT"),
-                )
-                .quantize()
-                .export()
-                .to_edge_transform_and_lower()
-                .dump_operator_distribution()
-                .check_count(self.ops_after_partitioner_INT)
-                .to_executorch()
-                .run_method_and_compare_outputs(
-                    inputs=t5_encoder_model_inputs,
-                )
-            )
+
+def test_T5EncoderModel_tosa_FP():
+    t5_encoder_model, t5_encoder_model_inputs = (
+        TestT5EncoderModel().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = TosaPipelineFP[input_t](
+            t5_encoder_model,
+            t5_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            use_to_edge_transform_and_lower=True,
+            transform_passes=[
+                ConvertInt64ConstOpsToInt32Pass(),
+                ConvertInt64OutputOpsToInt32Pass(),
+                InsertInt32CastsAfterInt64PlaceholdersPass(),
+            ],
+        )
+        pipeline.change_args(
+            "check_count.exir", TestT5EncoderModel.ops_after_partitioner_FP
+        )
+        pipeline.run()
+
+
+def test_T5EncoderModel_tosa_INT():
+    t5_encoder_model, t5_encoder_model_inputs = (
+        TestT5EncoderModel().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = TosaPipelineINT[input_t](
+            t5_encoder_model,
+            t5_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            use_to_edge_transform_and_lower=True,
+        )
+        pipeline.change_args(
+            "check_count.exir", TestT5EncoderModel.ops_after_partitioner_INT
+        )
+        pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+def test_T5EncoderModel_vgf_FP():
+    t5_encoder_model, t5_encoder_model_inputs = (
+        TestT5EncoderModel().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = VgfPipeline[input_t](
+            t5_encoder_model,
+            t5_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            tosa_version="TOSA-1.0+FP",
+            use_to_edge_transform_and_lower=True,
+            transform_passes=[
+                ConvertInt64ConstOpsToInt32Pass(),
+                ConvertInt64OutputOpsToInt32Pass(),
+                InsertInt32CastsAfterInt64PlaceholdersPass(),
+            ],
+        )
+        pipeline.change_args(
+            "check_count.exir", TestT5EncoderModel.ops_after_partitioner_FP
+        )
+        pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+def test_T5EncoderModel_vgf_INT():
+    t5_encoder_model, t5_encoder_model_inputs = (
+        TestT5EncoderModel().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = VgfPipeline[input_t](
+            t5_encoder_model,
+            t5_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            tosa_version="TOSA-1.0+INT",
+            use_to_edge_transform_and_lower=True,
+        )
+        pipeline.change_args(
+            "check_count.exir", TestT5EncoderModel.ops_after_partitioner_INT
+        )
+        pipeline.run()
diff --git a/backends/arm/test/models/stable_diffusion/test_vae_AutoencoderKL.py b/backends/arm/test/models/stable_diffusion/test_vae_AutoencoderKL.py
index ab0f4892fb8..a3c3a018131 100644
--- a/backends/arm/test/models/stable_diffusion/test_vae_AutoencoderKL.py
+++ b/backends/arm/test/models/stable_diffusion/test_vae_AutoencoderKL.py
@@ -4,7 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 
-import unittest
+from typing import Tuple
 
 import torch
 from diffusers.models.autoencoders import AutoencoderKL
@@ -14,10 +14,16 @@
 from executorch.backends.arm.test.models.stable_diffusion.stable_diffusion_module_test_configs import (
     AutoencoderKL_config,
 )
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.test.tester.test_pipeline import (
+    TosaPipelineFP,
+    TosaPipelineINT,
+    VgfPipeline,
+)
+
+input_t = Tuple[torch.Tensor]
 
 
-class TestAutoencoderKL(unittest.TestCase):
+class TestAutoencoderKL:
     """
     Test class of AutoencoderKL.
     AutoencoderKL is the encoder/decoder used by Stable Diffusion 3.5 Medium
@@ -41,40 +47,68 @@ def forward(self, *args, **kwargs):
 
         return auto_encoder_model, auto_encoder_model_inputs
 
-    def test_AutoencoderKL_tosa_FP(self):
-        auto_encoder_model, auto_encoder_model_inputs = self.prepare_model_and_inputs()
-        with torch.no_grad():
-            (
-                ArmTester(
-                    auto_encoder_model,
-                    example_inputs=auto_encoder_model_inputs,
-                    compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP"),
-                )
-                .export()
-                .to_edge_transform_and_lower()
-                .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-                .to_executorch()
-                .run_method_and_compare_outputs(
-                    inputs=auto_encoder_model_inputs,
-                )
-            )
-
-    def test_AutoencoderKL_tosa_INT(self):
-        auto_encoder_model, auto_encoder_model_inputs = self.prepare_model_and_inputs()
-        with torch.no_grad():
-            (
-                ArmTester(
-                    auto_encoder_model,
-                    example_inputs=auto_encoder_model_inputs,
-                    compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+INT"),
-                )
-                .quantize()
-                .export()
-                .to_edge_transform_and_lower()
-                .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-                .to_executorch()
-                .run_method_and_compare_outputs(
-                    inputs=auto_encoder_model_inputs,
-                    atol=1.0,  # TODO: MLETORCH-990 Reduce tolerance of vae(AutoencoderKL) with INT
-                )
-            )
+
+def test_AutoencoderKL_tosa_FP():
+    auto_encoder_model, auto_encoder_model_inputs = (
+        TestAutoencoderKL().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = TosaPipelineFP[input_t](
+            auto_encoder_model,
+            auto_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            use_to_edge_transform_and_lower=True,
+        )
+        pipeline.run()
+
+
+def test_AutoencoderKL_tosa_INT():
+    auto_encoder_model, auto_encoder_model_inputs = (
+        TestAutoencoderKL().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = TosaPipelineINT[input_t](
+            auto_encoder_model,
+            auto_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            use_to_edge_transform_and_lower=True,
+            atol=1.0,  # TODO: MLETORCH-990 Reduce tolerance of vae(AutoencoderKL) with INT
+        )
+        pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+def test_AutoencoderKL_vgf_FP():
+    auto_encoder_model, auto_encoder_model_inputs = (
+        TestAutoencoderKL().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = VgfPipeline[input_t](
+            auto_encoder_model,
+            auto_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            tosa_version="TOSA-1.0+FP",
+            use_to_edge_transform_and_lower=True,
+        )
+        pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+def test_AutoencoderKL_vgf_INT():
+    auto_encoder_model, auto_encoder_model_inputs = (
+        TestAutoencoderKL().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = VgfPipeline[input_t](
+            auto_encoder_model,
+            auto_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            tosa_version="TOSA-1.0+INT",
+            use_to_edge_transform_and_lower=True,
+            atol=1.0,  # TODO: MLETORCH-990 Reduce tolerance of vae(AutoencoderKL) with INT
+        )
+        pipeline.run()

From edf69278ea59dc681a72ee3697021e6af533bb97 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0imon=20Str=C3=BD=C4=8Dek?= <simon.strycek@nxp.com>
Date: Thu, 2 Oct 2025 16:02:21 +0200
Subject: [PATCH 102/266] NXP backend: Improve Neutron targets handling
 (#14718)

### Summary
Adds NeutronTargetSpec class containing metadata about the desired
target for better handling of Neutron target support.

### Test plan

This feature modifies handling of individual operators target support
and therefore should be covered by already existing unit tests.

cc @digantdesai @JakeStevens @robert-kalmar

Co-authored-by: Jiri Ocenasek <jiri.ocenasek@nxp.com>
---
 .../nxp/backend/edge_program_converter.py     | 10 ++-
 .../ir/converter/builder/model_builder.py     | 70 +++++++++-------
 .../backend/ir/converter/node_converter.py    | 25 ++----
 .../ops_converters/add_tensor_converter.py    | 17 ++--
 .../ops_converters/cat_converter.py           | 81 ++++++++++---------
 .../constant_pad_nd_converter.py              | 22 ++---
 .../ops_converters/convolution_converter.py   | 67 +++++++--------
 .../ops_converters/mean_dim_converter.py      | 38 ++++-----
 .../ops_converters/softmax_converter.py       | 17 +---
 .../prune_transpose_operators.py              |  2 +-
 .../nxp/backend/neutron_converter_manager.py  | 45 +++++++----
 backends/nxp/backend/neutron_target_spec.py   | 64 +++++++++++++++
 backends/nxp/neutron_partitioner.py           | 44 +++++-----
 backends/nxp/nxp_backend.py                   | 20 ++---
 backends/nxp/tests/executors.py               | 10 +--
 backends/nxp/tests/test_neutron_backend.py    |  2 +-
 .../tests/test_neutron_converter_manager.py   |  9 +--
 17 files changed, 289 insertions(+), 254 deletions(-)
 create mode 100644 backends/nxp/backend/neutron_target_spec.py

diff --git a/backends/nxp/backend/edge_program_converter.py b/backends/nxp/backend/edge_program_converter.py
index 192798c151e..febcd03913a 100644
--- a/backends/nxp/backend/edge_program_converter.py
+++ b/backends/nxp/backend/edge_program_converter.py
@@ -18,6 +18,7 @@
 from torch.fx import Node
 from torch.nn.parameter import Parameter
 from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters import *  # noqa F403
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from executorch.backends.nxp.backend.node_format_inference import (
     NodeFormat,
     NodeFormatInference,
@@ -54,12 +55,14 @@ class EdgeProgramToIRConverter:
     """
 
     _default_conversion_config = ConversionConfig()
+    _default_target_spec = NeutronTargetSpec("imxrt700", "SDK_25_09")
     _default_delegation_options = CustomDelegationOptions()
 
     def convert_program(
         self,
         edge_program: ExportedProgram,
-        conversion_config=_default_conversion_config,
+        conversion_config: ConversionConfig = _default_conversion_config,
+        neutron_target_spec: NeutronTargetSpec = _default_target_spec,
         custom_delegation_options: CustomDelegationOptions = _default_delegation_options,
     ) -> (bytes, dict):
         """
@@ -67,6 +70,7 @@ def convert_program(
 
         :param edge_program: Converter ExportedProgram.
         :param conversion_config: ConversionConfig instance.
+        :param neutron_target_spec: Object for querying the target platform to retrieve its properties.
         :param custom_delegation_options: Custom user options which affect node delegation.
         :return: TFLite flatbuffers as bytes.
         """
@@ -76,6 +80,7 @@ def convert_program(
         cc = self.build_conversion_context(
             parameters_mapping,
             node_formats,
+            neutron_target_spec,
             conversion_config,
             custom_delegation_options,
         )
@@ -173,11 +178,12 @@ def map_inputs_to_parameters(edge_program: ExportedProgram) -> dict[str, Paramet
     def build_conversion_context(
         parameters_mapping: dict,
         node_formats: dict[Node, NodeFormat],
+        neutron_target_spec: NeutronTargetSpec,
         conversion_config: ConversionConfig = _default_conversion_config,
         custom_delegation_options: CustomDelegationOptions = _default_delegation_options,
     ) -> ConversionContext:
         tflite_builder = AtenModelBuilderDirector(
-            3, "TFLite from EdgeProgram", conversion_config
+            3, "TFLite from EdgeProgram", neutron_target_spec, conversion_config
         )
 
         # Add "sentinel" buffer (defined in schema.fbs)
diff --git a/backends/nxp/backend/ir/converter/builder/model_builder.py b/backends/nxp/backend/ir/converter/builder/model_builder.py
index 496fa752853..643a6231d15 100755
--- a/backends/nxp/backend/ir/converter/builder/model_builder.py
+++ b/backends/nxp/backend/ir/converter/builder/model_builder.py
@@ -48,6 +48,7 @@
     FlexTranspose,
 )
 from executorch.backends.nxp.backend.ir.tflite_optimizer import optimizer
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 
 
 class ModelBuilder:
@@ -74,17 +75,21 @@ class ModelBuilder:
 
     _zeros_tensor_map: Dict  # Mapping 'string' shapes to 'tflT.Tensor' objects
 
-    _default_conversion_config = ConversionConfig()
+    neutron_target_spec: NeutronTargetSpec
 
     conversion_config: ConversionConfig
 
+    _default_conversion_config = ConversionConfig()
+
     def __init__(
         self,
         model_version: int,
         model_description: str,
+        neutron_target_spec: NeutronTargetSpec,
         conversion_config: ConversionConfig = _default_conversion_config,
     ) -> None:
         self._tfl_model = tflite_model.Model(model_version, model_description)
+        self.neutron_target_spec = neutron_target_spec
         self.conversion_config = conversion_config
 
         self.op_code_type_index_map = {}
@@ -471,31 +476,7 @@ def finish(self) -> tflite_model.Model:
 
         return self._tfl_model
 
-    def _assign_tensor_and_buffer_indices(  # noqa C901
-        self, allow_inputs_stripping: bool
-    ):
-        """Correctly initialize all references via indices in all tensors and buffers."""
-
-        # Assign each buffer its index
-        for i, buffer in enumerate(self.get_buffers().vector):
-            buffer.tmp_index = i
-
-        # Assign each tensor its index and its buffer index
-        for i, tensor in enumerate(self.get_tensors().vector):
-            if tensor.tmp_null_tensor:
-                # Using -1 as the index to the 'tensors' vector is way of telling the TFLite inference engine, that
-                #  this tensor should not be used.
-                # https://github.com/tensorflow/tensorflow/blob/05404d959119d41a8ffb8a75c6f232cfd8540d45/tensorflow/lite/kernels/kernel_util.cc#L79-L98
-                tensor.tmp_index = -1
-            else:
-                tensor.tmp_index = i
-
-            tensor.buffer = tensor.tmp_buffer.tmp_index
-
-        # TODO Remove inputs and outputs that are not in the tensors collection
-
-        # Assign 'Outputs' and 'Inputs' their tensor indices
-        outputs = self.get_sub_graph().outputs
+    def _assign_io_tensor_indices(self, inputs, outputs, allow_inputs_stripping: bool):
         for tensor in outputs.tmp_outputs:
             try:
                 outputs.append(tensor.tmp_index)
@@ -505,7 +486,6 @@ def _assign_tensor_and_buffer_indices(  # noqa C901
                     f"The tensor '{tensor.name}' is among the model outputs, but does NOT appear in the graph!",
                 )
 
-        inputs = self.get_sub_graph().inputs
         for tensor in inputs.tmp_inputs:
             try:
                 inputs.append(tensor.tmp_index)
@@ -520,14 +500,46 @@ def _assign_tensor_and_buffer_indices(  # noqa C901
                         f"The tensor '{tensor.name}' is among the model inputs, but does NOT appear in the graph!",
                     )
 
-        # Assign each operator its inputs and outputs indices
-        for operator in self.get_sub_graph().operators.vector:
+    def _assign_operators_io_tensor_indices(self, operators):
+        for operator in operators.vector:
             for inputTensor in operator.tmp_inputs:
                 operator.inputs.append(inputTensor.tmp_index)
 
             for outputTensor in operator.tmp_outputs:
                 operator.outputs.append(outputTensor.tmp_index)
 
+    def _assign_tensor_and_buffer_indices(self, allow_inputs_stripping: bool):
+        """Correctly initialize all references via indices in all tensors and buffers."""
+
+        # Assign each buffer its index
+        for i, buffer in enumerate(self.get_buffers().vector):
+            buffer.tmp_index = i
+
+        # Assign each tensor its index and its buffer index
+        for i, tensor in enumerate(self.get_tensors().vector):
+            if tensor.tmp_null_tensor:
+                # Using -1 as the index to the 'tensors' vector is way of telling the TFLite inference engine, that
+                #  this tensor should not be used.
+                # https://github.com/tensorflow/tensorflow/blob/05404d959119d41a8ffb8a75c6f232cfd8540d45/tensorflow/lite/kernels/kernel_util.cc#L79-L98
+                tensor.tmp_index = -1
+            else:
+                tensor.tmp_index = i
+
+            tensor.buffer = tensor.tmp_buffer.tmp_index
+
+        # TODO Remove inputs and outputs that are not in the tensors collection
+
+        subgraph = self.get_sub_graph()
+
+        # Assign 'Outputs' and 'Inputs' their tensor indices
+        self._assign_io_tensor_indices(
+            inputs=subgraph.inputs,
+            outputs=subgraph.outputs,
+            allow_inputs_stripping=allow_inputs_stripping,
+        )
+        # Assign each operator its inputs and outputs indices
+        self._assign_operators_io_tensor_indices(operators=subgraph.operators)
+
     def _build_operator_code(
         self, op_type: BuiltinOperator, version, custom_code: str = None
     ):
diff --git a/backends/nxp/backend/ir/converter/node_converter.py b/backends/nxp/backend/ir/converter/node_converter.py
index c44a6e19955..36266486aac 100755
--- a/backends/nxp/backend/ir/converter/node_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converter.py
@@ -4,7 +4,6 @@
 # LICENSE file in the root directory of this source tree.
 
 from abc import ABC, abstractmethod
-from enum import Enum
 
 import torch
 
@@ -16,6 +15,7 @@
     AtenModelBuilderDirector,
 )
 from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from executorch.exir.dialects._ops import ops as exir_ops
 from torch.fx import Node
 from torch.fx.passes.infra.partitioner import Partition
@@ -42,17 +42,6 @@ def is_not_qdq_node(node: torch.fx.Node) -> bool:
     return not (_is_quant_node(node) or _is_dequant_node(node))
 
 
-class Target(Enum):
-    IGNORE = "ignore"  # No target platform. Any target specific restrictions will be ignored.
-
-    RT700 = "imxrt700"
-    IMX95 = "imx95"
-
-    @classmethod
-    def values(cls) -> list[str]:
-        return [elt.value for elt in cls]
-
-
 class NodeConverter(ABC):
     """
     Classes which implement conversion of torch.Node to TFLite should inherit from this class and overwrite the
@@ -94,7 +83,7 @@ def _is_supported_in_IR(
     @staticmethod
     def _is_supported_on_target(
         node: Node,
-        target: Target,
+        neutron_target_spec: NeutronTargetSpec,
         parameters_mapping: dict[str, Parameter],
         custom_delegation_options: CustomDelegationOptions,
     ) -> bool:
@@ -103,31 +92,31 @@ def _is_supported_on_target(
             can be used by operators with no target specific requirements.
 
         :param node: The node (edge operator) to check.
-        :param target: Value of the `Target` enum representing the target platform to check for.
+        :param neutron_target_spec: Object for querying the target platform to retrieve its properties.
         :param parameters_mapping: Dictionary mapping tensor names to their static data (if they have it).
         :param custom_delegation_options: Custom options which affect delegation.
         """
-        return target == Target.RT700
+        return True
 
     @classmethod
     def is_supported(
         cls,
         node: Node,
-        target: Target,
+        neutron_target_spec: NeutronTargetSpec,
         parameters_mapping: dict[str, Parameter],
         custom_delegation_options: CustomDelegationOptions,
     ) -> bool:
         """Check if the given `node` is supported in the IR and on the given `target` platform.
 
         :param node: torch.Node to check.
-        :param target: Value of the `Target` enum representing the target platform to check for.
+        :param neutron_target_spec: Object for querying the target platform to retrieve its properties.
         :param parameters_mapping: Dict mapping tensor names to their data.
         :param custom_delegation_options: Custom user options which affect node delegation.
         """
         return cls._is_supported_in_IR(
             node, parameters_mapping, custom_delegation_options
         ) and cls._is_supported_on_target(
-            node, target, parameters_mapping, custom_delegation_options
+            node, neutron_target_spec, parameters_mapping, custom_delegation_options
         )
 
     @classmethod
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/add_tensor_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/add_tensor_converter.py
index c74baa61f67..cd5aa2ead81 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/add_tensor_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/add_tensor_converter.py
@@ -9,11 +9,11 @@
 from executorch.backends.nxp.backend.ir.converter.node_converter import (
     CustomDelegationOptions,
     NodeConverter,
-    Target,
 )
 from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options import (
     add_options,
 )
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from torch.fx import Node
 from torch.nn import Parameter
 
@@ -22,20 +22,15 @@ class AddTensorConverter(NodeConverter):
     @staticmethod
     def _is_supported_on_target(
         node: Node,
-        target: Target,
+        neutron_target_spec: NeutronTargetSpec,
         parameters_mapping: dict[str, Parameter],
         custom_delegation_options: CustomDelegationOptions,
     ) -> bool:
-        match target:
-            case Target.RT700:
-                if node_uses_shape_broadcasting(node):
-                    # Shape broadcasting may require the addition of `Transpose` ops during conversion.
-                    return False
-
-                return True
+        if node_uses_shape_broadcasting(node):
+            # Shape broadcasting may require the addition of `Transpose` ops during conversion.
+            return False
 
-            case _:
-                return False
+        return True
 
     @staticmethod
     def _is_supported_in_IR(
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/cat_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/cat_converter.py
index 4f7f00fe5ba..22ca258cd4f 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/cat_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/cat_converter.py
@@ -13,11 +13,11 @@
     _is_dequant_node,
     _is_quant_node,
     NodeConverter,
-    Target,
 )
 from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options.concatenation_options import (
     Concatenation,
 )
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from torch.fx import Node
 from torch.nn import Parameter
 
@@ -72,51 +72,52 @@ def _all_io_shares_quantization_parameters(node: Node) -> bool:
     @staticmethod
     def _is_supported_on_target(
         node: Node,
-        target: Target,
+        neutron_target_spec: NeutronTargetSpec,
         parameters_mapping: dict[str, Parameter],
         custom_delegation_options: CustomDelegationOptions,
     ) -> bool:
         if custom_delegation_options.force_delegate_cat:
             return True
 
-        match target:
-            case Target.RT700:
-                dim = CatConverter._get_normalized_dim(node)
-
-                # neutron-library/src/utils/NeutronLibraryInterrogation.cpp#1491
-                if dim == 0:
-                    return False
-
-                # Neutron requires the channels to be a multiple of `8`. The channels could either be the second or the
-                #  last dimension, depending on the formats of the node. The format, however, cannot be determined
-                #  during conversion, as it depends on what other nodes are delegated.
-                input_channels = [
-                    # The second dimension is the channels in PyTorch. If the inputs/output are not channels first, it
-                    #  will still be the channels in the IR.
-                    _get_shape(input_)[1]
-                    for input_ in node.all_input_nodes
-                ] + [
-                    # If the inputs/outputs are channels first, the last dimension will be the channels.
-                    _get_shape(input_)[-1]
-                    for input_ in node.all_input_nodes
-                ]
-                if any((input_channel % 8) != 0 for input_channel in input_channels):
-                    # neutron-library/src/utils/NeutronLibraryInterrogation.cpp#1492
-                    return False
-
-                output_channels = [_get_shape(node)[1], _get_shape(node)[-1]]
-                # neutron-library/src/utils/NeutronLibraryInterrogation.cpp#1493
-                if any((out_c % 8) != 0 for out_c in output_channels):
-                    return False
-
-                if len(node.all_input_nodes) < 2:  # Not supported on Neutron
-                    # TODO Try to skip the operator if this case is realistic.
-                    return False
-
-                return True
-
-            case _:
-                return False
+        dim = CatConverter._get_normalized_dim(node)
+
+        # neutron-library/src/utils/NeutronLibraryInterrogation.cpp#1491
+        if dim == 0:
+            return False
+
+        # Neutron requires the channels to be a multiple of numMacs. The channels could either be the second or the
+        #  last dimension, depending on the formats of the node. The format, however, cannot be determined
+        #  during conversion, as it depends on what other nodes are delegated.
+        input_channels = [
+            # The second dimension is the channels in PyTorch. If the inputs/output are not channels first, it
+            #  will still be the channels in the IR.
+            _get_shape(input_)[1]
+            for input_ in node.all_input_nodes
+        ] + [
+            # If the inputs/outputs are channels first, the last dimension will be the channels.
+            _get_shape(input_)[-1]
+            for input_ in node.all_input_nodes
+        ]
+        if any(
+            (input_channel % neutron_target_spec.get_num_macs()) != 0
+            for input_channel in input_channels
+        ):
+            # neutron-library/src/utils/NeutronLibraryInterrogation.cpp#1492
+            return False
+
+        output_channels = [_get_shape(node)[1], _get_shape(node)[-1]]
+        # neutron-library/src/utils/NeutronLibraryInterrogation.cpp#1493
+        if any(
+            (out_c % neutron_target_spec.get_num_macs()) != 0
+            for out_c in output_channels
+        ):
+            return False
+
+        if len(node.all_input_nodes) < 2:  # Not supported on Neutron
+            # TODO Try to skip the operator if this case is realistic.
+            return False
+
+        return True
 
     @staticmethod
     def _is_supported_in_IR(
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/constant_pad_nd_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/constant_pad_nd_converter.py
index f58df1a88d9..499541aa58c 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/constant_pad_nd_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/constant_pad_nd_converter.py
@@ -17,7 +17,6 @@
 from executorch.backends.nxp.backend.ir.converter.node_converter import (
     CustomDelegationOptions,
     NodeConverter,
-    Target,
 )
 from executorch.backends.nxp.backend.ir.converter.quantization_utils import (
     quantize_int8,
@@ -27,6 +26,7 @@
     pad_options,
     pad_v2_options,
 )
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from torch.fx import Node
 from torch.nn import Parameter
 
@@ -35,22 +35,16 @@ class ConstantPadNDConverter(NodeConverter):
     @staticmethod
     def _is_supported_on_target(
         node: Node,
-        target: Target,
+        neutron_target_spec: NeutronTargetSpec,
         parameters_mapping: dict[str, Parameter],
         custom_delegation_options: CustomDelegationOptions,
     ) -> bool:
-        match target:
-            case Target.RT700:
-                # TODO: Consider different tensor formats (dim-order)
-                paddings = node.args[1]
-                if len(paddings) > 4 and paddings[4:6] != [0, 0]:
-                    # Attempt to Pad channels dimension, which is not supported on Neutron.
-                    return False
-
-                return True
-
-            case _:
-                return False
+        paddings = node.args[1]
+        if len(paddings) > 4 and paddings[4:6] != [0, 0]:
+            # Attempt to Pad channels dimension, which is not supported on Neutron.
+            return False
+
+        return True
 
     @staticmethod
     def _is_supported_in_IR(
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/convolution_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/convolution_converter.py
index 8955b4c8fd4..f32b5a65cac 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/convolution_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/convolution_converter.py
@@ -25,7 +25,6 @@
 from executorch.backends.nxp.backend.ir.converter.node_converter import (
     CustomDelegationOptions,
     NodeConverter,
-    Target,
 )
 from executorch.backends.nxp.backend.ir.converter.node_converters.shared import (
     conv_utils,
@@ -45,6 +44,7 @@
     depthwise_conv_2d_options,
     reshape_options,
 )
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from torch.fx import Node
 from torch.nn import Parameter
 
@@ -53,45 +53,38 @@ class ConvolutionConverter(NodeConverter):
     @staticmethod
     def _is_supported_on_target(
         node: Node,
-        target: Target,
+        neutron_target_spec: NeutronTargetSpec,
         parameters_mapping: dict[str, Parameter],
         custom_delegation_options: CustomDelegationOptions,
     ) -> bool:
-        match target:
-            case Target.RT700:
-                activations = node.args[0]
-                weights = node.args[1]
-                groups = node.args[8]
-
-                if activations.meta["val"].shape[0] != 1:
-                    # Only batch size 1 is supported on neutron.
-                    return False
-
-                if groups == 1:  # Regular convolution.
-                    pass
-                elif conv_utils.group_conv_convertible_as_depthwise(
-                    node, groups
-                ):  # Depthwise convolution.
-                    # Only supported if the weights are static, because TFLite `DepthwiseConv2D` uses permuted
-                    #  weights. In case the weights are dynamic, a Transpose operator would have to be added, which
-                    #  is not supported on Neutron.
-                    if not node_is_effectively_static_tensor(
-                        weights, parameters_mapping
-                    ):
-                        return False
-                elif conv_utils.group_conv_convertible_into_multiple_convolutions(
-                    node, groups
-                ):  # Separable conv. This should never be reached, as the node should have been decomposed into
-                    #  multiple parallel convolutions by the `SplitGroupConvolution` pre-processing pass.
-                    logging.warning("Group convolution was not decomposed.")
-                    return False
-                else:  # Unexpected case (should never happen).
-                    return False
-
-                return True
-
-            case _:
+        activations = node.args[0]
+        weights = node.args[1]
+        groups = node.args[8]
+
+        if activations.meta["val"].shape[0] != 1:
+            # Only batch size 1 is supported on neutron.
+            return False
+
+        if groups == 1:  # Regular convolution.
+            pass
+        elif conv_utils.group_conv_convertible_as_depthwise(
+            node, groups
+        ):  # Depthwise convolution.
+            # Only supported if the weights are static, because TFLite `DepthwiseConv2D` uses permuted
+            #  weights. In case the weights are dynamic, a Transpose operator would have to be added, which
+            #  is not supported on Neutron.
+            if not node_is_effectively_static_tensor(weights, parameters_mapping):
                 return False
+        elif conv_utils.group_conv_convertible_into_multiple_convolutions(
+            node, groups
+        ):  # Separable conv. This should never be reached, as the node should have been decomposed into
+            #  multiple parallel convolutions by the `SplitGroupConvolution` pre-processing pass.
+            logging.warning("Group convolution was not decomposed.")
+            return False
+        else:  # Unexpected case (should never happen).
+            return False
+
+        return True
 
     @staticmethod
     def _is_supported_in_IR(
@@ -238,7 +231,7 @@ def _convert_1d_conv(
     def _convert_unpadded_2D(
         self, t_op: tflite_model.Operator, conv_params: ConvParameters
     ) -> conv_utils.ConvConversionResult:
-        """Convert the `aten.convolution` into TFLite. The `padding` and `builtin_options` must be converter by the
+        """Convert the `aten.convolution` into TFLite. The `padding` and `builtin_options` must be converted by the
         caller.
         """
         common.assign_2d_strides(t_op.builtin_options, conv_params.stride)
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py
index f03c403876f..c1dd7b600be 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py
@@ -12,7 +12,6 @@
 from executorch.backends.nxp.backend.ir.converter.node_converter import (
     CustomDelegationOptions,
     NodeConverter,
-    Target,
 )
 from executorch.backends.nxp.backend.ir.converter.node_converters.shared.reduce_utils import (
     convert_axes_from_attribute,
@@ -20,6 +19,7 @@
 from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options import (
     mean_options,
 )
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from torch.fx import Node
 from torch.nn import Parameter
 
@@ -28,34 +28,20 @@ class MeanDimConverter(NodeConverter):
     @staticmethod
     def _is_supported_on_target(
         node: Node,
-        target: Target,
+        neutron_target_spec: NeutronTargetSpec,
         parameters_mapping: dict[str, Parameter],
         custom_delegation_options: CustomDelegationOptions,
     ) -> bool:
-        match target:
-            case Target.RT700:
-                # TODO: Consider different tensor formats (dim-order)
-                dim = node.args[1]
-                keepdim = node.args[2] if len(node.args) >= 3 else False
-                rank = len(node.args[0].meta["val"].shape)
-                dim = [MeanDimConverter._to_neg_dim(d, rank) for d in dim]
-
-                # Only last 2 dimensions (H, W) and keepdim=True with rank=4 are supported on Neutron.
-                if rank != 4 or dim not in [[-1, -2], [-2, -1]] or not keepdim:
-                    return False
-
-                return True
-
-            case _:
-                return False
+        dim = node.args[1]
+        keepdim = node.args[2] if len(node.args) >= 3 else False
+        rank = len(node.args[0].meta["val"].shape)
+        dim = [d - rank if d > 0 else d for d in dim]
 
-    @staticmethod
-    def _to_pos_dim(d, rank):
-        return d + rank if d < 0 else d
+        # Only last 2 dimensions (H, W) and keepdim=True with rank=4 are supported on Neutron.
+        if rank != 4 or dim not in [[-1, -2], [-2, -1]] or not keepdim:
+            return False
 
-    @staticmethod
-    def _to_neg_dim(d, rank):
-        return d - rank if d > 0 else d
+        return True
 
     @staticmethod
     def _is_supported_in_IR(
@@ -75,6 +61,10 @@ def _is_supported_in_IR(
 
         return True
 
+    @staticmethod
+    def _to_pos_dim(d: int, rank: int):
+        return d + rank if d < 0 else d
+
     @staticmethod
     def _normalize_and_to_channel_last_dim(dim: list[int], rank: int) -> list[int]:
         # convert negative index to positive
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/softmax_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/softmax_converter.py
index aa74c78ca24..5e4404d8476 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/softmax_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/softmax_converter.py
@@ -7,13 +7,11 @@
     CustomDelegationOptions,
 )
 from executorch.backends.nxp.backend.edge_helper import input_rank
-from executorch.backends.nxp.backend.ir.converter.node_converter import (
-    NodeConverter,
-    Target,
-)
+from executorch.backends.nxp.backend.ir.converter.node_converter import NodeConverter
 from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options import (
     softmax_options,
 )
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from torch.fx import Node
 from torch.nn import Parameter
 
@@ -22,18 +20,11 @@ class SoftmaxConverter(NodeConverter):
     @staticmethod
     def _is_supported_on_target(
         node: Node,
-        target: Target,
+        neutron_target_spec: NeutronTargetSpec,
         parameters_mapping: dict[str, Parameter],
         custom_delegation_options: CustomDelegationOptions,
     ) -> bool:
-        match target:
-            case Target.RT700:
-                # The eIQ Neutron NPU runtime software has a known issue with the SoftMax operation.
-                #  As long as the issue is present, return False for the i.MX RT700 target also.
-                return False
-
-            case _:
-                return False
+        return False
 
     @staticmethod
     def _is_supported_in_IR(
diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_transpose_operators.py b/backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_transpose_operators.py
index dc9ad9999b4..0be46efcaa8 100755
--- a/backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_transpose_operators.py
+++ b/backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_transpose_operators.py
@@ -1,4 +1,4 @@
-# Copyright 2024 NXP
+# Copyright 2024-2025 NXP
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
diff --git a/backends/nxp/backend/neutron_converter_manager.py b/backends/nxp/backend/neutron_converter_manager.py
index 2bc4380f89b..a6884a9ee24 100644
--- a/backends/nxp/backend/neutron_converter_manager.py
+++ b/backends/nxp/backend/neutron_converter_manager.py
@@ -7,8 +7,6 @@
 import multiprocessing
 import pkgutil
 
-from executorch.backends.nxp.backend.ir.converter.node_converter import Target
-
 
 def convert_unsafe(neutron_converter, tflite_model, cctx, queue):
     """
@@ -27,16 +25,7 @@ class NeutronConverterManager:
     contains NeutronGraph nodes.
     """
 
-    _supported_target_names = [Target.RT700.value]
-
-    def convert(
-        self, tflite_model: bytes, target: str, neutron_converter_flavor: str
-    ) -> bytes:
-        # Neutron converter crashes if we provide invalid target -> verify.
-        if target not in self._supported_target_names:
-            raise RuntimeError(
-                f"Target '{target}' is not supported by NeutronConverterManager."
-            )
+    def __init__(self, neutron_converter_flavor: str = "SDK_25_09"):
 
         neutron_converter_modules = [
             module.name
@@ -57,13 +46,34 @@ def convert(
                     f"not found. Install 'neutron_converter_[flavor]' Python package."
                 )
 
-        neutron_converter = importlib.import_module(
+        self.neutron_converter = importlib.import_module(
             f"{requested_module_name}.neutron_converter"
         )
+        self.neutron_library_utils = importlib.import_module(
+            f"{requested_module_name}.neutron_library_utils"
+        )
+
+    def get_converter(self):
+        return self.neutron_converter
+
+    def get_library_utils(self):
+        return self.neutron_library_utils
+
+    def verify_target(self, target: str):
+        if not self.neutron_library_utils.isNeutronTarget(target):
+            valid_targets = [
+                target.name for target in self.neutron_library_utils.getNeutronTargets()
+            ]
+            raise ValueError(
+                f"Target `{target}` is not a valid target. Must be one of `{valid_targets}`."
+            )
+
+    def convert(self, tflite_model: bytes, target: str) -> bytes:
+        # Neutron converter crashes if we provide invalid target -> verify.
+        self.verify_target(target)
 
-        cctx = neutron_converter.CompilationContext()
-        cctx.targetOpts = neutron_converter.getNeutronTarget(target)
-        # New switch since Neutron Converter SDK_25.06
+        cctx = self.neutron_converter.CompilationContext()
+        cctx.targetOpts = self.neutron_converter.getNeutronTarget(target)
         cctx.compilationOpts.minNumOpsPerGraph = 1
 
         logger = multiprocessing.log_to_stderr()
@@ -71,7 +81,8 @@ def convert(
         queue = multiprocessing.Manager().Queue()
 
         process = multiprocessing.Process(
-            target=convert_unsafe, args=(neutron_converter, tflite_model, cctx, queue)
+            target=convert_unsafe,
+            args=(self.neutron_converter, tflite_model, cctx, queue),
         )
         process.start()
         process.join()  # waits until the subprocess is complete
diff --git a/backends/nxp/backend/neutron_target_spec.py b/backends/nxp/backend/neutron_target_spec.py
new file mode 100644
index 00000000000..44399982e29
--- /dev/null
+++ b/backends/nxp/backend/neutron_target_spec.py
@@ -0,0 +1,64 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Target Spec for the NXP Neutron NPU
+
+from enum import Enum
+
+from executorch.backends.nxp.backend.neutron_converter_manager import (
+    NeutronConverterManager,
+)
+
+
+class NeutronHWVersion(Enum):
+    N1 = 1
+    N3 = 2
+
+
+class NeutronTargetSpec:
+    """
+    The functionality for probing the properties of Neutron Target.
+    """
+
+    def __init__(self, target: str, neutron_converter_flavor: str):
+
+        converter_manager = NeutronConverterManager(neutron_converter_flavor)
+        converter_manager.verify_target(target)
+        neutron_converter = converter_manager.get_converter()
+        self.neutron_target = neutron_converter.getNeutronTarget(target)
+
+        if self.is_subsystem():
+            raise ValueError(
+                f"Target `{target}` is not a neutron-C target. Only MCU targets are supported at the moment."
+            )
+
+        if self.get_hw_version() != NeutronHWVersion.N3:
+            raise ValueError(
+                f"Target `{target}` contains unsupported HW version. Only N3/N3+ targets are supported at the moment."
+            )
+
+    # Target name.
+    def get_name(self) -> str:
+        return self.neutron_target.name
+
+    # Whether the target has subsystem (Neutron-S) or not (Neutron-C).
+    def is_subsystem(self) -> bool:
+        return self.neutron_target.subsystem
+
+    # Number of compute units.
+    def get_num_units(self) -> int:
+        return self.neutron_target.numUnits
+
+    # Number of compute pipelines.
+    def get_num_pipes(self) -> int:
+        return self.neutron_target.numPipes
+
+    # Number of compute MACs.
+    def get_num_macs(self) -> int:
+        return self.neutron_target.numMacs
+
+    # Neutron compute block hardware version.
+    def get_hw_version(self) -> NeutronHWVersion:
+        return NeutronHWVersion(self.neutron_target.hwVersion)
diff --git a/backends/nxp/neutron_partitioner.py b/backends/nxp/neutron_partitioner.py
index 371b7474f58..917545e6c89 100644
--- a/backends/nxp/neutron_partitioner.py
+++ b/backends/nxp/neutron_partitioner.py
@@ -8,7 +8,7 @@
 import logging
 import operator
 from dataclasses import dataclass
-from typing import Dict, final, List, Mapping
+from typing import final, Mapping
 
 import torch
 
@@ -18,13 +18,13 @@
 from executorch.backends.nxp.backend.edge_program_converter import (
     EdgeProgramToIRConverter,
 )
-from executorch.backends.nxp.backend.ir.converter.node_converter import Target
 from torch.export.exported_program import ExportedProgram
 from torch.fx import Graph
 from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner, Partition
 from torch.fx.passes.operator_support import OperatorSupportBase
 from torch.nn import Parameter
 from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters import *  # noqa F403
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from executorch.backends.nxp.nxp_backend import NeutronBackend
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.backend.partitioner import (
@@ -64,7 +64,7 @@ class QDQCluster:
         """
 
         compute_node: torch.fx.Node
-        ops: List[torch.fx.Node]
+        ops: list[torch.fx.Node]
 
     QUANTIZE_OPERATORS = [
         exir_ops.edge.quantized_decomposed.quantize_per_channel.default,
@@ -97,7 +97,7 @@ def is_dequant_node(node: torch.fx.Node) -> bool:
     def is_auxiliary_node(node: torch.fx.Node) -> bool:
         return node.target in QDQClusterRecognizer.AUXILIARY_OPS
 
-    def get_qdq_cluster_input_part(self, node: torch.fx.Node) -> List[torch.fx.Node]:
+    def get_qdq_cluster_input_part(self, node: torch.fx.Node) -> list[torch.fx.Node]:
         """
         Return the list of nodes representing the input part of the QDQ cluster of the node `node`.
         Those are various dequantization nodes (see DEQUANTIZE_OPERATORS) optionally followed by auxiliary
@@ -125,7 +125,7 @@ def get_qdq_cluster_input_part(self, node: torch.fx.Node) -> List[torch.fx.Node]
         logging.debug(f"Dequant Cluster for {node} is: {qdq_cluster}")
         return qdq_cluster
 
-    def get_qdq_cluster_output_part(self, node: torch.fx.Node) -> List[torch.fx.Node]:
+    def get_qdq_cluster_output_part(self, node: torch.fx.Node) -> list[torch.fx.Node]:
         """
         Returns the list of nodes representing the output part of the QDQ cluster of the `node`.
         Those are various quantize nodes (see QUANTIZE_OPERATORS) preceded by auxiliary nodes.
@@ -155,7 +155,7 @@ def get_qdq_cluster_output_part(self, node: torch.fx.Node) -> List[torch.fx.Node
         logging.debug(f"Quant Cluster for {node} is {qdq_cluster}")
         return qdq_cluster
 
-    def get_qdq_cluster(self, node: torch.fx.Node) -> List[torch.fx.Node]:
+    def get_qdq_cluster(self, node: torch.fx.Node) -> list[torch.fx.Node]:
         """
         Returns the QDQ cluster of the operator, if quantized. If operator is not quantized, returns empty list.
         """
@@ -167,7 +167,7 @@ def get_qdq_cluster(self, node: torch.fx.Node) -> List[torch.fx.Node]:
         else:
             return []
 
-    def tag_nodes(self, nodes: List[torch.fx.Node], cluster_name: str) -> None:
+    def tag_nodes(self, nodes: list[torch.fx.Node], cluster_name: str) -> None:
         """
         Tags a node and its related dequant and quant nodes with a specified cluster name
         """
@@ -175,7 +175,7 @@ def tag_nodes(self, nodes: List[torch.fx.Node], cluster_name: str) -> None:
             logging.info(f"Tagging node {node} as {cluster_name}")
             node.meta["cluster"] = cluster_name
 
-    def tag_qdq_clusters(self, nodes: List[torch.fx.Node]):
+    def tag_qdq_clusters(self, nodes: list[torch.fx.Node]):
         """
         Identifies QDQ clusters and tag them based on compute operation inside.
         """
@@ -220,14 +220,14 @@ class NeutronSupportedOperators(OperatorSupportBase):
 
     def __init__(
         self,
-        qdq_clusters: Dict[str, QDQClusterRecognizer.QDQCluster],
-        target: Target,
-        operators_not_to_delegate: List[str],
+        qdq_clusters: dict[str, QDQClusterRecognizer.QDQCluster],
+        neutron_target_spec: NeutronTargetSpec,
+        operators_not_to_delegate: list[str],
         parameters_mapping: dict[str, Parameter],
         custom_delegation_options: CustomDelegationOptions,
     ):
         self.qdq_clusters = qdq_clusters
-        self.target = target
+        self.neutron_target_spec = neutron_target_spec
         self.operators_not_to_delegate = operators_not_to_delegate
         self.parameters_mapping = parameters_mapping
         self.custom_delegation_options = custom_delegation_options
@@ -269,7 +269,7 @@ def _is_node_supported_compute(self, node: torch.fx.node.Node) -> bool:
             # TODO: `view_copy` node should be delegated only if it's not the only operator in the cluster.
             node_converter.is_supported(
                 node,
-                self.target,
+                self.neutron_target_spec,
                 self.parameters_mapping,
                 self.custom_delegation_options,
             )
@@ -305,13 +305,16 @@ def is_node_supported(
 class NeutronPartitioner(Partitioner):
     def __init__(
         self,
-        compile_spec: List[CompileSpec],
+        compile_spec: list[CompileSpec],
         custom_delegation_options: CustomDelegationOptions | None = None,
     ) -> None:
         self.delegation_spec = DelegationSpec(NeutronBackend.__name__, compile_spec)
         self.custom_delegation_options = (
             custom_delegation_options or CustomDelegationOptions()
         )
+        target = self.delegation_spec[1][2].value.decode()
+        converter_flavor = self.delegation_spec[1][3].value.decode()
+        self.neutron_target_spec = NeutronTargetSpec(target, converter_flavor)
 
     def validate_partitioning_result(
         self,
@@ -343,22 +346,17 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         # subgraphs containing the nodes with the tags
         logging.info("NeutronPartitioner::partition")
         partition_tags = {}
+        partition_list = []
 
         graph_module = exported_program.graph_module
         nodes = list(graph_module.graph.nodes)
 
         qdq_cluster_recognizer = QDQClusterRecognizer()
         qdq_cluster_recognizer.tag_qdq_clusters(nodes)
+
         graph_module.recompile()
 
-        target = None
-        operators_not_to_delegate = ""
-        for spec in self.delegation_spec.compile_specs:
-            if spec.key == "target":
-                target = Target(spec.value.decode())
-            if spec.key == "operators_not_to_delegate":
-                operators_not_to_delegate = spec.value.decode().split(",")
-        assert target is not None
+        operators_not_to_delegate = self.delegation_spec[1][4].value.decode().split(",")
         logging.info(f"Operators not to delegate: {operators_not_to_delegate}")
 
         parameters_mapping = EdgeProgramToIRConverter.map_inputs_to_parameters(
@@ -368,7 +366,7 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
             exported_program.graph_module,
             NeutronSupportedOperators(
                 qdq_cluster_recognizer.cluster_map,
-                target,
+                self.neutron_target_spec,
                 operators_not_to_delegate,
                 parameters_mapping,
                 self.custom_delegation_options,
diff --git a/backends/nxp/nxp_backend.py b/backends/nxp/nxp_backend.py
index fd1687d73fd..44e9a19d9f2 100644
--- a/backends/nxp/nxp_backend.py
+++ b/backends/nxp/nxp_backend.py
@@ -1,4 +1,4 @@
-# Copyright 2024 NXP
+# Copyright 2024-2025 NXP
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -18,11 +18,11 @@
 from executorch.backends.nxp.backend.edge_program_converter import (
     EdgeProgramToIRConverter,
 )
-from executorch.backends.nxp.backend.ir.converter.node_converter import Target
 from executorch.backends.nxp.backend.ir.tensor_formatting import TensorFormat
 from executorch.backends.nxp.backend.neutron_converter_manager import (
     NeutronConverterManager,
 )
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from executorch.backends.nxp.neutron_node_extraction import (
     extract_artifacts_from_neutron_node,
     NeutronNodeArtifacts,
@@ -36,9 +36,9 @@
 
 
 class NeutronCompileSpecBuilder:
+    config: NeutronTargetSpec
 
     def __init__(self):
-        self.config: Target = None
         self.compile_spec: List[CompileSpec] = []
         self.compiler_flags = []
         self.output_format = None
@@ -68,14 +68,9 @@ def neutron_compile_spec(
             extra_flags: Extra flags for the Neutron compiler
             operators_not_to_delegate: List of operators that should not be delegated
         """
-        try:
-            self.config = Target(config)
-        except ValueError:
-            raise ValueError(
-                f"Config `{config}` is not a valid target. Must be one of `{Target.values()}`."
-            )
 
         self.neutron_converter_flavor = neutron_converter_flavor
+        self.config = NeutronTargetSpec(config, neutron_converter_flavor)
 
         assert (
             self.output_format is None
@@ -101,7 +96,7 @@ def build(self):
             self.compile_spec += [
                 CompileSpec("output_format", "tflite".encode()),
                 CompileSpec("compile_flags", " ".join(self.compiler_flags).encode()),
-                CompileSpec("target", self.config.value.encode()),
+                CompileSpec("target", self.config.get_name().encode()),
                 CompileSpec(
                     "neutron_converter_flavor", self.neutron_converter_flavor.encode()
                 ),
@@ -187,10 +182,11 @@ def preprocess(  # noqa C901
             # Convert the edge program to TFLite.
             tflite_model, io_formats = EdgeProgramToIRConverter().convert_program(
                 edge_program,
+                neutron_target_spec=NeutronTargetSpec(target, neutron_converter_flavor),
             )
 
-            neutron_model = NeutronConverterManager().convert(
-                tflite_model, target, neutron_converter_flavor
+            neutron_model = NeutronConverterManager(neutron_converter_flavor).convert(
+                tflite_model, target
             )
 
             # Dump the tflite file if logging level is enabled
diff --git a/backends/nxp/tests/executors.py b/backends/nxp/tests/executors.py
index 592717c0b3b..9626a2779c4 100644
--- a/backends/nxp/tests/executors.py
+++ b/backends/nxp/tests/executors.py
@@ -1,4 +1,4 @@
-# Copyright 2023-2024 NXP
+# Copyright 2023-2025 NXP
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -18,10 +18,8 @@
     create_channels_first_to_channels_last_permutation,
     create_channels_last_to_channels_first_permutation,
 )
-from executorch.backends.nxp.backend.ir.converter.node_converter import (
-    NodeConverter,
-    Target,
-)
+from executorch.backends.nxp.backend.ir.converter.node_converter import NodeConverter
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from torch.export import ExportedProgram
 from torch.fx import Node
 from torch.fx.graph import Graph
@@ -373,7 +371,7 @@ def graph_contains_any_of_ops(graph: Graph, ops: list) -> bool:
     return any(node.target in ops for node in graph.nodes)
 
 
-target_support_check_function = Callable[[Node, Target], bool]
+target_support_check_function = Callable[[Node, NeutronTargetSpec], bool]
 
 
 class OverrideTargetSupportCheck:
diff --git a/backends/nxp/tests/test_neutron_backend.py b/backends/nxp/tests/test_neutron_backend.py
index 53e54ec2f56..c9917651fbd 100644
--- a/backends/nxp/tests/test_neutron_backend.py
+++ b/backends/nxp/tests/test_neutron_backend.py
@@ -1,4 +1,4 @@
-# Copyright 2024 NXP
+# Copyright 2024-2025 NXP
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
diff --git a/backends/nxp/tests/test_neutron_converter_manager.py b/backends/nxp/tests/test_neutron_converter_manager.py
index e10e8cca67b..2fcfd8cd987 100644
--- a/backends/nxp/tests/test_neutron_converter_manager.py
+++ b/backends/nxp/tests/test_neutron_converter_manager.py
@@ -1,4 +1,4 @@
-# Copyright 2024 NXP
+# Copyright 2024-2025 NXP
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -29,9 +29,7 @@ def test_conv2d_neutron_conversion__default_flavor():
     )
 
     neutron_converter_manager = NeutronConverterManager()
-    neutron_model = neutron_converter_manager.convert(
-        tflite_model, "imxrt700", "SDK_25_09"
-    )
+    neutron_model = neutron_converter_manager.convert(tflite_model, "imxrt700")
 
     assert len(
         neutron_model
@@ -50,9 +48,8 @@ def test__conv2d_neutron_conversion__invalid_flavor():
         edge_program_manager.exported_program()
     )
 
-    neutron_converter_manager = NeutronConverterManager()
     with pytest.raises(RuntimeError) as excinfo:
-        _ = neutron_converter_manager.convert(tflite_model, "imxrt700", "bad_flavor")
+        _ = NeutronConverterManager("bad_flavor").convert(tflite_model, "imxrt700")
 
     assert "Neutron Converter module with flavor 'bad_flavor' not found." in str(
         excinfo

From 01456041ecaf58548da1d32397553edcb2713767 Mon Sep 17 00:00:00 2001
From: Agrima Khare <121654192+agrima1304@users.noreply.github.com>
Date: Thu, 2 Oct 2025 15:10:52 +0100
Subject: [PATCH 103/266] Arm Backend: Add tests for stack.default (#14623)

Stack is not in the list of core ATen ops and is decomposed
automatically when lowering the graph
(https://docs.pytorch.org/docs/main/export.html#export-ir-decompositions),
so only the tests need to be added.

stack is in this decomp table:
https://github.com/pytorch/pytorch/blob/5d749ceb92c2c28bcfbdf918b4ab99b1a91fcb50/torch/_decomp/__init__.py#L466


Signed-off-by: Agrima Khare <agrima.khare@arm.com>
---
 backends/arm/test/ops/test_stack.py | 150 ++++++++++++++++++++++++++++
 1 file changed, 150 insertions(+)
 create mode 100644 backends/arm/test/ops/test_stack.py

diff --git a/backends/arm/test/ops/test_stack.py b/backends/arm/test/ops/test_stack.py
new file mode 100644
index 00000000000..873a599992a
--- /dev/null
+++ b/backends/arm/test/ops/test_stack.py
@@ -0,0 +1,150 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineINT,
+    EthosU85PipelineINT,
+    TosaPipelineFP,
+    TosaPipelineINT,
+    VgfPipeline,
+)
+
+test_data_suite = {
+    # (test_name, test_data)
+    "ones_two_tensors": lambda: ((torch.ones(1), torch.ones(1)), 0),
+    "ones_and_rand_three_tensors": lambda: (
+        (torch.ones(1, 2), torch.randn(1, 2), torch.randn(1, 2)),
+        1,
+    ),
+    "ones_and_rand_four_tensors": lambda: (
+        (
+            torch.ones(1, 2, 5),
+            torch.randn(1, 2, 5),
+            torch.randn(1, 2, 5),
+            torch.randn(1, 2, 5),
+        ),
+        -1,
+    ),
+    "rand_two_tensors": lambda: (
+        (torch.randn(2, 2, 4), torch.randn(2, 2, 4)),
+        2,
+    ),
+    "rand_two_tensors_dim_0": lambda: (
+        (torch.randn(1, 2, 4, 4), torch.randn(1, 2, 4, 4)),
+    ),
+    "rand_two_tensors_dim_2": lambda: (
+        (torch.randn(2, 2, 3, 5), torch.randn(2, 2, 3, 5)),
+        2,
+    ),
+    "rand_large": lambda: (
+        (
+            10000 * torch.randn(2, 3, 1, 4),
+            torch.randn(2, 3, 1, 4),
+            torch.randn(2, 3, 1, 4),
+        ),
+        -3,
+    ),
+}
+
+
+class Stack(nn.Module):
+    aten_op = "torch.ops.aten.stack.default"
+    exir_op = "executorch_exir_dialects_edge__ops_aten_cat_default"
+
+    def forward(self, n: tuple[torch.Tensor, ...], dim: int = 0):
+        return torch.stack(n, dim)
+
+
+input_t1 = Tuple[torch.Tensor]
+
+
+@common.parametrize("test_module", test_data_suite)
+def test_stack_tosa_FP(test_module: input_t1):
+    test_data = test_module()
+    pipeline = TosaPipelineFP[input_t1](
+        Stack(),
+        test_data,
+        aten_op=Stack.aten_op,
+        exir_op=Stack.exir_op,
+        use_to_edge_transform_and_lower=False,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_data_suite)
+def test_stack_tosa_INT(test_module: input_t1):
+    test_data = test_module()
+    pipeline = TosaPipelineINT[input_t1](
+        Stack(),
+        test_data,
+        aten_op=Stack.aten_op,
+        exir_op=Stack.exir_op,
+        use_to_edge_transform_and_lower=False,
+    )
+    pipeline.run()
+
+
+@common.XfailIfNoCorstone300
+@common.parametrize("test_module", test_data_suite)
+def test_stack_u55_INT(test_module: input_t1):
+    test_data = test_module()
+    pipeline = EthosU55PipelineINT[input_t1](
+        Stack(),
+        test_data,
+        aten_ops=Stack.aten_op,
+        exir_ops=Stack.exir_op,
+        use_to_edge_transform_and_lower=False,
+    )
+    pipeline.run()
+
+
+@common.XfailIfNoCorstone320
+@common.parametrize("test_module", test_data_suite)
+def test_stack_u85_INT(test_module: input_t1):
+    test_data = test_module()
+    pipeline = EthosU85PipelineINT[input_t1](
+        Stack(),
+        test_data,
+        aten_ops=Stack.aten_op,
+        exir_ops=Stack.exir_op,
+        use_to_edge_transform_and_lower=False,
+    )
+    pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+@common.parametrize("test_module", test_data_suite)
+def test_stack_vgf_FP(test_module: input_t1):
+    test_data = test_module()
+    pipeline = VgfPipeline[input_t1](
+        Stack(),
+        test_data,
+        aten_op=Stack.aten_op,
+        exir_op=Stack.exir_op,
+        tosa_version="TOSA-1.0+FP",
+        use_to_edge_transform_and_lower=False,
+    )
+    pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+@common.parametrize("test_module", test_data_suite)
+def test_stack_vgf_INT(test_module: input_t1):
+    test_data = test_module()
+    pipeline = VgfPipeline[input_t1](
+        Stack(),
+        test_data,
+        aten_op=Stack.aten_op,
+        exir_op=Stack.exir_op,
+        tosa_version="TOSA-1.0+INT",
+        use_to_edge_transform_and_lower=False,
+    )
+    pipeline.run()

From 4372a143172df1b0037f296a36e9b5e83cdba548 Mon Sep 17 00:00:00 2001
From: Abhinayk <abhinayk@meta.com>
Date: Thu, 2 Oct 2025 10:03:06 -0700
Subject: [PATCH 104/266] Fix const prop pass when a const prop tensor has zero
 stride, make it contiguous (#14725)

---
 exir/passes/constant_prop_pass.py |  8 ++++
 exir/tests/test_passes.py         | 73 ++++++++++++++++++++++++++++++-
 2 files changed, 80 insertions(+), 1 deletion(-)

diff --git a/exir/passes/constant_prop_pass.py b/exir/passes/constant_prop_pass.py
index 7daa3a247e8..06c1c78ee21 100644
--- a/exir/passes/constant_prop_pass.py
+++ b/exir/passes/constant_prop_pass.py
@@ -164,6 +164,14 @@ def get_propagated_const_tensor_dict(
         with torch.no_grad():
             # Execute the `node.target` and create a new propagated constant tensor.
             prop_constant_tensor = node.target(*args_data, **kwargs_data)
+
+            # ExecuTorch doesn't support zero strides, so we need to ensure the tensor is contiguous
+            # if it has any zero strides from broadcasting/expansion operations
+            if (
+                isinstance(prop_constant_tensor, torch.Tensor)
+                and 0 in prop_constant_tensor.stride()
+            ):
+                prop_constant_tensor = prop_constant_tensor.contiguous()
         const_node_to_tensor[node] = prop_constant_tensor
 
     return const_node_to_tensor
diff --git a/exir/tests/test_passes.py b/exir/tests/test_passes.py
index 716b808b087..14f105e8205 100644
--- a/exir/tests/test_passes.py
+++ b/exir/tests/test_passes.py
@@ -24,7 +24,17 @@
 from executorch.backends.xnnpack.quantizer.xnnpack_quantizer_utils import (
     QuantizationConfig,
 )
-from executorch.exir import EdgeCompileConfig, EdgeProgramManager, memory, to_edge
+from executorch.backends.xnnpack.utils.configs import (
+    get_xnnpack_executorch_backend_config,
+)
+
+from executorch.exir import (
+    EdgeCompileConfig,
+    EdgeProgramManager,
+    memory,
+    to_edge,
+    to_edge_transform_and_lower,
+)
 from executorch.exir.dialects._ops import bind_pattern_to_op, ops, ops as exir_ops
 from executorch.exir.dialects.edge._ops import EdgeOpOverload
 from executorch.exir.emit import emit_program
@@ -2022,3 +2032,64 @@ def forward(self, x):
         pass_result = constant_prop_pass(edge.exported_program())
         # 1 constant: a (= self.w @ self.cst)
         self.assertEqual(1, len(pass_result.constants))
+
+    def test_constant_prop_pass_zero_stride_tensors(self) -> None:
+        """
+        Test that constant propagation correctly handles tensors with zero strides
+        by converting them to contiguous tensors. Zero-stride tensors can be created
+        by operations like expand() and are not supported by ExecuTorch.
+        """
+
+        class ZeroStrideModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.const_param = torch.nn.Parameter(torch.tensor([1.0, 2.0, 3.0]))
+
+            def forward(self, x):
+                unsqueezed = self.const_param.unsqueeze(
+                    1
+                )  # Shape: (3, 1), strides: (1, 1)
+                # expand creates zero-stride tensor
+                expanded = unsqueezed.expand(3, 5)  # Shape: (3, 5), strides: (1, 0)
+
+                # Use the expanded tensor with the input to prevent elimination
+                result = x + expanded.sum()
+                return result
+
+        model = ZeroStrideModel()
+        x = torch.randn(3, 5)
+        exported = torch.export.export(model, (x,))
+
+        # Before constant prop: verify we have the parameter
+        self.assertIn("const_param", exported.state_dict)
+
+        const_prop_result = constant_prop_pass(exported)
+        lowered = to_edge_transform_and_lower(
+            const_prop_result,
+            partitioner=[XnnpackPartitioner()],
+        )
+
+        # Should go through
+        lowered.to_executorch(get_xnnpack_executorch_backend_config([SpecPropPass()]))
+        self.assertGreater(len(const_prop_result.constants), 0)
+
+        # Find the propagated constant tensor
+        prop_tensor = None
+        for constant_name, constant_tensor in const_prop_result.constants.items():
+            if constant_name.startswith("_prop_tensor_constant"):
+                prop_tensor = constant_tensor
+                break
+
+        # Verify the propagated tensor exists and has no zero strides
+        self.assertIsNotNone(prop_tensor)
+        self.assertNotIn(
+            0,
+            prop_tensor.stride(),
+            f"Propagated tensor still has zero stride: {prop_tensor.stride()}",
+        )
+
+        # Verify the tensor is contiguous
+        self.assertTrue(
+            prop_tensor.is_contiguous(),
+            f"Propagated tensor is not contiguous: {prop_tensor.stride()}",
+        )

From 0882c9b689196791384a74ba1a2da695cd1cba4b Mon Sep 17 00:00:00 2001
From: DannyYuyang-quic <quic_yuyazhua@quicinc.com>
Date: Fri, 3 Oct 2025 01:30:47 +0800
Subject: [PATCH 105/266] Qualcomm AI Engine Direct - GA Static
 Gemma-2b-instruct (#14459)

### Summary:
- e2e script for Gemma-2b-it in static llama version
- add model params file & model weight converter

### Test plan
``` bash
python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --temperature 0 --model_mode hybrid --max_seq_len 1024 --prefill_ar_len 128 --decoder_model gemma3-1b --prompt "I would like to learn python, could you teach me with a simple example?" --tasks wikitext --limit 1
```
---
 backends/qualcomm/tests/test_qnn_delegate.py  |  59 ++++++++++
 examples/models/gemma/__init__.py             |  16 +++
 examples/models/gemma/config/2b_config.json   |  19 ++++
 examples/models/gemma/convert_weights.py      | 104 ++++++++++++++++++
 examples/qualcomm/oss_scripts/llama/README.md |  20 +++-
 .../qualcomm/oss_scripts/llama/__init__.py    |  31 ++++++
 .../oss_scripts/llama/decoder_constants.py    |   1 +
 examples/qualcomm/oss_scripts/llama/llama.py  |  16 ++-
 .../oss_scripts/llama/qnn_llama_runner.cpp    |   3 +-
 .../oss_scripts/llama/runner/runner.cpp       |   6 +-
 .../oss_scripts/llama/runner/runner.h         |   1 +
 11 files changed, 265 insertions(+), 11 deletions(-)
 create mode 100644 examples/models/gemma/__init__.py
 create mode 100644 examples/models/gemma/config/2b_config.json
 create mode 100644 examples/models/gemma/convert_weights.py

diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index e3cf52b9a6f..7018edcbb9c 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -4968,6 +4968,65 @@ def test_qnn_backend_seq_mse(self):
 
 
 class TestExampleLLMScript(TestQNN):
+    def test_static_gemma_2b(self):
+        if not self.required_envs():
+            self.skipTest("missing required envs")
+
+        prompt = "My favourite condiment is "
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--model",
+            self.model,
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+            "--prompt",
+            f"{prompt}",
+            "--decoder_model",
+            "gemma-2b",
+            "--model_mode",
+            "kv",
+            "--max_seq_len",
+            "1024",
+            "--eval_perplexity",
+            "--tasks",
+            "wikitext",
+            "--limit",
+            "1",
+        ]
+        if self.compile_only:
+            cmds.extend(["--compile_only"])
+        elif self.device:
+            cmds.extend(["--device", self.device])
+        if self.host:
+            cmds.extend(["--host", self.host])
+        elif self.enable_x86_64:
+            cmds.extend(["--enable_x86_64"])
+        if self.pre_gen_pte:
+            cmds.extend(["--pre_gen_pte", self.pre_gen_pte])
+
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                inference_speed_ref = {"SM8650": 32, "SM8750": 36}
+                self.assertLessEqual(msg["wiki_ppl"], 35)
+                self.assertLessEqual(msg["pte_size"], 2_700_000_000)  # 2.7GB
+                if self.model in inference_speed_ref:
+                    self.assertGreaterEqual(
+                        msg["inference_speed"], inference_speed_ref[self.model]
+                    )
+
     def test_static_gemma3_1b(self):
         if not self.required_envs():
             self.skipTest("missing required envs")
diff --git a/examples/models/gemma/__init__.py b/examples/models/gemma/__init__.py
new file mode 100644
index 00000000000..13a14ff0751
--- /dev/null
+++ b/examples/models/gemma/__init__.py
@@ -0,0 +1,16 @@
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.examples.models.gemma.convert_weights import convert_weights
+from executorch.examples.models.llama.model import Llama2Model
+
+
+class GemmaModel(Llama2Model):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+
+__all__ = [
+    "GemmaModel",
+    "convert_weights",
+]
diff --git a/examples/models/gemma/config/2b_config.json b/examples/models/gemma/config/2b_config.json
new file mode 100644
index 00000000000..20a40723c30
--- /dev/null
+++ b/examples/models/gemma/config/2b_config.json
@@ -0,0 +1,19 @@
+{
+  "dim": 2048,
+  "ffn_dim_multiplier": 1,
+  "hidden_dim": 16384,
+  "n_heads": 8,
+  "head_dim": 256,
+  "n_kv_heads": 1,
+  "n_layers": 18,
+  "act_fn": "gelu",
+  "norm_type": "gemma3",
+  "norm_eps": 1e-06,
+  "rope_theta": 10000.0,
+  "use_scaled_rope": false,
+  "apply_embedding": true,
+  "embedding_scale_factor": 45.254833995939045,
+  "vocab_size": 256000,
+  "use_hf_rope": true,
+  "attention_qkv_bias": false
+}
diff --git a/examples/models/gemma/convert_weights.py b/examples/models/gemma/convert_weights.py
new file mode 100644
index 00000000000..09a17bc2266
--- /dev/null
+++ b/examples/models/gemma/convert_weights.py
@@ -0,0 +1,104 @@
+import argparse
+
+import json
+import os
+from typing import Dict
+
+import torch
+from safetensors.torch import load_file
+
+from torchtune.models.convert_weights import get_mapped_key
+
+
+# Weight mappings from Gemma's checkpoint to ExecuTorch's transformer parameters.
+_GEMMA_TO_EXECUTORCH = {
+    "model.embed_tokens.weight": "tok_embeddings.weight",
+    "model.norm.weight": "norm.weight",
+    "model.layers.{}.self_attn.k_proj.weight": "layers.{}.attention.wk.weight",
+    "model.layers.{}.self_attn.q_proj.weight": "layers.{}.attention.wq.weight",
+    "model.layers.{}.self_attn.v_proj.weight": "layers.{}.attention.wv.weight",
+    "model.layers.{}.self_attn.o_proj.weight": "layers.{}.attention.wo.weight",
+    "model.layers.{}.input_layernorm.weight": "layers.{}.attention_norm.weight",
+    "model.layers.{}.post_attention_layernorm.weight": "layers.{}.ffn_norm.weight",
+    "model.layers.{}.mlp.gate_proj.weight": "layers.{}.feed_forward.w1.weight",
+    "model.layers.{}.mlp.down_proj.weight": "layers.{}.feed_forward.w2.weight",
+    "model.layers.{}.mlp.up_proj.weight": "layers.{}.feed_forward.w3.weight",
+}
+
+
+def gemma_to_executorch(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+    """
+    Convert the state dict so that it matches what ExecuTorch's transformer definition expects.
+    """
+    converted_state_dict = {}
+    for key, value in state_dict.items():
+        new_key = get_mapped_key(key, _GEMMA_TO_EXECUTORCH)
+        converted_state_dict[new_key] = value
+    converted_state_dict["output.weight"] = converted_state_dict[
+        "tok_embeddings.weight"
+    ]
+    return converted_state_dict
+
+
+def load_checkpoint_from_safetensors(input_dir: str) -> Dict:
+    index_path = os.path.join(input_dir, "model.safetensors.index.json")
+    if os.path.exists(index_path):
+        # Sharded checkpoint.
+        with open(index_path, "r") as f:
+            index = json.load(f)
+        weight_map = index["weight_map"]
+        checkpoint_shards = sorted(set(weight_map.values()))
+
+        # Load all the shards into memory
+        shard_to_weights = {}
+        for shard in checkpoint_shards:
+            shard_to_weights[shard] = load_file(os.path.join(input_dir, shard))
+
+        # Merge tensors into consolidated state dict.
+        merged_state_dict = {}
+        for weight_name, shard in weight_map.items():
+            tensor = shard_to_weights[shard][weight_name]
+            merged_state_dict[weight_name] = tensor
+        return merged_state_dict
+    else:
+        # Single checkpoint.
+        state_dict = load_file(os.path.join(input_dir, "model.safetensors"))
+        return state_dict
+
+
+def load_checkpoint(input_dir: str) -> Dict:
+    pytorch_path = os.path.join(input_dir, "pytorch_model.bin")
+    if os.path.exists(pytorch_path):
+        print("Loading checkpoint from PyTorch .bin file")
+        return torch.load(pytorch_path, map_location="cpu", weights_only=True)
+    print("Loading checkpoint from safetensors directory")
+    return load_checkpoint_from_safetensors(input_dir)
+
+
+def convert_weights(input_dir: str, output_file: str) -> None:
+    print("Loading checkpoint...")
+    sd = load_checkpoint(input_dir)
+    print("Converting checkpoint...")
+    sd = gemma_to_executorch(sd)
+    print("Saving checkpoint...")
+    torch.save(sd, output_file)
+    print("Done.")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert Gemma weights to ExecuTorch transformer format."
+    )
+    parser.add_argument(
+        "input_dir",
+        type=str,
+        help="Path to directory containing safetensor checkpoint files, or PyTorch checkpoint file.",
+    )
+    parser.add_argument("output", type=str, help="Path to the output checkpoint")
+
+    args = parser.parse_args()
+    convert_weights(args.input_dir, args.output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/qualcomm/oss_scripts/llama/README.md b/examples/qualcomm/oss_scripts/llama/README.md
index 1be94ec04d6..9bb76142362 100644
--- a/examples/qualcomm/oss_scripts/llama/README.md
+++ b/examples/qualcomm/oss_scripts/llama/README.md
@@ -5,12 +5,13 @@ This file provides you the instructions to run LLM Decoder model with different
  1. LLAMA2 Stories 110M
  2. LLAMA3.2 1B
  3. LLAMA3.2 3B
- 4. Gemma3 1B
- 5. Phi4-mini-instruct
- 6. QWEN2.5 0.5B / 1.5B
- 7. QWEN3 0.6B / 1.7B
- 8. SmolLM2 135M
- 9. SmolLM3 3B
+ 4. Gemma 2B
+ 5. Gemma3 1B
+ 6. Phi4-mini-instruct
+ 7. QWEN2.5 0.5B / 1.5B
+ 8. QWEN3 0.6B / 1.7B
+ 9. SmolLM2 135M
+ 10. SmolLM3 3B
  
 
 We offer the following modes to execute the model:
@@ -78,6 +79,13 @@ Default example using kv mode.
 python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --decoder_model llama3_2-3b_instruct --model_mode kv --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?" --tasks wikitext --limit 1
 ```
 
+#### Gemma 2B
+Default example using hybrid mode
+```bash
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --temperature 0 --model_mode hybrid --max_seq_len 1024 --prefill_ar_len 128 --decoder_model gemma-2b --prompt "I would like to learn python, could you teach me with a simple example?" --tasks wikitext --limit 1
+```
+
+
 #### Gemma3 1B
 Default example using hybrid mode
 ```bash
diff --git a/examples/qualcomm/oss_scripts/llama/__init__.py b/examples/qualcomm/oss_scripts/llama/__init__.py
index 5908fcf32a6..628defc1496 100644
--- a/examples/qualcomm/oss_scripts/llama/__init__.py
+++ b/examples/qualcomm/oss_scripts/llama/__init__.py
@@ -24,6 +24,7 @@
 )
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
 
+from executorch.examples.models.gemma import convert_weights as convert_gemma_weights
 from executorch.examples.models.gemma3 import convert_weights as convert_gemma3_weights
 from executorch.examples.models.phi_4_mini import (
     convert_weights as convert_phi_4_mini_weights,
@@ -300,6 +301,36 @@ class Llama3_2_3B_Instruct(LLMModelConfig):
     )
 
 
+@register_llm_model("gemma-2b")
+@dataclass(init=False, frozen=True)
+class Gemma_2B(LLMModelConfig):
+    repo_id: str = "google/gemma-2b-it"
+    params_path: str = os.path.join(
+        BASE_DIR, "../../../models/gemma/config/2b_config.json"
+    )
+    convert_weights = convert_gemma_weights
+    transform_weight = False
+    instruct_model = True
+
+    num_sharding = 4
+    # quant config
+    ptq = QuantDtype.use_16a4w_block
+    group_size = 64
+    masked_softmax = True
+    seq_mse_candidates = 0
+    r1 = False
+    r2 = False
+    r3 = False
+    quantization_config_wv_sha_16a8w = get_ptq_per_channel_quant_config(
+        torch.uint16, weight_dtype=torch.int8, act_observer=MinMaxObserver
+    )
+    custom_annotation = (
+        annotate_kv_8bit,
+        annotate_output_16a8w,
+        partial(annotate_wv_sha, quantization_config=quantization_config_wv_sha_16a8w),
+    )
+
+
 @register_llm_model("gemma3-1b")
 @dataclass(init=False, frozen=True)
 class Gemma3(LLMModelConfig):
diff --git a/examples/qualcomm/oss_scripts/llama/decoder_constants.py b/examples/qualcomm/oss_scripts/llama/decoder_constants.py
index ac96770b889..d43ceb8351a 100644
--- a/examples/qualcomm/oss_scripts/llama/decoder_constants.py
+++ b/examples/qualcomm/oss_scripts/llama/decoder_constants.py
@@ -14,6 +14,7 @@
 DECODER_MODEL_VERSION = {
     "stories260k": "llama2",
     "stories110m": "llama2",
+    "gemma-2b": "gemma",
     "gemma3-1b": "gemma3",
     "phi_4_mini": "phi_4_mini",
     "llama3_2-1b_instruct": "llama3",
diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
index ae5ae63d509..887e680341f 100755
--- a/examples/qualcomm/oss_scripts/llama/llama.py
+++ b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -327,6 +327,13 @@ def quantize(
                     chat_template, args.prompt[0], args.system_prompt
                 )
             )
+
+            # Gemma may produce unexpected output if the prompt contains an extra <bos> token.
+            # This can happen after applying a prompt template, which might inject <bos> unintentionally.
+            # To prevent decoding issues, we explicitly remove <bos> token
+            if chat_template and args.decoder_model in {"gemma-2b", "gemma3-1b"}:
+                prompt = prompt.replace("<bos>", "")
+
             graph_module_inference(
                 use_kv_cache=self.llama_meta["get_use_kv_cache"],
                 get_example_inputs=self.get_example_inputs,
@@ -534,14 +541,13 @@ def compile(
         state_dict = torch.load(
             checkpoint, weights_only=True, map_location="cpu", mmap=True
         )
-        if args.decoder_model == "gemma3-1b":
+        if args.decoder_model in {"gemma-2b", "gemma3-1b"}:
             for k, v in state_dict.items():
                 if "norm" not in k:
                     continue
                 # Llama does x.to(float16) * w whilst Gemma3 is (x * w).to(float16)
                 # See https://github.com/huggingface/transformers/pull/29402
                 state_dict[k] = v.float() + torch.ones(v.shape, dtype=torch.float32)
-
     else:
         state_dict = torch.load(
             args.checkpoint, weights_only=True, map_location="cpu", mmap=True
@@ -1286,7 +1292,11 @@ def export_llama(args) -> None:
         )
         tokenizer_artifacts = tokenizer.save_pretrained(args.artifact)
         tokenizer_config = tokenizer_artifacts[0]
-        runtime_tokenizer_path = tokenizer_artifacts[-1]
+        if args.decoder_model == "gemma-2b":
+            # For Gemma, use tokenizer.model as it doesn't provide pre_tokenizer in tokenizer.json.
+            runtime_tokenizer_path = tokenizer_artifacts[-3]
+        else:
+            runtime_tokenizer_path = tokenizer_artifacts[-1]
         tokenizer = get_tokenizer(runtime_tokenizer_path, tokenizer_config)
 
     # TODO: Remove this once error is resolved.
diff --git a/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
index 71eaea2b8d6..2bffb35852a 100644
--- a/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
@@ -9,7 +9,7 @@
 /**
  * @file
  *
- * This tool can run Llama2 110M, Llama3.2 1B / 3B, Gemma3 1B,
+ * This tool can run Llama2 110M, Llama3.2 1B / 3B, Gemma 2B, Gemma3 1B,
  * phi4-mini-instruct, Qwen2.5 0.5B / 1.5B, Qwen3 0.6B / 1.7B, SmolLM2 135M,
  * SmolLM3 3B with Qualcomm AI Engine Direct.
  *
@@ -117,6 +117,7 @@ std::string get_formatted_prompt(
       formatted_prompt.append(
           "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");
       break;
+    case example::DecoderModelVersion::kGemma:
     case example::DecoderModelVersion::kGemma3:
       formatted_prompt.append("<start_of_turn>user\n");
       formatted_prompt.append(prompt);
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
index fe45d4b6a67..0c4884bbccf 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
@@ -122,6 +122,8 @@ Runner<T>::Runner(
     decoder_model_version_ = DecoderModelVersion::kLlama2;
   } else if (decoder_model_version == "llama3") {
     decoder_model_version_ = DecoderModelVersion::kLlama3;
+  } else if (decoder_model_version == "gemma") {
+    decoder_model_version_ = DecoderModelVersion::kGemma;
   } else if (decoder_model_version == "gemma3") {
     decoder_model_version_ = DecoderModelVersion::kGemma3;
     cache_mode_ = CacheMode::HybridCache;
@@ -199,7 +201,9 @@ Error Runner<T>::load() {
       decoder_model_version_ == DecoderModelVersion::kSmollm2_135m ||
       decoder_model_version_ == DecoderModelVersion::kSmollm3) {
     eos_ids->insert(tokenizer_->encode("<|im_end|>", 0, 0).get()[0]);
-  } else if (decoder_model_version_ == DecoderModelVersion::kGemma3) {
+  } else if (
+      decoder_model_version_ == DecoderModelVersion::kGemma ||
+      decoder_model_version_ == DecoderModelVersion::kGemma3) {
     eos_ids->insert(tokenizer_->encode("<end_of_turn>", 0, 0).get()[0]);
   }
 
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.h b/examples/qualcomm/oss_scripts/llama/runner/runner.h
index 9f290d79c75..1472093ab66 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/runner.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/runner.h
@@ -32,6 +32,7 @@ namespace example {
 enum DecoderModelVersion {
   kLlama2 = 0,
   kLlama3,
+  kGemma,
   kGemma3,
   kPhi4,
   kQwen2_5,

From deb42f2a8e48f5032b4a98ee781a15fa87a157cf Mon Sep 17 00:00:00 2001
From: Laith Sakka <lsakka@meta.com>
Date: Thu, 2 Oct 2025 10:54:53 -0700
Subject: [PATCH 106/266] update lama export DS specs to be more accurate.

Differential Revision: D83708583

Pull Request resolved: https://github.com/pytorch/executorch/pull/14737
---
 extension/llm/export/builder.py           | 9 +++++++--
 extension/llm/export/test/test_builder.py | 2 +-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index 01000f3564c..da5c3324662 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -142,9 +142,14 @@ def __init__(
                     {1: torch.export.Dim("token_dim", max=self.max_seq_len - 1)},
                 )
             else:
-                # Two input arguments: tokens and input_pos but input_pos is static shape
+                # Two input arguments: tokens and input_pos but input_pos is static shape.
+
+                # A runtime assertion is added by torch.ops.llama.update_cache requires that
+                # L['tokens'].size()[1] + input_pos[0].item() < self.max_seq_len
+                # This consttaint L['tokens'].size()[1] to be elf.max_seq_len-1
+                # run with TORCH_LOGS=+dynamic for details
                 self.dynamic_shapes = (
-                    {1: torch.export.Dim("token_dim", max=self.max_seq_len)},
+                    {1: torch.export.Dim("token_dim", max=self.max_seq_len - 1)},
                     {"input_pos": {0: 1}},
                 )
 
diff --git a/extension/llm/export/test/test_builder.py b/extension/llm/export/test/test_builder.py
index 8bf591813ec..7883480c1e7 100644
--- a/extension/llm/export/test/test_builder.py
+++ b/extension/llm/export/test/test_builder.py
@@ -88,7 +88,7 @@ def test_get_dynamic_shape_with_dynamic_shape_enabled_with_kv_cache(self) -> Non
         # Check first element (tokens dimension)
         self.assertIsInstance(result[0], dict)
         self.assertIn(1, result[0])
-        self.assertEqual(result[0][1].max, self.max_seq_len)
+        self.assertEqual(result[0][1].max, self.max_seq_len - 1)
 
         # Check second element (input_pos dimension)
         self.assertIsInstance(result[1], dict)

From 19258d284c8257a53471a63d0b92f462f8eb2a5c Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@meta.com>
Date: Thu, 2 Oct 2025 12:47:25 -0700
Subject: [PATCH 107/266] update tokenizer pin (#14751)

Summary:
https://github.com/meta-pytorch/tokenizers/commit/65e41a96e1b6870d0e616cd7f9eaaf5aaa1d89f3
bringing in this change for windows builds of voxtral runner

Differential Revision: D83759380
---
 extension/llm/tokenizers | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extension/llm/tokenizers b/extension/llm/tokenizers
index b0076444dec..65e41a96e1b 160000
--- a/extension/llm/tokenizers
+++ b/extension/llm/tokenizers
@@ -1 +1 @@
-Subproject commit b0076444decffb88166452e26ba688233b905647
+Subproject commit 65e41a96e1b6870d0e616cd7f9eaaf5aaa1d89f3

From a1652f97b721dccc4f1f2585d3e1f15a2306e8d0 Mon Sep 17 00:00:00 2001
From: tmi <vojta.tuma@gmail.com>
Date: Fri, 3 Oct 2025 00:16:59 +0200
Subject: [PATCH 108/266] Fix pyproject.toml license classifier deprecation
 (#14592)

Gets rid of the 'deprecated' warnings that pop up multiple times during
build/install

Bumps setuptools requirement to accept the new license declaration
format

### Summary
Just a tiny PR, no change to API or code or anything. The license itself
is as before, it just changes the manner in which it is declared -- as
recommended by the PyPA guidelines
https://packaging.python.org/en/latest/guides/writing-pyproject-toml/#license

I tried to find any issue related to this but found none. And I guess
not worth it creating one

### Test plan
Since this does not change any of the code, I just tested that the
package can be installed/built as before via `./install_executorch.sh`,
and that the deprecation warnings vanish
---
 pyproject.toml       | 7 ++++---
 requirements-dev.txt | 3 ++-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index fbed875a824..401b1fa2c24 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,9 +1,10 @@
 [build-system]
 requires = [
   "cmake>=3.29,<4.0.0",  # For building binary targets in the wheel. 4.0.0 breaks third-party CMake build so temporarily pin the version.
+  "packaging>=24.2", # Lower bound required by setuptools
   "pip>=23",  # For building the pip package.
   "pyyaml",  # Imported by the kernel codegen tools.
-  "setuptools>=63",  # For building the pip package contents.
+  "setuptools>=77.0.3",  # For building the pip package contents.
   "wheel",  # For building the pip package archive.
   "zstd",  # Imported by resolve_buck.py.
   "certifi",  # Imported by resolve_buck.py.
@@ -21,7 +22,8 @@ readme = "README-wheel.md"
 authors = [
   {name="PyTorch Team", email="packages@pytorch.org"},
 ]
-license = {file = "LICENSE"}
+license = "BSD-3-Clause"
+license-files = ["LICENSE"]
 keywords = ["pytorch", "machine learning"]
 # PyPI package information.
 classifiers = [
@@ -33,7 +35,6 @@ classifiers = [
     "Intended Audience :: Developers",
     "Intended Audience :: Education",
     "Intended Audience :: Science/Research",
-    "License :: OSI Approved :: BSD License",
     "Topic :: Scientific/Engineering",
     "Topic :: Scientific/Engineering :: Mathematics",
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 9df5e7b93ed..258a898894c 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,9 +1,10 @@
 # Pip packages needed to build from source. Mainly for development of ExecuTorch.
 
 cmake>=3.29, <4.0.0  # For building binary targets in the wheel.
+packaging>=24.2 # Lower bound required by setuptools
 pip>=23  # For building the pip package.
 pyyaml  # Imported by the kernel codegen tools.
-setuptools>=63  # For building the pip package contents.
+setuptools>=77.0.3  # For building the pip package contents.
 wheel  # For building the pip package archive.
 zstd  # Imported by resolve_buck.py.
 certifi  # Imported by resolve_buck.py.

From 53ccfd04c2ebd74da7d17174dd64711783466bcf Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu0820@users.noreply.github.com>
Date: Thu, 2 Oct 2025 23:25:43 +0100
Subject: [PATCH 109/266] Fix cuda export test failures from #14715 (#14753)

---
 backends/cuda/TARGETS                          |  1 +
 backends/cuda/cuda_backend.py                  |  4 +++-
 backends/cuda/replace_slice_copy_with_slice.py | 13 ++++++++-----
 backends/cuda/tests/test_cuda_export.py        |  5 ++++-
 4 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/backends/cuda/TARGETS b/backends/cuda/TARGETS
index 3e412b6dc56..fe57f7f1b63 100644
--- a/backends/cuda/TARGETS
+++ b/backends/cuda/TARGETS
@@ -6,6 +6,7 @@ runtime.python_library(
     name = "cuda_backend",
     srcs = [
         "cuda_backend.py",
+        "replace_slice_copy_with_slice.py",
     ],
     visibility = [
         "//executorch/...",
diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
index a39065f6a52..8ed8cdefbb1 100644
--- a/backends/cuda/cuda_backend.py
+++ b/backends/cuda/cuda_backend.py
@@ -144,7 +144,9 @@ def preprocess(
         }
 
         with collect_unsupported_fallback_kernels(), torch.nn.attention.sdpa_kernel(
-            [SDPBackend.MATH]
+            [
+                SDPBackend.MATH  # pyre-ignore[16]: Module `torch.nn.attention` has no attribute `SDPBackend`.
+            ]
         ), torch.no_grad():
             # torch._logging.set_logs(post_grad_graphs=True)
             so_path = torch._inductor.aot_compile(edge_program_module, tuple(user_input_placeholders), options=options)  # type: ignore[arg-type]
diff --git a/backends/cuda/replace_slice_copy_with_slice.py b/backends/cuda/replace_slice_copy_with_slice.py
index 55ddef5de9b..4f16759af35 100644
--- a/backends/cuda/replace_slice_copy_with_slice.py
+++ b/backends/cuda/replace_slice_copy_with_slice.py
@@ -6,20 +6,23 @@
 
 # pyre-strict
 
-from typing import Iterable
+from typing import Dict, Iterable, Tuple
 
 import torch
 from executorch.exir.dialects._ops import ops
+from executorch.exir.dialects.edge._ops import EdgeOpOverload
 from executorch.exir.pass_base import ExportPass, PassResult
 from torch import fx
 
 
-_SLICE_COPY_TARGETS = (
+_SLICE_COPY_TARGETS: Tuple[torch._ops.OpOverload | EdgeOpOverload] = (
     torch.ops.aten.slice_copy.Tensor,
     ops.edge.aten.slice_copy.Tensor,
 )
 
-_SLICE_TARGETS = {
+_SLICE_TARGETS: Dict[
+    torch._ops.OpOverload | EdgeOpOverload, torch._ops.OpOverload | EdgeOpOverload
+] = {
     torch.ops.aten.slice_copy.Tensor: torch.ops.aten.slice.Tensor,
     ops.edge.aten.slice_copy.Tensor: ops.edge.aten.slice.Tensor,
 }
@@ -99,8 +102,8 @@ def _is_view_user(self, node: fx.Node, user: fx.Node) -> bool:
         return False
 
     def _argument_mutates(
-        self, schema: torch._C.FunctionSchema, key
-    ) -> bool:  # pyre-ignore[11]
+        self, schema: torch._C.FunctionSchema, key: int | str
+    ) -> bool:
         arguments = schema.arguments
         if isinstance(key, int):
             if key >= len(arguments):
diff --git a/backends/cuda/tests/test_cuda_export.py b/backends/cuda/tests/test_cuda_export.py
index 99f8d33a766..d794a4f042c 100644
--- a/backends/cuda/tests/test_cuda_export.py
+++ b/backends/cuda/tests/test_cuda_export.py
@@ -8,6 +8,7 @@
 from typing import Tuple
 
 import torch
+from executorch.backends.cuda.cuda_backend import CudaBackend
 from executorch.backends.cuda.cuda_partitioner import CudaPartitioner
 from executorch.exir import EdgeCompileConfig, to_edge_transform_and_lower
 from torch.export import export
@@ -30,7 +31,9 @@ def _export_to_cuda_with_lower(
         exported_program = export(module, inputs, strict=True)
 
         # Create partitioner and compile specs
-        partitioner = CudaPartitioner([])
+        partitioner = CudaPartitioner(
+            [CudaBackend.generate_method_name_compile_spec("forward")]
+        )
 
         # Use to_edge_transform_and_lower for complete pipeline
         edge_program_manager = to_edge_transform_and_lower(

From c997fe405ac0ad6bf295ca5459f5352c2aeaae45 Mon Sep 17 00:00:00 2001
From: Naveen Suda <99509021+navsud@users.noreply.github.com>
Date: Thu, 2 Oct 2025 18:05:45 -0700
Subject: [PATCH 110/266] Remove explicit device arguments

Differential Revision: D82239525

Pull Request resolved: https://github.com/pytorch/executorch/pull/14619
---
 examples/models/llama/model_args.py |  3 +++
 examples/models/llama/rope.py       | 10 ++++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/examples/models/llama/model_args.py b/examples/models/llama/model_args.py
index 04d29f91ac6..3f9d3d8f2af 100644
--- a/examples/models/llama/model_args.py
+++ b/examples/models/llama/model_args.py
@@ -63,6 +63,9 @@ class ModelArgs:
     use_sdpa_with_kv_cache_op: bool = (
         False  # Use custom sdpa op that updates kv cache in-place
     )
+    # Device to use for the model: "cpu" or "cuda" (needed for QAT)
+    # Only used for creating Rope parameters
+    device: str = "cpu"
     # Generate logits for all inputs. When it's True, it would take big memory usage
     # at runtime. Enable it only necessary (e.g., use perplexity tools that requires
     # logits for all input tokens.)
diff --git a/examples/models/llama/rope.py b/examples/models/llama/rope.py
index 8c0d5db6a80..0d1dd306091 100644
--- a/examples/models/llama/rope.py
+++ b/examples/models/llama/rope.py
@@ -138,7 +138,11 @@ def forward(
 # and https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_rope_utils.py#L242.
 # Current only support non-long rope.
 def hf_precompute_freqs_cis(
-    dim: int, end: int, theta: float, partial_rotary_factor: float = 1.0
+    dim: int,
+    end: int,
+    theta: float,
+    partial_rotary_factor: float = 1.0,
+    device: Union[str, torch.device] = "cpu",
 ):
     # Partial rotary embeddings.
     dim = int(dim * partial_rotary_factor)
@@ -146,7 +150,7 @@ def hf_precompute_freqs_cis(
     # Short factor scaling.
     freqs = 1.0 / (
         theta
-        ** (torch.arange(0, dim, 2, device="cpu", dtype=torch.int64).float() / dim)
+        ** (torch.arange(0, dim, 2, device=device, dtype=torch.int64).float() / dim)
     )
     # TODO: support long factor scaling.
 
@@ -236,6 +240,7 @@ def __init__(self, params: ModelArgs):
             self.precompute_freqs_cis = partial(
                 hf_precompute_freqs_cis,
                 partial_rotary_factor=self.params.partial_rotary_factor,
+                device=self.params.device,
             )
             self.apply_rotary_emb = hf_apply_rotary_emb
         else:
@@ -244,6 +249,7 @@ def __init__(self, params: ModelArgs):
                 use_scaled=self.params.use_scaled_rope,
                 scale_factor=self.params.rope_scale_factor,
                 high_freq_factor=self.params.high_freq_factor,
+                device=self.params.device,
             )
             self.apply_rotary_emb = RotaryEmbedding()
 

From 54bfd72921034825f5bd0e5bfcd93808bc8156b1 Mon Sep 17 00:00:00 2001
From: Andrew Grebenisan <33402477+DrJessop@users.noreply.github.com>
Date: Thu, 2 Oct 2025 20:11:48 -0700
Subject: [PATCH 111/266] Fix Wav2Vec Replace Pass Bug

Differential Revision: D83778606

Pull Request resolved: https://github.com/pytorch/executorch/pull/14757
---
 backends/cadence/aot/replace_ops.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backends/cadence/aot/replace_ops.py b/backends/cadence/aot/replace_ops.py
index 9e95460f2f5..2104764cd14 100644
--- a/backends/cadence/aot/replace_ops.py
+++ b/backends/cadence/aot/replace_ops.py
@@ -89,10 +89,10 @@ def replace_logical_nop_where_with_where(
 
             # Get the third arg node and its input
             logical_not_node = node.args[0]
-            logical_not_input_tensor = logical_not_node.args[0].to_tensor()
+            logical_not_input_node = logical_not_node.args[0]
 
             # If the logical_not input is not a boolean tensor, bail.
-            if logical_not_input_tensor.meta["spec"].dtype != torch.bool:
+            if logical_not_input_node.meta["val"].dtype != torch.bool:
                 continue
 
             # Replace the where op with another one, flipping the inputs and using the boolean

From 822a711dbe3b12f8defe740ea6ab570dec2841f6 Mon Sep 17 00:00:00 2001
From: Eli Amesefe <eliamesefe@meta.com>
Date: Thu, 2 Oct 2025 20:59:25 -0700
Subject: [PATCH 112/266] Update addmm int16 for Ethos-U85

Differential Revision: D83627934

Pull Request resolved: https://github.com/pytorch/executorch/pull/14714
---
 backends/arm/operators/op_bmm.py    | 23 +++++++++++++++++++++++
 backends/arm/test/ops/test_addmm.py |  6 ------
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/backends/arm/operators/op_bmm.py b/backends/arm/operators/op_bmm.py
index 2636a08d7c5..9bebc3597ca 100644
--- a/backends/arm/operators/op_bmm.py
+++ b/backends/arm/operators/op_bmm.py
@@ -79,6 +79,12 @@ def define_node(
             input1_zp = input_qparams[1].get_zp_per_tensor()
             bmm_result = tosa_graph.addIntermediate(output.shape, ts.DType.INT32)
             bmm_output_name = bmm_result.name
+        elif inputs[0].dtype == ts.DType.INT16:
+            input_qparams = get_input_qparams(node)
+            input0_zp = input_qparams[0].get_zp_per_tensor()
+            input1_zp = input_qparams[1].get_zp_per_tensor()
+            bmm_result = tosa_graph.addIntermediate(output.shape, ts.DType.INT48)
+            bmm_output_name = bmm_result.name
         else:
             bmm_output_name = output.name
             input0_zp, input1_zp = 0, 0
@@ -118,3 +124,20 @@ def define_node(
                 output_zp=[output_qparams.get_zp_per_tensor()],
                 rounding_mode=RoundingMode.SINGLE_ROUND,
             )
+        elif output.dtype == ts.DType.INT16:
+            output_qparams = get_output_qparams(node)[0]
+            final_output_scale = (
+                input_qparams[0].get_scale_per_tensor() * input_qparams[1].get_scale_per_tensor()  # type: ignore[possibly-undefined]  # pyre-ignore[61]
+            ) / output_qparams.get_scale_per_tensor()
+
+            build_rescale(
+                tosa_fb=tosa_graph,
+                scale=[final_output_scale],
+                # pyre-ignore[61]: Uninitialized local [61]: Local variable `bmm_result` is undefined, or not always defined.
+                input_node=bmm_result,  # type: ignore[possibly-undefined]
+                output_name=output.name,
+                output_type=ts.DType.INT16,
+                input_zp=[0],
+                output_zp=[output_qparams.get_zp_per_tensor()],
+                rounding_mode=RoundingMode.SINGLE_ROUND,
+            )
diff --git a/backends/arm/test/ops/test_addmm.py b/backends/arm/test/ops/test_addmm.py
index b9a891ec740..1170f65dd58 100644
--- a/backends/arm/test/ops/test_addmm.py
+++ b/backends/arm/test/ops/test_addmm.py
@@ -213,9 +213,6 @@ def get_symmetric_a16w8_addmm_quantizer(per_channel_quantization=False):
 
 
 @common.parametrize("test_data", test_data_suite)
-@pytest.mark.xfail(
-    reason="missing int16 addmm ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13979"
-)
 def test_addmm_16a8w_tosa_INT(test_data: input_t1):
     """Test addmm (FC layer) operation with 16A8W quantization (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -268,9 +265,6 @@ def test_addmm_16a8w_u55_INT16(test_data: input_t1):
 
 @common.parametrize("test_data", test_data_suite)
 @common.XfailIfNoCorstone320
-@pytest.mark.xfail(
-    reason="Vela compilation fails with 'Invalid arguments' for int16 addmm operations"
-)
 def test_addmm_16a8w_u85_INT16(test_data: input_t1):
     """Test addmm (FC layer) operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False

From e6527463f88fd69862b6799d3e9465b9690d4309 Mon Sep 17 00:00:00 2001
From: Naveen Suda <99509021+navsud@users.noreply.github.com>
Date: Thu, 2 Oct 2025 22:07:58 -0700
Subject: [PATCH 113/266] Use FusedMovingAvgObsFakeQuantize instead of
 FakeQuantize for faster QAT

Differential Revision: D83583655

Pull Request resolved: https://github.com/pytorch/executorch/pull/14740
---
 backends/qualcomm/quantizer/qconfig.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/backends/qualcomm/quantizer/qconfig.py b/backends/qualcomm/quantizer/qconfig.py
index 30af923781a..694fab3dc6b 100644
--- a/backends/qualcomm/quantizer/qconfig.py
+++ b/backends/qualcomm/quantizer/qconfig.py
@@ -200,7 +200,7 @@ def get_16a8w_qnn_qat_config(
     act_observer=MovingAverageMinMaxObserver,
 ) -> QuantizationConfig:
     extra_args: Dict[str, Any] = {"eps": 2**-20}
-    act_fake_quant_ctr = FakeQuantize.with_args(
+    act_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args(
         dtype=torch.int32,
         quant_min=torch.iinfo(torch.uint16).min,
         quant_max=torch.iinfo(torch.uint16).max,
@@ -398,7 +398,7 @@ def get_ptq_per_block_quant_config(
 def get_8a8w_qnn_qat_config(
     act_symmetric: bool = False, act_observer=MovingAverageMinMaxObserver
 ) -> QuantizationConfig:
-    act_fake_quant_ctr = FakeQuantize.with_args(
+    act_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args(
         dtype=torch.uint8,
         qscheme=(
             torch.per_tensor_symmetric if act_symmetric else torch.per_tensor_affine
@@ -458,7 +458,7 @@ def get_8a8w_qnn_qat_config(
 def get_16a4w_qnn_qat_config(
     act_observer=MovingAverageMinMaxObserver,
 ) -> QuantizationConfig:
-    act_fake_quant_ctr = FakeQuantize.with_args(
+    act_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args(
         dtype=torch.int32,
         quant_min=torch.iinfo(torch.uint16).min,
         quant_max=torch.iinfo(torch.uint16).max,
@@ -541,7 +541,7 @@ def get_qat_per_channel_quant_config(
         # If zero_point is 128, htp can do optimizations.
         # If we keep quant_min and quant_max none, observer will default use 128 as zero_point.
         # If we provide uint8 quant_min/max, it will use 127 as zero_point, which is undesired.
-        act_fake_quant_ctr = FakeQuantize.with_args(
+        act_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args(
             dtype=torch.int32 if act_dtype == torch.uint16 else act_dtype,
             qscheme=torch.per_tensor_symmetric,
             observer=act_observer,
@@ -553,7 +553,7 @@ def get_qat_per_channel_quant_config(
             observer_or_fake_quant_ctr=act_fake_quant_ctr,
         )
     else:
-        act_fake_quant_ctr = FakeQuantize.with_args(
+        act_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args(
             dtype=torch.int32 if act_dtype == torch.uint16 else act_dtype,
             quant_min=torch.iinfo(act_dtype).min,
             quant_max=torch.iinfo(act_dtype).max,

From 70ea66186e34210676171b3fb1ac8055117d8c06 Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin <anthony@shoumikh.in>
Date: Thu, 2 Oct 2025 23:00:56 -0700
Subject: [PATCH 114/266] Add Phi4 test and fix regex parsing.

Differential Revision: D83641294

Pull Request resolved: https://github.com/pytorch/executorch/pull/14716
---
 .../Exported/ExecuTorchLLMTextRunner.h        | 15 +++++++-
 .../Exported/ExecuTorchLLMTextRunner.mm       | 11 +++++-
 .../__tests__/MultimodalRunnerTest.swift      |  2 +-
 .../__tests__/TextRunnerTest.swift            | 37 ++++++++++++++++++-
 4 files changed, 59 insertions(+), 6 deletions(-)

diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.h b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.h
index 550a20ea633..50957ee47f5 100644
--- a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.h
+++ b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.h
@@ -25,12 +25,23 @@ __attribute__((deprecated("This API is experimental.")))
 
  @param modelPath      File system path to the serialized model.
  @param tokenizerPath  File system path to the tokenizer data.
- @param tokens         An array of NSString special tokens to use during tokenization.
+ @return An initialized ExecuTorchLLMTextRunner instance.
+*/
+- (instancetype)initWithModelPath:(NSString *)modelPath
+                    tokenizerPath:(NSString *)tokenizerPath;
+
+/**
+ Initializes a text LLM runner with the given model and tokenizer paths,
+ and a list of special tokens to include in the tokenizer.
+
+ @param modelPath      File system path to the serialized model.
+ @param tokenizerPath  File system path to the tokenizer data.
+ @param specialTokens  An array of NSString special tokens to use during tokenization.
  @return An initialized ExecuTorchLLMTextRunner instance.
 */
 - (instancetype)initWithModelPath:(NSString *)modelPath
                     tokenizerPath:(NSString *)tokenizerPath
-                    specialTokens:(NSArray<NSString *> *)tokens
+                    specialTokens:(NSArray<NSString *> *)specialTokens
     NS_DESIGNATED_INITIALIZER;
 
 /**
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.mm b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.mm
index 4ea1bd921f7..1a6c3f40045 100644
--- a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.mm
+++ b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.mm
@@ -28,15 +28,22 @@ @implementation ExecuTorchLLMTextRunner {
   std::unique_ptr<llm::TextLLMRunner> _runner;
 }
 
+- (instancetype)initWithModelPath:(NSString*)modelPath
+                    tokenizerPath:(NSString*)tokenizerPath {
+  return [self initWithModelPath:modelPath
+                   tokenizerPath:tokenizerPath
+                   specialTokens:@[]];
+}
+
 - (instancetype)initWithModelPath:(NSString*)modelPath
                     tokenizerPath:(NSString*)tokenizerPath
-                    specialTokens:(NSArray<NSString*>*)tokens {
+                    specialTokens:(NSArray<NSString*>*)specialTokens {
   self = [super init];
   if (self) {
     _modelPath = [modelPath copy];
     _tokenizerPath = [tokenizerPath copy];
     _specialTokens = std::make_unique<std::vector<std::string>>();
-    for (NSString *token in tokens) {
+    for (NSString *token in specialTokens) {
       _specialTokens->emplace_back(token.UTF8String);
     }
   }
diff --git a/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift b/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift
index cdf15f12350..7ae9da4969b 100644
--- a/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift
+++ b/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift
@@ -60,7 +60,7 @@ class MultimodalRunnerTest: XCTestCase {
   let userPrompt = "What's on the picture?"
   let sequenceLength = 768
 
-  func test() {
+  func testLLaVA() {
     let bundle = Bundle(for: type(of: self))
     guard let modelPath = bundle.path(forResource: "llava", ofType: "pte"),
           let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "bin"),
diff --git a/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift b/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift
index 5e99af0c57f..f7124fec640 100644
--- a/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift
+++ b/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift
@@ -39,7 +39,7 @@ class TextRunnerTest: XCTestCase {
   let userPrompt = "The capital of France is called"
   let sequenceLength = 128
 
-  func test() {
+  func testLLaMA() {
     let bundle = Bundle(for: type(of: self))
     guard let modelPath = bundle.path(forResource: "llama3_2-1B", ofType: "pte"),
           let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "model") else {
@@ -73,4 +73,39 @@ class TextRunnerTest: XCTestCase {
     }
     XCTAssertTrue(text.lowercased().contains("paris"))
   }
+
+  func testPhi4() {
+    let bundle = Bundle(for: type(of: self))
+    guard let modelPath = bundle.path(forResource: "phi4-mini", ofType: "pte"),
+          let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "json") else {
+      XCTFail("Couldn't find model or tokenizer files")
+      return
+    }
+    let runner = TextRunner(modelPath: modelPath, tokenizerPath: tokenizerPath)
+    var text = ""
+
+    do {
+      try runner.generate(userPrompt, Config {
+        $0.sequenceLength = sequenceLength
+      }) { token in
+        text += token
+      }
+    } catch {
+      XCTFail("Failed to generate text with error \(error)")
+    }
+    XCTAssertTrue(text.lowercased().contains("paris"))
+
+    text = ""
+    runner.reset()
+    do {
+      try runner.generate(userPrompt, Config {
+        $0.sequenceLength = sequenceLength
+      }) { token in
+        text += token
+      }
+    } catch {
+      XCTFail("Failed to generate text with error \(error)")
+    }
+    XCTAssertTrue(text.lowercased().contains("paris"))
+  }
 }

From 05799c93bac19db778f380bb906d0d556e1672ca Mon Sep 17 00:00:00 2001
From: Vaclav Novak <novakvaclav73@gmail.com>
Date: Fri, 3 Oct 2025 10:33:40 +0200
Subject: [PATCH 115/266] NXP backend: added aten.sub operator support (#14514)

### Summary

adds support for aten.sub operator

### Test plan

tests can be manually run using `pytest -c /dev/null
backends/nxp/tests/`

---------

Co-authored-by: Martin Pavella <martin.pavella@nxp.com>
---
 .../nxp/backend/edge_program_converter.py     |   1 +
 .../ops_converters/__init__.py                |   4 +
 .../ops_converters/sub_tensor_converter.py    |  59 ++++++
 backends/nxp/neutron_partitioner.py           |   1 +
 backends/nxp/quantizer/neutron_quantizer.py   |   2 +
 backends/nxp/quantizer/patterns.py            |  26 +++
 .../test_add_tensor_converter.py              |   4 +
 .../test_sub_tensor_converter.py              | 175 ++++++++++++++++++
 backends/nxp/tests/models.py                  |  28 +++
 9 files changed, 300 insertions(+)
 create mode 100644 backends/nxp/backend/ir/converter/node_converters/ops_converters/sub_tensor_converter.py
 create mode 100644 backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py

diff --git a/backends/nxp/backend/edge_program_converter.py b/backends/nxp/backend/edge_program_converter.py
index febcd03913a..03d55548d2d 100644
--- a/backends/nxp/backend/edge_program_converter.py
+++ b/backends/nxp/backend/edge_program_converter.py
@@ -43,6 +43,7 @@
     exir_ops.edge.aten.permute_copy.default: PermuteCopyConverter,  # noqa F405
     exir_ops.edge.aten.relu.default: ReLUConverter,  # noqa F405
     exir_ops.edge.aten._softmax.default: SoftmaxConverter,  # noqa F405
+    exir_ops.edge.aten.sub.Tensor: SubTensorConverter,  # noqa F405
     exir_ops.edge.aten.tanh.default: TanhConverter,  # noqa F405
     exir_ops.edge.aten.view_copy.default: ViewCopyConverter,  # noqa F405
     exir_ops.edge.aten.sigmoid.default: SigmoidConverter,  # noqa F405
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py
index 472a3495e19..3cf70f46b8d 100755
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py
@@ -56,6 +56,9 @@
 from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.softmax_converter import (
     SoftmaxConverter,
 )
+from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.sub_tensor_converter import (
+    SubTensorConverter,
+)
 from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.tanh_converter import (
     TanhConverter,
 )
@@ -80,6 +83,7 @@
     "MaxPool2dConverter",
     "AvgPool2dConverter",
     "AddTensorConverter",
+    "SubTensorConverter",
     "CloneConverter",
     "AbsConverter",
     "AdaptiveAvgPool2dConverter",
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/sub_tensor_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/sub_tensor_converter.py
new file mode 100644
index 00000000000..e9522c87114
--- /dev/null
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/sub_tensor_converter.py
@@ -0,0 +1,59 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.nxp.backend.ir.converter.conversion.common import (
+    node_uses_shape_broadcasting,
+)
+from executorch.backends.nxp.backend.ir.converter.node_converter import (
+    CustomDelegationOptions,
+    NodeConverter,
+)
+from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options import (
+    sub_options,
+)
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
+from torch.fx import Node
+from torch.nn import Parameter
+
+
+class SubTensorConverter(NodeConverter):
+    @staticmethod
+    def _is_supported_on_target(
+        node: Node,
+        neutron_target_spec: NeutronTargetSpec,
+        parameters_mapping: dict[str, Parameter],
+        custom_delegation_options: CustomDelegationOptions,
+    ) -> bool:
+        if node_uses_shape_broadcasting(node):
+            # Shape broadcasting may require the addition of `Transpose` ops during conversion.
+            return False
+
+        return True
+
+    @staticmethod
+    def _is_supported_in_IR(
+        node: Node,
+        parameters_mapping: dict[str, Parameter],
+        custom_delegation_options: CustomDelegationOptions,
+    ) -> bool:
+        if len(node.args) != 2:
+            return False
+
+        # The `alpha` attribute can be represented by adding an extra `Mul` operator.
+        #  However, this is not implemented as `alpha` is rarely used.
+        if hasattr(node.kwargs, "alpha"):
+            return False
+
+        return True
+
+    # sub.Tensor Node format: (Tensor self, Tensor other, *, Scalar alpha=1)
+    def convert(self, node: Node):
+        """Convert 'sub_tensor' operator to NeutronIR 'Sub'."""
+        self.assert_convertible(node)
+
+        t_op = self._create_tflite_op_with_io_tensors(node)
+
+        t_op.builtin_options = sub_options.Sub()
+        self.builder.append_operators([t_op])
diff --git a/backends/nxp/neutron_partitioner.py b/backends/nxp/neutron_partitioner.py
index 917545e6c89..e7ad7ff7a0b 100644
--- a/backends/nxp/neutron_partitioner.py
+++ b/backends/nxp/neutron_partitioner.py
@@ -210,6 +210,7 @@ def tag_qdq_clusters(self, nodes: list[torch.fx.Node]):
     exir_ops.edge.aten.mm.default: MMConverter,  # noqa F405
     exir_ops.edge.aten.relu.default: ReLUConverter,  # noqa F405
     exir_ops.edge.aten._softmax.default: SoftmaxConverter,  # noqa F405
+    exir_ops.edge.aten.sub.Tensor: SubTensorConverter,  # noqa F405
     exir_ops.edge.aten.tanh.default: TanhConverter,  # noqa F405
     exir_ops.edge.aten.view_copy.default: ViewCopyConverter,  # noqa F405
     exir_ops.edge.aten.sigmoid.default: SigmoidConverter,  # noqa F405
diff --git a/backends/nxp/quantizer/neutron_quantizer.py b/backends/nxp/quantizer/neutron_quantizer.py
index db19bcb8ba8..2681e221869 100644
--- a/backends/nxp/quantizer/neutron_quantizer.py
+++ b/backends/nxp/quantizer/neutron_quantizer.py
@@ -36,6 +36,7 @@
     SharedSpecPattern,
     SigmoidPattern,
     SoftMaxPattern,
+    SubTensorPattern,
     TanhInPlacePattern,
     TanhPattern,
     ViewPattern,
@@ -208,6 +209,7 @@ def __init__(self):
                 NeutronAtenQuantizer(ReshapePattern(), static_qconfig),
                 NeutronAtenQuantizer(SigmoidPattern(), static_qconfig),
                 NeutronAtenQuantizer(SoftMaxPattern(), static_qconfig),
+                NeutronAtenQuantizer(SubTensorPattern(), static_qconfig),
                 NeutronAtenQuantizer(TanhPattern(), static_qconfig),
                 NeutronAtenQuantizer(TanhInPlacePattern(), static_qconfig),
                 NeutronAtenQuantizer(ViewPattern(), static_qconfig),
diff --git a/backends/nxp/quantizer/patterns.py b/backends/nxp/quantizer/patterns.py
index 34ee611b8b2..9588ce24c9e 100644
--- a/backends/nxp/quantizer/patterns.py
+++ b/backends/nxp/quantizer/patterns.py
@@ -224,6 +224,32 @@ def get_anchors(
         )
 
 
+class SubTensorPattern(QuantizationPattern):
+    """
+    Quantization pattern for Sub Tensor quantization. Accepts 1 or 2 input nodes.
+
+    Basic quantization for all inputs and output.
+    """
+
+    def partition_types(self) -> list[torch.nn.Module]:
+        return [torch.ops.aten.sub.Tensor]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule]
+    ) -> PartitionAnchors | None:
+        node = fused_partition[0].nodes[-1]
+        inputs = [(node, NodeArgsIdx(0))]
+        if len(fused_partition[0].input_nodes) == 2:
+            inputs = [(node, NodeArgsIdx(0)), (node, NodeArgsIdx(1))]
+
+        return PartitionAnchors(
+            inputs=inputs,
+            weights=[],
+            biases=[],
+            output=[(node,)],
+        )
+
+
 class AvgPoolPattern(SharedSpecPattern):
     """
     Quantizer for AvgPool2D operator.
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py
index 567b593e05b..2c3107eae77 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py
@@ -1,3 +1,7 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
 import numpy as np
 import pytest
 import torch
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py
new file mode 100644
index 00000000000..98566ff1ad6
--- /dev/null
+++ b/backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py
@@ -0,0 +1,175 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import pytest
+import torch
+
+from executorch.backends.nxp.backend.edge_program_converter import (
+    EdgeProgramToIRConverter,
+)
+from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program
+from executorch.backends.nxp.tests.executors import (
+    convert_run_compare,
+    ToChannelFirstPreprocess,
+    ToChannelLastPreprocess,
+)
+from executorch.backends.nxp.tests.models import (
+    SubTensorConvModule,
+    SubTensorModule,
+    SubTensorOneInputModule,
+)
+from executorch.exir.dialects._ops import ops as exir_ops
+from torch.export import ExportedProgram
+
+
+@pytest.fixture(autouse=True)
+def reseed_model_per_test_run():
+    torch.manual_seed(23)
+    np.random.seed(23)
+
+
+@pytest.mark.parametrize(
+    "input_shape",
+    [
+        pytest.param((4,), id="1D."),
+        pytest.param((6, 6), id="2D."),
+        pytest.param((1, 4, 8), id="3D."),
+        pytest.param((1, 4, 8, 8), id="4D."),
+    ],
+)
+def test_sub_tensor_quant_conversion(mocker, input_shape):
+    model = SubTensorModule()
+
+    converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
+
+    # Run conversion
+    _ = to_quantized_edge_program(model, [input_shape, input_shape])
+
+    # Capture generated model
+    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+
+    # Capture converted program
+    exported_program: ExportedProgram = converter_spy.call_args.args[1]
+
+    input_data_1 = (np.random.random(input_shape).astype(np.float32) * 50).astype(
+        np.int8
+    )
+    input_data_2 = (np.random.random(input_shape).astype(np.float32) * 50).astype(
+        np.int8
+    )
+    input_data = {0: input_data_1, 1: input_data_2}
+
+    nodes = list(exported_program.graph.nodes)
+    assert nodes[4].target == exir_ops.edge.aten.sub.Tensor
+
+    convert_run_compare(
+        exported_program, tfl_model=tflite_flatbuffers_model, input_data=input_data
+    )
+
+
+@pytest.mark.parametrize(
+    "input_shape",
+    [
+        pytest.param((4,), id="1D."),
+        pytest.param((6, 6), id="2D."),
+        pytest.param((1, 4, 8), id="3D."),
+        pytest.param((1, 4, 8, 8), id="4D."),
+    ],
+)
+def test_sub_tensor_one_input_quant_conversion(mocker, input_shape):
+    model = SubTensorOneInputModule()
+
+    converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
+
+    # Run conversion
+    _ = to_quantized_edge_program(model, input_shape)
+
+    # Capture generated model
+    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+
+    # Capture converted program
+    exported_program: ExportedProgram = converter_spy.call_args.args[1]
+
+    input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(np.int8)
+
+    nodes = list(exported_program.graph.nodes)
+    assert nodes[2].target == exir_ops.edge.aten.sub.Tensor
+
+    convert_run_compare(
+        exported_program, tfl_model=tflite_flatbuffers_model, input_data=input_data
+    )
+
+
+@pytest.mark.parametrize(
+    "x_input_shape",
+    [
+        pytest.param((1, 4, 8, 8), id="4D."),
+        pytest.param((1, 4, 5, 5), id="4D, product of dims is not a multiple of 8."),
+    ],
+)
+def test_sub_tensor_w_conv_quant_conversion(mocker, x_input_shape):
+    model = SubTensorConvModule()
+
+    converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
+
+    n, c, h, w = x_input_shape
+    y_input_shape = (n, 8, h, w)
+
+    # Run conversion
+    _ = to_quantized_edge_program(model, [x_input_shape, y_input_shape])
+
+    # Capture generated model
+    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+
+    # Capture converted program
+    exported_program: ExportedProgram = converter_spy.call_args.args[1]
+
+    input_data_1 = (np.random.random(x_input_shape).astype(np.float32) * 50).astype(
+        np.int8
+    )
+    input_data_2 = (np.random.random(y_input_shape).astype(np.float32) * 50).astype(
+        np.int8
+    )
+    input_data = {0: input_data_1, 1: input_data_2}
+
+    nodes = list(exported_program.graph.nodes)
+    assert nodes[15].target == exir_ops.edge.aten.sub.Tensor
+
+    convert_run_compare(
+        exported_program,
+        input_data=input_data,
+        tflite_input_preprocess=ToChannelLastPreprocess(),
+        tfl_model=tflite_flatbuffers_model,
+        tflite_output_preprocess=ToChannelFirstPreprocess(),
+    )
+
+
+@pytest.mark.parametrize(
+    "x_input_shape, y_input_shape",
+    [
+        pytest.param((1, 4, 7), (4, 7), id="3D -> 2D."),
+        pytest.param((1, 4, 8), (1, 4, 4, 8), id="3D -> 4D."),
+        pytest.param((1, 1, 4, 4, 8), (1, 4, 4, 8), id="5D -> 4D."),
+        pytest.param((4,), (4, 4), id="1D -> 2D."),
+        pytest.param((4,), (4, 4, 4), id="1D -> 3D."),
+        pytest.param((6, 6), (1, 8, 6, 6), id="2D -> 4D."),
+        pytest.param((6, 6), (6,), id="2D -> 1D."),
+    ],
+)
+def test_sub_tensor_broadcasting_unsupported_quant_conversion(
+    x_input_shape, y_input_shape
+):
+    model = SubTensorModule()
+
+    # Run conversion
+    edge_program = to_quantized_edge_program(
+        model, [x_input_shape, y_input_shape]
+    ).exported_program()
+    nodes = list(edge_program.graph.nodes)
+
+    # Broadcast is not supported, node is not converted
+    assert (
+        nodes[6].target == exir_ops.edge.aten.sub.Tensor
+    )  # Sub Tensor is not delegated.
diff --git a/backends/nxp/tests/models.py b/backends/nxp/tests/models.py
index e7b60b2566c..f613349fed0 100644
--- a/backends/nxp/tests/models.py
+++ b/backends/nxp/tests/models.py
@@ -451,6 +451,34 @@ def forward(x):
         return x + x
 
 
+class SubTensorModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    @staticmethod
+    def forward(x, y):
+        return x - y
+
+
+class SubTensorConvModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = Conv2dModule(padding=1, stride=1)
+
+    def forward(self, x, y):
+        x = self.conv(x)
+        return x - y
+
+
+class SubTensorOneInputModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    @staticmethod
+    def forward(x):
+        return x - x
+
+
 class MeanDimLinearModule(torch.nn.Module):
     def __init__(self, dim, keepdim):
         super().__init__()

From 3557edf1dfab4fcc9732bfae30f61001a4f96d7f Mon Sep 17 00:00:00 2001
From: neuropilot-captain <neuropilot@mediatek.com>
Date: Fri, 3 Oct 2025 21:15:50 +0800
Subject: [PATCH 116/266] Update MTK tool versions in documents (#14772)

### Summary
NeuroPilot Express SDK is released for ExecuTorch 1.0. Update the
document for the latest tool version
Resolves discussion 14253
---
 backends/mediatek/README.md      | 4 ++--
 docs/source/backends-mediatek.md | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/backends/mediatek/README.md b/backends/mediatek/README.md
index e8a535b3fde..6ff751f8408 100644
--- a/backends/mediatek/README.md
+++ b/backends/mediatek/README.md
@@ -28,7 +28,7 @@ To get started with MediaTek's ExecuTorch libraries, download the [NeuroPilot Ex
 
 - **`mtk_converter-8.13.0+public-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl`**: This library preprocesses the model into a MediaTek representation.
 
-- **`mtk_neuron-8.2.19-py3-none-linux_x86_64.whl`**: This library converts the model to binaries.
+- **`mtk_neuron-8.2.23-py3-none-linux_x86_64`**: This library converts the model to binaries.
 
 Additionally, make sure to copy `NeuronAdapter.h` to the following directory: `backends/mediatek/runtime/include/api/`.
 
@@ -45,7 +45,7 @@ Follow the steps below to setup your build environment:
    ```
 - Install the two .whl downloaded from NeuroPilot Portal
    ```bash
-   pip3 install mtk_neuron-8.2.19-py3-none-linux_x86_64.whl
+   pip3 install mtk_neuron-8.2.23-py3-none-linux_x86_64.whl
    pip3 install mtk_converter-8.13.0+public-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
    ```
 
diff --git a/docs/source/backends-mediatek.md b/docs/source/backends-mediatek.md
index a562cea13bd..34cd56f971b 100644
--- a/docs/source/backends-mediatek.md
+++ b/docs/source/backends-mediatek.md
@@ -23,7 +23,7 @@ The MediaTek backend enables acceleration of PyTorch models on edge devices with
   ```
 - NeuroPilot SDK Python wheels (download from [NeuroPilot Express SDK](https://neuropilot.mediatek.com/resources/public/npexpress/en/docs/npexpress)):
   ```bash
-  pip3 install mtk_neuron-8.2.19-py3-none-linux_x86_64.whl
+  pip3 install mtk_neuron-8.2.23-py3-none-linux_x86_64.whl
   pip3 install mtk_converter-8.13.0+public-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
   ```
 

From c44c5417f79b39c750701f66e0f26b84fa2cd770 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Fri, 3 Oct 2025 12:11:58 -0400
Subject: [PATCH 117/266] Runner support for multiple ptd files (#14758)

This PR was created by the merge bot to help merge the original PR into
the main branch.
ghstack PR number: https://github.com/pytorch/executorch/pull/14159 by
@lucylq
^ Please use this as the source of truth for the PR details, comments,
and reviews
ghstack PR base:
https://github.com/pytorch/executorch/tree/gh/lucylq/111/base
ghstack PR head:
https://github.com/pytorch/executorch/tree/gh/lucylq/111/head
Merge bot PR base: https://github.com/pytorch/executorch/tree/main
Merge bot PR head:
https://github.com/pytorch/executorch/tree/gh/lucylq/111/orig
Differential Revision:
[D82072385](https://our.internmc.facebook.com/intern/diff/D82072385/)
@diff-train-skip-merge

Co-authored-by: lucylq <lfq@meta.com>
---
 examples/models/llama/runner/runner.cpp    | 17 +++++++++++++++-
 examples/models/llama/runner/runner.h      | 11 +++++++----
 extension/llm/runner/llm_runner_helper.cpp | 22 +++++++++++++++++++--
 extension/llm/runner/llm_runner_helper.h   | 23 +++++++++++++++++++++-
 4 files changed, 65 insertions(+), 8 deletions(-)

diff --git a/examples/models/llama/runner/runner.cpp b/examples/models/llama/runner/runner.cpp
index 2ba2fdf9941..19ed9f88339 100644
--- a/examples/models/llama/runner/runner.cpp
+++ b/examples/models/llama/runner/runner.cpp
@@ -37,6 +37,21 @@ std::unique_ptr<llm::TextLLMRunner> create_llama_runner(
     const std::string& tokenizer_path,
     std::optional<const std::string> data_path,
     float temperature) {
+  if (data_path.has_value()) {
+    std::vector<std::string> data_files;
+    data_files.push_back(data_path.value());
+    return create_llama_runner(
+        model_path, tokenizer_path, std::move(data_files), temperature);
+  }
+  return create_llama_runner(
+      model_path, tokenizer_path, std::vector<std::string>(), temperature);
+}
+
+std::unique_ptr<llm::TextLLMRunner> create_llama_runner(
+    const std::string& model_path,
+    const std::string& tokenizer_path,
+    std::vector<std::string> data_files,
+    float temperature) {
   ET_LOG(
       Info,
       "Creating LLaMa runner: model_path=%s, tokenizer_path=%s",
@@ -55,7 +70,7 @@ std::unique_ptr<llm::TextLLMRunner> create_llama_runner(
     return nullptr;
   }
   return llm::create_text_llm_runner(
-      model_path, std::move(tokenizer), data_path);
+      model_path, std::move(tokenizer), data_files);
 }
 
 } // namespace example
diff --git a/examples/models/llama/runner/runner.h b/examples/models/llama/runner/runner.h
index f07cd4e8ee8..728ae57efa8 100644
--- a/examples/models/llama/runner/runner.h
+++ b/examples/models/llama/runner/runner.h
@@ -11,12 +11,9 @@
 
 #pragma once
 
-#include <cstdint>
-#include <functional>
 #include <memory>
 #include <optional>
 #include <string>
-#include <unordered_map>
 
 #include <executorch/examples/models/llama/tokenizer/llama_tiktoken.h>
 #include <executorch/extension/llm/runner/irunner.h>
@@ -30,7 +27,13 @@ namespace llm = ::executorch::extension::llm;
 std::unique_ptr<llm::TextLLMRunner> create_llama_runner(
     const std::string& model_path,
     const std::string& tokenizer_path,
-    std::optional<const std::string> data_path = std::nullopt,
+    std::optional<const std::string> data_path,
+    float temperature = -1.0f);
+
+std::unique_ptr<llm::TextLLMRunner> create_llama_runner(
+    const std::string& model_path,
+    const std::string& tokenizer_path,
+    std::vector<std::string> data_files = {},
     float temperature = -1.0f);
 
 std::unique_ptr<tokenizers::Tokenizer> load_llama_tokenizer(
diff --git a/extension/llm/runner/llm_runner_helper.cpp b/extension/llm/runner/llm_runner_helper.cpp
index f12de5f1d87..d1e4ff2ce45 100644
--- a/extension/llm/runner/llm_runner_helper.cpp
+++ b/extension/llm/runner/llm_runner_helper.cpp
@@ -183,6 +183,24 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
     std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
     std::optional<const std::string> data_path,
     float temperature) {
+  if (data_path.has_value()) {
+    std::vector<std::string> data_files;
+    data_files.push_back(data_path.value());
+    return create_text_llm_runner(
+        model_path, std::move(tokenizer), std::move(data_files), temperature);
+  }
+  return create_text_llm_runner(
+      model_path,
+      std::move(tokenizer),
+      std::vector<std::string>(),
+      temperature);
+}
+
+std::unique_ptr<TextLLMRunner> create_text_llm_runner(
+    const std::string& model_path,
+    std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
+    std::vector<std::string> data_files,
+    float temperature) {
   // Sanity check tokenizer
   if (!tokenizer || !tokenizer->is_loaded()) {
     ET_LOG(Error, "Tokenizer is null or not loaded");
@@ -191,9 +209,9 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
 
   // Create the Module
   std::unique_ptr<Module> module;
-  if (data_path.has_value()) {
+  if (data_files.size() > 0) {
     module = std::make_unique<Module>(
-        model_path, data_path.value(), Module::LoadMode::File);
+        model_path, data_files, Module::LoadMode::File);
   } else {
     module = std::make_unique<Module>(model_path, Module::LoadMode::File);
   }
diff --git a/extension/llm/runner/llm_runner_helper.h b/extension/llm/runner/llm_runner_helper.h
index 191ea3ab090..5c109581e19 100644
--- a/extension/llm/runner/llm_runner_helper.h
+++ b/extension/llm/runner/llm_runner_helper.h
@@ -101,7 +101,28 @@ ET_EXPERIMENTAL std::unordered_set<uint64_t> get_eos_ids(
 ET_EXPERIMENTAL std::unique_ptr<TextLLMRunner> create_text_llm_runner(
     const std::string& model_path,
     std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
-    std::optional<const std::string> data_path = std::nullopt,
+    std::optional<const std::string> data_path,
+    float temperature = -1.0f);
+
+/**
+ * @brief Creates a TextLLMRunner instance with dependency injection
+ *
+ * This factory function creates and initializes a TextLLMRunner with all
+ * necessary components for text generation using the specified model and
+ * tokenizer.
+ *
+ * @param model_path Path to the model file
+ * @param tokenizer Initialized tokenizer instance
+ * @param data_files Vector of paths to additional data required by the model
+ * @param temperature Optional temperature parameter for controlling randomness
+ * (deprecated)
+ * @return std::unique_ptr<TextLLMRunner> Initialized TextLLMRunner instance, or
+ * nullptr on failure
+ */
+ET_EXPERIMENTAL std::unique_ptr<TextLLMRunner> create_text_llm_runner(
+    const std::string& model_path,
+    std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
+    std::vector<std::string> data_files = {},
     float temperature = -1.0f);
 
 /**

From 4d681cb3b81de5b5fc4c7969f109e83e4607a06c Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Fri, 3 Oct 2025 12:13:53 -0400
Subject: [PATCH 118/266] JNI support for multiple ptd files (#14769)

This PR was created by the merge bot to help merge the original PR into
the main branch.
ghstack PR number: https://github.com/pytorch/executorch/pull/14168 by
@lucylq
^ Please use this as the source of truth for the PR details, comments,
and reviews
ghstack PR base:
https://github.com/pytorch/executorch/tree/gh/lucylq/113/base
ghstack PR head:
https://github.com/pytorch/executorch/tree/gh/lucylq/113/head
Merge bot PR base:
https://github.com/pytorch/executorch/tree/gh/lucylq/111/orig
Merge bot PR head:
https://github.com/pytorch/executorch/tree/gh/lucylq/113/orig
Differential Revision:
[D82072929](https://our.internmc.facebook.com/intern/diff/D82072929/)
@diff-train-skip-merge

---------

Co-authored-by: lucylq <lfq@meta.com>
---
 .../executorch/extension/llm/LlmModule.java   | 33 +++++++++++++++----
 extension/android/jni/jni_layer_llama.cpp     | 29 +++++++++++-----
 2 files changed, 47 insertions(+), 15 deletions(-)

diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java
index 289df5defd9..f135731f26a 100644
--- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java
+++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java
@@ -11,6 +11,7 @@
 import com.facebook.jni.HybridData;
 import com.facebook.jni.annotations.DoNotStrip;
 import java.io.File;
+import java.util.List;
 import org.pytorch.executorch.ExecuTorchRuntime;
 import org.pytorch.executorch.annotations.Experimental;
 
@@ -32,14 +33,22 @@ public class LlmModule {
 
   @DoNotStrip
   private static native HybridData initHybrid(
-      int modelType, String modulePath, String tokenizerPath, float temperature, String dataPath);
+      int modelType,
+      String modulePath,
+      String tokenizerPath,
+      float temperature,
+      List<String> dataFiles);
 
   /**
    * Constructs a LLM Module for a model with given type, model path, tokenizer, temperature, and
-   * data path.
+   * dataFiles.
    */
   public LlmModule(
-      int modelType, String modulePath, String tokenizerPath, float temperature, String dataPath) {
+      int modelType,
+      String modulePath,
+      String tokenizerPath,
+      float temperature,
+      List<String> dataFiles) {
     ExecuTorchRuntime runtime = ExecuTorchRuntime.getRuntime();
 
     File modelFile = new File(modulePath);
@@ -50,12 +59,22 @@ public LlmModule(
     if (!tokenizerFile.canRead() || !tokenizerFile.isFile()) {
       throw new RuntimeException("Cannot load tokenizer path " + tokenizerPath);
     }
-    mHybridData = initHybrid(modelType, modulePath, tokenizerPath, temperature, dataPath);
+
+    mHybridData = initHybrid(modelType, modulePath, tokenizerPath, temperature, dataFiles);
+  }
+
+  /**
+   * Constructs a LLM Module for a model with given type, model path, tokenizer, temperature, and
+   * data path.
+   */
+  public LlmModule(
+      int modelType, String modulePath, String tokenizerPath, float temperature, String dataPath) {
+    this(modelType, modulePath, tokenizerPath, temperature, List.of(dataPath));
   }
 
   /** Constructs a LLM Module for a model with given model path, tokenizer, temperature. */
   public LlmModule(String modulePath, String tokenizerPath, float temperature) {
-    this(MODEL_TYPE_TEXT, modulePath, tokenizerPath, temperature, null);
+    this(MODEL_TYPE_TEXT, modulePath, tokenizerPath, temperature, List.of());
   }
 
   /**
@@ -63,12 +82,12 @@ public LlmModule(String modulePath, String tokenizerPath, float temperature) {
    * path.
    */
   public LlmModule(String modulePath, String tokenizerPath, float temperature, String dataPath) {
-    this(MODEL_TYPE_TEXT, modulePath, tokenizerPath, temperature, dataPath);
+    this(MODEL_TYPE_TEXT, modulePath, tokenizerPath, temperature, List.of(dataPath));
   }
 
   /** Constructs a LLM Module for a model with given path, tokenizer, and temperature. */
   public LlmModule(int modelType, String modulePath, String tokenizerPath, float temperature) {
-    this(modelType, modulePath, tokenizerPath, temperature, null);
+    this(modelType, modulePath, tokenizerPath, temperature, List.of());
   }
 
   /** Constructs a LLM Module for a model with the given LlmModuleConfig */
diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
index cabf30c42e4..a0c90991bf7 100644
--- a/extension/android/jni/jni_layer_llama.cpp
+++ b/extension/android/jni/jni_layer_llama.cpp
@@ -140,13 +140,13 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
       facebook::jni::alias_ref<jstring> model_path,
       facebook::jni::alias_ref<jstring> tokenizer_path,
       jfloat temperature,
-      facebook::jni::alias_ref<jstring> data_path) {
+      facebook::jni::alias_ref<jobject> data_files) {
     return makeCxxInstance(
         model_type_category,
         model_path,
         tokenizer_path,
         temperature,
-        data_path);
+        data_files);
   }
 
   ExecuTorchLlmJni(
@@ -154,7 +154,7 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
       facebook::jni::alias_ref<jstring> model_path,
       facebook::jni::alias_ref<jstring> tokenizer_path,
       jfloat temperature,
-      facebook::jni::alias_ref<jstring> data_path = nullptr) {
+      facebook::jni::alias_ref<jobject> data_files = nullptr) {
     temperature_ = temperature;
 #if defined(ET_USE_THREADPOOL)
     // Reserve 1 thread for the main thread.
@@ -173,18 +173,32 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
           model_path->toStdString().c_str(),
           llm::load_tokenizer(tokenizer_path->toStdString()));
     } else if (model_type_category == MODEL_TYPE_CATEGORY_LLM) {
-      std::optional<const std::string> data_path_str = data_path
-          ? std::optional<const std::string>{data_path->toStdString()}
-          : std::nullopt;
+      std::vector<std::string> data_files_vector;
+      if (data_files != nullptr) {
+        // Convert Java List<String> to C++ std::vector<string>
+        auto list_class = facebook::jni::findClassStatic("java/util/List");
+        auto size_method = list_class->getMethod<jint()>("size");
+        auto get_method =
+            list_class->getMethod<facebook::jni::local_ref<jobject>(jint)>(
+                "get");
+
+        jint size = size_method(data_files);
+        for (jint i = 0; i < size; ++i) {
+          auto str_obj = get_method(data_files, i);
+          auto jstr = facebook::jni::static_ref_cast<jstring>(str_obj);
+          data_files_vector.push_back(jstr->toStdString());
+        }
+      }
       runner_ = executorch::extension::llm::create_text_llm_runner(
           model_path->toStdString(),
           llm::load_tokenizer(tokenizer_path->toStdString()),
-          data_path_str);
+          data_files_vector);
 #if defined(EXECUTORCH_BUILD_QNN)
     } else if (model_type_category == MODEL_TYPE_QNN_LLAMA) {
       std::unique_ptr<executorch::extension::Module> module = std::make_unique<
           executorch::extension::Module>(
           model_path->toStdString().c_str(),
+          data_files_set,
           executorch::extension::Module::LoadMode::MmapUseMlockIgnoreErrors);
       std::string decoder_model = "llama3"; // use llama3 for now
       runner_ = std::make_unique<example::Runner<uint16_t>>( // QNN runner
@@ -192,7 +206,6 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
           decoder_model.c_str(),
           model_path->toStdString().c_str(),
           tokenizer_path->toStdString().c_str(),
-          data_path->toStdString().c_str(),
           "");
       model_type_category_ = MODEL_TYPE_CATEGORY_LLM;
 #endif

From 7116e0ad6d0454755f1a90016ae96a4d2ede3329 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu0820@users.noreply.github.com>
Date: Fri, 3 Oct 2025 19:18:21 +0100
Subject: [PATCH 119/266] Tag mutated buffer for AOTI cuda partitioner (#14783)

This should avoid having to copy mutated buffer back to outputs.

Before PR I'm getting this graph:

```
graph():
    %b_key_cache_0 : [num_users=1] = placeholder[target=b_key_cache_0]
    %b_value_cache_0 : [num_users=1] = placeholder[target=b_value_cache_0]
    %b_key_cache_1 : [num_users=1] = placeholder[target=b_key_cache_1]
    %b_value_cache_1 : [num_users=1] = placeholder[target=b_value_cache_1]
    %b_key_cache_2 : [num_users=1] = placeholder[target=b_key_cache_2]
    %b_value_cache_2 : [num_users=1] = placeholder[target=b_value_cache_2]
    %b_key_cache_3 : [num_users=1] = placeholder[target=b_key_cache_3]
    %b_value_cache_3 : [num_users=1] = placeholder[target=b_value_cache_3]
...
    %b_key_cache_29 : [num_users=1] = placeholder[target=b_key_cache_29]
    %b_value_cache_29 : [num_users=1] = placeholder[target=b_value_cache_29]
    %inputs_embeds : [num_users=1] = placeholder[target=inputs_embeds]
    %cache_position : [num_users=1] = placeholder[target=cache_position]
    %lowered_module_0 : [num_users=1] = get_attr[target=lowered_module_0]
    %executorch_call_delegate : [num_users=61] = call_function[target=torch.ops.higher_order.executorch_call_delegate](args = (%lowered_module_0, %inputs_embeds, %cache_position, %b_value_cache_0, %b_key_cache_0, %b_value_cache_1, %b_key_cache_1, %b_value_cache_2, %b_key_cache_2, %b_value_cache_3, %b_key_cache_3, %b_value_cache_4, %b_key_cache_4, %b_value_cache_5, %b_key_cache_5, %b_value_cache_6, %b_key_cache_6, %b_value_cache_7, %b_key_cache_7, %b_value_cache_8, %b_key_cache_8, %b_value_cache_9, %b_key_cache_9, %b_value_cache_10, %b_key_cache_10, %b_value_cache_11, %b_key_cache_11, %b_value_cache_12, %b_key_cache_12, %b_value_cache_13, %b_key_cache_13, %b_value_cache_14, %b_key_cache_14, %b_value_cache_15, %b_key_cache_15, %b_value_cache_16, %b_key_cache_16, %b_value_cache_17, %b_key_cache_17, %b_value_cache_18, %b_key_cache_18, %b_value_cache_19, %b_key_cache_19, %b_value_cache_20, %b_key_cache_20, %b_value_cache_21, %b_key_cache_21, %b_value_cache_22, %b_key_cache_22, %b_value_cache_23, %b_key_cache_23, %b_value_cache_24, %b_key_cache_24, %b_value_cache_25, %b_key_cache_25, %b_value_cache_26, %b_key_cache_26, %b_value_cache_27, %b_key_cache_27, %b_value_cache_28, %b_key_cache_28, %b_value_cache_29, %b_key_cache_29), kwargs = {})
    %getitem : [num_users=1] = call_function[target=operator.getitem](args = (%executorch_call_delegate, 0), kwargs = {})
    %getitem_1 : [num_users=1] = call_function[target=operator.getitem](args = (%executorch_call_delegate, 1), kwargs = {})
    %getitem_2 : [num_users=1] = call_function[target=operator.getitem](args = (%executorch_call_delegate, 2), kwargs = {})
    %getitem_3 : [num_users=1] = call_function[target=operator.getitem](args = (%executorch_call_delegate, 3), kwargs = {})
    %getitem_4 : [num_users=1] = call_function[target=operator.getitem](args = (%executorch_call_delegate, 4), kwargs = {})
...
    %getitem_60 : [num_users=1] = call_function[target=operator.getitem](args = (%executorch_call_delegate, 60), kwargs = {})
    return (getitem_1, getitem, getitem_3, getitem_2, getitem_5, getitem_4, getitem_7, getitem_6, getitem_9, getitem_8, getitem_11, getitem_10, getitem_13, getitem_12, getitem_15, getitem_14, getitem_17, getitem_16, getitem_19, getitem_18, getitem_21, getitem_20, getitem_23, getitem_22, getitem_25, getitem_24, getitem_27, getitem_26, getitem_29, getitem_28, getitem_31, getitem_30, getitem_33, getitem_32, getitem_35, getitem_34, getitem_37, getitem_36, getitem_39, getitem_38, getitem_41, getitem_40, getitem_43, getitem_42, getitem_45, getitem_44, getitem_47, getitem_46, getitem_49, getitem_48, getitem_51, getitem_50, getitem_53, getitem_52, getitem_55, getitem_54, getitem_57, getitem_56, getitem_59, getitem_58, getitem_60)/home/larryliu/.conda/envs/executorch/lib/python3.11/site-packages/executorch/exir/emit/_emitter.py:1595: UserWarning: Mutation on a buffer in the model is detected. ExecuTorch assumes buffers that are mutated in the graph have a meaningless initial state, only the shape and dtype will be serialized, unless a pass which sets meta["et_init_buffer"] to True such as InitializedMutableBufferPass is run.
  warnings.warn(
```

This is unncessary because we don't want the kv cache as output.

After applying this PR I'm getting this graph instead:

```
graph():
    %inputs_embeds : [num_users=1] = placeholder[target=inputs_embeds]
    %cache_position : [num_users=1] = placeholder[target=cache_position]
    %lowered_module_0 : [num_users=1] = get_attr[target=lowered_module_0]
    %executorch_call_delegate : [num_users=1] = call_function[target=torch.ops.higher_order.executorch_call_delegate](args = (%lowered_module_0, %inputs_embeds, %cache_position), kwargs = {})
    %getitem_60 : [num_users=1] = call_function[target=operator.getitem](args = (%executorch_call_delegate, 0), kwargs = {})
    return (getitem_60,)
```

### Summary
[PLEASE REMOVE] See [CONTRIBUTING.md's Pull
Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests)
for ExecuTorch PR guidelines.

[PLEASE REMOVE] If this PR closes an issue, please add a `Fixes
#<issue-id>` line.

[PLEASE REMOVE] If this PR introduces a fix or feature that should be
the upcoming release notes, please add a "Release notes: <area>" label.
For a list of available release notes labels, check out
[CONTRIBUTING.md's Pull
Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests).

### Test plan
[PLEASE REMOVE] How did you test this PR? Please write down any manual
commands you used and note down tests that you have written if
applicable.
---
 backends/cuda/cuda_partitioner.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/backends/cuda/cuda_partitioner.py b/backends/cuda/cuda_partitioner.py
index 14c75bdb937..64df7b7dcb2 100644
--- a/backends/cuda/cuda_partitioner.py
+++ b/backends/cuda/cuda_partitioner.py
@@ -15,7 +15,7 @@
     Partitioner,
     PartitionResult,
 )
-from executorch.exir.backend.utils import tag_constant_data
+from executorch.exir.backend.utils import tag_constant_data, tag_mutated_buffer
 from torch.export.exported_program import ExportedProgram
 
 
@@ -54,6 +54,7 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         partition_tags[tag] = self.delegation_spec
 
         tag_constant_data(exported_program)
+        tag_mutated_buffer(exported_program)
 
         return PartitionResult(
             tagged_exported_program=exported_program, partition_tags=partition_tags

From b021fd01eab33b14749197a1fd7bbd2dfa85e823 Mon Sep 17 00:00:00 2001
From: Andrew Grebenisan <33402477+DrJessop@users.noreply.github.com>
Date: Fri, 3 Oct 2025 14:24:33 -0700
Subject: [PATCH 120/266] Support im2row

Differential Revision: D83620790

Pull Request resolved: https://github.com/pytorch/executorch/pull/14729
---
 backends/cadence/aot/ref_implementations.py   | 113 +++++++
 .../aot/tests/test_ref_implementations.py     | 293 ++++++++++++++++++
 2 files changed, 406 insertions(+)

diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py
index ca15e825ff0..886cb14d0d6 100644
--- a/backends/cadence/aot/ref_implementations.py
+++ b/backends/cadence/aot/ref_implementations.py
@@ -1303,3 +1303,116 @@ def rope(
         [x0 * cos_tensor - x1 * sin_tensor, x0 * sin_tensor + x1 * cos_tensor], dim=-1
     )
     return rotated.view(original_shape)
+
+
+@impl(m, "im2row")
+def im2row(
+    input_tensor: torch.Tensor,
+    kernel_size: tuple[int, int],
+    dilation: tuple[int, int],
+    padding: tuple[int, int],
+    stride: tuple[int, int],
+    in_zero_point: torch.Tensor,
+    channel_last: bool = False,
+) -> torch.Tensor:
+    """
+    Converts an input tensor into a 2D matrix where each row is a flattened sliding window (patch)
+    from the input, suitable for use in convolution as a matrix multiplication (im2row).
+
+    Args:
+        - input_tensor: Input tensor of shape (N, C, H, W) or (N, H, W, C) if channel_last.
+        - kernel_size: Size of the convolution kernel.
+        - dilation: Dilation of the convolution kernel.
+        - padding: Padding to apply to the input.
+        - stride: Stride of the convolution.
+        - in_zero_point : Zero point for input quantization (broadcastable to input).
+        - channel_last: If True, input is in NHWC format, else NCHW.
+
+    Returns:
+        - Tensor of shape (N, num_patches, patch_size)
+    """
+    if len(input_tensor.shape) == 3:
+        height_dim = 1 if channel_last else 2
+        input_tensor = input_tensor.unsqueeze(height_dim)
+
+    if in_zero_point is not None:
+        if in_zero_point.numel() != 1 and in_zero_point.shape != (
+            input_tensor.shape[0],
+        ):
+            raise ValueError(
+                f"Input zero point must be a scalar or broadcastable to input shape {input_tensor.shape}"
+            )
+        if in_zero_point.dtype != torch.int32:
+            raise ValueError("Input zero point must be an int32 tensor")
+
+    if channel_last:
+        input_tensor = input_tensor.movedim(-1, -3).contiguous()  # NHWC -> NCHW
+
+    N, C, H, W = input_tensor.shape
+    kH, kW = kernel_size
+    dH, dW = dilation
+    pH, pW = padding
+    sH, sW = stride
+
+    # Handle padding with zero point values
+    if in_zero_point is not None and (pH > 0 or pW > 0):
+        # Expand zero point to (N, 1, 1, 1) for broadcasting
+        in_zero_point = in_zero_point.expand(N)
+
+        # Pad input with the per-batch zero point values
+        input_tensor = torch.stack(
+            [
+                torch.nn.functional.pad(
+                    input_tensor[i],
+                    (pW, pW, pH, pH),
+                    mode="constant",
+                    value=in_zero_point[i].item(),
+                )
+                for i in range(len(input_tensor))
+            ]
+        )
+
+        padding = (0, 0)  # Already padded manually
+
+    # Use unfold to extract sliding local blocks
+    # Unfold: (N, C, H, W) -> (N, C, L, kH, kW), where L = number of sliding windows
+    # torch.nn.functional.unfold returns (N, C*kH*kW, L)
+    patches = torch.nn.functional.unfold(
+        input_tensor.float(),  # unfold not implemented for int
+        kernel_size=(kH, kW),
+        dilation=(dH, dW),
+        padding=padding,
+        stride=(sH, sW),
+    ).to(
+        input_tensor.dtype
+    )  # (N, C*kH*kW, L)
+
+    # Transpose to (N, L, C*kH*kW)
+    patches = patches.transpose(1, 2).contiguous()
+
+    # Reshape to (N*L, C*kH*kW)
+    patches = patches.view(N, -1, C * kH * kW)
+
+    # If channel_last, output should be in NHWC patch order (but im2row is always row-major)
+    return patches
+
+
+@impl(m, "im2row.per_tensor")
+def im2row_per_tensor(
+    input_tensor: torch.Tensor,
+    kernel_size: tuple[int, int],
+    dilation: tuple[int, int],
+    padding: tuple[int, int],
+    stride: tuple[int, int],
+    in_zero_point: int,
+    channel_last: bool = False,
+) -> torch.Tensor:
+    return im2row(
+        input_tensor,
+        kernel_size,
+        dilation,
+        padding,
+        stride,
+        torch.tensor(in_zero_point, dtype=torch.int32),
+        channel_last,
+    )
diff --git a/backends/cadence/aot/tests/test_ref_implementations.py b/backends/cadence/aot/tests/test_ref_implementations.py
index 8d02c5c2963..0aa1f0a243a 100644
--- a/backends/cadence/aot/tests/test_ref_implementations.py
+++ b/backends/cadence/aot/tests/test_ref_implementations.py
@@ -1843,3 +1843,296 @@ def test_avg_pool2d(
                 torch.equal(output, expected_output),
                 f"Output values don't match expected in {name}. Got {output}, expected {expected_output}",
             )
+
+    @expand(
+        [
+            # Basic 2x2 kernel, stride 1, no padding, NCHW
+            (
+                "nchw_basic_2x2",
+                torch.tensor(
+                    [[[[1, 2, 3], [4, 5, 6], [7, 8, 9]]]], dtype=torch.float32
+                ),  # (N=1, C=1, H=3, W=3)
+                (2, 2),  # kernel_size
+                (1, 1),  # dilation
+                (0, 0),  # padding
+                (1, 1),  # stride
+                None,  # in_zero_point
+                False,  # channel_last
+                False,
+                torch.tensor(
+                    [
+                        [[1, 2, 4, 5], [2, 3, 5, 6], [4, 5, 7, 8], [5, 6, 8, 9]],
+                    ],
+                    dtype=torch.float32,
+                ),
+            ),
+            # 2x2 kernel, stride 2, no padding, NCHW
+            (
+                "nchw_stride2",
+                torch.tensor(
+                    [[[[1, 2, 3], [4, 5, 6], [7, 8, 9]]]], dtype=torch.float32
+                ),
+                (2, 2),
+                (1, 1),
+                (0, 0),
+                (2, 2),
+                None,
+                False,
+                False,
+                torch.tensor(
+                    [
+                        [[1, 2, 4, 5]],
+                    ],
+                    dtype=torch.float32,  # Only every other patch in each dim
+                ),
+            ),
+            # 2x2 kernel, stride 1, padding 1, NCHW
+            (
+                "nchw_padding1",
+                torch.tensor([[[[1, 2], [3, 4]]]], dtype=torch.float32),  # (1,1,2,2)
+                (2, 2),
+                (1, 1),
+                (1, 1),
+                (1, 1),
+                None,
+                False,
+                False,
+                torch.tensor(
+                    [
+                        [
+                            [0, 0, 0, 1],
+                            [0, 0, 1, 2],
+                            [0, 0, 2, 0],
+                            [0, 1, 0, 3],
+                            [1, 2, 3, 4],
+                            [2, 0, 4, 0],
+                            [0, 3, 0, 0],
+                            [3, 4, 0, 0],
+                            [4, 0, 0, 0],
+                        ],
+                    ],
+                    dtype=torch.float32,
+                ),
+            ),
+            # 2x2 kernel, stride 1, no padding, NHWC
+            (
+                "nhwc_basic_2x2",
+                torch.tensor(
+                    [[[[1], [2], [3]], [[4], [5], [6]], [[7], [8], [9]]]],
+                    dtype=torch.float32,
+                ),  # (N=1, H=3, W=3, C=1)
+                (2, 2),
+                (1, 1),
+                (0, 0),
+                (1, 1),
+                None,
+                True,
+                False,
+                torch.tensor(
+                    [
+                        [[1, 2, 4, 5], [2, 3, 5, 6], [4, 5, 7, 8], [5, 6, 8, 9]],
+                    ],
+                    dtype=torch.float32,
+                ),
+            ),
+            # 2x2 kernel, stride 1, no padding, NCHW, in_zero_point=1
+            (
+                "nchw_in_zero_point_no_padding",
+                torch.tensor([[[[2, 3, 4], [5, 6, 7], [8, 9, 10]]]], dtype=torch.int8),
+                (2, 2),
+                (1, 1),
+                (0, 0),
+                (1, 1),
+                torch.tensor(1, dtype=torch.int32),
+                False,
+                False,
+                torch.tensor(
+                    [
+                        [[2, 3, 5, 6], [3, 4, 6, 7], [5, 6, 8, 9], [6, 7, 9, 10]],
+                    ],
+                    dtype=torch.int8,
+                ),
+            ),
+            (
+                "nchw_in_zero_point_with_padding=1_and_stride=2",
+                torch.tensor([[[[2, 3, 4], [5, 6, 7], [8, 9, 10]]]], dtype=torch.int8),
+                (2, 2),
+                (1, 1),
+                (1, 1),
+                (2, 2),
+                torch.tensor(-1, dtype=torch.int32),
+                False,
+                False,
+                torch.tensor(
+                    [
+                        [
+                            [-1, -1, -1, 2],
+                            [-1, -1, 3, 4],
+                            [-1, 5, -1, 8],
+                            [6, 7, 9, 10],
+                        ],
+                    ],
+                    dtype=torch.int8,
+                ),
+            ),
+            # 2x2 kernel, stride 1, no padding, NHWC, in_zero_point=2
+            (
+                "nhwc_in_zero_point",
+                torch.tensor(
+                    [[[[3], [4], [5]], [[6], [7], [8]], [[9], [10], [11]]]],
+                    dtype=torch.int8,
+                ),
+                (2, 2),
+                (1, 1),
+                (0, 0),
+                (1, 1),
+                torch.tensor(2, dtype=torch.int32),
+                True,
+                False,
+                torch.tensor(
+                    [
+                        [[3, 4, 6, 7], [4, 5, 7, 8], [6, 7, 9, 10], [7, 8, 10, 11]],
+                    ],
+                    dtype=torch.int8,
+                ),
+            ),
+            # Multi-channel input, 2x2 kernel, stride 1, no padding, NCHW
+            (
+                "nchw_multi_channel",
+                torch.tensor(
+                    [
+                        [
+                            [[1, 2, 3], [4, 5, 6], [7, 8, 9]],  # channel 0
+                            [[10, 11, 12], [13, 14, 15], [16, 17, 18]],  # channel 1
+                        ]
+                    ],
+                    dtype=torch.float32,
+                ),  # (1,2,3,3)
+                (2, 2),
+                (1, 1),
+                (0, 0),
+                (1, 1),
+                None,
+                False,
+                False,
+                torch.tensor(
+                    [
+                        [
+                            [1, 2, 4, 5, 10, 11, 13, 14],
+                            [2, 3, 5, 6, 11, 12, 14, 15],
+                            [4, 5, 7, 8, 13, 14, 16, 17],
+                            [5, 6, 8, 9, 14, 15, 17, 18],
+                        ],
+                    ],
+                    dtype=torch.float32,
+                ),
+            ),
+            # Multi-channel input and multi-channel zero-point
+            (
+                "nchw_multi_channel_and_zero_point_no_padding",
+                torch.tensor([[[1, 2, 3]], [[4, 5, 6]]], dtype=torch.int32),
+                (1, 2),
+                (1, 1),
+                (0, 0),
+                (1, 1),
+                torch.tensor([-1, -2], dtype=torch.int32),
+                False,
+                False,
+                torch.tensor([[[1, 2], [2, 3]], [[4, 5], [5, 6]]], dtype=torch.int32),
+            ),
+            (
+                "nchw_multi_channel_and_zero_point_with_padding=1_and_stride=(2, 1)",
+                torch.tensor([[[1, 2, 3]], [[4, 5, 6]]], dtype=torch.int32),
+                (1, 2),
+                (1, 1),
+                (2, 1),
+                (2, 2),
+                torch.tensor([-1, -2], dtype=torch.int32),
+                False,
+                False,
+                torch.tensor(
+                    [
+                        [
+                            [-1, -1],
+                            [-1, -1],
+                            [-1, 1],
+                            [2, 3],
+                            [-1, -1],
+                            [-1, -1],
+                        ],
+                        [
+                            [-2, -2],
+                            [-2, -2],
+                            [-2, 4],
+                            [5, 6],
+                            [-2, -2],
+                            [-2, -2],
+                        ],
+                    ],
+                    dtype=torch.int32,
+                ),
+            ),
+            (
+                "per_tensor",
+                torch.tensor(
+                    [[[[3], [4], [5]], [[6], [7], [8]], [[9], [10], [11]]]],
+                    dtype=torch.int8,
+                ),
+                (2, 2),
+                (1, 1),
+                (0, 0),
+                (1, 1),
+                2,
+                True,
+                True,
+                torch.tensor(
+                    [
+                        [[3, 4, 6, 7], [4, 5, 7, 8], [6, 7, 9, 10], [7, 8, 10, 11]],
+                    ],
+                    dtype=torch.int8,
+                ),
+            ),
+        ]
+    )
+    def test_im2row(
+        self,
+        name: str,
+        input_tensor: torch.Tensor,
+        kernel_size: tuple[int, int],
+        dilation: tuple[int, int],
+        padding: tuple[int, int],
+        stride: tuple[int, int],
+        in_zero_point: torch.Tensor | None,
+        channel_last: bool,
+        per_tensor: bool,
+        expected_output: torch.Tensor,
+    ) -> None:
+        if per_tensor:
+            output = torch.ops.cadence.im2row.per_tensor(
+                input_tensor,
+                kernel_size,
+                dilation,
+                padding,
+                stride,
+                in_zero_point,
+                channel_last,
+            )
+        else:
+            output = torch.ops.cadence.im2row(
+                input_tensor,
+                kernel_size,
+                dilation,
+                padding,
+                stride,
+                in_zero_point,
+                channel_last,
+            )
+        self.assertEqual(
+            output.shape,
+            expected_output.shape,
+            f"im2row output shape mismatch in {name}",
+        )
+        self.assertTrue(
+            torch.equal(output, expected_output),
+            f"im2row output mismatch in {name}: got {output}, expected {expected_output}",
+        )

From 7c7b729e0413390c8991819c87324ab9fb5d8c4c Mon Sep 17 00:00:00 2001
From: lucylq <lfq@meta.com>
Date: Fri, 3 Oct 2025 16:35:16 -0700
Subject: [PATCH 121/266] Patch
 https://github.com/pytorch/executorch/pull/14754 (#14786)

landed as https://www.internalfb.com/diff/D82075758
---
 .../ExecuTorch/Exported/ExecuTorchModule.h    |  8 ++++----
 .../ExecuTorch/Exported/ExecuTorchModule.mm   | 19 +++++++++++++------
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchModule.h b/extension/apple/ExecuTorch/Exported/ExecuTorchModule.h
index cda9a914bc3..9b8400d739f 100644
--- a/extension/apple/ExecuTorch/Exported/ExecuTorchModule.h
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorchModule.h
@@ -126,14 +126,14 @@ NS_SWIFT_NAME(Module)
  * Initializes a module with a file path, data path and a specified load mode.
  *
  * @param filePath A string representing the path to the ExecuTorch program file.
- * @param dataFilePath A string representing the path to a .ptd file with
+ * @param dataFilePaths A list of strings representing paths to .ptd files with
  * external tensors and external data.
  * @param loadMode A value from ExecuTorchModuleLoadMode that determines the
  * file loading behavior.
  * @return An initialized ExecuTorchModule instance.
  */
 - (instancetype)initWithFilePath:(NSString *)filePath
-                    dataFilePath:(NSString *)dataFilePath
+                   dataFilePaths:(NSArray<NSString *> *)dataFilePaths
                         loadMode:(ExecuTorchModuleLoadMode)loadMode
     NS_DESIGNATED_INITIALIZER;
 
@@ -141,12 +141,12 @@ NS_SWIFT_NAME(Module)
  * Initializes a module with a file path, data path and a specified load mode.
  *
  * @param filePath A string representing the path to the ExecuTorch program file.
- * @param dataFilePath A string representing the path to a .ptd file with
+ * @param dataFilePaths A list of strings representing paths to .ptd files with
  * external tensors and external data.
  * @return An initialized ExecuTorchModule instance.
  */
 - (instancetype)initWithFilePath:(NSString *)filePath
-                    dataFilePath:(NSString *)dataFilePath;
+                   dataFilePaths:(NSArray<NSString *> *)dataFilePaths;
 
 /**
  * Initializes a module with a file path and a specified load mode.
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchModule.mm b/extension/apple/ExecuTorch/Exported/ExecuTorchModule.mm
index ce58f2fb21a..69bb59c860e 100644
--- a/extension/apple/ExecuTorch/Exported/ExecuTorchModule.mm
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorchModule.mm
@@ -250,13 +250,20 @@ @implementation ExecuTorchModule {
 }
 
 - (instancetype)initWithFilePath:(NSString *)filePath
-                    dataFilePath:(NSString *)dataFilePath
+                   dataFilePaths:(NSArray<NSString *> *)dataFilePaths
                         loadMode:(ExecuTorchModuleLoadMode)loadMode {
   self = [super init];
   if (self) {
+    // Convert NSArray<NSString *> to std::vector<std::string>
+    std::vector<std::string> dataFilePathsVector;
+    if (dataFilePaths != nil) {
+      for (NSString *dataFile in dataFilePaths) {
+        dataFilePathsVector.emplace_back(dataFile.UTF8String);
+      }
+    }
     _module = std::make_unique<Module>(
       filePath.UTF8String,
-      dataFilePath.UTF8String,
+      dataFilePathsVector,
       static_cast<Module::LoadMode>(loadMode)
     );
     _inputs = [NSMutableDictionary new];
@@ -266,21 +273,21 @@ - (instancetype)initWithFilePath:(NSString *)filePath
 }
 
 - (instancetype)initWithFilePath:(NSString *)filePath
-                    dataFilePath:(NSString *)dataFilePath {
+                   dataFilePaths:(NSArray<NSString *> *)dataFilePaths {
   return [self initWithFilePath:filePath
-                   dataFilePath:dataFilePath
+                  dataFilePaths:dataFilePaths
                        loadMode:ExecuTorchModuleLoadModeFile];
 }
 
 - (instancetype)initWithFilePath:(NSString *)filePath
                         loadMode:(ExecuTorchModuleLoadMode)loadMode {
   return [self initWithFilePath:filePath
-                   dataFilePath:@""
+                  dataFilePaths:@[]
                        loadMode:loadMode];
 }
 - (instancetype)initWithFilePath:(NSString *)filePath {
   return [self initWithFilePath:filePath
-                   dataFilePath:@""
+                  dataFilePaths:@[]
                        loadMode:ExecuTorchModuleLoadModeFile];
 }
 

From 0ee11607fc08d7c02374ddde1f92ed8c273b15b4 Mon Sep 17 00:00:00 2001
From: Andrew Grebenisan <33402477+DrJessop@users.noreply.github.com>
Date: Fri, 3 Oct 2025 18:36:02 -0700
Subject: [PATCH 122/266] Add transposed im2row

Differential Revision: D83709868

Pull Request resolved: https://github.com/pytorch/executorch/pull/14738
---
 backends/cadence/aot/ref_implementations.py   | 156 ++++++++++++++++
 .../aot/tests/test_ref_implementations.py     | 170 ++++++++++++++++++
 2 files changed, 326 insertions(+)

diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py
index 886cb14d0d6..2642340679e 100644
--- a/backends/cadence/aot/ref_implementations.py
+++ b/backends/cadence/aot/ref_implementations.py
@@ -1416,3 +1416,159 @@ def im2row_per_tensor(
         torch.tensor(in_zero_point, dtype=torch.int32),
         channel_last,
     )
+
+
+@impl(m, "transposed_im2row")
+def transposed_im2row(
+    input_tensor: torch.Tensor,
+    kernel_size: tuple[int, int],
+    dilation: tuple[int, int],
+    padding: tuple[int, int],
+    stride: tuple[int, int],
+    output_padding: tuple[int, int],
+    in_zero_point: torch.Tensor,
+    channel_last: bool = False,
+) -> torch.Tensor:
+    """
+    Converts input tensor patches into im2row format for transposed convolutions.
+    This function extracts patches from input in a pattern suitable for transposed convolution.
+
+    Args:
+        - input_tensor: Input spatial tensor, NCHW or NHWC format (3D or 4D).
+        - kernel_size: Size of the convolution kernel.
+        - dilation: Dilation of the convolution kernel.
+        - padding: Padding to apply to the input.
+        - stride: Stride of the convolution.
+        - output_padding: Additional output padding for transposed convolution.
+        - in_zero_point: Zero point for input quantization (broadcastable to input).
+        - channel_last: If True, input is in NHWC format, else NCHW.
+
+    Returns:
+        - 3D tensor of shape (N, output_h * output_w, kernel_h * kernel_w * in_c)
+    """
+    # Handle 1D convolution case by adding height dimension
+    if len(input_tensor.shape) == 3:
+        height_dim = 1 if channel_last else 2
+        input_tensor = input_tensor.unsqueeze(height_dim)
+
+    if in_zero_point is not None:
+        if in_zero_point.dtype != torch.int32:
+            raise ValueError("Input zero point must be an int32 tensor")
+
+    # Move to NCHW for processing if needed
+    if channel_last:
+        input_tensor = input_tensor.movedim(-1, -3).contiguous()  # NHWC -> NCHW
+
+    N, C, H_in, W_in = input_tensor.shape
+
+    # Output: (N, C*H_in*W_in, H_out, W_out)
+    H_out = (
+        (H_in - 1) * stride[0]
+        + kernel_size[0]
+        + output_padding[0]
+        - 2 * padding[0]
+        + dilation[0] * (kernel_size[0] - 1)
+    )
+    W_out = (
+        (W_in - 1) * stride[1]
+        + kernel_size[1]
+        + output_padding[1]
+        - 2 * padding[1]
+        + dilation[1] * (kernel_size[1] - 1)
+    )
+
+    # For each input pixel, create a channel where the upsampled (transposed conv) patch is placed
+    # Output: (N, C*H_in*W_in, H_out, W_out)
+    inp_flat = input_tensor.reshape(N, C * H_in * W_in)
+
+    # Calculate output spatial size
+    H_out = (
+        (H_in - 1) * stride[0]
+        - 2 * padding[0]
+        + dilation[0] * (kernel_size[0] - 1)
+        + output_padding[0]
+        + 1
+    )
+    W_out = (
+        (W_in - 1) * stride[1]
+        - 2 * padding[1]
+        + dilation[1] * (kernel_size[1] - 1)
+        + output_padding[1]
+        + 1
+    )
+
+    # Compute the upsampled (top-left) position for each input pixel
+    h_idx = torch.arange(H_in, device=input_tensor.device)
+    w_idx = torch.arange(W_in, device=input_tensor.device)
+    grid_h, grid_w = torch.meshgrid(h_idx, w_idx, indexing="ij")
+    out_h_idx = grid_h * stride[0] - padding[0]
+    out_w_idx = grid_w * stride[1] - padding[1]
+
+    # Compute all input pixel positions (flattened)
+    ch_idx = torch.arange(C * H_in * W_in, device=input_tensor.device)
+    ij_idx = ch_idx % (H_in * W_in)
+    i_idx = ij_idx // W_in
+    j_idx = ij_idx % W_in
+
+    # For each input pixel, compute the output positions for the kernel window
+    kh_idx = torch.arange(kernel_size[0], device=input_tensor.device)
+    kw_idx = torch.arange(kernel_size[1], device=input_tensor.device)
+    kh_grid, kw_grid = torch.meshgrid(kh_idx, kw_idx, indexing="ij")
+    kh_grid = kh_grid.reshape(-1)
+    kw_grid = kw_grid.reshape(-1)
+    num_kernel = kernel_size[0] * kernel_size[1]
+
+    # Broadcast to all channels and kernel positions
+    ch_idx_b = ch_idx.repeat_interleave(num_kernel)
+    n_kernel = ch_idx.shape[0] * num_kernel
+
+    i_idx_b = i_idx.repeat_interleave(num_kernel)
+    j_idx_b = j_idx.repeat_interleave(num_kernel)
+    kh_b = kh_grid.repeat(ch_idx.shape[0])
+    kw_b = kw_grid.repeat(ch_idx.shape[0])
+
+    h_out = out_h_idx[i_idx_b, j_idx_b] + kh_b * dilation[0]
+    w_out = out_w_idx[i_idx_b, j_idx_b] + kw_b * dilation[1]
+
+    # Mask for valid output positions
+    valid = (h_out >= 0) & (h_out < H_out) & (w_out >= 0) & (w_out < W_out)
+
+    # Prepare indices for advanced indexing
+    n_idx = (
+        torch.arange(N, device=input_tensor.device)
+        .view(-1, 1)
+        .expand(N, n_kernel)
+        .reshape(-1)
+    )
+    ch_idx_full = ch_idx_b.expand(N, n_kernel).reshape(-1)
+    h_out_full = h_out.expand(N, n_kernel).reshape(-1)
+    w_out_full = w_out.expand(N, n_kernel).reshape(-1)
+    valid_full = valid.expand(N, n_kernel).reshape(-1)
+
+    # Gather input values for each channel
+    inp_vals = inp_flat[:, ch_idx_b].reshape(-1)
+
+    # Create output tensor
+    patches = torch.zeros((N, C * H_in * W_in, H_out, W_out), dtype=input_tensor.dtype)
+
+    # If in_zero_point is provided, fill patches with it
+    if in_zero_point is not None:
+        if in_zero_point.numel() == 1:
+            patches.fill_(in_zero_point.item())
+        else:
+            # Broadcast in_zero_point to (N, C, H_in, W_in)
+            assert in_zero_point.shape == (N,)
+            in_zero_point = in_zero_point.view(N, 1, 1, 1)
+            patches = patches + in_zero_point
+
+    # Scatter input values to output positions (only valid positions)
+    patches[
+        n_idx[valid_full],
+        ch_idx_full[valid_full],
+        h_out_full[valid_full],
+        w_out_full[valid_full],
+    ] = inp_vals[valid_full]
+
+    # Optionally, flatten to (N, num_patches, patch_size) if needed
+    patches = patches.view(N, C * H_in * W_in, -1).transpose(1, 2).contiguous()
+    return patches
diff --git a/backends/cadence/aot/tests/test_ref_implementations.py b/backends/cadence/aot/tests/test_ref_implementations.py
index 0aa1f0a243a..f78d2292e7b 100644
--- a/backends/cadence/aot/tests/test_ref_implementations.py
+++ b/backends/cadence/aot/tests/test_ref_implementations.py
@@ -2136,3 +2136,173 @@ def test_im2row(
             torch.equal(output, expected_output),
             f"im2row output mismatch in {name}: got {output}, expected {expected_output}",
         )
+
+    @expand(
+        [
+            (
+                "basic_2x2",
+                torch.tensor([[[[1, 2], [3, 4]]]], dtype=torch.int32),
+                (2, 2),
+                (1, 1),
+                (0, 0),
+                (1, 1),
+                (0, 0),
+                None,
+                False,
+                torch.tensor(
+                    [
+                        [
+                            [1, 0, 0, 0],
+                            [1, 2, 0, 0],
+                            [0, 2, 0, 0],
+                            [1, 0, 3, 0],
+                            [1, 2, 3, 4],
+                            [0, 2, 0, 4],
+                            [0, 0, 3, 0],
+                            [0, 0, 3, 4],
+                            [0, 0, 0, 4],
+                        ]
+                    ],
+                    dtype=torch.int32,
+                ),
+            ),
+            (
+                "basic_2x2_with_zero_point",
+                torch.tensor([[[[1, 2], [3, 4]]]], dtype=torch.int32),
+                (2, 2),
+                (1, 1),
+                (0, 0),
+                (1, 1),
+                (0, 0),
+                torch.tensor(100, dtype=torch.int32),
+                False,
+                torch.tensor(
+                    [
+                        [
+                            [1, 100, 100, 100],
+                            [1, 2, 100, 100],
+                            [100, 2, 100, 100],
+                            [1, 100, 3, 100],
+                            [1, 2, 3, 4],
+                            [100, 2, 100, 4],
+                            [100, 100, 3, 100],
+                            [100, 100, 3, 4],
+                            [100, 100, 100, 4],
+                        ]
+                    ],
+                    dtype=torch.int32,
+                ),
+            ),
+            (
+                "basic_2x2_with_stride_2",
+                torch.tensor([[[[1, 2], [3, 4]]]], dtype=torch.int32),
+                (2, 2),  # kernel size
+                (1, 1),  # dilation
+                (0, 0),  # padding
+                (2, 2),  # stride
+                (0, 0),  # output padding
+                None,
+                False,
+                torch.tensor(
+                    [
+                        [
+                            [1, 0, 0, 0],
+                            [1, 0, 0, 0],
+                            [0, 2, 0, 0],
+                            [0, 2, 0, 0],
+                            [1, 0, 0, 0],
+                            [1, 0, 0, 0],
+                            [0, 2, 0, 0],
+                            [0, 2, 0, 0],
+                            [0, 0, 3, 0],
+                            [0, 0, 3, 0],
+                            [0, 0, 0, 4],
+                            [0, 0, 0, 4],
+                            [0, 0, 3, 0],
+                            [0, 0, 3, 0],
+                            [0, 0, 0, 4],
+                            [0, 0, 0, 4],
+                        ]
+                    ],
+                    dtype=torch.int32,
+                ),
+            ),
+            (
+                "batch2_with_batch2_zero_point",
+                torch.tensor(
+                    [
+                        [[[1, 2], [3, 4]]],
+                        [[[5, 6], [7, 8]]],
+                    ],
+                    dtype=torch.int32,
+                ),  # input: (2,1,2,2)
+                (2, 2),  # kernel_size
+                (1, 1),  # dilation
+                (0, 0),  # padding
+                (1, 1),  # stride
+                (0, 0),  # output_padding
+                torch.tensor([100, 200], dtype=torch.int32),  # in_zero_point per batch
+                False,  # channel_last
+                torch.tensor(
+                    [
+                        [
+                            [1, 100, 100, 100],
+                            [1, 2, 100, 100],
+                            [100, 2, 100, 100],
+                            [1, 100, 3, 100],
+                            [1, 2, 3, 4],
+                            [100, 2, 100, 4],
+                            [100, 100, 3, 100],
+                            [100, 100, 3, 4],
+                            [100, 100, 100, 4],
+                        ],
+                        [
+                            [5, 200, 200, 200],
+                            [5, 6, 200, 200],
+                            [200, 6, 200, 200],
+                            [5, 200, 7, 200],
+                            [5, 6, 7, 8],
+                            [200, 6, 200, 8],
+                            [200, 200, 7, 200],
+                            [200, 200, 7, 8],
+                            [200, 200, 200, 8],
+                        ],
+                    ],
+                    dtype=torch.int32,
+                ),
+            ),
+        ]
+    )
+    def test_transposed_im2row(
+        self,
+        name: str,
+        input_tensor: torch.Tensor,
+        kernel_size: tuple[int, int],
+        dilation: tuple[int, int],
+        padding: tuple[int, int],
+        stride: tuple[int, int],
+        output_padding: tuple[int, int],
+        in_zero_point: torch.Tensor | int | None,
+        channel_last: bool,
+        expected_output: torch.Tensor,
+    ) -> None:
+        output = torch.ops.cadence.transposed_im2row(
+            input_tensor,
+            kernel_size,
+            dilation,
+            padding,
+            stride,
+            output_padding,
+            in_zero_point,
+            channel_last,
+        )
+
+        self.assertEqual(
+            output.shape,
+            expected_output.shape,
+            f"transposed_im2row output shape mismatch in {name}: got {output.shape}, expected {expected_output.shape}",
+        )
+        self.assertTrue(
+            torch.equal(output, expected_output),
+            f"transposed_im2row output mismatch in {name}: got {output}, expected {expected_output}",
+        )

From 0b5a4ab1ff1ebe3262742764c19d5c8cc15874ef Mon Sep 17 00:00:00 2001
From: Eli Amesefe <eliamesefe@meta.com>
Date: Fri, 3 Oct 2025 20:19:41 -0700
Subject: [PATCH 123/266] Update linear -> conv2d int16 for Ethos

Differential Revision: D83632029

Pull Request resolved: https://github.com/pytorch/executorch/pull/14763
---
 backends/arm/operators/op_conv2d.py  |  6 +++---
 backends/arm/test/ops/test_linear.py | 14 ++------------
 2 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/backends/arm/operators/op_conv2d.py b/backends/arm/operators/op_conv2d.py
index 469e6613c1f..933e353387b 100644
--- a/backends/arm/operators/op_conv2d.py
+++ b/backends/arm/operators/op_conv2d.py
@@ -182,11 +182,11 @@ def define_node(
             acc_type = ts.DType.FP32
 
         tosa_graph.addConst(
-            [1], output.dtype, [input_zp], name=f"{conv2d_output_name}_input_zp"
+            [1], inputs[0].dtype, [input_zp], name=f"{conv2d_output_name}_input_zp"
         )
         tosa_graph.addConst(
             [1],
-            output.dtype,
+            inputs[1].dtype,
             weight_zp,
             name=f"{conv2d_output_name}_weight_zp",
         )
@@ -269,7 +269,7 @@ def define_node(
 
         # For quantized convolution, rescale the output value back to the same
         # integer value domain of the next op. Otherwise return float32 output.
-        if inputs[0].dtype == ts.DType.INT8 or inputs[0].dtype == ts.DType.INT16:
+        if output.dtype == ts.DType.INT8 or output.dtype == ts.DType.INT16:
             # Get scale_factor from input, weight, and output.
             input_scale = input_qparams[0].get_scale_per_tensor()  # type: ignore[possibly-undefined]  # pyre-ignore [61]
             per_channel_quant = input_qparams[1].per_channel  # pyre-ignore [61]
diff --git a/backends/arm/test/ops/test_linear.py b/backends/arm/test/ops/test_linear.py
index bd719954ff5..4029fcef54e 100644
--- a/backends/arm/test/ops/test_linear.py
+++ b/backends/arm/test/ops/test_linear.py
@@ -8,8 +8,6 @@
 
 from typing import Tuple
 
-import pytest
-
 import torch
 from executorch.backends.arm.quantizer.arm_quantizer import (
     get_symmetric_a16w8_quantization_config,
@@ -313,12 +311,8 @@ def test_linear_16a8w_tosa_INT(test_data: torch.Tensor):
     pipeline.run()
 
 
-@common.parametrize("test_data", test_data_rank1_INT | test_data_rank4_INT)
+@common.parametrize("test_data", test_data_all_16a8w)
 @common.XfailIfNoCorstone300
-@pytest.mark.xfail(
-    reason="Ethos-U55 A16W8 linear: int16 matmul not yet supported; pending backend support or linear->conv1x1 lowering. See: https://github.com/pytorch/executorch/issues/13947",
-    strict=False,
-)
 def test_linear_16a8w_u55_INT16(test_data: torch.Tensor):
     """Test linear operation with 16A8W quantization on U55 (16-bit activations, 8-bit weights)"""
     test_data, out_features, has_bias, per_channel_quantization = test_data()
@@ -347,12 +341,8 @@ def test_linear_16a8w_u55_INT16(test_data: torch.Tensor):
     pipeline.run()
 
 
-@common.parametrize("test_data", test_data_rank1_INT | test_data_rank4_INT)
+@common.parametrize("test_data", test_data_all_16a8w)
 @common.XfailIfNoCorstone320
-@pytest.mark.xfail(
-    reason="Ethos-U55 A16W8 linear: int16 matmul not yet supported; pending backend support or linear->conv1x1 lowering. See: https://github.com/pytorch/executorch/issues/13947",
-    strict=False,
-)
 def test_linear_16a8w_u85_INT16(test_data: torch.Tensor):
     """Test linear operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)"""
     test_data, out_features, has_bias, per_channel_quantization = test_data()

From ca9fc0613063ce8d15148ca9c3dfe7e94b6b14c0 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Fri, 3 Oct 2025 23:32:01 -0400
Subject: [PATCH 124/266] [Release Only] Bugfix/fix nxp separable conv test
 (#14800)

### Summary
Fix failing separable convolution test.

The error is larger on the CI than on my PC.

Fixes #14709

### Test plan
N/A

Co-authored-by: Martin Pavella <martin.pavella@nxp.com>
---
 backends/nxp/tests/test_split_group_convolution.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/nxp/tests/test_split_group_convolution.py b/backends/nxp/tests/test_split_group_convolution.py
index 21ab1c5b59a..4c9f277e34d 100644
--- a/backends/nxp/tests/test_split_group_convolution.py
+++ b/backends/nxp/tests/test_split_group_convolution.py
@@ -110,7 +110,7 @@ def test_split_group_convolution__2d(self, _, input_shape: list[int], group: int
         input_data = torch.randn(input_shape, dtype=torch.float32)
         out1 = original_module(input_data).detach().numpy()
         out2 = modified_module(input_data).detach().numpy()
-        assert np.allclose(out1, out2, atol=2.0e-7)
+        assert np.allclose(out1, out2, atol=2.0e-7, rtol=1.9e-4)
 
         # Make sure the graph can be correctly quantized and lowered to edge.
         ep = _quantize_and_lower_module(

From 3f0896a5d9dd70f5c21bf2368640d748192f0238 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Sat, 4 Oct 2025 00:00:31 -0400
Subject: [PATCH 125/266] [ET-VK] Miscellaneous fixes (#14801)

Collecting fixes for various models/ops in this diff/PR.

They have all been squashed into this single change to make it easier to cherry pick.

# Fixes

## Wav2Letter

Type: Output correctness failure

This is caused by a bug in swiftshader, and not reproducible on any other platform. Specifically, the issue is in the softmax shader; the exact cause of the issue is unknown, but it is related to using shared memory within shaders. The workaround for this issue is to use separate shared memory arrays for the shared max and shared sum.

## ConvNeXT

Type: Exception during runtime

This is caused by an incompatible memory layout being used for mean2d. More technically, the packed dimension of the tensor cannot be one of the dims being reduced. The current operator registry system did not have a way to select valid tensor representations based on the actual arguments of an op.

To fix, we have to introduce a mechanism for ops to specify valid representations once a node's arguments are known. Once the model is exported with supported memory layout, the model test passes.

## Inception_V3/ViT

Type: Exception during runtime

The root cause of this was an interaction betwen the fuse batch norm pass and how `vulkan_preprocess.py` was applying passes. Essentially, the fuse batch norm pass creates a new param node for the fused weight, but after the pass is applied `_copy_module` is used to copy the transformed graph back into the ExportedProgram. However, it seems that _copy_module lowercases the node names without updating the exported program's graph signature. Therefore, subsequent passes couldn't recognize the weight tensor of convolution tensors as a constant/parameter node.

The solution was to migrate vulkan_preprocess.py to use the _transform() API instead of using _copy_module.

## DenseNet 161 (w/ dynamic shapes)

Type: Output Mismatch

Cause: the native_batch_norm op doesn't support dynamic shapes. However, the backend test runner doesn't set the correct compile option to filter ops without dynamic shape support.

Differential Revision: [D83703496](https://our.internmc.facebook.com/intern/diff/D83703496/)

[ghstack-poisoned]
---
 .github/workflows/pull.yml                    |   7 +-
 backends/vulkan/_passes/fold_qdq.py           |   5 +-
 backends/vulkan/_passes/fuse_patterns.py      |  10 +-
 backends/vulkan/_passes/fuse_quantized_ops.py |  10 +-
 .../vulkan/_passes/tag_memory_meta_pass.py    |   4 +
 backends/vulkan/op_registry.py                |  93 +++++++++----
 .../vulkan/partitioner/vulkan_partitioner.py  |  10 +-
 backends/vulkan/patterns/quantized_linear.py  |  12 +-
 .../vulkan/runtime/graph/ops/glsl/conv2d.glsl |   2 +-
 .../runtime/graph/ops/glsl/conv2d_dw.glsl     |   2 +-
 .../graph/ops/glsl/conv2d_dw_output_tile.glsl |   4 +
 .../vulkan/runtime/graph/ops/glsl/full.yaml   |   1 +
 .../runtime/graph/ops/glsl/softmax.glsl       |  27 ++--
 .../runtime/graph/ops/impl/BatchNorm.cpp      |  14 +-
 .../vulkan/runtime/graph/ops/impl/Permute.cpp |   8 +-
 .../vulkan/runtime/graph/ops/impl/Pool.cpp    |   4 +-
 .../vulkan/runtime/graph/ops/impl/Squeeze.cpp |   9 +-
 backends/vulkan/test/TARGETS                  |   1 -
 backends/vulkan/test/test_vulkan_passes.py    |  70 +---------
 backends/vulkan/test/utils.py                 |   4 +-
 backends/vulkan/utils.py                      |  19 ++-
 backends/vulkan/vulkan_preprocess.py          |  59 ++++----
 examples/vulkan/export.py                     | 127 +++++++++++-------
 23 files changed, 298 insertions(+), 204 deletions(-)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index c15fadd102f..845cb5d8631 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -970,11 +970,16 @@ jobs:
         PYTHON_EXECUTABLE=python bash backends/vulkan/test/scripts/test_model.sh --build
 
         # Test models serially
-        models="mv2 mv3 edsr resnet18 resnet50 dl3"
+        models="mv2 mv3 edsr resnet18 resnet50 dl3 w2l ic3 ic4"
         for model in $models; do
           python -m examples.vulkan.export --model_name=$model --test
         done
 
+        # For selected vision models, test with dynamic shapes
+        models="mv2 resnet18 resnet50 ic3 densenet161"
+        for model in $models; do
+          python -m examples.vulkan.export --model_name=$model --test -d
+        done
 
   test-vulkan-operators-linux:
     name: test-vulkan-operators-linux
diff --git a/backends/vulkan/_passes/fold_qdq.py b/backends/vulkan/_passes/fold_qdq.py
index 3beccc2205c..a6a5e751c05 100644
--- a/backends/vulkan/_passes/fold_qdq.py
+++ b/backends/vulkan/_passes/fold_qdq.py
@@ -17,9 +17,8 @@ class FoldQDQPass(ExportPass):
     valid quant op patterns have already been fused before this pass.
     """
 
-    def __init__(self, edge_program: torch.export.ExportedProgram):
-        super(FoldQDQPass, self).__init__()
-        self.edge_program = edge_program
+    def __init__(self):
+        super().__init__()
 
     def call(self, graph_module: torch.fx.GraphModule):
         for node in graph_module.graph.nodes:
diff --git a/backends/vulkan/_passes/fuse_patterns.py b/backends/vulkan/_passes/fuse_patterns.py
index 6ced1f32a7c..1575dd6a4f6 100644
--- a/backends/vulkan/_passes/fuse_patterns.py
+++ b/backends/vulkan/_passes/fuse_patterns.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import Optional
+
 import executorch.backends.vulkan.patterns as vk_patterns
 
 import torch
@@ -13,13 +15,15 @@
 
 
 class FusePatternsPass(ExportPass):
-    def __init__(self, exported_program: ExportedProgram) -> None:
+    def __init__(self) -> None:
         super().__init__()
-        self.program = exported_program
+        self._exported_program: Optional[ExportedProgram] = None
 
     def call(self, graph_module: torch.fx.GraphModule):
+        assert self._exported_program is not None
+
         total_replaced = vk_patterns.replace_all_fusable_subgraphs(
-            self.program, graph_module
+            self._exported_program, graph_module
         )
 
         if total_replaced > 0:
diff --git a/backends/vulkan/_passes/fuse_quantized_ops.py b/backends/vulkan/_passes/fuse_quantized_ops.py
index ca9f7541159..bb8cf5f2e64 100644
--- a/backends/vulkan/_passes/fuse_quantized_ops.py
+++ b/backends/vulkan/_passes/fuse_quantized_ops.py
@@ -211,18 +211,20 @@ def fuse_into_linear_qcnw_node(
 
 
 class FuseQuantizedOpsTransform(ExportPass):
-    def __init__(self, exported_program: ExportedProgram) -> None:
+    def __init__(self) -> None:
         super().__init__()
-        self.program = exported_program
+        self._exported_program: Optional[ExportedProgram] = None
 
     def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        assert self._exported_program is not None
+
         for node in graph_module.graph.nodes:
             # Check for linear_qcnw pattern (weight-only quantization)
-            qcnw_details = matches_linear_qcnw_pattern(self.program, node)
+            qcnw_details = matches_linear_qcnw_pattern(self._exported_program, node)
             if qcnw_details is not None:
                 qcnw_method, qcnw_nbits = qcnw_details
                 fuse_into_linear_qcnw_node(
-                    self.program, graph_module, node, qcnw_method, qcnw_nbits
+                    self._exported_program, graph_module, node, qcnw_method, qcnw_nbits
                 )
                 continue
 
diff --git a/backends/vulkan/_passes/tag_memory_meta_pass.py b/backends/vulkan/_passes/tag_memory_meta_pass.py
index db53cc666a8..8ed71aa1dae 100644
--- a/backends/vulkan/_passes/tag_memory_meta_pass.py
+++ b/backends/vulkan/_passes/tag_memory_meta_pass.py
@@ -230,6 +230,10 @@ def get_arg_tensor_source_repset(
         """
         arg_node = op_node.args[arg_i]
 
+        # For non-tensor arguments, return ANY_STORAGE
+        if not utils.is_tensor_arg_node(arg_node):
+            return utils.ANY_STORAGE
+
         # Special case for cat - use the first tensor in the list as representative
         if isinstance(arg_node, list):
             arg_node = arg_node[0]
diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
index a92b3b11f6f..63b57a0e79c 100644
--- a/backends/vulkan/op_registry.py
+++ b/backends/vulkan/op_registry.py
@@ -16,8 +16,6 @@
 
 import torch
 
-from executorch.backends.vulkan.serialization.vulkan_graph_schema import VkMemoryLayout
-
 from executorch.exir.dialects._ops import ops as exir_ops
 
 from executorch.exir.dialects.edge._ops import EdgeOpOverload
@@ -48,6 +46,9 @@ class OpFeatures:
         # Optional check function used during partitioning to determine if a node's
         # inputs are supported by the operator implementation.
         "are_node_inputs_supported_fn",
+        # Optional function to determine valid representation sets for input and outputs
+        # once a node's actual inputs are known.
+        "pick_io_storage_fn",
     ]
 
     def __init__(
@@ -61,6 +62,7 @@ def __init__(
         supports_resize: bool = False,
         supports_prepacking: bool = False,
         are_node_inputs_supported_fn: Optional[Callable] = allow_node,
+        pick_io_storage_fn: Optional[Callable] = None,
     ):
         self.inputs_storage: utils.TensorRepSetList = utils.TensorRepSetList(
             inputs_storage if inputs_storage is not None else []
@@ -77,15 +79,21 @@ def __init__(
         self.supports_prepacking = supports_prepacking
 
         self.are_node_inputs_supported_fn = are_node_inputs_supported_fn
+        self.pick_io_storage_fn = pick_io_storage_fn
 
     def make_op_repsets(
         self,
         op_node: torch.fx.Node,
         texture_limits: utils.ImageExtents = utils.DEFAULT_TEXTURE_LIMITS,
     ) -> utils.OpRepSets:
-        return utils.OpRepSets(
-            self.inputs_storage, self.outputs_storage, op_node, texture_limits
-        )
+        inputs_storage = self.inputs_storage
+        outputs_storage = self.outputs_storage
+        if self.pick_io_storage_fn is not None:
+            i_storage, o_storage = self.pick_io_storage_fn(op_node)
+            inputs_storage = utils.TensorRepSetList(i_storage)
+            outputs_storage = utils.TensorRepSetList(o_storage)
+
+        return utils.OpRepSets(inputs_storage, outputs_storage, op_node, texture_limits)
 
 
 #######################
@@ -410,28 +418,16 @@ def register_softmax_op():
 )
 def register_reduce_op():
     def check_reduce_node(node: torch.fx.Node) -> bool:
+        # Only one argument implies that the reduction is over the entire tensor, which
+        # is not supported yet.
+        if len(node.args) == 1:
+            return False
+
         dim_list = node.args[1]
+        # Only 1D and 2D reductions are supported at the moment.
         if isinstance(dim_list, list) and len(dim_list) > 2:
             return False
 
-        if isinstance(dim_list, list) and len(dim_list) == 2:
-            # Try to get the memory layout for this node
-            try:
-                memory_layout = utils.get_node_memory_layout(node)
-
-                # If we have memory layout information, check if any dimension in dim_list corresponds to a packed dimension
-                if (
-                    memory_layout is not None
-                    and memory_layout != VkMemoryLayout.DEFAULT_LAYOUT
-                ):
-                    # For now only default layout is supported for 2D reduction.
-                    # Because we can't determine if the input is NCHW or NHWC here,
-                    # assume the reduction dimension is packed so we cannot support it.
-                    return False
-            except (AssertionError, KeyError, AttributeError):
-                # If we can't get memory layout information, we'll assume the dims aren't packed
-                pass
-
         def try_find_keepdim_arg(node: torch.fx.Node) -> bool:
             for arg in node.args:
                 if isinstance(arg, bool):
@@ -446,10 +442,41 @@ def try_find_keepdim_arg(node: torch.fx.Node) -> bool:
 
         return True
 
+    def pick_io_storage_for_reduce(node: torch.fx.Node):
+        inputs_storage = utils.ANY_TEXTURE
+        outputs_storage = utils.ANY_TEXTURE
+
+        input_tensor = node.args[0]
+        ndim = input_tensor.meta["val"].ndim
+        dim_list = node.args[1]
+        if isinstance(dim_list, list) and len(dim_list) == 2:
+            reduce_dim1_whcn = utils.nchw_dim_to_whcn_dim(dim_list[0], ndim)
+            reduce_dim2_whcn = utils.nchw_dim_to_whcn_dim(dim_list[1], ndim)
+
+            possible_packed_dims = {0, 1, 2}
+            possible_packed_dims.discard(reduce_dim1_whcn)
+            possible_packed_dims.discard(reduce_dim2_whcn)
+
+            packed_dim = possible_packed_dims.pop()
+            assert packed_dim in [0, 1, 2]
+
+            if packed_dim == 0:
+                inputs_storage = utils.WIDTH_PACKED_TEXTURE
+                outputs_storage = utils.WIDTH_PACKED_TEXTURE
+            elif packed_dim == 1:
+                inputs_storage = utils.HEIGHT_PACKED_TEXTURE
+                outputs_storage = utils.HEIGHT_PACKED_TEXTURE
+            else:
+                inputs_storage = utils.CHANNELS_PACKED_TEXTURE
+                outputs_storage = utils.CHANNELS_PACKED_TEXTURE
+
+        return inputs_storage, outputs_storage
+
     return OpFeatures(
         inputs_storage=utils.ANY_TEXTURE,
         supports_resize=True,
         are_node_inputs_supported_fn=check_reduce_node,
+        pick_io_storage_fn=pick_io_storage_for_reduce,
     )
 
 
@@ -474,6 +501,23 @@ def register_2d_pool_op():
     ]
 )
 def register_convolution_op():
+    def check_conv_node(node: torch.fx.Node) -> bool:
+        x = node.args[0]
+        x_shape = x.meta["val"].size()
+        # 4-D input implies 2D convolution
+        if len(x_shape) == 4:
+            batches = x.meta["val"].size()[0]
+            if batches != 1:
+                return False
+        # 3-D input implies 1D convolution
+        if len(x_shape) == 3:
+            transpose = node.args[6]
+            # Transposed 1D convolution is not supported yet
+            if transpose:
+                return False
+
+        return True
+
     return OpFeatures(
         inputs_storage=[
             utils.CHANNELS_PACKED_TEXTURE,  # input
@@ -490,6 +534,7 @@ def register_convolution_op():
         ],
         supports_resize=True,
         supports_prepacking=True,
+        are_node_inputs_supported_fn=check_conv_node,
     )
 
 
@@ -716,6 +761,7 @@ def register_ported_ops_with_prepacking():
     return OpFeatures(
         inputs_storage=utils.CHANNELS_PACKED_TEXTURE,
         supports_prepacking=True,
+        supports_resize=True,
     )
 
 
@@ -746,6 +792,7 @@ def register_ported_ops_with_prepacking_all_dims():
     return OpFeatures(
         inputs_storage=utils.ANY_TEXTURE,
         supports_prepacking=True,
+        supports_resize=True,
     )
 
 
diff --git a/backends/vulkan/partitioner/vulkan_partitioner.py b/backends/vulkan/partitioner/vulkan_partitioner.py
index e5b2d0f7864..0bdc16616ef 100644
--- a/backends/vulkan/partitioner/vulkan_partitioner.py
+++ b/backends/vulkan/partitioner/vulkan_partitioner.py
@@ -36,7 +36,7 @@
     Partitioner,
     PartitionResult,
 )
-from executorch.exir.backend.utils import tag_constant_data
+from executorch.exir.backend.utils import tag_constant_data, tag_mutated_buffer
 from executorch.exir.dialects._ops import ops as exir_ops
 
 from torch.export.exported_program import ExportedProgram
@@ -254,9 +254,10 @@ def _is_node_supported(self, node: torch.fx.Node) -> bool:  # noqa: C901
             self.log_skip(node, "permute node of non compatible linear node")
             return False
 
-        is_in_local_scalar_dense_chain, dst_node_is_compatible = (
-            self.is_in_local_scalar_dense_chain(node)
-        )
+        (
+            is_in_local_scalar_dense_chain,
+            dst_node_is_compatible,
+        ) = self.is_in_local_scalar_dense_chain(node)
         if is_in_local_scalar_dense_chain and dst_node_is_compatible:
             return True
         elif is_in_local_scalar_dense_chain:
@@ -419,6 +420,7 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
             logger.info(f"Found {pl} Vulkan subgraphs to be partitioned.")
 
         tag_constant_data(exported_program)
+        tag_mutated_buffer(exported_program)
 
         return PartitionResult(
             tagged_exported_program=exported_program, partition_tags=partition_tags
diff --git a/backends/vulkan/patterns/quantized_linear.py b/backends/vulkan/patterns/quantized_linear.py
index 882d0d41e6d..374e29c634d 100644
--- a/backends/vulkan/patterns/quantized_linear.py
+++ b/backends/vulkan/patterns/quantized_linear.py
@@ -92,9 +92,11 @@ def __init__(self, mm_node: torch.fx.Node) -> None:
             return
 
         # Identify input node
-        self.fp_input_node, self.quantize_input_node, dq_node = (
-            utils.maybe_skip_q_dq_arg_chain(self.anchor_node.args[0])
-        )
+        (
+            self.fp_input_node,
+            self.quantize_input_node,
+            dq_node,
+        ) = utils.maybe_skip_q_dq_arg_chain(self.anchor_node.args[0])
         assert self.fp_input_node is not None
         self.all_nodes.append(self.fp_input_node)
 
@@ -386,7 +388,7 @@ def make_linear_dq8ca_q4gsw_op(
         weight_sums_node = create_constant_placeholder(
             exp_program=ep,
             graph=graph_module.graph,
-            kind=InputKind.CONSTANT_TENSOR,
+            kind=InputKind.PARAMETER,
             name=sums_name,
             data=sum_per_quant_group,
         )
@@ -429,7 +431,7 @@ def make_linear_q8ta_q8csw_custom_op(
         weight_sums_node = create_constant_placeholder(
             exp_program=ep,
             graph=graph_module.graph,
-            kind=InputKind.CONSTANT_TENSOR,
+            kind=InputKind.PARAMETER,
             name=sums_name,
             data=sum_per_output_channel,
         )
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl
index 0f5dbc41273..88746c5594e 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl
@@ -60,7 +60,7 @@ void main() {
     int num_steps = ((-ipos.y) + dilation.y - 1) / dilation.y;
     start.y = ipos.y + num_steps * dilation.y;
   }
-  const ivec2 end = min(ipos + overlay_region.xy, ivec2(in_sizes.xy));
+  const ivec2 end = min(ipos + overlay_region.xy, in_sizes.xy);
   // Compute the start of the kernel based on how far we are skipping ahead when
   // reading the input. Note that these are "canonical" indices.
   ivec2 kstart = (start - ipos) / dilation;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl
index 02fbef29b75..9089f87d658 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl
@@ -54,7 +54,7 @@ void main() {
   // Compute the start and end of the input indices to load. Padding is assumed
   // to be constant 0 padding, so reads from the padding region are skipped.
   const ivec2 start = ipos;
-  const ivec2 end = ipos + overlay_region.xy;
+  const ivec2 end = min(ipos + overlay_region.xy, in_sizes.xy);
 
   VEC4_T sum = texelFetch(t_bias, ivec2(pos.z, 0), 0);
   int kx = 0;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
index 19250419baf..7448b042cad 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
@@ -97,6 +97,10 @@ void main() {
   for (int y = start.y, i = 0; i < TILE_SIZE + BATCH_SIZE_Y - 1; y += dilation.y, i++) {
     for (int x = start.x, j = 0; j < TILE_SIZE + BATCH_SIZE_X - 1; x += dilation.x, j++) {
       in_texels[j] = texelFetch(t_in, ivec3(x, y, pos.z), 0);
+      // Set to zero if reading out of bounds
+      if (any(greaterThanEqual(ivec2(x, y), in_sizes.xy))) {
+        in_texels[j] = VEC4_T(0);
+      }
     }
 
     // from 2nd iteration onwards accumulate dot product in 2nd sum
diff --git a/backends/vulkan/runtime/graph/ops/glsl/full.yaml b/backends/vulkan/runtime/graph/ops/glsl/full.yaml
index eff78a7938d..1a5b0cb235e 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/full.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/full.yaml
@@ -14,5 +14,6 @@ full:
     DTYPE:
       - VALUE: half
       - VALUE: float
+      - VALUE: int32
   shader_variants:
     - NAME: full
diff --git a/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl b/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl
index d35492bc367..86a2229c416 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl
@@ -42,7 +42,8 @@ layout(constant_id = 5) const int group_dim = 1;
 // work group will write into its assigned element in the shared array.
 #define MAX_NTHREADS 16
 
-shared vec4 shared_vecs[MAX_NTHREADS];
+shared vec4 shared_max[MAX_NTHREADS];
+shared vec4 shared_sum[MAX_NTHREADS];
 
 #include "indexing_utils.h"
 
@@ -102,13 +103,13 @@ void softmax_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
        i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
     max_elements = max(max_elements, load_texel(tin, scan_pos));
   }
-  shared_vecs[smi] = max_elements;
+  shared_max[smi] = max_elements;
   barrier();
   // Iterate over the partial maximums to obtain the overall maximum
   group_i = tid.y * NWORKERS;
-  max_elements = shared_vecs[group_i++];
+  max_elements = shared_max[group_i++];
   for (int i = 1; i < NWORKERS; ++i, group_i++) {
-    max_elements = max(max_elements, shared_vecs[group_i]);
+    max_elements = max(max_elements, shared_max[group_i]);
   }
 
   scan_pos[reduce_dim] = tid.x;
@@ -118,13 +119,13 @@ void softmax_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
        i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
     denominators += exp(load_texel(tin, scan_pos) - max_elements);
   }
-  shared_vecs[smi] = denominators;
+  shared_sum[smi] = denominators;
   barrier();
   // Iterate over the partial sums to obtain the overall sum
   group_i = tid.y * NWORKERS;
-  denominators = shared_vecs[group_i++];
+  denominators = shared_sum[group_i++];
   for (int i = 1; i < NWORKERS; ++i, group_i++) {
-    denominators += shared_vecs[group_i];
+    denominators += shared_sum[group_i];
   }
 
   // Determine if there are any padding elements in the final texel of the
@@ -184,13 +185,13 @@ void softmax_packed_dim(const ivec2 tid, ivec3 scan_pos) {
       max_elements.x = max(intex[i], max_elements.x);
     }
   }
-  shared_vecs[smi] = max_elements;
+  shared_max[smi] = max_elements;
   barrier();
   // Iterate over the partial maximums to obtain the overall maximum
   group_i = tid.y * NWORKERS;
-  max_elements = shared_vecs[group_i++];
+  max_elements = shared_max[group_i++];
   for (int i = 1; i < NWORKERS; ++i, group_i++) {
-    max_elements = max(max_elements, shared_vecs[group_i]);
+    max_elements = max(max_elements, shared_max[group_i]);
   }
   // Each element of the texel is itself a partial maximum; iterate over the
   // texel to find the actual maximum
@@ -214,13 +215,13 @@ void softmax_packed_dim(const ivec2 tid, ivec3 scan_pos) {
       denominators.x += exp(intex[i] - max_element);
     }
   }
-  shared_vecs[smi] = denominators;
+  shared_sum[smi] = denominators;
   barrier();
   // Iterate over the partial sums to obtain the overall sum
   group_i = tid.y * NWORKERS;
-  denominators = shared_vecs[group_i++];
+  denominators = shared_sum[group_i++];
   for (int i = 1; i < NWORKERS; ++i, group_i++) {
-    denominators += shared_vecs[group_i];
+    denominators += shared_sum[group_i];
   }
   // Reduce over the accumulated texel to find the overall sum
   float denominator = 0;
diff --git a/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp b/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp
index 757afd06849..a6dd8f07f53 100644
--- a/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp
@@ -19,6 +19,18 @@
 
 namespace vkcompute {
 
+void resize_batch_norm_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  const ValueRef out = args.at(0).refs.at(0);
+  const ValueRef self = args.at(1).refs.at(0);
+
+  // For batch norm, output dimensions are the same as input dimensions
+  std::vector<int64_t> new_out_sizes = graph->sizes_of(self);
+  graph->virtual_resize(out, new_out_sizes);
+}
+
 ValueRef check_and_prepack_arg(
     ComputeGraph& graph,
     ValueRef arg_ref,
@@ -101,7 +113,7 @@ void add_native_batch_norm_node(
       // Resize Args
       {},
       // Resizing Logic
-      nullptr));
+      resize_batch_norm_node));
 }
 
 void native_batch_norm(ComputeGraph& graph, const std::vector<ValueRef>& args) {
diff --git a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
index 9ac4c963bc3..329620e80e6 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
@@ -109,11 +109,15 @@ void add_permute_node(
   {
     IntListPtr permute_dims_ptr = graph.get_int_list(permute_dims);
     const int32_t permute_ndim =
-        utils::safe_downcast<int>(permute_dims_ptr->size());
+        utils::safe_downcast<int32_t>(permute_dims_ptr->size());
 
     for (int32_t nchw_i = permute_ndim - 1, whcn_i = 0; nchw_i >= 0;
          nchw_i--, whcn_i++) {
-      const int32_t permute_dim_nchw = permute_dims_ptr->at(nchw_i);
+      int32_t permute_dim_nchw =
+          utils::safe_downcast<int32_t>(permute_dims_ptr->at(nchw_i));
+      if (permute_dim_nchw < 0) {
+        permute_dim_nchw += permute_ndim;
+      }
       const int32_t permute_dim_whcn = permute_ndim - 1 - permute_dim_nchw;
 
       whcn_permute_dims[whcn_i] = permute_dim_whcn;
diff --git a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp b/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
index 250fcdd5490..879f59667d6 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
@@ -137,7 +137,7 @@ void max_pool2d(ComputeGraph& graph, const std::vector<ValueRef>& args) {
 
 struct DivisorParams final {
   int32_t divisor_override;
-  bool count_include_pad;
+  int32_t count_include_pad;
 };
 
 DivisorParams create_divisor_params(
@@ -148,7 +148,7 @@ DivisorParams create_divisor_params(
       graph.val_is_int(divisor_override)
           ? static_cast<int32_t>(graph.get_int(divisor_override))
           : 0,
-      graph.get_bool(count_include_pad)};
+      int32_t(graph.get_bool(count_include_pad))};
 }
 
 void add_avg_pool2d_node(
diff --git a/backends/vulkan/runtime/graph/ops/impl/Squeeze.cpp b/backends/vulkan/runtime/graph/ops/impl/Squeeze.cpp
index 13801b45cc7..e2b73b2f3f2 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Squeeze.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Squeeze.cpp
@@ -32,8 +32,13 @@ void add_squeeze_copy_dims_node(
   // 2. Squeeze outter most dim
   // For these cases, just pass input to output via clone.
   for (int i = 0; i < dims.size(); ++i) {
-    if (dims.at(i) != 0 && in_sizes.at(dims.at(i)) == 1) {
-      squeeze_dims.push_back(dims.at(i));
+    // adjust negative dims
+    int64_t dim_val = dims.at(i);
+    if (dim_val < 0) {
+      dim_val += in_dim;
+    }
+    if (dims.at(i) != 0 && in_sizes.at(dim_val) == 1) {
+      squeeze_dims.push_back(dim_val);
     }
   }
   if (squeeze_dims.size() == 0) {
diff --git a/backends/vulkan/test/TARGETS b/backends/vulkan/test/TARGETS
index 53fad86f90c..ee296a4f68f 100644
--- a/backends/vulkan/test/TARGETS
+++ b/backends/vulkan/test/TARGETS
@@ -34,7 +34,6 @@ python_unittest(
     deps = [
         "//caffe2:torch",
         "//executorch/backends/vulkan/_passes:vulkan_passes",
-        "//executorch/backends/vulkan/quantizer:vulkan_quantizer",
         "//executorch/backends/vulkan:vulkan_preprocess",
         "//pytorch/ao:torchao",  # @manual
     ]
diff --git a/backends/vulkan/test/test_vulkan_passes.py b/backends/vulkan/test/test_vulkan_passes.py
index 4a30ab6c2de..438126a179f 100644
--- a/backends/vulkan/test/test_vulkan_passes.py
+++ b/backends/vulkan/test/test_vulkan_passes.py
@@ -3,15 +3,8 @@
 
 import torch
 
-from executorch.backends.transforms.addmm_mm_to_linear import AddmmToLinearTransform
-from executorch.backends.vulkan._passes import FuseQuantizedOpsTransform
 from executorch.backends.vulkan._passes.fuse_patterns import FusePatternsPass
 
-from executorch.backends.vulkan.quantizer.vulkan_quantizer import (
-    get_symmetric_quantization_config,
-    VulkanQuantizer,
-)
-
 from executorch.exir import EdgeCompileConfig, EdgeProgramManager, to_edge
 
 from executorch.exir.backend.canonical_partitioners.config_partitioner import (
@@ -94,66 +87,6 @@ def op_node_count(graph_module: torch.fx.GraphModule, canonical_op_name: str) ->
 
 
 class TestVulkanPasses(unittest.TestCase):
-    def test_fuse_int8pack_mm(self):
-        K = 256
-        N = 256
-        model = SingleLinearModule(K, N)
-        sample_inputs = model.get_sample_inputs()
-
-        quantizer = VulkanQuantizer()
-        quantizer.set_global(
-            get_symmetric_quantization_config(is_dynamic=False, weight_bits=8)
-        )
-
-        edge_manager = quantize_and_lower_module(
-            model,
-            sample_inputs,
-            quantizer,
-        )
-
-        ep = edge_manager._edge_programs["forward"]
-        edge_manager.transform(
-            [
-                AddmmToLinearTransform(),
-                FuseQuantizedOpsTransform(ep),
-            ]
-        )
-
-        gm = ep.graph_module
-
-        self.assertEqual(op_node_count(gm, "_weight_int8pack_mm.default"), 1)
-        self.assertEqual(op_node_count(gm, "dequantize_per_channel.default"), 0)
-
-    def test_fuse_linear_qcs4w(self):
-        K = 256
-        N = 256
-        model = SingleLinearModule(K, N)
-        sample_inputs = model.get_sample_inputs()
-
-        quantizer = VulkanQuantizer()
-        quantizer.set_global(
-            get_symmetric_quantization_config(is_dynamic=False, weight_bits=4)
-        )
-
-        edge_manager = quantize_and_lower_module(
-            model,
-            sample_inputs,
-            quantizer,
-        )
-
-        ep = edge_manager._edge_programs["forward"]
-        edge_manager.transform(
-            [
-                AddmmToLinearTransform(),
-                FuseQuantizedOpsTransform(ep),
-            ]
-        )
-
-        gm = ep.graph_module
-
-        self.assertEqual(op_node_count(gm, "linear_qcs4w.default"), 1)
-        self.assertEqual(op_node_count(gm, "dequantize_per_channel.default"), 0)
-
     def test_fuse_rotary_emb(self):
         """Test conversion of rotary embedding pattern to et_vk.apply_rotary_emb custom op."""
 
@@ -238,7 +171,8 @@ def _reshape_for_broadcast(self, freqs_cis: torch.Tensor, x: torch.Tensor):
 
         # Apply the rotary embedding pass
         ep = edge_manager._edge_programs["forward"]
-        rotary_pass = FusePatternsPass(ep)
+        rotary_pass = FusePatternsPass()
+        rotary_pass._exported_program = ep
         result = rotary_pass.call(ep.graph_module)
 
         # Verify that the pass was successful
diff --git a/backends/vulkan/test/utils.py b/backends/vulkan/test/utils.py
index bfe4e9fceee..a887c53473a 100644
--- a/backends/vulkan/test/utils.py
+++ b/backends/vulkan/test/utils.py
@@ -90,7 +90,9 @@ def export_model_to_vulkan(
     qmode=QuantizationMode.NONE,
 ):
     compile_options = {}
-    exported_graph = get_exported_graph(model, sample_inputs, qmode=qmode)
+    exported_graph = get_exported_graph(
+        model, sample_inputs, dynamic_shapes=dynamic_shapes, qmode=qmode
+    )
     program = export(
         exported_graph,
         sample_inputs,
diff --git a/backends/vulkan/utils.py b/backends/vulkan/utils.py
index 972a4f26c1b..09c57f649ae 100644
--- a/backends/vulkan/utils.py
+++ b/backends/vulkan/utils.py
@@ -128,7 +128,7 @@ def is_param_node(program: ExportedProgram, node: torch.fx.Node) -> bool:
         is_get_attr_node(node)
         or is_param(program, node)
         or is_buffer(program, node)
-        or is_constant(program, node)
+        or is_lifted_tensor_constant(program, node)
     )
 
 
@@ -206,6 +206,8 @@ def is_tensor_arg_node(node: Any) -> bool:
     if isinstance(node, torch.fx.Node):
         return is_tensor_node(node)
     elif isinstance(node, (list, tuple)):
+        if len(node) == 0:
+            return False
         return all(is_tensor_node(n) for n in node)
 
     return False
@@ -1228,6 +1230,16 @@ def is_in_8bit_range(tensor: torch.Tensor) -> bool:
 ##
 
 
+def nchw_dim_to_whcn_dim(nchw_dim: int, ndim: int) -> int:
+    # Handle negative indices for nchw_dim
+    if nchw_dim < 0:
+        nchw_dim += ndim
+
+    assert nchw_dim >= 0 and nchw_dim < ndim
+    whcn_dim = (ndim - 1) - nchw_dim
+    return whcn_dim
+
+
 def get_tensor_val_str(tensor_val: FakeTensor) -> str:
     return f"{tensor_val.dtype}: {tensor_val.shape}"
 
@@ -1279,6 +1291,7 @@ def update_program_state_dict(
     updated_tensor: torch.Tensor,
 ) -> None:
     target_name = None
+    kind = None
     # Iterate over all the tensors in the graph signature, and find
     # the one corresponding to the parameter/buffer name
     for input_ in program.graph_signature.input_specs:
@@ -1287,6 +1300,7 @@ def update_program_state_dict(
             and isinstance(input_.arg, TensorArgument)
             and input_.arg.name == buffer_name
         ):
+            kind = input_.kind
             target_name = input_.target
             break
 
@@ -1296,6 +1310,9 @@ def update_program_state_dict(
     ), f"could not find {buffer_name} in source program signature"
     assert target_name in program.state_dict, f"could not find {target_name}"
 
+    if kind == InputKind.PARAMETER:
+        updated_tensor = torch.nn.Parameter(updated_tensor, requires_grad=False)
+
     # Finally, overwrite the current tensor with updated tensor
     program.state_dict[target_name] = updated_tensor
 
diff --git a/backends/vulkan/vulkan_preprocess.py b/backends/vulkan/vulkan_preprocess.py
index 2f91d97ff58..876f7fa8900 100644
--- a/backends/vulkan/vulkan_preprocess.py
+++ b/backends/vulkan/vulkan_preprocess.py
@@ -8,7 +8,7 @@
 
 from functools import partial
 
-from typing import Any, Dict, final, List
+from typing import Any, Callable, Dict, final, List
 
 import executorch.backends.vulkan.utils as utils
 
@@ -56,7 +56,9 @@
 
 from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
 
-from executorch.exir.program._program import _copy_module
+from executorch.exir.program._program import _transform
+
+from torch._export.verifier import Verifier
 
 from torch.export._remove_auto_functionalized_pass import (
     unsafe_remove_auto_functionalized_pass,
@@ -65,28 +67,34 @@
 DEFAULT_DEBUG_HANDLE = 65535
 
 
+class _any_op(Verifier):
+    # Set training dialect to skip functional check in base verifier
+    dialect = "TRAINING"
+
+    def allowed_op_types(self):
+        return (Callable,)
+
+
 # pyre-ignore
 def apply_passes(program: ExportedProgram, passes) -> ExportedProgram:
     for p in passes:
-        if issubclass(type(p), ExportPass) or issubclass(type(p), PassBase):
-            new_gm = program.graph_module
-            # This is a workaround to allow the memory planning pass to work without
-            # having to first apply ToOutVarPass(). See the `greedy()` function in
-            # `exir.memory_planning`; if this attribute isn't set, assertions in
-            # `collect_spec_from_nodes()` will fail.
-            if isinstance(p, MemoryPlanningPass):
-                new_gm.encounter_to_out_var_failure = True
-
-            new_gm_res = p(new_gm)
-            assert new_gm_res is not None
-            new_gm = new_gm_res.graph_module
-
+        if isinstance(p, MemoryPlanningPass) and hasattr(p, "run"):
+            p.run(program.graph_module)
+
+        elif issubclass(type(p), ExportPass) or issubclass(type(p), PassBase):
+            # Some passes require the ep to be provided. However, since the ep may be
+            # updated with each pass applied, the ep must be set right before calling
+            # the pass. _exported_program is the attribute used by XNNPACK and Vulkan
+            # passes to store the exported program.
+            if hasattr(p, "_exported_program"):
+                p._exported_program = program
+
+            program = _transform(program, p, override_verifiers=[_any_op])
             # See the application of this function in exir/program/_program.py for more
             # details on why this step is necessary.
             if isinstance(p, SpecPropPass):
-                p.update_placeholder_tensor_specs(program, new_gm)
+                p.update_placeholder_tensor_specs(program, program.graph_module)
 
-            _copy_module(program.graph_module, new_gm)
         else:
             program = p(program)
 
@@ -159,17 +167,17 @@ def preprocess(  # noqa: C901
         program = apply_passes(
             program,
             [
-                FusePatternsPass(program),
-                RemoveRedundantOpsTransform(),
+                FuseBatchNormPass(program),
+                FusePatternsPass(),
+                FuseClampPass(),
                 AddmmToLinearTransform(),
-                FuseQuantizedOpsTransform(program),
+                RemoveRedundantOpsTransform(),
+                FuseQuantizedOpsTransform(),
                 ReplaceQDQPass(),
-                FoldQDQPass(program),
+                FoldQDQPass(),
                 SqueezeUnsqueezeInputs(),
                 FuseViewCopyTransform(),
                 ViewCopyToSqueezeUnsqueezePass(),
-                FuseBatchNormPass(program),
-                FuseClampPass(),
             ],
         )
 
@@ -215,6 +223,11 @@ def preprocess(  # noqa: C901
         mem_planning_suite = MemoryPlanningAlgorithmSuite(
             algo_list=[greedy_memory_planning]
         )
+        # This is a workaround to allow the memory planning pass to work without having
+        # to first apply ToOutVarPass(). See the `greedy()` function in
+        # `exir.memory_planning`; if this attribute isn't set, assertions in
+        # `collect_spec_from_nodes()` will fail.
+        program.graph_module.encounter_to_out_var_failure = True
         program = apply_passes(
             program,
             [
diff --git a/examples/vulkan/export.py b/examples/vulkan/export.py
index c90b501df6f..dace37e5473 100644
--- a/examples/vulkan/export.py
+++ b/examples/vulkan/export.py
@@ -14,22 +14,18 @@
 import backends.vulkan.test.utils as test_utils
 
 import torch
+import torchvision
 
-from executorch.backends.transforms.convert_dtype_pass import I64toI32
 from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
 from executorch.devtools import BundledProgram
 from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
 from executorch.devtools.bundled_program.serialize import (
     serialize_from_bundled_program_to_flatbuffer,
 )
-from executorch.exir import (
-    EdgeCompileConfig,
-    ExecutorchBackendConfig,
-    to_edge_transform_and_lower,
-)
+from executorch.exir import to_edge_transform_and_lower
 from executorch.extension.export_util.utils import save_pte_program
 from executorch.extension.pytree import tree_flatten
-from torch.export import export
+from torch.export import Dim, export
 
 from ..models import MODEL_NAME_TO_MODEL
 from ..models.model_factory import EagerModelFactory
@@ -38,6 +34,67 @@
 logging.basicConfig(level=logging.INFO, format=FORMAT)
 
 
+def is_vision_model(model_name):
+    if model_name in [
+        # These models are also registered in examples/models
+        "dl3",
+        "edsr",
+        "mv2",
+        "mv3",
+        "vit",
+        "ic3",
+        "ic4",
+        "resnet18",
+        "resnet50",
+        # These models are not registered in examples/models but are available via
+        # torchvision
+        "convnext_small",
+        "densenet161",
+        "shufflenet_v2_x1_0",
+    ]:
+        return True
+
+    return False
+
+
+def get_vision_model_sample_input():
+    return (torch.randn(1, 3, 224, 224),)
+
+
+def get_vision_model_dynamic_shapes():
+    return (
+        {
+            2: Dim("height", min=1, max=16) * 16,
+            3: Dim("width", min=1, max=16) * 16,
+        },
+    )
+
+
+def init_model(model_name):
+    if model_name == "convnext_small":
+        return torchvision.models.convnext_small()
+    if model_name == "densenet161":
+        return torchvision.models.densenet161()
+    if model_name == "shufflenet_v2_x1_0":
+        return torchvision.models.shufflenet_v2_x1_0()
+
+    return None
+
+
+def get_sample_inputs(model_name):
+    if is_vision_model(model_name):
+        return get_vision_model_sample_input()
+
+    return None
+
+
+def get_dynamic_shapes(model_name):
+    if is_vision_model(model_name):
+        return get_vision_model_dynamic_shapes()
+
+    return None
+
+
 def main() -> None:
     logger = logging.getLogger("")
     logger.setLevel(logging.INFO)
@@ -68,21 +125,6 @@ def main() -> None:
         help="whether to export with strict mode. Default is True",
     )
 
-    parser.add_argument(
-        "-a",
-        "--segment_alignment",
-        required=False,
-        help="specify segment alignment in hex. Default is 0x1000. Use 0x4000 for iOS",
-    )
-
-    parser.add_argument(
-        "-e",
-        "--external_constants",
-        action=argparse.BooleanOptionalAction,
-        default=False,
-        help="Save constants in external .ptd file. Default is False",
-    )
-
     parser.add_argument(
         "-d",
         "--dynamic",
@@ -119,31 +161,35 @@ def main() -> None:
 
     args = parser.parse_args()
 
-    if args.model_name not in MODEL_NAME_TO_MODEL:
-        raise RuntimeError(
-            f"Model {args.model_name} is not a valid name. "
-            f"Available models are {list(MODEL_NAME_TO_MODEL.keys())}."
+    if args.model_name in MODEL_NAME_TO_MODEL:
+        model, example_inputs, _, dynamic_shapes = EagerModelFactory.create_model(
+            *MODEL_NAME_TO_MODEL[args.model_name]
         )
+    else:
+        model = init_model(args.model_name)
+        example_inputs = get_sample_inputs(args.model_name)
+        dynamic_shapes = get_dynamic_shapes(args.model_name) if args.dynamic else None
 
-    model, example_inputs, _, dynamic_shapes = EagerModelFactory.create_model(
-        *MODEL_NAME_TO_MODEL[args.model_name]
-    )
+        if model is None:
+            raise RuntimeError(
+                f"Model {args.model_name} is not a valid name. "
+                f"Available models are {list(MODEL_NAME_TO_MODEL.keys())}."
+            )
 
     # Prepare model
     model.eval()
 
     # Setup compile options
     compile_options = {}
-    if args.dynamic or dynamic_shapes is not None:
+    if args.dynamic:
         compile_options["require_dynamic_shapes"] = True
+        # Try to manually get the dynamic shapes for the model if not set
+        if dynamic_shapes is None:
+            dynamic_shapes = get_dynamic_shapes(args.model_name)
+
     if args.force_fp16:
         compile_options["force_fp16"] = True
 
-    # Configure Edge compilation
-    edge_compile_config = EdgeCompileConfig(
-        _skip_dim_order=False,  # Proper handling for Vulkan memory format
-    )
-
     logging.info(f"Exporting model {args.model_name} with Vulkan delegate")
 
     # Export the model using torch.export
@@ -157,10 +203,6 @@ def main() -> None:
     # Transform and lower with Vulkan partitioner
     edge_program = to_edge_transform_and_lower(
         program,
-        compile_config=edge_compile_config,
-        transform_passes=[
-            I64toI32(edge_compile_config._skip_dim_order),
-        ],
         partitioner=[VulkanPartitioner(compile_options)],
         generate_etrecord=args.etrecord,
     )
@@ -169,13 +211,8 @@ def main() -> None:
         f"Exported and lowered graph:\n{edge_program.exported_program().graph}"
     )
 
-    # Configure backend options
-    backend_config = ExecutorchBackendConfig(external_constants=args.external_constants)
-    if args.segment_alignment is not None:
-        backend_config.segment_alignment = int(args.segment_alignment, 16)
-
     # Create executorch program
-    exec_prog = edge_program.to_executorch(config=backend_config)
+    exec_prog = edge_program.to_executorch()
 
     # Save ETRecord if requested
     if args.etrecord:

From 881915d21d8704eaee45183108626c77ed5fdfd4 Mon Sep 17 00:00:00 2001
From: Hardik Sharma <hardiksharma@meta.com>
Date: Sat, 4 Oct 2025 10:32:16 -0700
Subject: [PATCH 126/266] Add platforms for all operator library sub-targets.

Differential Revision: D83680406

Pull Request resolved: https://github.com/pytorch/executorch/pull/14728
---
 shim_et/xplat/executorch/codegen/codegen.bzl  | 108 ++++++++++--------
 .../kernels/prim_ops/selective_build.bzl      |   1 +
 2 files changed, 60 insertions(+), 49 deletions(-)

diff --git a/shim_et/xplat/executorch/codegen/codegen.bzl b/shim_et/xplat/executorch/codegen/codegen.bzl
index 3546b64cdb6..0002884b2a4 100644
--- a/shim_et/xplat/executorch/codegen/codegen.bzl
+++ b/shim_et/xplat/executorch/codegen/codegen.bzl
@@ -1,12 +1,12 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_default_executorch_platforms", "is_xplat", "runtime", "struct_to_json")
 load("@fbsource//xplat/executorch/build:selects.bzl", "selects")
-load("@fbsource//xplat/executorch/kernels/portable:op_registration_util.bzl", "portable_source_list")
-load("@fbsource//xplat/executorch/kernels/optimized:op_registration_util.bzl", "optimized_source_list")
 load(
     "@fbsource//xplat/executorch/kernels/optimized:lib_defs.bzl",
     "get_vec_deps",
     "get_vec_preprocessor_flags",
 )
+load("@fbsource//xplat/executorch/kernels/optimized:op_registration_util.bzl", "optimized_source_list")
+load("@fbsource//xplat/executorch/kernels/portable:op_registration_util.bzl", "portable_source_list")
 load("@fbsource//xplat/executorch/kernels/prim_ops:selective_build.bzl", "prim_ops_registry_selective")
 
 # Headers that declare the function signatures of the C++ functions that
@@ -96,15 +96,17 @@ def _get_prim_ops_registry_target(name, deps, aten_suffix, platforms):
     Returns:
         String: Target name for the appropriate prim ops registry
     """
+
     # If selective build targets are specified, create a selective prim ops registry
     # Create a selective prim ops registry using the existing function
     selective_prim_ops_registry_name = name + "_selected_prim_ops_registry"
     combined_prim_ops_header_target_name = name + "_combined_prim_ops_header"
     selected_prim_operators_genrule(combined_prim_ops_header_target_name, deps, platforms)
+
     # Use the existing prim_ops_registry_selective function
     prim_ops_registry_selective(
         name = selective_prim_ops_registry_name,
-        selected_prim_ops_header_target = ":"+combined_prim_ops_header_target_name,
+        selected_prim_ops_header_target = ":" + combined_prim_ops_header_target_name,
         aten_suffix = aten_suffix,
         platforms = platforms,
     )
@@ -123,11 +125,16 @@ def _extract_prim_ops_from_lists(ops, ops_dict):
     Returns:
         Tuple of (prim_ops, remaining_ops, remaining_ops_dict)
     """
+
     def _is_aten_prim_op(op_name):
         if not op_name.startswith("aten::"):
             return False
         for prim_suffix in [
-            "sym_size", "sym_numel", "sym_max", "sym_min", "sym_float"
+            "sym_size",
+            "sym_numel",
+            "sym_max",
+            "sym_min",
+            "sym_float",
         ]:
             if prim_suffix in op_name:
                 return True
@@ -169,7 +176,6 @@ def et_operator_library(
         ops_schema_yaml_target = None,
         server_generated_yaml_target = None,
         **kwargs):
-
     # Check if we should extract prim ops from the operator lists
     # Note that selective build for prim ops doesnt support model or ops_schema_yaml_target or server_generated_yaml_target
     # TODO: Add support for selective build for prim ops with model or ops_schema_yaml_target or server_generated_yaml_target
@@ -178,6 +184,7 @@ def et_operator_library(
     if should_extract_prim_ops:
         # Extract prim ops from ops and ops_dict
         prim_ops, remaining_ops, remaining_ops_dict = _extract_prim_ops_from_lists(ops, ops_dict)
+
         # Use the remaining ops (with prim ops removed) for the main et_operator_library
         final_ops = remaining_ops
         final_ops_dict = remaining_ops_dict
@@ -189,6 +196,7 @@ def et_operator_library(
 
     selected_operator_yaml_filename = "selected_operators.yaml"
     selected_prim_ops_filename = "selected_prim_ops.h"
+
     # Generate the main operator library with the final ops
     # do a dummy copy if server_generated_yaml_target is set
     if server_generated_yaml_target:
@@ -231,6 +239,7 @@ def et_operator_library(
         "--prim_op_names=" + ",".join(prim_ops),
         "--output_dir=${OUT}",
     ]
+
     # Here we generate the selected_prim_ops.h and the selected_operators.yaml file
     # both with single genrule
     genrule_cmd = genrule_cmd + [" && "] + prim_ops_genrule_cmd
@@ -307,7 +316,6 @@ def _prepare_genrule_and_lib(
     if support_exceptions:
         genrule_cmd.append("--add-exception-boundary")
 
-
     # Sources for generated kernel registration lib
     sources = MANUAL_REGISTRATION_SOURCES if manual_registration else GENERATED_SOURCES
 
@@ -371,7 +379,8 @@ def _prepare_custom_ops_genrule_and_lib(
         custom_ops_yaml_path = None,
         support_exceptions = True,
         deps = [],
-        kernels = []):
+        kernels = [],
+        platforms = get_default_executorch_platforms()):
     """Similar to _prepare_genrule_and_lib but for custom ops."""
     genrules = {}
     libs = {}
@@ -390,6 +399,7 @@ def _prepare_custom_ops_genrule_and_lib(
                    "--output_dir $OUT ").format(deps = " ".join(["\"{}\"".format(d) for d in deps])),
             outs = {"selected_operators.yaml": ["selected_operators.yaml"]},
             default_outs = ["."],
+            platforms = platforms,
         )
 
         # genrule for generating operator kernel bindings
@@ -460,6 +470,7 @@ def exir_custom_ops_aot_lib(
         kernels = kernels,
         support_exceptions = support_exceptions,
         deps = deps,
+        platforms = platforms,
     )
     for genrule in genrules:
         runtime.genrule(
@@ -468,6 +479,7 @@ def exir_custom_ops_aot_lib(
             cmd = genrules[genrule]["cmd"],
             outs = genrules[genrule]["outs"],
             default_outs = ["."],
+            platforms = platforms,
         )
     for compiler_lib in libs:
         runtime.cxx_library(
@@ -538,7 +550,7 @@ def get_optimized_lib_deps():
         "//executorch/runtime/kernel:kernel_includes",
     ] + get_vec_deps()
 
-def build_portable_header_lib(name, oplist_header_name, feature = None):
+def build_portable_header_lib(name, oplist_header_name, feature = None, **kwargs):
     """Build the portable headers into a header-only library.
     Ensures that includes work across portable and optimized libs.
     """
@@ -546,21 +558,23 @@ def build_portable_header_lib(name, oplist_header_name, feature = None):
         name = name,
         srcs = [],
         exported_headers = {
-            "selected_op_variants.h":":{}[selected_op_variants]".format(oplist_header_name),
+            "selected_op_variants.h": ":{}[selected_op_variants]".format(oplist_header_name),
         },
         exported_preprocessor_flags = ["-DEXECUTORCH_SELECTIVE_BUILD_DTYPE"],
         header_namespace = "",
         feature = feature,
+        **kwargs
     )
 
 def build_portable_lib(
-    name,
-    et_operator_lib_deps = [],
-    oplist_header_name = None,
-    portable_header_lib = None,
-    feature = None,
-    expose_operator_symbols = False,
-    visibility = ["@EXECUTORCH_CLIENTS"]):
+        name,
+        et_operator_lib_deps = [],
+        oplist_header_name = None,
+        portable_header_lib = None,
+        feature = None,
+        expose_operator_symbols = False,
+        visibility = ["@EXECUTORCH_CLIENTS"],
+        platforms = get_default_executorch_platforms()):
     """
     WARNING: Before using this, please consider using executorch_generated_lib instead. This
     function is only for special cases where you need to build a portable kernel library with
@@ -639,9 +653,10 @@ def build_portable_lib(
         # @lint-ignore BUCKLINT link_whole
         link_whole = True,
         feature = feature,
+        platforms = platforms,
     )
 
-def build_optimized_lib(name, oplist_header_name, portable_header_lib, feature = None, expose_operator_symbols = False):
+def build_optimized_lib(name, oplist_header_name, portable_header_lib, feature = None, expose_operator_symbols = False, platforms = get_default_executorch_platforms()):
     """Build optimized lib from source. We build from source so that the generated header file,
     selected_op_variants.h, can be used to selectively build the lib for different dtypes.
     """
@@ -661,7 +676,7 @@ def build_optimized_lib(name, oplist_header_name, portable_header_lib, feature =
     # Currently fbcode links all dependent libraries through shared
     # library, and it blocks users like unit tests to use kernel
     # implementation directly. So we enable this for xplat only.
-    compiler_flags = ["-Wno-missing-prototypes", "-Wno-pass-failed","-Wno-global-constructors","-Wno-shadow",]
+    compiler_flags = ["-Wno-missing-prototypes", "-Wno-pass-failed", "-Wno-global-constructors", "-Wno-shadow"]
     if not expose_operator_symbols and is_xplat():
         # Removing '-fvisibility=hidden' exposes operator symbols.
         # This allows operators to be called outside of the kernel registry.
@@ -674,6 +689,7 @@ def build_optimized_lib(name, oplist_header_name, portable_header_lib, feature =
         exported_preprocessor_flags = ["-DEXECUTORCH_SELECTIVE_BUILD_DTYPE"],
         deps = get_portable_lib_deps() + get_optimized_lib_deps() + [":" + portable_header_lib],
         compiler_flags = compiler_flags,
+        platforms = platforms,
         preprocessor_flags = get_vec_preprocessor_flags(),
         # sleef needs to be added as a direct dependency of the operator target when building for Android,
         # or a linker error may occur. Not sure why this happens; it seems that fbandroid_platform_deps of
@@ -699,10 +715,9 @@ def build_optimized_lib(name, oplist_header_name, portable_header_lib, feature =
     )
 
 def selected_operators_genrule(
-    name,
-    deps,
-    platforms = get_default_executorch_platforms(),
-):
+        name,
+        deps,
+        platforms = get_default_executorch_platforms()):
     """Generates selected_operators.yaml from the list of deps. We look into the trasitive closure of all the deps,
     and look for macros `et_operator_library`.
 
@@ -725,10 +740,9 @@ def selected_operators_genrule(
     )
 
 def selected_prim_operators_genrule(
-    name,
-    deps,
-    platforms = get_default_executorch_platforms(),
-):
+        name,
+        deps,
+        platforms = get_default_executorch_platforms()):
     """Generates selected_prim_ops.h from the list of deps. We look into the transitive closure of all the deps,
     and look for targets with label `et_operator_library`.
 
@@ -750,12 +764,11 @@ def selected_prim_operators_genrule(
     )
 
 def dtype_header_genrule(
-    name,
-    visibility,
-    deps = [],
-    selected_operators_genrule_name = None,
-    platforms = get_default_executorch_platforms(),
-):
+        name,
+        visibility,
+        deps = [],
+        selected_operators_genrule_name = None,
+        platforms = get_default_executorch_platforms()):
     """Generate selected_op_variants.h from selected_operators.yaml.
 
     Given a `selected_operators.yaml` (passed in as selected_operators_genrule_name), we should be able to determine
@@ -921,15 +934,14 @@ def executorch_generated_lib(
                 index = index + 1
                 portable = name + "_check_portable_" + dep.split(":")[1] + str(index)
                 message = "Dtype selective build requires that the portable library is not passed into `deps`. This will cause duplicate symbol errors in the build. Please remove it from `deps` and place it into `kernel_deps`"
-                check_recursive_dependencies(portable, dep, "//executorch/kernels/portable:operators", message)
+                check_recursive_dependencies(portable, dep, "//executorch/kernels/portable:operators", message, platforms = platforms)
         if ("//executorch/kernels/optimized:optimized_operators" in kernel_deps):
             index = 0
             for dep in deps:
                 index = index + 1
                 optimized = name + "_check_optimized_" + dep.split(":")[1] + str(index)
                 message = "Dtype selective build requires that the optimized library is not passed into `deps`. This will cause duplicate symbol errors in the build. Please remove it from `deps` and place it into `kernel_deps`"
-                check_recursive_dependencies(optimized, dep, "//executorch/kernels/optimized:optimized_operators", message)
-
+                check_recursive_dependencies(optimized, dep, "//executorch/kernels/optimized:optimized_operators", message, platforms = platforms)
 
     aten_suffix = "_aten" if aten_mode else ""
 
@@ -995,7 +1007,7 @@ def executorch_generated_lib(
     if dtype_selective_build:
         # Build portable headers lib. Used for portable and optimized kernel libraries.
         portable_header_lib = name + "_portable_header_lib"
-        build_portable_header_lib(portable_header_lib, oplist_header_name, feature)
+        build_portable_header_lib(portable_header_lib, oplist_header_name, feature, platforms = platforms)
 
         if "//executorch/kernels/portable:operators" in kernel_deps:
             # Remove portable from kernel_deps as we're building it from source.
@@ -1003,7 +1015,7 @@ def executorch_generated_lib(
 
             # Build portable lib.
             portable_lib_name = name + "_portable_lib"
-            build_portable_lib(name = portable_lib_name, portable_header_lib = portable_header_lib, feature = feature, expose_operator_symbols = expose_operator_symbols)
+            build_portable_lib(name = portable_lib_name, portable_header_lib = portable_header_lib, feature = feature, expose_operator_symbols = expose_operator_symbols, platforms = platforms)
             kernel_deps.append(":{}".format(portable_lib_name))
 
         if "//executorch/kernels/optimized:optimized_operators" in kernel_deps:
@@ -1012,7 +1024,7 @@ def executorch_generated_lib(
 
             # Build optimized lib.
             optimized_lib_name = name + "_optimized_lib"
-            build_optimized_lib(optimized_lib_name, oplist_header_name, portable_header_lib, feature, expose_operator_symbols)
+            build_optimized_lib(optimized_lib_name, oplist_header_name, portable_header_lib, feature, expose_operator_symbols, platforms = platforms)
             kernel_deps.append(":{}".format(optimized_lib_name))
 
     # Exports headers that declare the function signatures of the C++ functions
@@ -1111,10 +1123,9 @@ def executorch_generated_lib(
 #
 # If build successfully, all of the `selected_operators.yaml` will be merged into 1 `selected_operators.yaml` for debugging purpose.
 def executorch_ops_check(
-    name,
-    deps,
-    **kwargs,
-):
+        name,
+        deps,
+        **kwargs):
     runtime.genrule(
         name = name,
         macros_only = False,
@@ -1128,16 +1139,15 @@ def executorch_ops_check(
         platforms = kwargs.pop("platforms", get_default_executorch_platforms()),
         outs = {"selected_operators.yaml": ["selected_operators.yaml"]},
         default_outs = ["."],
-        **kwargs,
+        **kwargs
     )
 
 def check_recursive_dependencies(
-    name,
-    parent,
-    child,
-    message = "",
-    **kwargs,
-):
+        name,
+        parent,
+        child,
+        message = "",
+        **kwargs):
     """
     Checks if child is a transitive dependency of parent and fails if it is.
     The query runs the equivalent of `buck2 uquery "allpaths(parent, child)".
diff --git a/shim_et/xplat/executorch/kernels/prim_ops/selective_build.bzl b/shim_et/xplat/executorch/kernels/prim_ops/selective_build.bzl
index a5c89147801..73421f031ec 100644
--- a/shim_et/xplat/executorch/kernels/prim_ops/selective_build.bzl
+++ b/shim_et/xplat/executorch/kernels/prim_ops/selective_build.bzl
@@ -28,6 +28,7 @@ def prim_ops_registry_selective(name, selected_prim_ops_header_target, aten_suff
             header_name: [header_name],
             "selected_prim_ops.h": ["selected_prim_ops.h"]
         },
+        platforms = kwargs.get("platforms", "CXX"),
         default_outs = ["."],
     )
     runtime.cxx_library(

From 3d8b8d1d5f1cf74bf62cc9848e2a1cfe9d6804c0 Mon Sep 17 00:00:00 2001
From: cccclai <chenlai@meta.com>
Date: Sat, 4 Oct 2025 16:33:48 -0700
Subject: [PATCH 127/266] fix test-huggingface-transformers-* tests (#14752)

Fix these tests
https://hud.pytorch.org/hud/pytorch/executorch/main/1?per_page=50&name_filter=huggingface-transformer

The optimum is installed in a bit weird way, inside executorch folder,
it clone optimum inside executorch, and try to install executorch in the
nested optimum folder. Install optimum via pip instead in the same
commit. The behavior should be the same, tests still run as expected
---
 .github/workflows/trunk.yml | 53 +++++++++++++++----------------------
 1 file changed, 22 insertions(+), 31 deletions(-)

diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index ae3001ca920..adf3b7da151 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -823,11 +823,26 @@ jobs:
         echo "Recipe: $RECIPE"
         echo "Quantize: $QUANTIZE"
 
-        echo "::group::Set up ExecuTorch"
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake
+
+        echo "::group::Setup ExecuTorch"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "cmake"
+        echo "::endgroup::"
+
+        echo "::group::Setup Huggingface"
+        pip install -U "huggingface_hub[cli]" accelerate
+        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
+        pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
+        echo "::endgroup::"
+
+        echo "::group::Test MODEL: $MODEL RECIPE: $RECIPE QUANTIZE: $QUANTIZE"
+        export OUTPUT_DIR="$(pwd)/${MODEL}_${RECIPE}_${QUANTIZE}"
+        python .ci/scripts/test_huggingface_optimum_model.py --model "$MODEL" --recipe "$RECIPE" $QUANTIZE --model_dir "$OUTPUT_DIR"
+        echo "::endgroup::"
+
         # Build executor_runner with ETdump enabled
         PYTHON_EXECUTABLE=python cmake -DPYTHON_EXECUTABLE=python \
           -DCMAKE_INSTALL_PREFIX=cmake-out \
@@ -845,25 +860,6 @@ jobs:
           -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
           -Bcmake-out .
         cmake --build cmake-out -j16 --target install --config Release
-        echo "::endgroup::"
-
-        echo "::group::Set up Hugging Face"
-        pip install -U "huggingface_hub[cli]"
-        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
-        OPTIMUM_ET_COMMIT=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
-        git clone https://github.com/huggingface/optimum-executorch
-        pushd optimum-executorch
-        # There is no release yet, for CI stability, always test from the same commit on main
-        git checkout $OPTIMUM_ET_COMMIT
-        python install_dev.py --skip_override_torch
-        popd
-        pip list
-        echo "::endgroup::"
-
-        echo "::group::Run tests"
-        export OUTPUT_DIR="$(pwd)/${MODEL}_${RECIPE}_${QUANTIZE}"
-        python .ci/scripts/test_huggingface_optimum_model.py --model ${MODEL} --recipe ${RECIPE} ${QUANTIZE} --model_dir ${OUTPUT_DIR}
-        echo "::endgroup::"
 
         echo "::group::Generate artifacts for performance profiling"
         ./cmake-out/executor_runner \
@@ -930,16 +926,11 @@ jobs:
         ${CONDA_RUN} python install_executorch.py
         echo "::endgroup::"
 
-        echo "::group::Set up Hugging Face"
-        pip install -U "huggingface_hub[cli]"
-        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
-        OPTIMUM_ET_COMMIT=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
-        git clone https://github.com/huggingface/optimum-executorch
-        pushd optimum-executorch
-        # There is no release yet, for CI stability, always test from the same commit on main
-        git checkout $OPTIMUM_ET_COMMIT
-        ${CONDA_RUN} python install_dev.py --skip_override_torch
-        popd
+        echo "::group::Set up Huggingface"
+        ${CONDA_RUN} pip install -U "huggingface_hub[cli]" accelerate
+        ${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
+        ${CONDA_RUN} pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
         ${CONDA_RUN} pip list
         echo "::endgroup::"
 

From 3b16bc14ccb7e956b2a4bf0bdb541700596b1a20 Mon Sep 17 00:00:00 2001
From: Siddartha Pothapragada <sidart@meta.com>
Date: Sun, 5 Oct 2025 19:31:55 -0700
Subject: [PATCH 128/266] =?UTF-8?q?Summary:=20Use=20javaClassStatic()=20fo?=
 =?UTF-8?q?r=20class=20references=20stored=20in=20static=20=E2=80=A6=20(#1?=
 =?UTF-8?q?4744)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…variables - creates global references safe for persistence

findClassLocal() returns a local reference. Storing it in static auto
exceptionClass = ... could result in potential 'invalid local
reference:' as local references become invalid when the JNI frame ends

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:

### Summary
[PLEASE REMOVE] See [CONTRIBUTING.md's Pull
Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests)
for ExecuTorch PR guidelines.

[PLEASE REMOVE] If this PR closes an issue, please add a `Fixes
#<issue-id>` line.

[PLEASE REMOVE] If this PR introduces a fix or feature that should be
the upcoming release notes, please add a "Release notes: <area>" label.
For a list of available release notes labels, check out
[CONTRIBUTING.md's Pull
Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests).

### Test plan
[PLEASE REMOVE] How did you test this PR? Please write down any manual
commands you used and note down tests that you have written if
applicable.

Co-authored-by: Github Executorch <github_executorch@arm.com>
---
 extension/android/jni/jni_helper.cpp | 9 ++++++---
 extension/android/jni/jni_helper.h   | 7 +++++++
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/extension/android/jni/jni_helper.cpp b/extension/android/jni/jni_helper.cpp
index b92856bacb2..6491524c7ac 100644
--- a/extension/android/jni/jni_helper.cpp
+++ b/extension/android/jni/jni_helper.cpp
@@ -13,10 +13,13 @@ namespace executorch::jni_helper {
 void throwExecutorchException(uint32_t errorCode, const std::string& details) {
   // Get the current JNI environment
   auto env = facebook::jni::Environment::current();
+  if (!env) {
+    return;
+  }
 
-  // Find the Java ExecutorchRuntimeException class
-  static auto exceptionClass = facebook::jni::findClassLocal(
-      "org/pytorch/executorch/ExecutorchRuntimeException");
+  // stable/global class ref — safe to cache
+  static const auto exceptionClass =
+      JExecutorchRuntimeException::javaClassStatic();
 
   // Find the static factory method: makeExecutorchException(int, String)
   static auto makeExceptionMethod =
diff --git a/extension/android/jni/jni_helper.h b/extension/android/jni/jni_helper.h
index 996d75581d3..898c1619d9c 100644
--- a/extension/android/jni/jni_helper.h
+++ b/extension/android/jni/jni_helper.h
@@ -23,4 +23,11 @@ namespace executorch::jni_helper {
  */
 void throwExecutorchException(uint32_t errorCode, const std::string& details);
 
+// Define the JavaClass wrapper
+struct JExecutorchRuntimeException
+    : public facebook::jni::JavaClass<JExecutorchRuntimeException> {
+  static constexpr auto kJavaDescriptor =
+      "Lorg/pytorch/executorch/ExecutorchRuntimeException;";
+};
+
 } // namespace executorch::jni_helper

From f81e8346f4153cb2e21eb33a6bdce9c1008696ae Mon Sep 17 00:00:00 2001
From: Erik Lundell <erik.lundell@arm.com>
Date: Mon, 6 Oct 2025 14:23:08 +0200
Subject: [PATCH 129/266] Add strict-flag to ExportSession (#14588)

**Add strict export option to ExportRecipe**

    Default is True, mirroring earlier behavior.
    Also update ExportSession to handle this.


Signed-off-by: Erik Lundell <erik.lundell@arm.com>
---
 export/export.py | 5 ++++-
 export/recipe.py | 3 +++
 export/stages.py | 5 ++++-
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/export/export.py b/export/export.py
index 86a932d153c..1e9cdbde7c0 100644
--- a/export/export.py
+++ b/export/export.py
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -200,7 +201,9 @@ def _build_stages(self, stages: List[StageType]) -> Dict[StageType, Stage]:
                     aten_transform_passes = list(
                         self._export_recipe.aten_transform_passes
                     )
-                stage = TorchExportStage(aten_transform_passes)
+                stage = TorchExportStage(
+                    aten_transform_passes, strict=self._export_recipe.strict
+                )
             elif stage_type == StageType.TO_EDGE_TRANSFORM_AND_LOWER:
                 stage = EdgeTransformAndLowerStage.from_recipe(self._lowering_recipe)
             elif stage_type == StageType.TO_EDGE:
diff --git a/export/recipe.py b/export/recipe.py
index 18f4b8aebb9..4465da51956 100644
--- a/export/recipe.py
+++ b/export/recipe.py
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -151,6 +152,7 @@ class ExportRecipe:
         executorch_backend_config: Optional backend configuration for ExecuTorch
         pipeline_stages: Optional list of stages to execute, defaults to a standard pipeline.
         mode: Export mode (debug or release)
+        strict: Set the strict flag in the torch export call.
     """
 
     name: Optional[str] = None
@@ -163,6 +165,7 @@ class ExportRecipe:
     executorch_backend_config: Optional[ExecutorchBackendConfig] = None
     pipeline_stages: Optional[List[StageType]] = None
     mode: Mode = Mode.RELEASE
+    strict: bool = True
 
     @classmethod
     def get_recipe(cls, recipe: "RecipeType", **kwargs) -> "ExportRecipe":
diff --git a/export/stages.py b/export/stages.py
index 323b327bfa4..3be801c6a14 100644
--- a/export/stages.py
+++ b/export/stages.py
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -110,9 +111,11 @@ def __init__(
         aten_transform_passes: Optional[
             List[Callable[[str, ExportedProgram], ExportedProgram]]
         ] = None,
+        strict=True,
     ) -> None:
         super().__init__()
         self._aten_transform_passes = aten_transform_passes
+        self.strict = strict
 
     @property
     def stage_type(self) -> str:
@@ -147,7 +150,7 @@ def run(self, artifact: PipelineArtifact) -> None:
                     model,
                     example_inputs[method_name][0],
                     dynamic_shapes=method_dynamic_shapes,
-                    strict=True,
+                    strict=self.strict,
                 )
 
                 # Apply pre-edge transform passes if available

From 75ebd05eba32df37211e73012b7211a4a66d9b4c Mon Sep 17 00:00:00 2001
From: Surya Siddharth Pemmaraju <surya.siddharth.pemmaraju@intel.com>
Date: Mon, 6 Oct 2025 07:52:32 -0700
Subject: [PATCH 130/266] Fix OpenVINO ci (#14784)

### Summary
Re enable OpenVINO CI
Fixes #14314

### Test plan
Tested this PR locally with setup-openvino.sh and test_openvino.sh
The CI should run these two scripts and verify that all tests are
passing
---
 .ci/scripts/setup-openvino.sh    | 20 +++++++++-----------
 .ci/scripts/test_openvino.sh     |  2 +-
 .github/workflows/pull.yml       |  1 -
 backends/openvino/partitioner.py |  8 +++++++-
 backends/openvino/preprocess.py  |  8 ++++++++
 5 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/.ci/scripts/setup-openvino.sh b/.ci/scripts/setup-openvino.sh
index ff667619125..587494f46ac 100755
--- a/.ci/scripts/setup-openvino.sh
+++ b/.ci/scripts/setup-openvino.sh
@@ -10,19 +10,17 @@ set -ex
 # shellcheck source=/dev/null
 source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
 
-git clone https://github.com/openvinotoolkit/openvino.git
-cd openvino && git checkout releases/2025/1
-git submodule update --init --recursive
-sudo ./install_build_dependencies.sh
-mkdir build && cd build
-cmake .. -DCMAKE_BUILD_TYPE=Release -DENABLE_PYTHON=ON
-make -j$(nproc)
+# Download and install OpenVINO from release packages
+OPENVINO_VERSION="2025.3"
+OPENVINO_BUILD="2025.3.0.19807.44526285f24"
+OPENVINO_URL="https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION}/linux/openvino_toolkit_ubuntu22_${OPENVINO_BUILD}_x86_64.tgz"
 
-cd ..
-cmake --install build --prefix dist
+curl -Lo /tmp/openvino_toolkit.tgz --retry 3 --fail ${OPENVINO_URL}
+tar -xzf /tmp/openvino_toolkit.tgz
+mv openvino_toolkit_ubuntu22_${OPENVINO_BUILD}_x86_64 openvino
 
-source dist/setupvars.sh
-cd ../backends/openvino
+source openvino/setupvars.sh
+cd backends/openvino
 pip install -r requirements.txt
 cd scripts
 ./openvino_build.sh --enable_python
diff --git a/.ci/scripts/test_openvino.sh b/.ci/scripts/test_openvino.sh
index 85884a6475b..2bb2115b1ec 100755
--- a/.ci/scripts/test_openvino.sh
+++ b/.ci/scripts/test_openvino.sh
@@ -10,7 +10,7 @@ set -ex
 # shellcheck source=/dev/null
 source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
 
-source openvino/dist/setupvars.sh
+source openvino/setupvars.sh
 cd backends/openvino/tests
 python test_runner.py --test_type ops
 python test_runner.py --test_type models
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 845cb5d8631..8248a9637ec 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -787,7 +787,6 @@ jobs:
       contents: read
     strategy:
       fail-fast: false
-    if: false # TODO Re-enable after fixing timeouts (#14314)
     with:
       runner: linux.2xlarge
       docker-image: ci-image:executorch-ubuntu-22.04-gcc9
diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py
index bc3fde573e2..4975dc657c6 100644
--- a/backends/openvino/partitioner.py
+++ b/backends/openvino/partitioner.py
@@ -27,6 +27,10 @@
 
 
 class OpenvinoOperatorsSupport(OperatorSupportBase):
+    extended_support_dict = {
+        "torch.ops.dim_order_ops._clone_dim_order.default": None,
+        "torch.ops.dim_order_ops._to_dim_order_copy.default": None,
+    }
 
     def __init__(
         self,
@@ -62,7 +66,9 @@ def is_node_supported(self, _, node: torch.fx.Node) -> bool:
             op_type = node.target.__name__
         else:
             op_type = str(node.target)
-        supported_ops = OperatorSupport(options)._support_dict
+        supported_ops = (
+            OperatorSupport(options)._support_dict | self.extended_support_dict
+        )
         if op_type == "getitem":
             return True
 
diff --git a/backends/openvino/preprocess.py b/backends/openvino/preprocess.py
index c343f44a8b5..691115f6579 100644
--- a/backends/openvino/preprocess.py
+++ b/backends/openvino/preprocess.py
@@ -14,6 +14,8 @@
     PreprocessResult,
 )
 from executorch.exir.backend.compile_spec_schema import CompileSpec
+
+from executorch.exir.passes.memory_format_ops_pass import DimOrderOpsRevertPass
 from openvino.frontend.pytorch.torchdynamo.compile import (  # type: ignore[import-untyped]
     openvino_compile,
 )
@@ -36,6 +38,12 @@ def preprocess(
         Returns:
             PreprocessResult: The result of preprocessing, including the compiled model bytes.
         """
+        transformed_ep = DimOrderOpsRevertPass()(edge_program.graph_module)
+
+        # Update the edge_program with the transformed graph
+        if transformed_ep and transformed_ep.graph_module:
+            edge_program._graph_module = transformed_ep.graph_module
+
         input_names = edge_program.graph_signature.user_inputs
         args = []
         for node in edge_program.graph.nodes:

From 9a7fb42d5ac95ec0d8f30759625fd9dfcca4f1db Mon Sep 17 00:00:00 2001
From: Yufeng Shi <yufeng.shi@arm.com>
Date: Mon, 6 Oct 2025 17:19:11 +0100
Subject: [PATCH 131/266] Arm backend: Fix torch.matmul() failures for 2D
 tensor inputs (#14624)

- ConvertMmToBmmPass converts an MM node to BMM nodes, turns input and
output tensors from rank-2 to rank-3 via unsqueeze/squeeze, and inserts
q-dq before and after BMM node when necessary.

- After ConvertMmToBmmPass:
```
  x -> q   -> dq   -> unsqueeze -> q_2 -> dq_2 ->
                                                 \
                                                bmm -> q_4 -> dq_4
                                                 /
  y -> q_1 -> dq_1 -> unsqueeze -> q_3 -> dq_3 ->
```

- Therefore, if the original matmul was 2D, the bmm already has DQ nodes
on its inputs and Q node on its output. If AnnotateDecomposedMatmulPass
(#10654) is still applied in this case, it produces illegal sequences
such as: x -> q -> unsqueeze -> q_2 (invalid)

- Fix by checking whether the BMM is already surrounded by DQ nodes on
its inputs and Q nodes on its output.

Change-Id: I9949d59b0b4a96fa34a88b0734014567ea6f24cc


cc @digantdesai @freddan80 @per @zingo @oscarandersson8218

Signed-off-by: Yufeng Shi <yufeng.shi@arm.com>
Co-authored-by: Oscar Andersson <oscar.andersson@arm.com>
---
 backends/arm/_passes/annotate_decomposed_matmul.py | 9 +++++++--
 backends/arm/test/ops/test_matmul.py               | 7 +++++++
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/backends/arm/_passes/annotate_decomposed_matmul.py b/backends/arm/_passes/annotate_decomposed_matmul.py
index 6b89b0c3c4a..72ae46c76c1 100644
--- a/backends/arm/_passes/annotate_decomposed_matmul.py
+++ b/backends/arm/_passes/annotate_decomposed_matmul.py
@@ -73,7 +73,10 @@ def call(self, graph_module: GraphModule) -> PassResult:
                 node for node in partition.nodes if node.target in matmul_targets
             ][0]
 
-            if quantized_input:
+            if quantized_input and not all(
+                input_node.target in DQ_OPS
+                for input_node in matmul_node.all_input_nodes
+            ):
                 matmul_args = matmul_node.all_input_nodes
                 for node in matmul_args:
                     # Find the dq-node connected to this mm/bmm arg
@@ -99,7 +102,9 @@ def call(self, graph_module: GraphModule) -> PassResult:
 
             partition_output = list(partition.output_nodes[0].users)[0]
             quantized_output = partition_output.target in Q_OPS
-            if quantized_output:
+            if quantized_output and not all(
+                user.target in Q_OPS for user in matmul_node.users
+            ):
                 with graph_module.graph.inserting_after(matmul_node):
                     # Create q-node after matmul
                     q_node = create_node(
diff --git a/backends/arm/test/ops/test_matmul.py b/backends/arm/test/ops/test_matmul.py
index a788fc00a5d..f564672e98f 100644
--- a/backends/arm/test/ops/test_matmul.py
+++ b/backends/arm/test/ops/test_matmul.py
@@ -22,6 +22,7 @@
 
 class MatMul(torch.nn.Module):
     test_data_generators = {
+        "rand_rand_2d": lambda: (torch.rand(5, 5), torch.rand(5, 2)),
         "rand_rand_3d": lambda: (torch.rand(2, 3, 5), torch.rand(2, 5, 2)),
         "rand_rand_4d": lambda: (torch.rand(1, 2, 3, 5), torch.rand(1, 2, 5, 2)),
     }
@@ -32,6 +33,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
 
 class MatMulSingleInput(torch.nn.Module):
     test_data_generators = {
+        "rand_2d": lambda: (torch.rand(5, 5),),
         "rand_3d": lambda: (torch.rand(2, 5, 5),),
         "rand_4d": lambda: (torch.rand(1, 2, 5, 5),),
     }
@@ -42,6 +44,11 @@ def forward(self, x: torch.Tensor):
 
 class MatMulCombo(torch.nn.Module):
     test_data_generators = {
+        "rand_rand_rand_2d": lambda: (
+            torch.rand(5, 5),
+            torch.rand(5, 2),
+            torch.rand(2, 5),
+        ),
         "rand_rand_rand_3d": lambda: (
             torch.rand(2, 5, 5),
             torch.rand(2, 5, 2),

From ed3fdad208ccf9309a61c60ed3a262fb796f8848 Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin <anthony@shoumikh.in>
Date: Mon, 6 Oct 2025 09:59:22 -0700
Subject: [PATCH 132/266] Update extension/llm/tokenizers (#14807)

---
 extension/llm/tokenizers | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extension/llm/tokenizers b/extension/llm/tokenizers
index 65e41a96e1b..ee0ad9b6e84 160000
--- a/extension/llm/tokenizers
+++ b/extension/llm/tokenizers
@@ -1 +1 @@
-Subproject commit 65e41a96e1b6870d0e616cd7f9eaaf5aaa1d89f3
+Subproject commit ee0ad9b6e84622589911e2855a111b4278db114b

From 815ae92399815df6976620dbf977561ae79c4780 Mon Sep 17 00:00:00 2001
From: Ethan Ng <ethann@meta.com>
Date: Mon, 6 Oct 2025 10:25:13 -0700
Subject: [PATCH 133/266] Update
 ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass to check if
 is_tensor() is valid

Differential Revision: D83861005

Pull Request resolved: https://github.com/pytorch/executorch/pull/14798
---
 backends/cadence/aot/replace_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/cadence/aot/replace_ops.py b/backends/cadence/aot/replace_ops.py
index 2104764cd14..24390da5e16 100644
--- a/backends/cadence/aot/replace_ops.py
+++ b/backends/cadence/aot/replace_ops.py
@@ -1590,7 +1590,7 @@ def call_operator(self, op, args, kwargs, meta):
         updated_args = list(args)
         for op_arg_index in args_to_be_replaced:
             arg = args[op_arg_index]
-            if not arg.is_tensor():
+            if not isinstance(arg, ProxyValue) or not arg.is_tensor():
                 return super().call_operator(op, args, kwargs, meta)
 
             if not isinstance(arg.node.target, EdgeOpOverload):

From 8c434ddb066feafa3773ac4332a7fed62e9c6c76 Mon Sep 17 00:00:00 2001
From: Gregory Comer <gjcomer@meta.com>
Date: Mon, 6 Oct 2025 12:02:06 -0600
Subject: [PATCH 134/266] [Windows] Enable LLM preset in CI (#14805)

### Summary
Testing more extensions on Windows.
---
 .github/workflows/build-presets.yml | 2 +-
 tools/cmake/preset/windows.cmake    | 9 ++-------
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/build-presets.yml b/.github/workflows/build-presets.yml
index 66ab19eef3c..46031ac7ea3 100644
--- a/.github/workflows/build-presets.yml
+++ b/.github/workflows/build-presets.yml
@@ -109,7 +109,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        preset: [pybind, windows]
+        preset: [pybind, windows, llm]
     with:
       job-name: build
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
diff --git a/tools/cmake/preset/windows.cmake b/tools/cmake/preset/windows.cmake
index b75a5af578e..ef8bbbedbbf 100644
--- a/tools/cmake/preset/windows.cmake
+++ b/tools/cmake/preset/windows.cmake
@@ -4,14 +4,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+include(${PROJECT_SOURCE_DIR}/tools/cmake/preset/llm.cmake)
+
 # keep sorted
 set_overridable_option(EXECUTORCH_BUILD_EXECUTOR_RUNNER ON)
-set_overridable_option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_EVALUE_UTIL ON)
-set_overridable_option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
-set_overridable_option(EXECUTORCH_BUILD_EXTENSION_MODULE ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL ON)
-set_overridable_option(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
-set_overridable_option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED ON)
-set_overridable_option(EXECUTORCH_BUILD_KERNELS_QUANTIZED ON)
-set_overridable_option(EXECUTORCH_BUILD_XNNPACK ON)

From 563a5d244e42787b5b94702b4766b95287257dd9 Mon Sep 17 00:00:00 2001
From: Oscar Andersson <87121123+oscarandersson8218@users.noreply.github.com>
Date: Mon, 6 Oct 2025 20:02:54 +0200
Subject: [PATCH 135/266] Arm backend: Remove CheckNeedsDecomposition (#14512)

Remove redundant check as this can be covered by TOSAProIntSupportList.

Signed-off-by: Oscar Andersson <oscar.andersson@arm.com>
---
 .../tosa_profile_supported_op_lists.py        | 22 +-------
 .../tosa_supported_operators.py               | 55 +------------------
 2 files changed, 3 insertions(+), 74 deletions(-)

diff --git a/backends/arm/operator_support/tosa_profile_supported_op_lists.py b/backends/arm/operator_support/tosa_profile_supported_op_lists.py
index d763ef23df2..86db2d9b0b6 100644
--- a/backends/arm/operator_support/tosa_profile_supported_op_lists.py
+++ b/backends/arm/operator_support/tosa_profile_supported_op_lists.py
@@ -18,6 +18,7 @@
 
 
 # INT profile: ops supported via native TOSA ops, decompositions/transformations, precompute, TableOps, etc.
+# Note that ops supported via pre-quantization decompositions are not included here.
 TOSA_PRO_INT_SupportList: Final[Set] = {
     exir_ops.edge.aten.abs.default,
     exir_ops.edge.aten.add.Tensor,
@@ -46,8 +47,6 @@
     exir_ops.edge.aten.hardsigmoid.default,
     exir_ops.edge.aten.hardtanh.default,
     exir_ops.edge.aten.hardswish.default,
-    exir_ops.edge.aten.div.Tensor,
-    exir_ops.edge.aten.div.Tensor_mode,
     exir_ops.edge.aten.eq.Tensor,
     exir_ops.edge.aten.eq.Scalar,
     exir_ops.edge.aten.erf.default,
@@ -68,16 +67,7 @@
     exir_ops.edge.aten.lt.Tensor,
     exir_ops.edge.aten.lt.Scalar,
     exir_ops.edge.aten.mul.Tensor,
-    exir_ops.edge.aten.ne.Tensor,
-    exir_ops.edge.aten.ne.Scalar,
     exir_ops.edge.aten.neg.default,
-    exir_ops.edge.aten.add.Scalar,
-    exir_ops.edge.aten.sub.Scalar,
-    exir_ops.edge.aten.mul.Scalar,
-    exir_ops.edge.aten.div.Scalar,
-    exir_ops.edge.aten._native_batch_norm_legit_no_training.default,
-    exir_ops.edge.aten.native_layer_norm.default,
-    exir_ops.edge.aten.native_group_norm.default,
     exir_ops.edge.aten.sigmoid.default,
     exir_ops.edge.aten.mean.dim,
     exir_ops.edge.aten.mm.default,
@@ -86,19 +76,12 @@
     exir_ops.edge.aten.repeat.default,
     exir_ops.edge.aten.reciprocal.default,
     exir_ops.edge.aten.relu.default,
-    exir_ops.edge.aten.leaky_relu.default,
-    exir_ops.edge.aten.sqrt.default,
     exir_ops.edge.aten.rsqrt.default,
-    exir_ops.edge.aten.round.default,
-    exir_ops.edge.aten._softmax.default,
     exir_ops.edge.aten.select_copy.int,
-    exir_ops.edge.aten._log_softmax.default,
     exir_ops.edge.aten.sub.Tensor,
     exir_ops.edge.aten.tanh.default,
     exir_ops.edge.aten.upsample_bilinear2d.vec,
     exir_ops.edge.aten.upsample_nearest2d.vec,
-    exir_ops.edge.aten.var.correction,
-    exir_ops.edge.aten.var.dim,
     exir_ops.edge.aten.view_copy.default,
     exir_ops.edge.aten.unsqueeze_copy.default,
     exir_ops.edge.aten.squeeze_copy.dims,
@@ -127,12 +110,9 @@
     exir_ops.edge.aten.sign.default,
     exir_ops.edge.aten.asin.default,
     exir_ops.edge.aten.atanh.default,
-    exir_ops.edge.aten.addmm.default,
     exir_ops.edge.aten.masked_fill.Scalar,
     exir_ops.edge.aten.asinh.default,
     exir_ops.edge.aten.cosh.default,
-    exir_ops.edge.aten.glu.default,
-    exir_ops.edge.aten.logit.default,
     exir_ops.edge.aten.acos.default,
     exir_ops.edge.aten.elu.default,
     exir_ops.edge.aten.bitwise_not.default,
diff --git a/backends/arm/operator_support/tosa_supported_operators.py b/backends/arm/operator_support/tosa_supported_operators.py
index 86c53e4aff1..f7dace09c0b 100644
--- a/backends/arm/operator_support/tosa_supported_operators.py
+++ b/backends/arm/operator_support/tosa_supported_operators.py
@@ -135,7 +135,6 @@ def tosa_support_factory(
     ]
 
     if not tosa_spec.support_float():
-        negative_checks.append(NeedsDecompositionCheck(reporter))
         negative_checks.append(CheckProperQuantization(reporter))
     if tosa_spec.is_U55_subset:
         negative_checks.append(EthosU55NotSupported(reporter))
@@ -156,7 +155,8 @@ def tosa_support_factory(
 class TOSAProINTSupportList(OperatorSupportBase):
     """
     TOSA_PRO_INT_SupportList:
-        Ops supported in INT profile via native TOSA ops, decomposition/transformation, pre-compute, or TableOps
+        Ops supported in INT profile via native TOSA ops, decomposition/transformation, pre-compute, or TableOps.
+        Note that ops supported via pre-quantization decompositions are not included here.
     """
 
     def is_node_supported(
@@ -179,57 +179,6 @@ def is_node_supported(
         return node.op == "call_function" and node.target in TOSA_PRO_FP_SupportList
 
 
-class NeedsDecompositionCheck(OperatorSupportBase):
-    """
-    Targeted operators need to be decomposed prior to quantization in order to get a pair of q-dq-nodes surrounding
-    the operator, and to get optimal quantization parameters for each operator. This check will reject operators
-    that need to be decomposed.
-    """
-
-    def __init__(self, reporter: WhyNoPartitionReporter):
-        self.reporter = reporter
-
-    def is_node_supported(
-        self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
-    ) -> bool:
-
-        if node.op != "call_function":
-            return True
-
-        needs_decomp_dict = {
-            exir_ops.edge.aten.div.Tensor: None,
-            exir_ops.edge.aten._native_batch_norm_legit_no_training.default: "BatchNorm2D with track_running_stats==True not immediately following a convolution is not supported for quantized TOSA backends.",
-            exir_ops.edge.aten.native_layer_norm.default: None,
-            exir_ops.edge.aten.native_group_norm.default: None,
-            exir_ops.edge.aten._softmax.default: None,
-            exir_ops.edge.aten._log_softmax.default: None,
-            exir_ops.edge.aten.var.correction: None,
-            exir_ops.edge.aten.var.dim: None,
-            exir_ops.edge.aten.add.Scalar: None,
-            exir_ops.edge.aten.sqrt.default: None,
-            exir_ops.edge.aten.sub.Scalar: None,
-            exir_ops.edge.aten.mul.Scalar: None,
-            exir_ops.edge.aten.ne.Tensor: None,
-            exir_ops.edge.aten.ne.Scalar: None,
-            exir_ops.edge.aten.div.Scalar: None,
-            exir_ops.edge.aten.leaky_relu.default: None,
-            exir_ops.edge.aten.round.default: None,
-            exir_ops.edge.aten.addmm.default: None,
-            exir_ops.edge.aten.glu.default: None,
-            exir_ops.edge.aten.logit.default: None,
-        }
-
-        if node.target in needs_decomp_dict:
-            reject_message = needs_decomp_dict[node.target]
-            if reject_message is None:
-                reject_message = "Op needs to be decomposed into other ops before quantization to get quantized properly."
-
-            self.reporter.report_reject(node, reject_message)
-            return False
-        else:
-            return True
-
-
 class CheckProperQuantization(OperatorSupportBase):
     """
     For targeted nodes, check that it has been quantized as expected. In most cases this means that a pair of quantize

From 8484aeead6203f96b1033d7df5b3d51baefed3c6 Mon Sep 17 00:00:00 2001
From: Zingo Andersen <zingo.andersen@arm.com>
Date: Mon, 6 Oct 2025 20:03:51 +0200
Subject: [PATCH 136/266] Arm backend: Backend test serializes and uses
 EthosUQuant on Ethos-U flows (#14817)

### Summary

Serialize and quantize automaticaly when possible. This make Ethos-U
flows work.

### Test plan
This is runned by the backed suit testing for Ethos-U

Signed-off-by: Zingo Andersen <zingo.andersen@arm.com>
---
 backends/test/suite/flows/arm.py | 60 +++++++++++++++-----------------
 1 file changed, 28 insertions(+), 32 deletions(-)

diff --git a/backends/test/suite/flows/arm.py b/backends/test/suite/flows/arm.py
index 34a6346fb1f..85674331eda 100644
--- a/backends/test/suite/flows/arm.py
+++ b/backends/test/suite/flows/arm.py
@@ -3,70 +3,66 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# Create flows for Arm Backends used to test operator and model suits
 
-from executorch.backends.arm.quantizer import (
-    get_symmetric_quantization_config,
-    TOSAQuantizer,
-)
+from executorch.backends.arm.common.arm_compile_spec import ArmCompileSpec
+from executorch.backends.arm.quantizer import get_symmetric_quantization_config
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.tosa.compile_spec import TosaCompileSpec
+from executorch.backends.arm.util._factory import create_quantizer
 from executorch.backends.test.suite.flow import TestFlow
 from executorch.backends.xnnpack.test.tester.tester import Quantize
 
 
-def _create_tosa_flow(
+def _create_arm_flow(
     name,
-    compile_spec,
-    quantize: bool = False,
+    compile_spec: ArmCompileSpec,
     symmetric_io_quantization: bool = False,
     per_channel_quantization: bool = True,
 ) -> TestFlow:
 
     def _create_arm_tester(*args, **kwargs) -> ArmTester:
         kwargs["compile_spec"] = compile_spec
+        return ArmTester(*args, **kwargs)
+
+    support_serialize = not isinstance(compile_spec, TosaCompileSpec)
+    quantize = compile_spec.tosa_spec.support_integer()
 
-        return ArmTester(
-            *args,
-            **kwargs,
-        )
+    if quantize is True:
 
-    # Create and configure quantizer to use in the flow
-    def create_quantize_stage() -> Quantize:
-        quantizer = TOSAQuantizer(compile_spec)
-        quantization_config = get_symmetric_quantization_config(
-            is_per_channel=per_channel_quantization
-        )
-        if symmetric_io_quantization:
-            quantizer.set_io(quantization_config)
-        return Quantize(quantizer, quantization_config)
+        def create_quantize_stage() -> Quantize:
+            quantizer = create_quantizer(compile_spec)
+            quantization_config = get_symmetric_quantization_config(
+                is_per_channel=per_channel_quantization
+            )
+            if symmetric_io_quantization:
+                quantizer.set_io(quantization_config)
+            return Quantize(quantizer, quantization_config)
 
     return TestFlow(
         name,
         backend="arm",
         tester_factory=_create_arm_tester,
-        supports_serialize=False,
+        supports_serialize=support_serialize,
         quantize=quantize,
-        quantize_stage_factory=create_quantize_stage if quantize else None,
+        quantize_stage_factory=(create_quantize_stage if quantize is True else False),
     )
 
 
-ARM_TOSA_FP_FLOW = _create_tosa_flow(
-    "arm_tosa_fp", common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP")
+ARM_TOSA_FP_FLOW = _create_arm_flow(
+    "arm_tosa_fp",
+    common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP"),
 )
-ARM_TOSA_INT_FLOW = _create_tosa_flow(
+ARM_TOSA_INT_FLOW = _create_arm_flow(
     "arm_tosa_int",
     common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+INT"),
-    quantize=True,
 )
-
-ARM_ETHOS_U55_FLOW = _create_tosa_flow(
+ARM_ETHOS_U55_FLOW = _create_arm_flow(
     "arm_ethos_u55",
     common.get_u55_compile_spec(),
-    quantize=True,
 )
-
-ARM_ETHOS_U85_FLOW = _create_tosa_flow(
+ARM_ETHOS_U85_FLOW = _create_arm_flow(
     "arm_ethos_u85",
     common.get_u85_compile_spec(),
-    quantize=True,
 )

From b6bc421f2c01c38cb8a300a1cee6799151cf7818 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A5ns=20Nilsson?= <mans.nilsson@arm.com>
Date: Mon, 6 Oct 2025 20:05:17 +0200
Subject: [PATCH 137/266] Arm backend: Fix Arm tester issue for inplace ops
 (#14625)

Deep-copying the input avoids it getting mutated by the first reference
run.
---
 backends/arm/test/ops/test_silu.py     |  5 -----
 backends/arm/test/tester/arm_tester.py | 10 +++++++---
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/backends/arm/test/ops/test_silu.py b/backends/arm/test/ops/test_silu.py
index 25117ef89de..362358d0813 100644
--- a/backends/arm/test/ops/test_silu.py
+++ b/backends/arm/test/ops/test_silu.py
@@ -8,7 +8,6 @@
 
 from typing import Optional, Tuple
 
-import pytest
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.test_pipeline import (
@@ -125,7 +124,6 @@ def test_silu_u85_INT_inplace(test_data: input_t):
 
 @common.parametrize("test_data", Silu.test_data)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1387: Output differs")
 def test_silu_vgf_FP(test_data: input_t):
     silu_data = (test_data(), False)
     pipeline = VgfPipeline[input_t](
@@ -136,7 +134,6 @@ def test_silu_vgf_FP(test_data: input_t):
 
 @common.parametrize("test_data", Silu.test_data)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1387: Output differs")
 def test_silu_vgf_FP_inplace(test_data: input_t):
     silu_data = (test_data(), True)
     pipeline = VgfPipeline[input_t](
@@ -147,7 +144,6 @@ def test_silu_vgf_FP_inplace(test_data: input_t):
 
 @common.parametrize("test_data", Silu.test_data)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1387: Output differs")
 def test_silu_vgf_INT(test_data: input_t):
     silu_data = (test_data(), False)
     pipeline = VgfPipeline[input_t](
@@ -161,7 +157,6 @@ def test_silu_vgf_INT(test_data: input_t):
 
 @common.parametrize("test_data", Silu.test_data)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1387: Output differs")
 def test_silu_vgf_INT_inplace(test_data: input_t):
     silu_data = (test_data(), True)
     pipeline = VgfPipeline[input_t](
diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py
index 9f530f428ce..0cba8d987c0 100644
--- a/backends/arm/test/tester/arm_tester.py
+++ b/backends/arm/test/tester/arm_tester.py
@@ -430,6 +430,10 @@ def run_method_and_compare_outputs(
         for run_iteration in range(num_runs):
             reference_input = inputs if inputs else next(self.generate_random_inputs())
 
+            # Avoid issues with inplace operators
+            test_input = copy.deepcopy(reference_input)
+            original_input = copy.deepcopy(reference_input)
+
             input_shapes = [
                 generated_input.shape if hasattr(generated_input, "shape") else (1,)
                 for generated_input in reference_input
@@ -444,16 +448,16 @@ def run_method_and_compare_outputs(
                 # Run exported module directly
                 test_outputs, _ = pytree.tree_flatten(
                     self._calculate_reference_output(
-                        exported_program.module(), reference_input
+                        exported_program.module(), test_input
                     )
                 )
             else:
                 # Run lowered model with target
                 test_outputs, _ = pytree.tree_flatten(
-                    test_stage.run_artifact(reference_input)
+                    test_stage.run_artifact(test_input)
                 )
 
-            logger.info(f"\n      Input: {reference_input}")
+            logger.info(f"\n      Input: {original_input}")
             logger.info(f"\n Ref output: {reference_outputs}")
             logger.info(f"\nTest output: {test_outputs}")
 

From 6e7353f2c337afe0882ddb3579c4bdfdf6f24718 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A5ns=20Nilsson?= <mans.nilsson@arm.com>
Date: Mon, 6 Oct 2025 20:06:29 +0200
Subject: [PATCH 138/266] Arm backend: Add 6D tensor and pixel
 shuffle/unshuffle support (#14626)

Adds 6D tensor support required by pixel_shuffle/pixel_unshuffle when
given 4D inputs, which means for now we only support 4D inputs. Adds
TOSA, VGF and xfailing Ethos-U85 unit tests.

cc @digantdesai @freddan80 @per @zingo @oscarandersson8218
---
 .../arm/_passes/to_tosa_memory_format_pass.py | 101 +++++---
 backends/arm/constants.py                     |   7 +-
 .../tosa_supported_operators.py               |   4 +-
 .../arm/quantizer/quantization_annotator.py   |   2 +
 backends/arm/scripts/parse_test_names.py      |   2 +
 .../test_SD3Transformer2DModel.py             |   4 -
 backends/arm/test/ops/test_pixel_shuffling.py | 233 ++++++++++++++++++
 backends/arm/tosa/dialect/ops/transpose.py    |   4 +-
 8 files changed, 310 insertions(+), 47 deletions(-)
 create mode 100644 backends/arm/test/ops/test_pixel_shuffling.py

diff --git a/backends/arm/_passes/to_tosa_memory_format_pass.py b/backends/arm/_passes/to_tosa_memory_format_pass.py
index dcbdfb03f7b..b906c06b329 100644
--- a/backends/arm/_passes/to_tosa_memory_format_pass.py
+++ b/backends/arm/_passes/to_tosa_memory_format_pass.py
@@ -26,6 +26,9 @@
     NNCHW_ORDER,
     NNHWC_INVERSE_ORDER,
     NNHWC_ORDER,
+    NNNCHW_ORDER,
+    NNNHWC_INVERSE_ORDER,
+    NNNHWC_ORDER,
 )
 from executorch.exir import ExportedProgram
 from executorch.exir.dialects._ops import ops as exir_ops
@@ -51,12 +54,6 @@ class ToTosaMemoryFormatPass(ExportPass):
 
     _passes_required_after: Set[Type[ExportPass]] = set()
 
-    NHWC_order = (0, 2, 3, 1)
-    NHWC_inverse_order = (0, 3, 1, 2)
-    HWCM_order = (2, 3, 0, 1)
-    NNHWC_order = (0, 1, 3, 4, 2)
-    NNHWC_inverse_order = (0, 1, 4, 2, 3)
-
     def __init__(self, exported_program: ExportedProgram) -> None:
         self.exported_program = exported_program
         super().__init__()
@@ -93,7 +90,11 @@ def is_weight_node_for_depthwise_conv2d(self, node: torch.fx.Node):
     @staticmethod
     def memory_format_differs(shape):
         """Returns true if the shape will have a different memory layout in (N)NCHW and (N)NHWC format"""
-        if len(shape) >= 5:
+        if len(shape) >= 6:
+            C = shape[3]
+            H = shape[4]
+            W = shape[5]
+        elif len(shape) == 5:
             C = shape[2]
             H = shape[3]
             W = shape[4]
@@ -112,25 +113,26 @@ def memory_format_differs(shape):
 
     @staticmethod
     def is_channel_reshape(input_shape, output_shape):
-        """Returns true if the reshape changes the channel dimension"""
-        if not (
-            (len(input_shape) == len(output_shape) and (len(output_shape) in (4, 5)))
-            or (len(input_shape) == 4 and len(output_shape) == 5)
-            or (len(input_shape) == 5 and len(output_shape) == 4)
-        ):
+        """Returns true if reshape changes the channel dimension or batch product dimension(s)"""
+
+        valid_ranks = {4, 5, 6}
+
+        if not (len(input_shape) in valid_ranks and len(output_shape) in valid_ranks):
             return False
 
         C_old = input_shape[-3]
         C_new = output_shape[-3]
 
-        N_new = (
-            output_shape[0]
-            if len(output_shape) == 4
-            else output_shape[0] * output_shape[1]
-        )
-        N_old = (
-            input_shape[0] if len(input_shape) == 4 else input_shape[0] * input_shape[1]
-        )
+        def get_batch_prod_dim(shape):
+            product = 1
+
+            for dim in shape[:-3]:
+                product = product * dim
+
+            return product
+
+        N_old = get_batch_prod_dim(input_shape)
+        N_new = get_batch_prod_dim(output_shape)
 
         return (N_old != N_new) or (C_old != C_new)
 
@@ -141,17 +143,27 @@ def insert_input_transpose(node, input_node, graph_module):
             node.replace_input_with(input_node, pre_permute_node)
             return
 
+        if len(get_first_fake_tensor(input_node).size()) == 6:
+            mem_format = NNNHWC_INVERSE_ORDER
+        elif len(get_first_fake_tensor(input_node).size()) == 5:
+            mem_format = NNHWC_INVERSE_ORDER
+        else:
+            mem_format = NHWC_INVERSE_ORDER
+        # Guard: mem_format must be a true permutation for the current rank
+        _rank_ = len(
+            get_first_fake_tensor(input_node).size()
+        )  # or (node) in output path
+        assert sorted(mem_format) == list(
+            range(_rank_)
+        ), f"bad perm {mem_format} for rank {_rank_} in insert_input_transpose"
+
         with graph_module.graph.inserting_before(node):
             permute_node = create_node(
                 graph_module.graph,
                 exir_ops.backend.tosa.TRANSPOSE.default,
                 args=(
                     input_node,
-                    list(
-                        NNHWC_INVERSE_ORDER
-                        if len(get_first_fake_tensor(input_node).size()) == 5
-                        else NHWC_INVERSE_ORDER
-                    ),
+                    list(mem_format),
                 ),
                 from_node=node,
             )
@@ -163,26 +175,38 @@ def insert_input_transpose(node, input_node, graph_module):
 
     @staticmethod
     def insert_output_transpose(node, graph_module):
+
+        if len(get_first_fake_tensor(node).size()) == 6:
+            mem_format = NNNHWC_ORDER
+        elif len(get_first_fake_tensor(node).size()) == 5:
+            mem_format = NNHWC_ORDER
+        else:
+            mem_format = NHWC_ORDER
+        # Guard: mem_format must be a true permutation for the current rank
+        _rank_ = len(get_first_fake_tensor(node).size())  # or (node) in output path
+        assert sorted(mem_format) == list(
+            range(_rank_)
+        ), f"bad perm {mem_format} for rank {_rank_} in insert_input_transpose"
+
         with graph_module.graph.inserting_after(node):
             permute_node = create_node(
                 graph_module.graph,
                 exir_ops.backend.tosa.TRANSPOSE.default,
                 args=(
                     node,
-                    list(
-                        NNHWC_ORDER
-                        if len(get_first_fake_tensor(node).size()) == 5
-                        else NHWC_ORDER
-                    ),
+                    list(mem_format),
                 ),
                 from_node=node,
             )
 
-            permute_node.meta["tosa_dim_order"] = (
-                NNHWC_ORDER
-                if len(get_first_fake_tensor(node).size()) == 5
-                else NHWC_ORDER
-            )
+            rank = len(get_first_fake_tensor(node).size())
+            if rank == 6:
+                permute_node.meta["tosa_dim_order"] = NNNHWC_ORDER
+            elif rank == 5:
+                permute_node.meta["tosa_dim_order"] = NNHWC_ORDER
+            else:
+                permute_node.meta["tosa_dim_order"] = NHWC_ORDER
+
             node.meta["tosa_dim_order"] = tuple(
                 range(len(get_first_fake_tensor(node).size()))
             )
@@ -261,7 +285,7 @@ def insert_tosa_transposes(self, graph_module: torch.fx.GraphModule):
         ]
         for input_node in inputs:
             input_dim_order = get_first_fake_tensor(input_node).dim_order()
-            if input_dim_order in (NCHW_ORDER, NNCHW_ORDER):
+            if input_dim_order in (NCHW_ORDER, NNCHW_ORDER, NNNCHW_ORDER):
                 self.insert_output_transpose(input_node, graph_module)
 
         # Transpose outputs if they are in (N)NCHW format
@@ -276,6 +300,7 @@ def insert_tosa_transposes(self, graph_module: torch.fx.GraphModule):
             if output_dim_order in (
                 NCHW_ORDER,
                 NNCHW_ORDER,
+                NNNCHW_ORDER,
             ):
                 self.insert_input_transpose(
                     output_node, output_node_input, graph_module
@@ -313,6 +338,8 @@ def call(self, graph_module: torch.fx.GraphModule):
                     dim_order = HWCM_ORDER
             elif node_data.dim() == 5:
                 dim_order = NNHWC_ORDER
+            elif node_data.dim() == 6:
+                dim_order = NNNHWC_ORDER
             else:
                 dim_order = tuple(range(node_data.dim()))  # type: ignore[assignment]
 
diff --git a/backends/arm/constants.py b/backends/arm/constants.py
index b9995410b23..0e562f12e88 100644
--- a/backends/arm/constants.py
+++ b/backends/arm/constants.py
@@ -34,10 +34,13 @@
 NHWC_INVERSE_ORDER: Final = (0, 3, 1, 2)
 NNHWC_ORDER: Final = (0, 1, 3, 4, 2)
 NNHWC_INVERSE_ORDER: Final = (0, 1, 4, 2, 3)
+NNNHWC_ORDER: Final = (0, 1, 2, 4, 5, 3)
+NNNHWC_INVERSE_ORDER: Final = (0, 1, 2, 5, 3, 4)
 
 NCHW_ORDER: Final = (0, 1, 2, 3)
-NCHW_INVERSE_ORDER: Final = (0, 2, 3, 1)
 NNCHW_ORDER: Final = (0, 1, 2, 3, 4)
-NNCHW_INVERSE_ORDER: Final = (0, 1, 3, 4, 2)
+NNNCHW_ORDER: Final = (0, 1, 2, 3, 4, 5)
 
 HWCM_ORDER: Final = (2, 3, 0, 1)
+
+MAX_RANK: Final = 6
diff --git a/backends/arm/operator_support/tosa_supported_operators.py b/backends/arm/operator_support/tosa_supported_operators.py
index f7dace09c0b..f7857894d40 100644
--- a/backends/arm/operator_support/tosa_supported_operators.py
+++ b/backends/arm/operator_support/tosa_supported_operators.py
@@ -19,7 +19,7 @@
     FuseQuantizedActivationPass,
 )
 from executorch.backends.arm._passes.insert_table_ops import TableOps
-from executorch.backends.arm.constants import DQ_OPS, Q_OPS
+from executorch.backends.arm.constants import DQ_OPS, MAX_RANK, Q_OPS
 from executorch.backends.arm.operator_support.ethos_u55_support import (
     EthosU55CastCheck,
     EthosU55DtypeSupport,
@@ -127,7 +127,7 @@ def tosa_support_factory(
     negative_checks: list[OperatorSupportBase] = [
         CheckInt64InputsAndOutputs(exported_program, reporter),
         CheckFloat64Inputs(exported_program, reporter),
-        RankCheck(reporter, max_rank=5),
+        RankCheck(reporter, max_rank=MAX_RANK),
         *[
             reporter.wrap_check(check, f"Rejected by {check.__class__.__name__}")
             for check in (additional_checks if additional_checks else [])
diff --git a/backends/arm/quantizer/quantization_annotator.py b/backends/arm/quantizer/quantization_annotator.py
index ebc91c22bbb..349aa3e6b21 100644
--- a/backends/arm/quantizer/quantization_annotator.py
+++ b/backends/arm/quantizer/quantization_annotator.py
@@ -370,6 +370,8 @@ def _match_pattern(
     torch.ops.aten.dropout_.default,
     torch.ops.aten.adaptive_avg_pool2d.default,
     torch.ops.aten.alias_copy.default,
+    torch.ops.aten.pixel_shuffle.default,
+    torch.ops.aten.pixel_unshuffle.default,
 ]
 
 
diff --git a/backends/arm/scripts/parse_test_names.py b/backends/arm/scripts/parse_test_names.py
index 2629d8eb257..54f8aa7421d 100644
--- a/backends/arm/scripts/parse_test_names.py
+++ b/backends/arm/scripts/parse_test_names.py
@@ -26,6 +26,8 @@
     "_native_batch_norm_legit_no_training.default",
     "_native_batch_norm_legit.no_stats",
     "alias_copy.default",
+    "pixel_shuffle.default",
+    "pixel_unshuffle.default",
 ]
 ALL_EDGE_OPS = SAMPLE_INPUT.keys() | CUSTOM_EDGE_OPS
 
diff --git a/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py b/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py
index 1267c5b8e4c..9506fe727db 100644
--- a/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py
+++ b/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py
@@ -30,16 +30,12 @@ class TestSD3Transformer2DModel:
 
     # Adjust nbr below as we increase op support.
     ops_after_partitioner_FP = {
-        "executorch_exir_dialects_edge__ops_aten_permute_copy_default": 1,
         "executorch_exir_dialects_edge__ops_aten_unsqueeze_copy_default": 1,
-        "executorch_exir_dialects_edge__ops_aten_view_copy_default": 2,
         "executorch_exir_dialects_edge__ops_dim_order_ops__to_dim_order_copy_default": 1,
         "torch.ops.higher_order.executorch_call_delegate": 1,
     }
 
     ops_after_partitioner_INT = {
-        "executorch_exir_dialects_edge__ops_aten_permute_copy_default": 1,
-        "executorch_exir_dialects_edge__ops_aten_view_copy_default": 2,
         "executorch_exir_dialects_edge__ops_dim_order_ops__to_dim_order_copy_default": 2,
         "torch.ops.higher_order.executorch_call_delegate": 2,
     }
diff --git a/backends/arm/test/ops/test_pixel_shuffling.py b/backends/arm/test/ops/test_pixel_shuffling.py
new file mode 100644
index 00000000000..5aeb8b2d1bb
--- /dev/null
+++ b/backends/arm/test/ops/test_pixel_shuffling.py
@@ -0,0 +1,233 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from typing import Tuple
+
+import pytest
+
+import torch
+
+from executorch.backends.arm.constants import MAX_RANK
+
+from executorch.backends.arm.test import common
+
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineINT,
+    EthosU85PipelineINT,
+    TosaPipelineFP,
+    TosaPipelineINT,
+    VgfPipeline,
+)
+from torch import nn
+
+aten_op_pixel_unshuffle = "torch.ops.aten.pixel_unshuffle.default"
+exir_op_pixel_unshuffle = (
+    "executorch_exir_dialects_edge__ops_aten_pixel_unshuffle_default"
+)
+
+aten_op_pixel_shuffle = "torch.ops.aten.pixel_shuffle.default"
+exir_op_pixel_shuffle = "executorch_exir_dialects_edge__ops_aten_pixel_shuffle_default"
+
+input_t1 = Tuple[torch.Tensor]  # single positional input (1-tuple)
+
+max_rank_input_supported = MAX_RANK - 2
+
+
+class PixelUnShuffle(nn.Module):
+
+    upscale_factor = 2
+    test_data_generators = {
+        "rand_4d": lambda: (torch.randn(1, 12, 64, 64),),
+        "test_4d": lambda: (torch.tensor([[[[10.0, 20.0], [30.0, 40.0]]]]),),
+        "test_3d": lambda: (torch.tensor([[[10.0, 20.0], [30.0, 40.0]]]),),
+    }
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.space_to_depth = nn.PixelUnshuffle(self.upscale_factor)
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        if inputs.dim() > max_rank_input_supported:
+            raise RuntimeError(
+                f"Max rank of input for pixel_unshuffle is currently {max_rank_input_supported}, got {inputs.dim()}"
+            )
+        return self.space_to_depth(inputs)
+
+
+class PixelShuffle(nn.Module):
+
+    upscale_factor = 2
+    test_data_generators = {
+        "rand_4d": lambda: (torch.randn(1, 12, 64, 64),),
+        "test_4d": lambda: (torch.tensor([[[[10.0]], [[20.0]], [[30.0]], [[40.0]]]]),),
+        "test_3d": lambda: (torch.tensor([[[10.0]], [[20.0]], [[30.0]], [[40.0]]]),),
+    }
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.depth_to_space = nn.PixelShuffle(self.upscale_factor)
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        if inputs.dim() > max_rank_input_supported:
+            raise RuntimeError(
+                f"Max rank of input for pixel_shuffle is currently {max_rank_input_supported}, got {inputs.dim()}"
+            )
+        return self.depth_to_space(inputs)
+
+
+@common.parametrize("test_data", PixelUnShuffle.test_data_generators)
+def test_pixel_unshuffle_tosa_FP(test_data: input_t1):
+    pipeline = TosaPipelineFP[input_t1](
+        PixelUnShuffle(),
+        test_data(),
+        aten_op_pixel_unshuffle,
+        exir_op_pixel_unshuffle,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", PixelUnShuffle.test_data_generators)
+def test_pixel_unshuffle_tosa_INT(test_data: input_t1):
+    pipeline = TosaPipelineINT[input_t1](
+        PixelUnShuffle(),
+        test_data(),
+        aten_op_pixel_unshuffle,
+        exir_op_pixel_unshuffle,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", PixelShuffle.test_data_generators)
+def test_pixel_shuffle_tosa_FP(test_data: input_t1):
+    pipeline = TosaPipelineFP[input_t1](
+        PixelShuffle(),
+        test_data(),
+        aten_op_pixel_shuffle,
+        exir_op_pixel_shuffle,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", PixelShuffle.test_data_generators)
+def test_pixel_shuffle_tosa_INT(test_data: input_t1):
+    pipeline = TosaPipelineINT[input_t1](
+        PixelShuffle(),
+        test_data(),
+        aten_op_pixel_shuffle,
+        exir_op_pixel_shuffle,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", PixelUnShuffle.test_data_generators)
+@common.SkipIfNoModelConverter
+def test_pixel_unshuffle_vgf_FP(test_data: input_t1):
+    pipeline = VgfPipeline[input_t1](
+        PixelUnShuffle(),
+        test_data(),
+        aten_op_pixel_unshuffle,
+        exir_op_pixel_unshuffle,
+        tosa_version="TOSA-1.0+FP",
+        run_on_vulkan_runtime=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", PixelUnShuffle.test_data_generators)
+@common.SkipIfNoModelConverter
+def test_pixel_unshuffle_vgf_INT(test_data: input_t1):
+    pipeline = VgfPipeline[input_t1](
+        PixelUnShuffle(),
+        test_data(),
+        aten_op_pixel_unshuffle,
+        exir_op_pixel_unshuffle,
+        tosa_version="TOSA-1.0+INT",
+        run_on_vulkan_runtime=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", PixelShuffle.test_data_generators)
+@common.SkipIfNoModelConverter
+def test_pixel_shuffle_vgf_FP(test_data: input_t1):
+    pipeline = VgfPipeline[input_t1](
+        PixelShuffle(),
+        test_data(),
+        aten_op_pixel_shuffle,
+        exir_op_pixel_shuffle,
+        tosa_version="TOSA-1.0+FP",
+        run_on_vulkan_runtime=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", PixelShuffle.test_data_generators)
+@common.SkipIfNoModelConverter
+def test_pixel_shuffle_vgf_INT(test_data: input_t1):
+    pipeline = VgfPipeline[input_t1](
+        PixelShuffle(),
+        test_data(),
+        aten_op_pixel_shuffle,
+        exir_op_pixel_shuffle,
+        tosa_version="TOSA-1.0+INT",
+        run_on_vulkan_runtime=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", PixelUnShuffle.test_data_generators)
+@common.XfailIfNoCorstone300
+def test_pixel_unshuffle_u55_INT(test_data: input_t1):
+    pipeline = EthosU55PipelineINT[input_t1](
+        PixelUnShuffle(),
+        test_data(),
+        aten_op_pixel_unshuffle,
+        exir_op_pixel_unshuffle,
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", PixelUnShuffle.test_data_generators)
+@common.XfailIfNoCorstone320
+@pytest.mark.xfail(reason="MLETORCH-1424: rand test fails")
+def test_pixel_unshuffle_u85_INT(test_data: input_t1):
+    pipeline = EthosU85PipelineINT[input_t1](
+        PixelUnShuffle(),
+        test_data(),
+        aten_op_pixel_unshuffle,
+        exir_op_pixel_unshuffle,
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", PixelShuffle.test_data_generators)
+@common.XfailIfNoCorstone300
+def test_pixel_shuffle_u55_INT(test_data: input_t1):
+    pipeline = EthosU55PipelineINT[input_t1](
+        PixelShuffle(),
+        test_data(),
+        aten_op_pixel_shuffle,
+        exir_op_pixel_shuffle,
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", PixelShuffle.test_data_generators)
+@common.XfailIfNoCorstone320
+@pytest.mark.xfail(reason="MLETORCH-1424: rand test fails")
+def test_pixel_shuffle_u85_INT(test_data: input_t1):
+    pipeline = EthosU85PipelineINT[input_t1](
+        PixelShuffle(),
+        test_data(),
+        aten_op_pixel_shuffle,
+        exir_op_pixel_shuffle,
+        run_on_fvp=True,
+    )
+    pipeline.run()
diff --git a/backends/arm/tosa/dialect/ops/transpose.py b/backends/arm/tosa/dialect/ops/transpose.py
index 9c5aba05394..8d5bf8bac70 100644
--- a/backends/arm/tosa/dialect/ops/transpose.py
+++ b/backends/arm/tosa/dialect/ops/transpose.py
@@ -26,9 +26,9 @@ def TRANSPOSE(a, perms):
     # By utilizing an edge IR passthrough operator we can keep the edge program in
     # channels-first/contiguous and get the desired behavior in the TOSA lowering.
 
-    if len(perms) not in (4, 5):
+    if len(perms) not in (4, 5, 6):
         raise TosaValueError(
-            f"Only 4D and 5D tensors are supported, got {len(perms)}: {perms}",
+            f"Only 4D, 5D and 6D tensors are supported, got {len(perms)}: {perms}",
             op="TRANSPOSE",
         )
 

From 266cfd03c0814653d0fb4664b87ca3d2705d3a0e Mon Sep 17 00:00:00 2001
From: per held <per.held@arm.com>
Date: Mon, 6 Oct 2025 20:09:14 +0200
Subject: [PATCH 139/266] Arm backend: Add test for monitoring memory
 allocation (#14657)

Simple test to monitor memory allocations when running the "add" model
in fvp.


Signed-off-by: per.held@arm.com
---
 .github/workflows/trunk.yml                   |   1 +
 backends/arm/test/test_arm_baremetal.sh       |  15 ++
 .../arm/test/test_memory_allocator_log.py     | 170 ++++++++++++++++++
 3 files changed, 186 insertions(+)
 create mode 100644 backends/arm/test/test_memory_allocator_log.py

diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index adf3b7da151..aabea88f517 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -289,6 +289,7 @@ jobs:
           - test_arm_baremetal: test_models_ethos-u55
           - test_arm_baremetal: test_models_ethos-u85
           - test_arm_baremetal: test_smaller_stories_llama
+          - test_arm_baremetal: test_memory_allocation
       fail-fast: false
     with:
       runner: linux.2xlarge.memory
diff --git a/backends/arm/test/test_arm_baremetal.sh b/backends/arm/test/test_arm_baremetal.sh
index be87ea629d8..b8e8aee4e3a 100755
--- a/backends/arm/test/test_arm_baremetal.sh
+++ b/backends/arm/test/test_arm_baremetal.sh
@@ -366,5 +366,20 @@ test_smaller_stories_llama() {
     echo "${TEST_SUITE_NAME}: PASS"
 }
 
+test_memory_allocation() {
+    echo "${TEST_SUITE_NAME}: Test ethos-u memory allocation with run.sh"
+
+    mkdir -p arm_test/test_run
+    # Ethos-U85
+    echo "${TEST_SUITE_NAME}: Test target Ethos-U85"
+    examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u85-128 --model_name=examples/arm/example_modules/add.py &> arm_test/test_run/full.log
+    python3 backends/arm/test/test_memory_allocator_log.py --log arm_test/test_run/full.log \
+            --require "model_pte_program_size" "<= 3000 B" \
+            --require "method_allocator_planned" "<= 64 B" \
+            --require "method_allocator_loaded" "<= 1024 B" \
+            --require "method_allocator_input" "<= 4 B" \
+            --require "Total DRAM used" "<= 0.06 KiB"
+    echo "${TEST_SUITE_NAME}: PASS"
+}
 
 ${TEST_SUITE}
diff --git a/backends/arm/test/test_memory_allocator_log.py b/backends/arm/test/test_memory_allocator_log.py
new file mode 100644
index 00000000000..3853b60b7f6
--- /dev/null
+++ b/backends/arm/test/test_memory_allocator_log.py
@@ -0,0 +1,170 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Check log files for memory metrics and compare them against thresholds.
+
+Usage example:
+  python3 test_memory_allocator_log.py \
+    --log path/to/log.txt \
+    --require "Total SRAM used" "<= 310 KiB" \
+    --require "method_allocator_input" "<= 4 B"
+"""
+
+import argparse
+import re
+import sys
+from typing import List, Optional, Tuple
+
+
+def unit_factor(u: str) -> float:
+    if not u:
+        return 1.0
+    ul = u.strip().lower()
+    table = {
+        "b": 1,
+        "byte": 1,
+        "bytes": 1,
+        "kb": 1000,
+        "mb": 1000**2,
+        "gb": 1000**3,
+        "kib": 1024,
+        "mib": 1024**2,
+        "gib": 1024**3,
+    }
+    if ul in table:
+        return float(table[ul])
+    return 1.0
+
+
+def parse_value(text_num: str, text_unit: Optional[str]) -> float:
+    return float(text_num) * unit_factor(text_unit or "")
+
+
+def parse_cond(cond: str) -> Tuple[str, float, str]:
+    # Regexp explained. Example of things it will parse:
+    # "< 310 KiB", ">=10MB", "== 42", "!=3 bytes", "<=0.5 MiB"
+
+    # The regexp explained in detail:
+    # ^: anchor the match to the start and end of the string (no extra chars allowed).
+    # \s*: optional whitespace (spaces, tabs, etc.).
+    # (<=|>=|==|!=|<|>): capturing group 1. One of the comparison operators: <=, >=, ==, !=, <, >.
+    # \s*: optional whitespace.
+    # ([0-9]+(?:\.[0-9]+)?): capturing group 2. A number:
+    #   [0-9]+: one or more digits (the integer part).
+    #   (?:\.[0-9]+)?: optional non-capturing group for a fractional part like .25.
+    # \s*: optional whitespace between number and unit
+    # ([A-Za-z]+)?: capturing group 3, optional. A unit made of letters only (e.g., B, KB, KiB, MB, MiB). Case# insensitive by class choice.
+    # \s*: optional trailing whitespace.
+    m = re.match(
+        r"^\s*(<=|>=|==|!=|<|>)\s*([0-9]+(?:\.[0-9]+)?)\s*([A-Za-z]+)?\s*$", cond
+    )
+    if not m:
+        raise ValueError(f"Invalid condition: {cond}")
+    op, num, unit = m.groups()
+    return op, float(num), (unit or "")
+
+
+def compare(a: float, b: float, op: str) -> bool:
+    return {
+        "<": a < b,
+        "<=": a <= b,
+        ">": a > b,
+        ">=": a >= b,
+        "==": abs(a - b) < 1e-9,
+        "!=": abs(a - b) >= 1e-9,
+    }[op]
+
+
+def find_metric_value(line: str, label: str) -> Tuple[Optional[str], Optional[str]]:
+    # Same regexp as parse_cond() but without the first group of matching comparison operators
+    # First go, search for the pattern but escape and ignore cases
+    # The regexp:
+    # ([0-9]+(?:\.[0-9]+)?) — capturing group 1: a decimal number
+    # [0-9]+ — one or more digits (integer part)
+    # (?:\.[0-9]+)? — optional fractional part like .25 (non-capturing)
+    # \s* — optional whitespace between number and unit
+    # ([A-Za-z]+)? — capturing group 2 (optional): a unit made only of letters (e.g., B, KB, KiB, MB)
+    m = re.search(
+        re.escape(label) + r".*?([0-9]+(?:\.[0-9]+)?)\s*([A-Za-z]+)?",
+        line,
+        flags=re.IGNORECASE,
+    )
+    if m:
+        return m.group(1), m.group(2)
+    # Second go, same regexp as above but not caring about label. If
+    # no number was tied to a label be happy just salvaging it from
+    # the line
+    m = re.search(r"([0-9]+(?:\.[0-9]+)?)\s*([A-Za-z]+)?", line)
+    if m:
+        return m.group(1), m.group(2)
+    return None, None
+
+
+def first_line_with_label(lines: List[str], label: str) -> Optional[str]:
+    label_lc = label.lower()
+    return next((ln for ln in lines if label_lc in ln.lower()), None)
+
+
+def check_requirement(label: str, cond: str, lines: List[str]) -> Optional[str]:
+    op, thr_num, thr_unit = parse_cond(cond)
+    matched = first_line_with_label(lines, label)
+    if matched is None:
+        return f"{label}: not found in log"
+
+    num_str, unit_str = find_metric_value(matched, label)
+    if num_str is None:
+        return f"{label}: value not found on line: {matched.strip()}"
+
+    left_bytes = parse_value(num_str, unit_str)
+    right_bytes = parse_value(str(thr_num), thr_unit or (unit_str or ""))
+    ok = compare(left_bytes, right_bytes, op)
+
+    human_left = f"{num_str} {unit_str or 'B'}"
+    human_right = f"{thr_num:g} {thr_unit or (unit_str or 'B')}"
+    print(
+        f"[check] {label}: {human_left} {op} {human_right} -> {'OK' if ok else 'FAIL'}"
+    )
+
+    if ok:
+        return None
+    return f"{label}: {human_left} not {op} {human_right}"
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--log", required=True, help="Path to log file")
+    parser.add_argument(
+        "--require",
+        action="append",
+        nargs=2,
+        metavar=("LABEL", "COND"),
+        default=[],
+        help="""Required label and condition consisting
+                         of a number and unit. Example: \"Total DRAM
+                         used\" \"<= 0.06 KiB\"""",
+    )
+    args = parser.parse_args()
+
+    with open(args.log, "r", encoding="utf-8", errors="ignore") as f:
+        lines = f.readlines()
+
+    failures: List[str] = []
+    for label, cond in args.require:
+        msg = check_requirement(label, cond, lines)
+        if msg:
+            failures.append(msg)
+
+    if failures:
+        print("Failures:")
+        for msg in failures:
+            print(" - " + msg)
+        return 1
+
+    print("All checks passed.")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From f174974eb72df4c74cc863da05d930444e60fa6a Mon Sep 17 00:00:00 2001
From: per held <per.held@arm.com>
Date: Mon, 6 Oct 2025 20:10:37 +0200
Subject: [PATCH 140/266] Arm backend: Remove hello_world in core_software
 (#14775)

---
 ...Remove-hello_world-from-applications.patch | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 examples/arm/ethos-u-setup/core_platform/0001-Remove-hello_world-from-applications.patch

diff --git a/examples/arm/ethos-u-setup/core_platform/0001-Remove-hello_world-from-applications.patch b/examples/arm/ethos-u-setup/core_platform/0001-Remove-hello_world-from-applications.patch
new file mode 100644
index 00000000000..11590a8578f
--- /dev/null
+++ b/examples/arm/ethos-u-setup/core_platform/0001-Remove-hello_world-from-applications.patch
@@ -0,0 +1,25 @@
+From f6a7d867212336b3e344c21240a2a03671bffd65 Mon Sep 17 00:00:00 2001
+From: Per Held <per.held@arm.com>
+Date: Wed, 17 Sep 2025 13:46:05 +0200
+Subject: Remove hello_world from applications
+
+---
+ applications/CMakeLists.txt | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/applications/CMakeLists.txt b/applications/CMakeLists.txt
+index a017575..130f0f7 100644
+--- a/applications/CMakeLists.txt
++++ b/applications/CMakeLists.txt
+@@ -21,7 +21,7 @@ add_subdirectory(driver_unit_tests)
+ 
+ add_subdirectory(freertos)
+ 
+-add_subdirectory(hello_world)
++#add_subdirectory(hello_world)
+ 
+ add_subdirectory(threadx_demo)
+ 
+-- 
+2.43.0
+

From cf314751807e5b37a87d9f01877be4013b9c021a Mon Sep 17 00:00:00 2001
From: Gregory Comer <gjcomer@meta.com>
Date: Mon, 6 Oct 2025 14:16:32 -0600
Subject: [PATCH 141/266] Revert "[Windows] Enable LLM preset in CI (#14805)"
 (#14823)

This reverts commit 8c434ddb066feafa3773ac4332a7fed62e9c6c76.

Disabling for now as the Windows unittest jobs are failing post-merge.
They were clean on the PR, so probably just a conflict with a recent
change. I will investigate and re-merge.
---
 .github/workflows/build-presets.yml | 2 +-
 tools/cmake/preset/windows.cmake    | 9 +++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build-presets.yml b/.github/workflows/build-presets.yml
index 46031ac7ea3..66ab19eef3c 100644
--- a/.github/workflows/build-presets.yml
+++ b/.github/workflows/build-presets.yml
@@ -109,7 +109,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        preset: [pybind, windows, llm]
+        preset: [pybind, windows]
     with:
       job-name: build
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
diff --git a/tools/cmake/preset/windows.cmake b/tools/cmake/preset/windows.cmake
index ef8bbbedbbf..b75a5af578e 100644
--- a/tools/cmake/preset/windows.cmake
+++ b/tools/cmake/preset/windows.cmake
@@ -4,9 +4,14 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-include(${PROJECT_SOURCE_DIR}/tools/cmake/preset/llm.cmake)
-
 # keep sorted
 set_overridable_option(EXECUTORCH_BUILD_EXECUTOR_RUNNER ON)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_EVALUE_UTIL ON)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_MODULE ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL ON)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
+set_overridable_option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED ON)
+set_overridable_option(EXECUTORCH_BUILD_KERNELS_QUANTIZED ON)
+set_overridable_option(EXECUTORCH_BUILD_XNNPACK ON)

From a39866ca8c9a0a497f6682eb80e07ac99dbb96ba Mon Sep 17 00:00:00 2001
From: Andrew Grebenisan <33402477+DrJessop@users.noreply.github.com>
Date: Mon, 6 Oct 2025 13:31:56 -0700
Subject: [PATCH 142/266] Fix op signature for avg_pool2d

Differential Revision: D83873533

Pull Request resolved: https://github.com/pytorch/executorch/pull/14787
---
 backends/cadence/aot/ops_registrations.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
index e3009163d62..f7d07018e59 100644
--- a/backends/cadence/aot/ops_registrations.py
+++ b/backends/cadence/aot/ops_registrations.py
@@ -329,7 +329,7 @@
     "Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, bool channel_last=False) -> (Tensor out)"
 )
 lib.define(
-    "avg_pool2d(Tensor input, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, "
+    "avg_pool2d(Tensor input, int[2] kernel_size, int[2] stride=[], int[2] padding=[], bool ceil_mode=False, "
     "bool count_include_pad=True, int? divisor_override=None, Tensor? in_zero_point=None, bool channel_last=False) -> (Tensor out)"
 )
 lib.define(
@@ -525,7 +525,7 @@
     "Tensor out_multiplier, Tensor out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)"
 )
 lib.define(
-    "avg_pool2d.out(Tensor input, int[2] kernel_size, int[2] stride=[], int[2] padding=0, "
+    "avg_pool2d.out(Tensor input, int[2] kernel_size, int[2] stride=[], int[2] padding=[], "
     "bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, "
     "Tensor? in_zero_point=None, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)"
 )

From bc931e17135e38554e4752b2b3324b9754f29139 Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin <anthony@shoumikh.in>
Date: Mon, 6 Oct 2025 15:35:52 -0700
Subject: [PATCH 143/266] Update APP_PATH to point to mv3 directory (#14828)

---
 scripts/test_ios.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/test_ios.sh b/scripts/test_ios.sh
index 8cb86f8f43c..599ae1683a4 100755
--- a/scripts/test_ios.sh
+++ b/scripts/test_ios.sh
@@ -15,7 +15,7 @@ set -e
 
 OUTPUT="${1:-executorch}"
 EXIT_STATUS=0
-APP_PATH="executorch-examples/apple/ExecuTorchDemo/ExecuTorchDemo"
+APP_PATH="executorch-examples/mv3/apple/ExecuTorchDemo/ExecuTorchDemo"
 MODEL_NAME="mv3"
 SIMULATOR_NAME="executorch"
 

From 270873fa4fbab639820bb4375bd47ef2d2cd2fde Mon Sep 17 00:00:00 2001
From: Siddartha Pothapragada <sidart@meta.com>
Date: Mon, 6 Oct 2025 15:38:40 -0700
Subject: [PATCH 144/266] Restructure ET documentation with 'Platform First'
 model (#14720)

### Summary
[PLEASE REMOVE] See [CONTRIBUTING.md's Pull
Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests)
for ExecuTorch PR guidelines.

[PLEASE REMOVE] If this PR closes an issue, please add a `Fixes
#<issue-id>` line.

[PLEASE REMOVE] If this PR introduces a fix or feature that should be
the upcoming release notes, please add a "Release notes: <area>" label.
For a list of available release notes labels, check out
[CONTRIBUTING.md's Pull
Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests).

### Test plan
[PLEASE REMOVE] How did you test this PR? Please write down any manual
commands you used and note down tests that you have written if
applicable.
---
 docs/source/advanced-topics-section.md        | 112 +++++++
 docs/source/android-arm-vgf.md                |   1 +
 docs/source/android-backends.md               |  28 ++
 docs/source/android-examples.md               |   9 +
 docs/source/android-mediatek.md               |   1 +
 docs/source/android-qualcomm.md               |   1 +
 docs/source/android-samsung-exynos.md         |   1 +
 docs/source/android-section.md                |  23 ++
 docs/source/android-vulkan.md                 |   1 +
 docs/source/android-xnnpack.md                |   1 +
 docs/source/api-section.md                    |  26 ++
 docs/source/api.md                            |  11 -
 docs/source/backend-delegate-advanced.md      |  33 ++
 docs/source/backends-overview.md              |  73 ++++-
 docs/source/backends-samsung-exynos.md        |   1 +
 docs/source/backends-section.md               |   1 +
 docs/source/backends-xnnpack.md               |   7 +-
 docs/source/backends.md                       |  17 -
 .../compiler-delegate-and-partitioner.md      |   2 +-
 docs/source/compiler-ir-advanced.md           |  31 ++
 docs/source/desktop-backends.md               |  27 ++
 docs/source/desktop-coreml.md                 |   1 +
 docs/source/desktop-mps.md                    |   1 +
 docs/source/desktop-openvino.md               |   1 +
 docs/source/desktop-section.md                |  19 ++
 docs/source/desktop-xnnpack.md                |   1 +
 docs/source/edge-platforms-section.md         |  73 +++++
 docs/source/embedded-arm-ethos-u.md           |   1 +
 docs/source/embedded-backends.md              |  20 ++
 docs/source/embedded-cadence.md               |   1 +
 docs/source/embedded-nxp.md                   |   1 +
 docs/source/embedded-section.md               |  39 +++
 docs/source/file-formats-advanced.md          |  17 +
 docs/source/index.md                          | 307 +++++++++++-------
 docs/source/intro-section.md                  |  27 ++
 docs/source/intro.md                          |  10 -
 docs/source/ios-backends.md                   |  19 ++
 docs/source/ios-coreml.md                     |   1 +
 docs/source/ios-examples.md                   |   4 +
 docs/source/ios-mps.md                        |   1 +
 docs/source/ios-section.md                    |  23 ++
 docs/source/ios-xnnpack.md                    |   1 +
 docs/source/kernel-library-advanced.md        |  23 ++
 docs/source/kernel-library-overview.md        |   4 +-
 ...lama3-qualcomm-ai-engine-direct-backend.md |   5 +-
 docs/source/llm/working-with-llms.md          |   9 +-
 docs/source/platforms-desktop.md              |  23 ++
 docs/source/platforms-embedded.md             |  19 ++
 docs/source/quantization-optimization.md      |  20 ++
 docs/source/quick-start-section.md            |  38 +++
 docs/source/runtime-integration-advanced.md   |  20 ++
 docs/source/success-stories.md                |  56 ++++
 docs/source/support-section.md                |  17 +
 docs/source/tools-section.md                  |  30 ++
 docs/source/using-executorch-export.md        |   2 +-
 55 files changed, 1054 insertions(+), 187 deletions(-)
 create mode 100644 docs/source/advanced-topics-section.md
 create mode 100644 docs/source/android-arm-vgf.md
 create mode 100644 docs/source/android-backends.md
 create mode 100644 docs/source/android-examples.md
 create mode 100644 docs/source/android-mediatek.md
 create mode 100644 docs/source/android-qualcomm.md
 create mode 100644 docs/source/android-samsung-exynos.md
 create mode 100644 docs/source/android-section.md
 create mode 100644 docs/source/android-vulkan.md
 create mode 100644 docs/source/android-xnnpack.md
 create mode 100644 docs/source/api-section.md
 delete mode 100644 docs/source/api.md
 create mode 100644 docs/source/backend-delegate-advanced.md
 create mode 100644 docs/source/backends-samsung-exynos.md
 create mode 100644 docs/source/backends-section.md
 delete mode 100644 docs/source/backends.md
 create mode 100644 docs/source/compiler-ir-advanced.md
 create mode 100644 docs/source/desktop-backends.md
 create mode 100644 docs/source/desktop-coreml.md
 create mode 100644 docs/source/desktop-mps.md
 create mode 100644 docs/source/desktop-openvino.md
 create mode 100644 docs/source/desktop-section.md
 create mode 100644 docs/source/desktop-xnnpack.md
 create mode 100644 docs/source/edge-platforms-section.md
 create mode 100644 docs/source/embedded-arm-ethos-u.md
 create mode 100644 docs/source/embedded-backends.md
 create mode 100644 docs/source/embedded-cadence.md
 create mode 100644 docs/source/embedded-nxp.md
 create mode 100644 docs/source/embedded-section.md
 create mode 100644 docs/source/file-formats-advanced.md
 create mode 100644 docs/source/intro-section.md
 delete mode 100644 docs/source/intro.md
 create mode 100644 docs/source/ios-backends.md
 create mode 100644 docs/source/ios-coreml.md
 create mode 100644 docs/source/ios-examples.md
 create mode 100644 docs/source/ios-mps.md
 create mode 100644 docs/source/ios-section.md
 create mode 100644 docs/source/ios-xnnpack.md
 create mode 100644 docs/source/kernel-library-advanced.md
 create mode 100644 docs/source/platforms-desktop.md
 create mode 100644 docs/source/platforms-embedded.md
 create mode 100644 docs/source/quantization-optimization.md
 create mode 100644 docs/source/quick-start-section.md
 create mode 100644 docs/source/runtime-integration-advanced.md
 create mode 100644 docs/source/success-stories.md
 create mode 100644 docs/source/support-section.md
 create mode 100644 docs/source/tools-section.md

diff --git a/docs/source/advanced-topics-section.md b/docs/source/advanced-topics-section.md
new file mode 100644
index 00000000000..e7b7f5490c6
--- /dev/null
+++ b/docs/source/advanced-topics-section.md
@@ -0,0 +1,112 @@
+(advanced-topics-section)=
+
+# Advanced
+
+Deep dive into ExecuTorch's advanced features for optimization, customization, and integration.
+
+This section covers advanced concepts for developers who need to customize ExecuTorch for specific use cases, optimize performance, or integrate with custom hardware backends.
+
+## Quantization & Optimization
+
+Techniques for model compression and performance optimization.
+
+**→ {doc}`quantization-optimization` — Quantization strategies and performance optimization**
+
+Key topics:
+
+- Quantization strategies and techniques
+- Performance profiling and optimization
+
+## Model Export
+
+Learn the core ExecuTorch workflow, exporting PyTorch models to the `.pte` format for edge deployment.
+
+**→ {doc}`using-executorch-export`** - Model Export & Lowering
+
+Key topics:
+
+- Export and Lowering Workflow
+- Hardware Backend Selection & Optimization
+- Dynamic Shapes & Advanced Model Features
+
+
+## Kernel Library
+
+Deep dive into ExecuTorch's kernel implementation and customization.
+
+**→ {doc}`kernel-library-advanced` — Kernel library deep dive and customization**
+
+Key topics:
+
+- Kernel library architecture
+- Custom kernel implementation
+- Selective build and optimization
+
+## Backend & Delegates
+
+**→ {doc}`backend-delegate-advanced` — Backend delegate integration**
+
+Key topics:
+
+- Learn how to integrate Backend Delegate into ExecuTorch and more
+- XNNPACK Delegate Internals
+- Debugging Delegation
+
+
+## Runtime & Integration
+
+Advanced runtime features and backend integration.
+
+**→ {doc}`runtime-integration-advanced` — Runtime customization and backend integration**
+
+Key topics:
+
+- Backend delegate implementation
+- Platform abstraction layer
+- Custom runtime integration
+
+## Compiler & IR
+
+Advanced compiler features and intermediate representation details.
+
+**→ {doc}`compiler-ir-advanced` — Compiler passes and IR specification**
+
+Key topics:
+
+- Custom compiler passes
+- Memory planning strategies
+- Backend dialect and EXIR
+- Ops set definition
+
+
+## File Formats
+
+ExecuTorch file format specifications and internals.
+
+**→ {doc}`file-formats-advanced` — PTE and PTD file format specifications**
+
+Key topics:
+
+- PTE file format internals
+- PTD file format specification
+- Custom file format handling
+
+## Next Steps
+
+After exploring advanced topics:
+
+- **{doc}`tools-sdk-section`** - Developer tools for debugging and profiling
+- **{doc}`api-section`** - Complete API reference documentation
+
+```{toctree}
+:hidden:
+:maxdepth: 2
+:caption: Advanced Topics
+
+quantization-optimization
+using-executorch-export
+kernel-library-advanced
+backend-delegate-advanced
+runtime-integration-advanced
+compiler-ir-advanced
+file-formats-advanced
diff --git a/docs/source/android-arm-vgf.md b/docs/source/android-arm-vgf.md
new file mode 100644
index 00000000000..cc39b53e176
--- /dev/null
+++ b/docs/source/android-arm-vgf.md
@@ -0,0 +1 @@
+```{include} backends-arm-vgf.md
diff --git a/docs/source/android-backends.md b/docs/source/android-backends.md
new file mode 100644
index 00000000000..d506813990b
--- /dev/null
+++ b/docs/source/android-backends.md
@@ -0,0 +1,28 @@
+(android-backends)=
+# Backends
+
+Available hardware acceleration backends for Android deployment.
+
+## CPU Acceleration
+
+- {doc}`android-xnnpack` — XNNPACK CPU acceleration
+
+## GPU Acceleration
+
+- {doc}`android-vulkan` — Vulkan GPU acceleration
+
+## NPU/Accelerator Backends
+
+- {doc}`android-qualcomm` — Qualcomm AI Engine (NPU)
+- {doc}`android-mediatek` — MediaTek NPU acceleration
+- {doc}`android-arm-vgf` — ARM VGF Backend
+- {doc}`android-samsung-exynos` — Samsung Exynos NPU
+
+```{toctree}
+:hidden:
+android-xnnpack
+android-vulkan
+android-qualcomm
+android-mediatek
+android-arm-vgf
+android-samsung-exynos
diff --git a/docs/source/android-examples.md b/docs/source/android-examples.md
new file mode 100644
index 00000000000..65580870c57
--- /dev/null
+++ b/docs/source/android-examples.md
@@ -0,0 +1,9 @@
+# Examples & Demos
+
+- [Working with LLMs - Android Examples](https://github.com/meta-pytorch/executorch-examples/tree/main/llm/android)
+- [Demo Apps](https://github.com/meta-pytorch/executorch-examples/tree/main/dl3/android/DeepLabV3Demo#executorch-android-demo-app)
+- {doc}`tutorial-arm-vgf` — Export a simple PyTorch model for the ExecuTorch VGF backend
+
+```{toctree}
+:hidden:
+tutorial-arm-vgf
diff --git a/docs/source/android-mediatek.md b/docs/source/android-mediatek.md
new file mode 100644
index 00000000000..7034fe439dd
--- /dev/null
+++ b/docs/source/android-mediatek.md
@@ -0,0 +1 @@
+```{include} backends-mediatek.md
diff --git a/docs/source/android-qualcomm.md b/docs/source/android-qualcomm.md
new file mode 100644
index 00000000000..f484d771a8b
--- /dev/null
+++ b/docs/source/android-qualcomm.md
@@ -0,0 +1 @@
+```{include} backends-qualcomm.md
diff --git a/docs/source/android-samsung-exynos.md b/docs/source/android-samsung-exynos.md
new file mode 100644
index 00000000000..4c5a470edca
--- /dev/null
+++ b/docs/source/android-samsung-exynos.md
@@ -0,0 +1 @@
+```{include} backends-samsung-exynos.md
diff --git a/docs/source/android-section.md b/docs/source/android-section.md
new file mode 100644
index 00000000000..a5774352bc1
--- /dev/null
+++ b/docs/source/android-section.md
@@ -0,0 +1,23 @@
+(android-section)=
+
+# Android
+
+Deploy ExecuTorch on Android devices with hardware acceleration support.
+
+## Quick Start & Integration
+
+- {doc}`using-executorch-android` — Complete Android integration guide
+
+## Backends
+
+- {doc}`android-backends` — Available Android backends and acceleration options
+
+## Examples & Demos
+
+- {doc}`android-examples` — Explore Android Examples & Demos
+
+```{toctree}
+:hidden:
+using-executorch-android
+android-backends
+android-examples
diff --git a/docs/source/android-vulkan.md b/docs/source/android-vulkan.md
new file mode 100644
index 00000000000..6399ac4ec7c
--- /dev/null
+++ b/docs/source/android-vulkan.md
@@ -0,0 +1 @@
+```{include} backends-vulkan.md
diff --git a/docs/source/android-xnnpack.md b/docs/source/android-xnnpack.md
new file mode 100644
index 00000000000..315dd747006
--- /dev/null
+++ b/docs/source/android-xnnpack.md
@@ -0,0 +1 @@
+```{include} backends-xnnpack.md
diff --git a/docs/source/api-section.md b/docs/source/api-section.md
new file mode 100644
index 00000000000..f5725a063d4
--- /dev/null
+++ b/docs/source/api-section.md
@@ -0,0 +1,26 @@
+(api-section)=
+# API
+
+In this section, find complete API documentation for ExecuTorch's export, runtime, and extension interfaces. Includes comprehensive references for Python, C++, and Java APIs across all supported platforms.
+
+- {doc}`export-to-executorch-api-reference` — Export to ExecuTorch API Reference
+- {doc}`executorch-runtime-api-reference` — ExecuTorch Runtime API Reference
+- {doc}`runtime-python-api-reference` — Runtime Python API Reference
+- {doc}`api-life-cycle` — API Life Cycle
+- [Android doc →](https://pytorch.org/executorch/main/javadoc/)** — Android API Documentation
+- {doc}`extension-module` — Extension Module
+- {doc}`extension-tensor` — Extension Tensor
+- {doc}`running-a-model-cpp-tutorial` — Detailed C++ Runtime APIs Tutorial
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+:caption: API Reference
+
+export-to-executorch-api-reference
+executorch-runtime-api-reference
+runtime-python-api-reference
+api-life-cycle
+extension-module
+extension-tensor
+running-a-model-cpp-tutorial
diff --git a/docs/source/api.md b/docs/source/api.md
deleted file mode 100644
index 4f6160d258a..00000000000
--- a/docs/source/api.md
+++ /dev/null
@@ -1,11 +0,0 @@
-# API
-
-```{toctree}
-:maxdepth: 1
-
-export-to-executorch-api-reference
-executorch-runtime-api-reference
-runtime-python-api-reference
-api-life-cycle
-Javadoc <https://pytorch.org/executorch/main/javadoc/>
-```
diff --git a/docs/source/backend-delegate-advanced.md b/docs/source/backend-delegate-advanced.md
new file mode 100644
index 00000000000..752bd1cdc02
--- /dev/null
+++ b/docs/source/backend-delegate-advanced.md
@@ -0,0 +1,33 @@
+(backend-delegate-advanced)=
+
+# Backend & Delegates
+
+## Integration
+
+- {doc}`backend-delegates-integration` — Learn how to integrate a backend delegate into ExecuTorch
+
+## XNNPACK Reference
+
+- {doc}`backend-delegates-xnnpack-reference` — Deep dive into XNNPACK delegate internals and implementation details
+
+## Dependency Management
+
+- {doc}`backend-delegates-dependencies` — Manage third-party dependencies for backend delegates
+
+## Overview
+
+- {doc}`compiler-delegate-and-partitioner` — Understanding backends, delegates, and the partitioner system
+
+## Debugging
+
+- {doc}`debug-backend-delegate` — Tools and techniques for debugging delegation issues
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+
+backend-delegates-integration
+backend-delegates-xnnpack-reference
+backend-delegates-dependencies
+compiler-delegate-and-partitioner
+debug-backend-delegate
diff --git a/docs/source/backends-overview.md b/docs/source/backends-overview.md
index c83ace26853..b15b466d6a6 100644
--- a/docs/source/backends-overview.md
+++ b/docs/source/backends-overview.md
@@ -1,21 +1,64 @@
-# Backend Overview
+# Backends
 
-ExecuTorch backends provide hardware acceleration for a specific hardware target. In order to achieve maximum performance on target hardware, ExecuTorch optimizes the model for a specific backend during the export and lowering process. This means that the resulting .pte file is specialized for the specific hardware. In order to deploy to multiple backends, such as Core ML on iOS and Arm CPU on Android, it is common to generate a dedicated .pte file for each.
+## Backend Overview
 
-The choice of hardware backend is informed by the hardware that the model is intended to be deployed on. Each backend has specific hardware requires and level of model support. See the documentation for each hardware backend for more details.
+ExecuTorch backends provide hardware acceleration for specific hardware targets, enabling models to run efficiently on devices ranging from mobile phones to embedded systems and DSPs. During the export and lowering process, ExecuTorch optimizes your model for the chosen backend, resulting in a `.pte` file specialized for that hardware. To support multiple platforms (e.g., Core ML on iOS, Arm CPU on Android), you typically generate a dedicated `.pte` file for each backend.
 
-As part of the .pte file creation process, ExecuTorch identifies portions of the model (partitions) that are supported for the given backend. These sections are processed by the backend ahead of time to support efficient execution. Portions of the model that are not supported on the delegate, if any, are executed using the portable fallback implementation on CPU. This allows for partial model acceleration when not all model operators are supported on the backend, but may have negative performance implications. In addition, multiple partitioners can be specified in order of priority. This allows for operators not supported on GPU to run on CPU via XNNPACK, for example.
+The choice of backend is informed by the hardware your model will run on. Each backend has its own hardware requirements and level of model/operator support. See the documentation for each backend for details.
 
-### Available Backends
+As part of `.pte` file creation, ExecuTorch identifies model partitions supported by the backend. These are processed ahead of time for efficient execution. Operators not supported by the delegate are executed using the portable CPU fallback (e.g., XNNPACK), allowing for partial acceleration. You can also specify multiple partitioners in order of priority, so unsupported GPU ops can fall back to CPU, for example.
 
-Commonly used hardware backends are listed below. For mobile, consider using XNNPACK for Android and XNNPACK or Core ML for iOS. To create a .pte file for a specific backend, pass the appropriate partitioner class to `to_edge_transform_and_lower`. See the appropriate backend documentation for more information.
+---
 
-- [XNNPACK (Mobile CPU)](backends-xnnpack.md)
-- [Core ML (iOS)](backends-coreml.md)
-- [Metal Performance Shaders (iOS GPU)](backends-mps.md)
-- [Vulkan (Android GPU)](backends-vulkan.md)
-- [Qualcomm NPU](backends-qualcomm.md)
-- [MediaTek NPU](backends-mediatek.md)
-- [ARM Ethos-U NPU](backends-arm-ethos-u.md)
-- [ARM VGF](backends-arm-vgf.md)
-- [Cadence DSP](backends-cadence.md)
+## Why Backends Matter
+
+Backends are the bridge between your exported model and the hardware it runs on. Choosing the right backend ensures your model takes full advantage of device-specific acceleration, balancing performance, compatibility, and resource usage.
+
+---
+
+## Choosing a Backend
+
+| Backend                                  | Platform(s)         | Hardware Type | Typical Use Case                |
+|------------------------------------------|---------------------|---------------|---------------------------------|
+| [XNNPACK](backends-xnnpack)              | All                 | CPU           | General-purpose, fallback       |
+| [Core ML](backends-coreml)               | iOS, macOS          | NPU/GPU       | Apple devices, high performance |
+| [Metal Performance Shaders](backends-mps)| iOS, macOS          | GPU           | Apple GPU acceleration          |
+| [Vulkan ](backends-vulkan)               | Android             | GPU           | Android GPU acceleration        |
+| [Qualcomm](backends-qualcomm)            | Android             | NPU           | Qualcomm SoCs                   |
+| [MediaTek](backends-mediatek)            | Android             | NPU           | MediaTek SoCs                   |
+| [ARM EthosU](backends-arm-ethos-u)       | Embedded            | NPU           | ARM MCUs                        |
+| [ARM VGF](backends-arm-vgf)              | Android             | NPU           | ARM platforms                   |
+| [OpenVINO](build-run-openvino)           | Embedded            | CPU/GPU/NPU   | Intel  SoCs                     |
+| [NXP](backends-nxp)                      | Embedded            | NPU           | NXP SoCs                        |
+| [Cadence](backends-cadence)              | Embedded            | DSP           | DSP-optimized workloads         |
+| [Samsung Exynos](backends-samsung-exynos)| Android             | NPU           | Samsung Socs                    |
+
+**Tip:** For best performance, export a `.pte` file for each backend you plan to support.
+
+---
+
+## Best Practices
+
+- **Test on all target devices:** Operator support may vary by backend.
+- **Use fallback wisely:** If a backend doesn't support an operator, ExecuTorch will run it on CPU.
+- **Consult backend docs:** Each backend has unique setup and tuning options.
+
+---
+
+```{toctree}
+:maxdepth: 1
+:hidden:
+:caption: Backend Overview
+
+backends-xnnpack
+backends-coreml
+backends-mps
+backends-vulkan
+backends-qualcomm
+backends-mediatek
+backends-arm-ethos-u
+backends-arm-vgf
+build-run-openvino
+backends-nxp
+backends-cadence
+backends-samsung-exynos
diff --git a/docs/source/backends-samsung-exynos.md b/docs/source/backends-samsung-exynos.md
new file mode 100644
index 00000000000..0d77936bf7f
--- /dev/null
+++ b/docs/source/backends-samsung-exynos.md
@@ -0,0 +1 @@
+# Samsung Exynos Backend (TBD)
diff --git a/docs/source/backends-section.md b/docs/source/backends-section.md
new file mode 100644
index 00000000000..29a235a9416
--- /dev/null
+++ b/docs/source/backends-section.md
@@ -0,0 +1 @@
+```{include} backends-overview.md
diff --git a/docs/source/backends-xnnpack.md b/docs/source/backends-xnnpack.md
index d1a120e69fa..75ec17809a4 100644
--- a/docs/source/backends-xnnpack.md
+++ b/docs/source/backends-xnnpack.md
@@ -67,10 +67,11 @@ The XNNPACK delegate can also be used as a backend to execute symmetrically quan
 
 ### Supported Quantization Schemes
 The XNNPACK delegate supports the following quantization schemes:
+
 - 8-bit symmetric weights with 8-bit asymmetric activations (via the PT2E quantization flow).
-    - Supports both static and dynamic activations.
-    - Supports per-channel and per-tensor schemes.
-    - Supports linear, convolution, add, mul, cat, and adaptive avg pool 2d operators.
+  - Supports both static and dynamic activations.
+  - Supports per-channel and per-tensor schemes.
+  - Supports linear, convolution, add, mul, cat, and adaptive avg pool 2d operators.
 
 Weight-only quantization is not currently supported on XNNPACK.
 
diff --git a/docs/source/backends.md b/docs/source/backends.md
deleted file mode 100644
index 53db638f36d..00000000000
--- a/docs/source/backends.md
+++ /dev/null
@@ -1,17 +0,0 @@
-# Backends
-
-```{toctree}
-:maxdepth: 1
-
-backends-overview
-backends-xnnpack
-backends-coreml
-backends-mps
-backends-vulkan
-backends-arm-ethos-u
-backends-qualcomm
-backends-mediatek
-backends-cadence
-OpenVINO Backend <build-run-openvino>
-backends-nxp
-```
diff --git a/docs/source/compiler-delegate-and-partitioner.md b/docs/source/compiler-delegate-and-partitioner.md
index c633bb1fd12..437361517cc 100644
--- a/docs/source/compiler-delegate-and-partitioner.md
+++ b/docs/source/compiler-delegate-and-partitioner.md
@@ -1,4 +1,4 @@
-# Backends and Delegates
+# Understanding Backends and Delegates
 
 Audience: Vendors, Backend Delegate developers, who are interested in integrating their own compilers and hardware as part of ExecuTorch
 
diff --git a/docs/source/compiler-ir-advanced.md b/docs/source/compiler-ir-advanced.md
new file mode 100644
index 00000000000..b6d24026d5a
--- /dev/null
+++ b/docs/source/compiler-ir-advanced.md
@@ -0,0 +1,31 @@
+(compiler-ir-advanced)=
+# Compiler & IR
+
+Advanced compiler features and intermediate representation specifications.
+
+## Compiler Passes
+
+- {doc}`compiler-custom-compiler-passes` — Custom compiler passes and optimization
+
+## Memory Management
+
+- {doc}`compiler-memory-planning` — Advanced memory planning strategies
+
+## Intermediate Representation
+
+- {doc}`ir-exir` — EXIR (Export Intermediate Representation) specification
+- {doc}`ir-ops-set-definition` — Ops set definition and operator standardization
+
+## Backend dialect
+
+- {doc}`compiler-backend-dialect` — Backend dialect and compiler integration
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+
+compiler-custom-compiler-passes
+compiler-memory-planning
+ir-exir
+ir-ops-set-definition
+compiler-backend-dialect
diff --git a/docs/source/desktop-backends.md b/docs/source/desktop-backends.md
new file mode 100644
index 00000000000..e4220edb47f
--- /dev/null
+++ b/docs/source/desktop-backends.md
@@ -0,0 +1,27 @@
+(desktop-backends)=
+# Backends
+
+Available hardware acceleration backends for desktop platforms.
+
+## Linux Backends
+
+- {doc}`desktop-xnnpack` — XNNPACK (CPU acceleration)
+- {doc}`desktop-openvino` — OpenVINO (Intel hardware optimization)
+
+## macOS Backends
+
+- {doc}`desktop-coreml` — CoreML (recommended for Apple Silicon)
+- {doc}`desktop-mps` — Metal Performance Shaders (Apple Silicon GPU)
+- {doc}`desktop-xnnpack` — XNNPACK (CPU acceleration)
+
+## Windows Backends
+
+- {doc}`desktop-xnnpack` — XNNPACK (CPU acceleration)
+- {doc}`desktop-openvino` — OpenVINO (Intel hardware optimization)
+
+```{toctree}
+:hidden:
+desktop-xnnpack
+desktop-openvino
+desktop-coreml
+desktop-mps
diff --git a/docs/source/desktop-coreml.md b/docs/source/desktop-coreml.md
new file mode 100644
index 00000000000..48271326d87
--- /dev/null
+++ b/docs/source/desktop-coreml.md
@@ -0,0 +1 @@
+```{include} backends-coreml.md
diff --git a/docs/source/desktop-mps.md b/docs/source/desktop-mps.md
new file mode 100644
index 00000000000..d6f305d33aa
--- /dev/null
+++ b/docs/source/desktop-mps.md
@@ -0,0 +1 @@
+```{include} backends-mps.md
diff --git a/docs/source/desktop-openvino.md b/docs/source/desktop-openvino.md
new file mode 100644
index 00000000000..a0fd5774c73
--- /dev/null
+++ b/docs/source/desktop-openvino.md
@@ -0,0 +1 @@
+```{include} build-run-openvino.md
diff --git a/docs/source/desktop-section.md b/docs/source/desktop-section.md
new file mode 100644
index 00000000000..7afccbe1d4f
--- /dev/null
+++ b/docs/source/desktop-section.md
@@ -0,0 +1,19 @@
+(desktop-section)=
+# Desktop & Laptop Platforms
+
+Deploy ExecuTorch on Linux, macOS, and Windows with optimized backends for each platform.
+
+## Platform Overview & Runtime
+
+- {doc}`using-executorch-cpp` — C++ runtime integration guide
+- {doc}`using-executorch-building-from-source` — Building ExecuTorch from source
+
+## Backends
+
+- {doc}`desktop-backends` — Available desktop backends and platform-specific optimization
+
+```{toctree}
+:hidden:
+using-executorch-cpp
+using-executorch-building-from-source
+desktop-backends
diff --git a/docs/source/desktop-xnnpack.md b/docs/source/desktop-xnnpack.md
new file mode 100644
index 00000000000..315dd747006
--- /dev/null
+++ b/docs/source/desktop-xnnpack.md
@@ -0,0 +1 @@
+```{include} backends-xnnpack.md
diff --git a/docs/source/edge-platforms-section.md b/docs/source/edge-platforms-section.md
new file mode 100644
index 00000000000..8761325451d
--- /dev/null
+++ b/docs/source/edge-platforms-section.md
@@ -0,0 +1,73 @@
+(edge-platforms-section)=
+# Edge
+
+Deploy ExecuTorch on mobile, desktop, and embedded platforms with optimized backends for each.
+
+ExecuTorch supports deployment across a wide variety of edge computing platforms, from high-end mobile devices to constrained embedded systems and microcontrollers.
+
+## Android
+
+Deploy ExecuTorch on Android devices with hardware acceleration support.
+
+**→ {doc}`android-section` — Complete Android deployment guide**
+
+Key features:
+- Hardware acceleration support (CPU, GPU, NPU)
+- Multiple backend options (XNNPACK, Vulkan, Qualcomm, MediaTek, ARM, Samsung)
+- Comprehensive examples and demos
+
+## iOS
+
+Deploy ExecuTorch on iOS devices with Apple hardware acceleration.
+
+**→ {doc}`ios-section` — Complete iOS deployment guide**
+
+Key features:
+- Apple hardware optimization (CoreML, MPS, XNNPACK)
+- Swift and Objective-C integration
+- LLM and computer vision examples
+
+## Desktop & Laptop Platforms
+
+Deploy ExecuTorch on Linux, macOS, and Windows with optimized backends.
+
+**→ {doc}`desktop-section` — Complete desktop deployment guide**
+
+Key features:
+- Cross-platform C++ runtime
+- Platform-specific optimization (OpenVINO, CoreML, MPS)
+- CPU and GPU acceleration options
+
+## Embedded Systems
+
+Deploy ExecuTorch on constrained embedded systems and microcontrollers.
+
+**→ {doc}`embedded-section` — Complete embedded deployment guide**
+
+Key features:
+
+- Resource-constrained deployment
+- DSP and NPU acceleration (Cadence, ARM Ethos-U, NXP)
+- Custom backend development support
+- LLM and computer vision examples
+
+## Troubleshooting & Support
+
+- **{doc}`using-executorch-troubleshooting`** - Common issues and solutions across all platforms
+
+## Next Steps
+
+After choosing your platform:
+- **{doc}`backends-section`** - Deep dive into backend selection and optimization
+- **{doc}`llms-section`** - Working with Large Language Models on edge devices
+
+```{toctree}
+:hidden:
+:maxdepth: 2
+:caption: Edge Platforms
+
+android-section
+ios-section
+desktop-section
+embedded-section
+using-executorch-troubleshooting
diff --git a/docs/source/embedded-arm-ethos-u.md b/docs/source/embedded-arm-ethos-u.md
new file mode 100644
index 00000000000..cdc544a6553
--- /dev/null
+++ b/docs/source/embedded-arm-ethos-u.md
@@ -0,0 +1 @@
+```{include} backends-arm-ethos-u.md
diff --git a/docs/source/embedded-backends.md b/docs/source/embedded-backends.md
new file mode 100644
index 00000000000..4ed7962ef42
--- /dev/null
+++ b/docs/source/embedded-backends.md
@@ -0,0 +1,20 @@
+(embedded-backends)=
+# Backends
+
+Available hardware acceleration backends for embedded systems.
+
+## DSP Acceleration
+
+- {doc}`embedded-cadence` — Cadence Xtensa DSP processors
+
+## NPU Acceleration
+
+- {doc}`embedded-arm-ethos-u` — ARM Ethos-U NPU acceleration
+- {doc}`embedded-nxp` — NXP eIQ Neutron Backend
+
+
+```{toctree}
+:hidden:
+embedded-cadence
+embedded-arm-ethos-u
+embedded-nxp
diff --git a/docs/source/embedded-cadence.md b/docs/source/embedded-cadence.md
new file mode 100644
index 00000000000..d2f7ea78259
--- /dev/null
+++ b/docs/source/embedded-cadence.md
@@ -0,0 +1 @@
+```{include} backends-cadence.md
diff --git a/docs/source/embedded-nxp.md b/docs/source/embedded-nxp.md
new file mode 100644
index 00000000000..35d8f0ab75d
--- /dev/null
+++ b/docs/source/embedded-nxp.md
@@ -0,0 +1 @@
+```{include} backends-nxp.md
diff --git a/docs/source/embedded-section.md b/docs/source/embedded-section.md
new file mode 100644
index 00000000000..834001afbc3
--- /dev/null
+++ b/docs/source/embedded-section.md
@@ -0,0 +1,39 @@
+(embedded-section)=
+
+# Embedded Systems
+
+Deploy ExecuTorch on constrained embedded systems and microcontrollers.
+
+## API Reference & Development
+
+Start here for C++ development with ExecuTorch runtime APIs and essential tutorials.
+
+- {doc}`executorch-runtime-api-reference` — **Start here**: Complete runtime API reference for embedded development
+- {doc}`running-a-model-cpp-tutorial` — Step-by-step C++ API tutorial with practical examples
+- {doc}`extension-module` — Custom module extensions for specialized functionality
+- {doc}`extension-tensor` — Tensor operations and memory management extensions
+
+## Build & Integration Guide
+
+- {doc}`using-executorch-cpp` — Complete setup guide for C++ runtime integration
+- {doc}`using-executorch-building-from-source` — Building from Source
+
+## Choose Backend for acceleration
+
+- {doc}`embedded-backends` — Available embedded backends and acceleration options
+
+## Tutorials
+
+- {doc}`tutorial-arm-ethos-u` — Export a simple PyTorch model for the ExecuTorch Ethos-U backend
+
+
+```{toctree}
+:hidden:
+executorch-runtime-api-reference
+running-a-model-cpp-tutorial
+extension-module
+extension-tensor
+using-executorch-cpp
+using-executorch-building-from-source
+embedded-backends
+tutorial-arm-ethos-u
diff --git a/docs/source/file-formats-advanced.md b/docs/source/file-formats-advanced.md
new file mode 100644
index 00000000000..c16ebccfd65
--- /dev/null
+++ b/docs/source/file-formats-advanced.md
@@ -0,0 +1,17 @@
+(file-formats-advanced)=
+
+# File Formats
+
+ExecuTorch file format specifications and internal structure.
+
+## Program File Formats
+
+- {doc}`pte-file-format` — PTE (PyTorch ExecuTorch) file format specification
+- {doc}`ptd-file-format` — PTD file format specification
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+
+pte-file-format
+ptd-file-format
diff --git a/docs/source/index.md b/docs/source/index.md
index fd0957d8fd4..b65139319a7 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -1,134 +1,195 @@
 (home)=
 # Welcome to the ExecuTorch Documentation
 
-**ExecuTorch** is PyTorch's solution to training and inference on the
-Edge.
+**ExecuTorch** is PyTorch's solution for efficient AI inference on edge devices — from mobile phones to embedded systems.
 
 ## Key Value Propositions
 
-- **Portability:** Compatibility with a wide variety of computing
-  platforms, from high-end mobile phones to highly constrained
-  embedded systems and microcontrollers.
-- **Productivity:** Enabling developers to use the same toolchains and
-  Developer Tools from PyTorch model authoring and conversion, to
-  debugging and deployment to a wide variety of platforms.
-- **Performance:** Providing end users with a seamless and
-  high-performance experience due to a lightweight runtime and
-  utilizing full hardware capabilities such as CPUs, NPUs, and DSPs.
-
-ExecuTorch provides support for:
-
-* **Strong Model Support** LLMs (Large Language Models),
-  CV (Computer Vision), ASR (Automatic Speech Recognition), TTS (Text To Speech)
-* **All Major Platforms** Android, Mac, Linux, Windows
-* **Rich Acceleration Support** Apple, Arm, Cadence, MediaTek, NXP, OpenVino, Qualcomm, Vulkan, XNNPACK
-
-### Documentation Navigation
-#### Introduction
-- [Overview](intro-overview)
-- [How it Works](intro-how-it-works)
-- [Getting Started with Architecture](getting-started-architecture)
-- [Concepts](concepts)
-#### Usage
-- [Getting Started](getting-started)
-- [Using Executorch Export](using-executorch-export)
-- [Using Executorch on Android](using-executorch-android)
-- [Using Executorch on iOS](using-executorch-ios)
-- [Using Executorch with C++](using-executorch-cpp)
-- [Runtime Integration](using-executorch-runtime-integration)
-- [Troubleshooting](using-executorch-troubleshooting)
-- [Building from Source](using-executorch-building-from-source)
-- [Quantization](quantization-overview)
-- [FAQs](using-executorch-faqs)
-#### Examples
-- [Android Demo Apps](https://github.com/meta-pytorch/executorch-examples/tree/main/dl3/android/DeepLabV3Demo#executorch-android-demo-app)
-- [iOS Demo Apps](https://github.com/meta-pytorch/executorch-examples/tree/main/mv3/apple/ExecuTorchDemo)
-- [Hugging Face Models](https://github.com/huggingface/optimum-executorch/blob/main/README.md)
-#### Backends
-- [Overview](backends-overview)
-- [XNNPACK](backends-xnnpack)
-- [Core ML](backends-coreml)
-- [MPS](backends-mps)
-- [Vulkan](backends-vulkan)
-- [ARM Ethos-U](backends-arm-ethos-u)
-- [ARM VGF](backends-arm-vgf)
-- [Qualcomm](backends-qualcomm)
-- [MediaTek](backends-mediatek)
-- [Cadence](backends-cadence)
-- [OpenVINO](build-run-openvino)
-- [NXP](backend-nxp)
-#### Developer Tools
-- [Overview](devtools-overview)
-- [Bundled IO](bundled-io)
-- [ETRecord](etrecord)
-- [ETDump](etdump)
-- [Runtime Profiling](runtime-profiling)
-- [Model Debugging](model-debugging)
-- [Model Inspector](model-inspector)
-- [Memory Planning Inspection](memory-planning-inspection)
-- [Delegate Debugging](delegate-debugging)
-- [Tutorial](devtools-tutorial)
-#### Runtime
-- [Overview](runtime-overview)
-- [Extension Module](extension-module)
-- [Extension Tensor](extension-tensor)
-- [Detailed C++ Runtime APIs Tutorial](running-a-model-cpp-tutorial)
-- [Backend Delegate Implementation and Linking](runtime-backend-delegate-implementation-and-linking)
-- [Platform Abstraction Layer](runtime-platform-abstraction-layer)
-#### Portable C++ Programming
-- [PTE File Format](pte-file-format)
-- [PTD File Format](ptd-file-format)
-#### API Reference
-- [Export to Executorch API Reference](export-to-executorch-api-reference)
-- [Executorch Runtime API Reference](executorch-runtime-api-reference)
-- [Runtime Python API Reference](runtime-python-api-reference)
-- [API Life Cycle](api-life-cycle)
-- [Javadoc](https://pytorch.org/executorch/main/javadoc/)
-#### Kernel Library
-- [Overview](kernel-library-overview)
-- [Custom ATen Kernel](kernel-library-custom-aten-kernel)
-- [Selective Build](kernel-library-selective-build)
-#### Working with LLMs
-- [Getting Started](llm/getting-started.md)
-- [Exporting LLMs](llm/export-llm.md)
-- [Exporting custom LLMs](llm/export-custom-llm.md)
-- [Running with C++](llm/run-with-c-plus-plus.md)
-- [Running on Android (XNNPack)](https://github.com/meta-pytorch/executorch-examples/tree/main/llm/android)
-- [Running on Android (QNN)](llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md)
-- [Running on iOS](llm/run-on-ios.md)
-#### Backend Development
-- [Delegates Integration](backend-delegates-integration)
-- [XNNPACK Reference](backend-delegates-xnnpack-reference)
-- [Dependencies](backend-delegates-dependencies)
-- [Compiler Delegate and Partitioner](compiler-delegate-and-partitioner)
-- [Debug Backend Delegate](debug-backend-delegate)
-#### IR Specification
-- [EXIR](ir-exir)
-- [Ops Set Definition](ir-ops-set-definition)
-#### Compiler Entry Points
-- [Backend Dialect](compiler-backend-dialect)
-- [Custom Compiler Passes](compiler-custom-compiler-passes)
-- [Memory Planning](compiler-memory-planning)
-#### Contributing
-- [Contributing](contributing)
+- **Portability:** Run on diverse platforms, from high-end mobile to constrained microcontrollers
+- **Performance:** Lightweight runtime with full hardware acceleration (CPU, GPU, NPU, DSP)
+- **Productivity:** Use familiar PyTorch tools from authoring to deployment
+
+---
+
+## 🎯 Wins & Success Stories
+
+::::{grid} 1
+:class-container: success-showcase
+:::{grid-item-card}
+:class-header: bg-primary text-white
+:class-body: text-center
+[View All Success Stories →](success-stories)
+:::
+::::
+
+---
+
+## Quick Navigation
+
+::::{grid} 2
+
+:::{grid-item-card} **Get Started**
+:link: quick-start-section
+:link-type: doc
+
+New to ExecuTorch? Start here for installation and your first model deployment.
+:::
+
+:::{grid-item-card} **Deploy on Edge Platforms**
+:link: edge-platforms-section
+:link-type: doc
+
+Deploy on Android, iOS, Laptops / Desktops and embedded platforms with optimized backends.
+:::
+
+:::{grid-item-card} **Work with LLMs**
+:link: llm/working-with-llms
+:link-type: doc
+
+Export, optimize, and deploy Large Language Models on edge devices.
+:::
+
+:::{grid-item-card} 🔧 **Developer Tools**
+:link: tools-section
+:link-type: doc
+
+Profile, debug, and inspect your models with comprehensive tooling.
+:::
+
+::::
+
+---
+
+## Explore Documentation
+
+::::{grid} 1
+:::{grid-item-card} **Intro**
+:link: intro-section
+:link-type: doc
+
+**Overview, architecture, and core concepts** — Understand how ExecuTorch works and its benefits
+:::
+::::
+
+::::{grid} 1
+:::{grid-item-card} **Quick Start**
+:link: quick-start-section
+:link-type: doc
+
+**Get started with ExecuTorch** — Install, export your first model, and run inference
+:::
+::::
+
+::::{grid} 1
+:::{grid-item-card} **Edge**
+:link: edge-platforms-section
+:link-type: doc
+
+**Android, iOS, Desktop, Embedded** — Platform-specific deployment guides and examples
+:::
+::::
+
+::::{grid} 1
+:::{grid-item-card} **Backends**
+:link: backends-section
+:link-type: doc
+
+**CPU, GPU, NPU/Accelerator backends** — Hardware acceleration and backend selection
+:::
+::::
+
+::::{grid} 1
+:::{grid-item-card} **LLMs**
+:link: llm/working-with-llms
+:link-type: doc
+
+**LLM export, optimization, and deployment** — Complete LLM workflow for edge devices
+:::
+::::
+
+::::{grid} 1
+:::{grid-item-card} **Advanced**
+:link: advanced-topics-section
+:link-type: doc
+
+**Quantization, memory planning, custom passes** — Deep customization and optimization
+:::
+::::
+
+::::{grid} 1
+:::{grid-item-card} **Tools**
+:link: tools-section
+:link-type: doc
+
+**Developer tools, profiling, debugging** — Comprehensive development and debugging suite
+:::
+::::
+
+::::{grid} 1
+:::{grid-item-card} **API**
+:link: api-section
+:link-type: doc
+
+**API Reference Usages & Examples** — Detailed Python, C++, and Java API references
+:::
+::::
+
+::::{grid} 1
+:::{grid-item-card} **💬 Support**
+:link: support-section
+:link-type: doc
+
+**FAQ, troubleshooting, contributing** — Get help and contribute to the project
+:::
+::::
+
+---
+
+## What's Supported
+
+::::{grid} 3
+
+:::{grid-item}
+**Model Types**
+
+- Large Language Models (LLMs)
+- Computer Vision (CV)
+- Speech Recognition (ASR)
+- Text-to-Speech (TTS)
+- More ...
+:::
+
+:::{grid-item}
+**Platforms**
+
+- Android & iOS
+- Linux, macOS, Windows
+- Embedded & MCUs
+- Go **→ {doc}`edge-platforms-section`**
+:::
+
+:::{grid-item}
+**Rich Acceleration**
+
+- CPU
+- GPU
+- NPU
+- DSP
+- Go **→ {doc}`backends-section`**
+:::
+
+::::
 
 ```{toctree}
-:glob:
-:maxdepth: 1
 :hidden:
+:maxdepth: 1
 
-intro
-usage
-examples
-backends
-developer-tools
-runtime
-api
-quantization
-kernel-library
+intro-section
+quick-start-section
+edge-platforms-section
+backends-section
 llm/working-with-llms
-backend-development
-ir-specification
-compiler-entry-points
-contributing
-```
+advanced-topics-section
+tools-section
+api-section
+support-section
diff --git a/docs/source/intro-section.md b/docs/source/intro-section.md
new file mode 100644
index 00000000000..2f6f3c57c88
--- /dev/null
+++ b/docs/source/intro-section.md
@@ -0,0 +1,27 @@
+(intro-section)=
+
+# Intro
+
+Overview, architecture, and core concepts of ExecuTorch.
+
+ExecuTorch is PyTorch's solution for training and inference on the Edge, providing portability, productivity, and performance for edge computing platforms.
+
+## Getting Started with ExecuTorch
+
+New to ExecuTorch? Start with these foundational topics:
+
+- **{doc}`intro-overview`** - High-level overview of ExecuTorch capabilities
+- **{doc}`intro-how-it-works`** - Technical overview of the ExecuTorch workflow
+- **{doc}`getting-started-architecture`** - System architecture and components
+- **{doc}`concepts`** - Core concepts and terminology
+
+```{toctree}
+:hidden:
+:maxdepth: 2
+:caption: Introduction Topics
+
+intro-overview
+intro-how-it-works
+getting-started-architecture
+concepts
+```
diff --git a/docs/source/intro.md b/docs/source/intro.md
deleted file mode 100644
index f6609cc3ba7..00000000000
--- a/docs/source/intro.md
+++ /dev/null
@@ -1,10 +0,0 @@
-# Intro
-
-```{toctree}
-:maxdepth: 1
-
-intro-overview
-intro-how-it-works
-getting-started-architecture
-concepts
-```
diff --git a/docs/source/ios-backends.md b/docs/source/ios-backends.md
new file mode 100644
index 00000000000..cb186f53319
--- /dev/null
+++ b/docs/source/ios-backends.md
@@ -0,0 +1,19 @@
+(ios-backends)=
+# Backends
+
+Available hardware acceleration backends for iOS deployment.
+
+## Apple Hardware Acceleration (Recommended)
+
+- {doc}`ios-coreml` — CoreML (NPU/GPU, recommended for iOS)
+- {doc}`ios-mps` — Metal Performance Shaders (GPU)
+
+## CPU Acceleration
+
+- {doc}`ios-xnnpack` — XNNPACK (CPU acceleration)
+
+```{toctree}
+:hidden:
+ios-coreml
+ios-mps
+ios-xnnpack
diff --git a/docs/source/ios-coreml.md b/docs/source/ios-coreml.md
new file mode 100644
index 00000000000..48271326d87
--- /dev/null
+++ b/docs/source/ios-coreml.md
@@ -0,0 +1 @@
+```{include} backends-coreml.md
diff --git a/docs/source/ios-examples.md b/docs/source/ios-examples.md
new file mode 100644
index 00000000000..86acf3273a6
--- /dev/null
+++ b/docs/source/ios-examples.md
@@ -0,0 +1,4 @@
+# Examples & Demos
+
+- [iOS LLM Examples Repository](https://github.com/meta-pytorch/executorch-examples/tree/main/llm/apple)
+- [MobileViT Demo App](https://github.com/meta-pytorch/executorch-examples/tree/main/mv3/apple/ExecuTorchDemo)
diff --git a/docs/source/ios-mps.md b/docs/source/ios-mps.md
new file mode 100644
index 00000000000..d6f305d33aa
--- /dev/null
+++ b/docs/source/ios-mps.md
@@ -0,0 +1 @@
+```{include} backends-mps.md
diff --git a/docs/source/ios-section.md b/docs/source/ios-section.md
new file mode 100644
index 00000000000..33c9a61ce1d
--- /dev/null
+++ b/docs/source/ios-section.md
@@ -0,0 +1,23 @@
+(ios-section)=
+# iOS
+
+Deploy ExecuTorch on iOS devices with Apple hardware acceleration.
+
+## Quick Start & Integration
+
+- {doc}`using-executorch-ios` — Complete iOS integration guide
+
+## Backends
+
+- {doc}`ios-backends` — Available iOS backends and acceleration options
+
+## Examples & Demos
+
+- {doc}`ios-examples` — Explore iOS Examples & Demos
+
+
+```{toctree}
+:hidden:
+using-executorch-ios
+ios-backends
+ios-examples
diff --git a/docs/source/ios-xnnpack.md b/docs/source/ios-xnnpack.md
new file mode 100644
index 00000000000..315dd747006
--- /dev/null
+++ b/docs/source/ios-xnnpack.md
@@ -0,0 +1 @@
+```{include} backends-xnnpack.md
diff --git a/docs/source/kernel-library-advanced.md b/docs/source/kernel-library-advanced.md
new file mode 100644
index 00000000000..5f0215b87c1
--- /dev/null
+++ b/docs/source/kernel-library-advanced.md
@@ -0,0 +1,23 @@
+(kernel-library-advanced)=
+
+# Kernel Library Deep Dive
+
+Advanced kernel implementation and customization for ExecuTorch.
+
+## Kernel Library Overview
+
+- {doc}`kernel-library-overview` — Architecture and design of the kernel library
+
+- {doc}`kernel-library-custom-aten-kernel` — Kernel registration and customization
+
+## Build Optimization
+
+- {doc}`kernel-library-selective-build` — Selective build for reduced binary footprint
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+
+kernel-library-overview
+kernel-library-custom-aten-kernel
+kernel-library-selective-build
diff --git a/docs/source/kernel-library-overview.md b/docs/source/kernel-library-overview.md
index cfd46524097..a826b334ba4 100644
--- a/docs/source/kernel-library-overview.md
+++ b/docs/source/kernel-library-overview.md
@@ -1,7 +1,7 @@
-This page provides a description of the Portable Kernel Library and the Optimized Kernel Library, which are the default kernel libraries shipped with ExecuTorch. It is recommended reading for those who are interested in executing ExecuTorch programs with these kernel libraries, or for those who want to implement their own kernels and kernel libraries.
-
 # Overview of ExecuTorch’s Kernel Libraries
 
+This page provides a description of the Portable Kernel Library and the Optimized Kernel Library, which are the default kernel libraries shipped with ExecuTorch. It is recommended reading for those who are interested in executing ExecuTorch programs with these kernel libraries, or for those who want to implement their own kernels and kernel libraries.
+
 An ExecuTorch program encodes instructions that describe the computation that should be performed by the program. Many of these instructions will correspond to calling a specific ATen operator, for example `aten.convolution`. However, one of the core design principles of ExecuTorch is that the signature of an operator should be separate from the implementation of the operator. This means that the ExecuTorch runtime does not ship with any standard implementation for ATen operators; users must make sure to link against kernel libraries that contain implementations of the operators required by their ExecuTorch program, and configure [operator registration](kernel-library-custom-aten-kernel.md) to map an operator signature to the desired implementation. This makes it easy to adjust the implementation of operators such as `aten.convolution` that will be called when executing an ExecuTorch program; it allows users to select the exact operator implementations that will meet the unique performance, memory usage, battery usage, etc. constraints of their use-case.
 
 **In essence, a kernel library is simply a collection of ATen operator implementations that follow a common theme or design principle**. Note that due to ExecuTorch’s selective build process (discussed in the following section), operator implementations are linked individually. This means that users can easily mix different kernel libraries in their build without sacrificing build size.
diff --git a/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md
index 4587589a51b..642dc04da58 100644
--- a/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md
+++ b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md
@@ -1,4 +1,4 @@
-# Building and Running Llama 3 8B Instruct with Qualcomm AI Engine Direct Backend
+# Run Llama 3 8B on Android (with Qualcomm AI Engine Direct Backend)
 
 This tutorial demonstrates how to export Llama 3 8B Instruct for Qualcomm AI Engine Direct Backend and running the model on a Qualcomm device.
 
@@ -56,7 +56,7 @@ backend:
   qnn:
     enabled: True
     num_sharding: 8
-    
+
 
 # export_llm
 python -m extension.llm.export.export_llm \
@@ -136,6 +136,7 @@ You should see the message:
 ```
 
 ## What is coming?
+
 - Performance improvements
 - Reduce the memory pressure during inference to support 12GB Qualcomm devices
 - Support more LLMs (Qwen, Phi-4-mini, etc.)
diff --git a/docs/source/llm/working-with-llms.md b/docs/source/llm/working-with-llms.md
index 17b2e46c0a5..4c238f7ae5c 100644
--- a/docs/source/llm/working-with-llms.md
+++ b/docs/source/llm/working-with-llms.md
@@ -1,13 +1,18 @@
-# Working with LLMs
+(working-with-llms)=
+
+# LLMs
+
+Learn how to export LLM models and deploy them across different platforms and runtime environments. This section covers the complete workflow from model export to running inference on mobile devices and edge hardware.
+
 
 ```{toctree}
 :maxdepth: 1
+:caption: Working with LLMs
 
 getting-started
 export-llm
 export-custom-llm
 run-with-c-plus-plus
-llama-demo-android
 build-run-llama3-qualcomm-ai-engine-direct-backend
 run-on-ios
 ```
diff --git a/docs/source/platforms-desktop.md b/docs/source/platforms-desktop.md
new file mode 100644
index 00000000000..acbdb06a6b6
--- /dev/null
+++ b/docs/source/platforms-desktop.md
@@ -0,0 +1,23 @@
+# Desktop & Laptop
+
+ExecuTorch supports desktop and laptop deployment across Linux, macOS, and Windows.
+
+## Platform-Specific Guides
+- [C++ Runtime Integration](using-executorch-cpp) - Complete setup guide
+- [Building from Source](using-executorch-building-from-source)
+
+## Available Backends by Platform
+
+### Linux
+- [XNNPACK (CPU)](backends-xnnpack)
+- [OpenVINO (Intel)](build-run-openvino)
+- [ARM Ethos-U (ARM64)](backends-arm-ethos-u)
+
+### macOS
+- [CoreML (recommended)](backends-coreml)
+- [MPS (Apple Silicon)](backends-mps)
+- [XNNPACK (CPU)](backends-xnnpack)
+
+### Windows
+- [XNNPACK (CPU)](backends-xnnpack)
+- [OpenVINO (Intel)](build-run-openvino)
diff --git a/docs/source/platforms-embedded.md b/docs/source/platforms-embedded.md
new file mode 100644
index 00000000000..5ea248fc0d9
--- /dev/null
+++ b/docs/source/platforms-embedded.md
@@ -0,0 +1,19 @@
+# Embedded Platforms
+
+ExecuTorch supports embedded devices from microcontrollers to edge devices.
+
+## Platform-Specific Guides
+- [C++ Runtime Integration](using-executorch-cpp) - Complete setup guide
+- [Building from Source](using-executorch-building-from-source)
+
+## Available Backends by Device Type
+
+### Microcontrollers
+- [Cadence Xtensa Backend](backends-cadence)
+- [ARM Ethos-U NPU Backend](backends-arm-ethos-u)
+- [Custom Backend Development](backend-delegates-integration)
+
+### Edge Devices
+- [ARM Ethos-U NPU Backend](backends-arm-ethos-u)
+- [NXP eIQ Neutron Backend](backend-nxp)
+- [Custom Hardware Integration](backend-delegates-integration)
diff --git a/docs/source/quantization-optimization.md b/docs/source/quantization-optimization.md
new file mode 100644
index 00000000000..d2005b3adac
--- /dev/null
+++ b/docs/source/quantization-optimization.md
@@ -0,0 +1,20 @@
+(quantization-optimization)=
+
+# Quantization & Optimization
+
+Advanced techniques for model compression and performance optimization.
+
+## Quantization Strategies
+
+- {doc}`quantization-overview` — Comprehensive quantization strategies and techniques
+
+## Performance Optimization
+
+- {doc}`runtime-profiling` — Performance profiling and optimization techniques
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+
+quantization-overview
+runtime-profiling
diff --git a/docs/source/quick-start-section.md b/docs/source/quick-start-section.md
new file mode 100644
index 00000000000..b35bed8d22c
--- /dev/null
+++ b/docs/source/quick-start-section.md
@@ -0,0 +1,38 @@
+(quick-start-section)=
+# Quick Start
+
+Get started with ExecuTorch in just a few steps.
+
+This section walks you through the essential steps to get ExecuTorch up and running, from initial setup to exporting your first model for edge deployment.
+
+## What You'll Learn
+
+Follow these guides in order to get started with ExecuTorch:
+
+- **{doc}`getting-started`** - Initial Setup: Set up your development environment and run your first ExecuTorch example.
+
+- **{doc}`using-executorch-export`** - Exporting your model: Export for Edge deployment.
+
+- **{doc}`using-executorch-building-from-source`** - Building from Source: Build ExecuTorch from source for custom configurations and development.
+
+## Prerequisites
+
+- Python 3.10-3.12
+- PyTorch 2.9+
+- Basic familiarity with PyTorch model development
+
+## Next Steps
+
+After completing the quick start, explore:
+
+- **{doc}`edge-platforms-section`** - Deploy to specific platforms (Android, iOS, Desktop, Embedded)
+- **{doc}`backends-section`** - Choose the right acceleration backend for your hardware
+
+```{toctree}
+:hidden:
+:maxdepth: 2
+:caption: Quick Start Guide
+
+getting-started
+using-executorch-export
+using-executorch-building-from-source
diff --git a/docs/source/runtime-integration-advanced.md b/docs/source/runtime-integration-advanced.md
new file mode 100644
index 00000000000..a76265c4093
--- /dev/null
+++ b/docs/source/runtime-integration-advanced.md
@@ -0,0 +1,20 @@
+(runtime-integration-advanced)=
+
+# Runtime & Integration
+
+Advanced runtime integration topics
+
+## Platform Integration
+
+- {doc}`runtime-platform-abstraction-layer` — Platform abstraction layer for cross-platform deployment
+
+## Portable C++ Programming
+
+- {doc}`portable-cpp-programming` — Portable C++ programming for cross-platform deployment
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+
+runtime-platform-abstraction-layer
+portable-cpp-programming
diff --git a/docs/source/success-stories.md b/docs/source/success-stories.md
new file mode 100644
index 00000000000..cba874132c6
--- /dev/null
+++ b/docs/source/success-stories.md
@@ -0,0 +1,56 @@
+(success-stories)=
+
+# Success Stories
+
+Discover how organizations are leveraging ExecuTorch to deploy AI models at scale on edge devices.
+
+---
+
+## 🎯 Featured Success Stories
+
+::::{grid} 1
+:gutter: 3
+
+:::{grid-item-card} **🚀 Story 1: [Title Placeholder]**
+:class-header: bg-primary text-white
+
+**Industry:** [Industry]
+**Hardware:** [Hardware Platform]
+**Impact:** [Key Metrics]
+
+[Placeholder Description] - Brief overview of the challenge, solution, and results achieved.
+
+
+[Read Full Story →](#story-1-details)
+:::
+
+:::{grid-item-card} **⚡ Story 2: [Title Placeholder]**
+:class-header: bg-success text-white
+
+**Industry:** [Industry]
+**Hardware:** [Hardware Platform]
+**Impact:** [Key Metrics]
+
+[Placeholder Description] - Brief overview of the challenge, solution, and results achieved.
+
+
+
+[Read Full Story →](#story-2-details)
+:::
+
+:::{grid-item-card} **🧠 Story 3: [Title Placeholder]**
+:class-header: bg-info text-white
+
+**Industry:** [Industry]
+**Hardware:** [Hardware Platform]
+**Impact:** [Key Metrics]
+
+[Placeholder Description] - Brief overview of the challenge, solution, and results achieved.
+
+
+[Read Full Story →](#story-3-details)
+:::
+
+::::
+
+---
diff --git a/docs/source/support-section.md b/docs/source/support-section.md
new file mode 100644
index 00000000000..64c47a3e55b
--- /dev/null
+++ b/docs/source/support-section.md
@@ -0,0 +1,17 @@
+(support-section)=
+# Support
+
+In this section, find answers to common questions, troubleshooting guides, and information on how to contribute to the ExecuTorch project. Get help with issues and learn how to participate in the community.
+
+- {doc}`using-executorch-faqs` — FAQ
+- {doc}`using-executorch-troubleshooting` — Common Issues
+- {doc}`contributing` — Contributing
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+:caption: Support
+
+using-executorch-faqs
+using-executorch-troubleshooting
+contributing
diff --git a/docs/source/tools-section.md b/docs/source/tools-section.md
new file mode 100644
index 00000000000..461a1f6849a
--- /dev/null
+++ b/docs/source/tools-section.md
@@ -0,0 +1,30 @@
+(tools-sdk-section)=
+
+# Tools
+
+In this section, explore ExecuTorch's comprehensive developer tools for profiling, debugging, and model inspection. These tools help optimize performance and troubleshoot issues during development and deployment.
+
+- {doc}`devtools-overview` — Developer Tools Overview
+- {doc}`bundled-io` — Bundled I/O
+- {doc}`etrecord` — ETRecord
+- {doc}`etdump` — ETDump
+- {doc}`runtime-profiling` — Profiling Suite
+- {doc}`model-debugging` — Debugging Tools
+- {doc}`model-inspector` — Model Inspector
+- {doc}`memory-planning-inspection` — Memory Planning Inspection
+- {doc}`devtools-tutorial` — Development Utilities
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+:caption: Tools
+
+devtools-overview
+bundled-io
+etrecord
+etdump
+runtime-profiling
+model-debugging
+model-inspector
+memory-planning-inspection
+devtools-tutorial
diff --git a/docs/source/using-executorch-export.md b/docs/source/using-executorch-export.md
index b3d1836b78a..2363affa7cb 100644
--- a/docs/source/using-executorch-export.md
+++ b/docs/source/using-executorch-export.md
@@ -32,7 +32,7 @@ As part of the .pte file creation process, ExecuTorch identifies portions of the
 
 Commonly used hardware backends are listed below. For mobile, consider using XNNPACK for Android and XNNPACK or Core ML for iOS. To create a .pte file for a specific backend, pass the appropriate partitioner class to `to_edge_transform_and_lower`. See the appropriate backend documentation and the [Export and Lowering](#export-and-lowering) section below for more information.
 
-- [XNNPACK (Mobile CPU)](backends-xnnpack.md)
+- [XNNPACK (CPU)](backends-xnnpack.md)
 - [Core ML (iOS)](backends-coreml.md)
 - [Metal Performance Shaders (iOS GPU)](backends-mps.md)
 - [Vulkan (Android GPU)](backends-vulkan.md)

From d8a21260d35a4acf2073266820950a819aafb8ae Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin <anthony@shoumikh.in>
Date: Mon, 6 Oct 2025 16:42:10 -0700
Subject: [PATCH 145/266] Add Gemma 3 test.

Differential Revision: D84001548

Pull Request resolved: https://github.com/pytorch/executorch/pull/14825
---
 .../Exported/ExecuTorchLLMMultimodalRunner.h  |  16 ++
 .../Exported/ExecuTorchLLMMultimodalRunner.mm |  84 +++++++-
 .../__tests__/MultimodalRunnerTest.swift      | 179 ++++++++++++++----
 .../__tests__/TextRunnerTest.swift            |   4 +-
 4 files changed, 233 insertions(+), 50 deletions(-)

diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h
index 8523581da8a..250241b9c9d 100644
--- a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h
+++ b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h
@@ -44,6 +44,12 @@ __attribute__((objc_subclassing_restricted))
                     channels:(NSInteger)channels
     NS_DESIGNATED_INITIALIZER;
 
+- (instancetype)initWithFloatData:(NSData *)data
+                            width:(NSInteger)width
+                           height:(NSInteger)height
+                         channels:(NSInteger)channels
+    NS_DESIGNATED_INITIALIZER;
+
 @property(nonatomic, readonly) NSData *data;
 
 @property(nonatomic, readonly) NSInteger width;
@@ -52,6 +58,8 @@ __attribute__((objc_subclassing_restricted))
 
 @property(nonatomic, readonly) NSInteger channels;
 
+@property(nonatomic, readonly) BOOL isFloat;
+
 + (instancetype)new NS_UNAVAILABLE;
 - (instancetype)init NS_UNAVAILABLE;
 
@@ -80,6 +88,12 @@ __attribute__((objc_subclassing_restricted))
                       frames:(NSInteger)frames
     NS_DESIGNATED_INITIALIZER;
 
+- (instancetype)initWithFloatData:(NSData *)data
+                        batchSize:(NSInteger)batchSize
+                             bins:(NSInteger)bins
+                           frames:(NSInteger)frames
+    NS_DESIGNATED_INITIALIZER;
+
 @property(nonatomic, readonly) NSData *data;
 
 @property(nonatomic, readonly) NSInteger batchSize;
@@ -88,6 +102,8 @@ __attribute__((objc_subclassing_restricted))
 
 @property(nonatomic, readonly) NSInteger frames;
 
+@property(nonatomic, readonly) BOOL isFloat;
+
 + (instancetype)new NS_UNAVAILABLE;
 - (instancetype)init NS_UNAVAILABLE;
 
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm
index a3dc3e6afd1..964805053e2 100644
--- a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm
+++ b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm
@@ -32,6 +32,22 @@ - (instancetype)initWithData:(NSData *)data
     _width = width;
     _height = height;
     _channels = channels;
+    _isFloat = NO;
+  }
+  return self;
+}
+
+- (instancetype)initWithFloatData:(NSData *)data
+                            width:(NSInteger)width
+                           height:(NSInteger)height
+                         channels:(NSInteger)channels {
+  self = [super init];
+  if (self) {
+    _data = [data copy];
+    _width = width;
+    _height = height;
+    _channels = channels;
+    _isFloat = YES;
   }
   return self;
 }
@@ -53,6 +69,22 @@ - (instancetype)initWithData:(NSData *)data
     _batchSize = batchSize;
     _bins = bins;
     _frames = frames;
+    _isFloat = NO;
+  }
+  return self;
+}
+
+- (instancetype)initWithFloatData:(NSData *)data
+                        batchSize:(NSInteger)batchSize
+                             bins:(NSInteger)bins
+                           frames:(NSInteger)frames {
+  self = [super init];
+  if (self) {
+    _data = [data copy];
+    _batchSize = batchSize;
+    _bins = bins;
+    _frames = frames;
+    _isFloat = YES;
   }
   return self;
 }
@@ -170,6 +202,7 @@ - (BOOL)generateWithInputs:(NSArray<ExecuTorchLLMMultimodalInput *> *)inputs
     return NO;
   }
   std::vector<llm::MultimodalInput> nativeInputs;
+  nativeInputs.reserve((size_t)inputs.count);
   for (ExecuTorchLLMMultimodalInput *input in inputs) {
     switch (input.type) {
       case ExecuTorchLLMMultimodalInputTypeText:
@@ -177,13 +210,50 @@ - (BOOL)generateWithInputs:(NSArray<ExecuTorchLLMMultimodalInput *> *)inputs
         break;
       case ExecuTorchLLMMultimodalInputTypeImage: {
         ExecuTorchLLMImage *image = input.image;
-        std::vector<uint8_t> data((uint8_t *)image.data.bytes, (uint8_t *)image.data.bytes + image.data.length);
-        nativeInputs.emplace_back(llm::MultimodalInput(llm::Image(
-          std::move(data),
-          (int32_t)image.width,
-          (int32_t)image.height,
-          (int32_t)image.channels
-        )));
+        if (image.isFloat) {
+          const float *buffer = (const float *)image.data.bytes;
+          size_t elementCount = (size_t)image.data.length / sizeof(float);
+          std::vector<float> data(buffer, buffer + elementCount);
+          nativeInputs.emplace_back(llm::MultimodalInput(llm::Image(
+            std::move(data),
+            (int32_t)image.width,
+            (int32_t)image.height,
+            (int32_t)image.channels
+          )));
+        } else {
+          const uint8_t *buffer = (const uint8_t *)image.data.bytes;
+          std::vector<uint8_t> data(buffer, buffer + image.data.length);
+          nativeInputs.emplace_back(llm::MultimodalInput(llm::Image(
+            std::move(data),
+            (int32_t)image.width,
+            (int32_t)image.height,
+            (int32_t)image.channels
+          )));
+        }
+        break;
+      }
+      case ExecuTorchLLMMultimodalInputTypeAudio: {
+        ExecuTorchLLMAudio *audio = input.audio;
+        if (audio.isFloat) {
+          const float *buffer = (const float *)audio.data.bytes;
+          size_t elementCount = (size_t)audio.data.length / sizeof(float);
+          std::vector<float> data(buffer, buffer + elementCount);
+          nativeInputs.emplace_back(llm::MultimodalInput(llm::Audio(
+            std::move(data),
+            (int32_t)audio.batchSize,
+            (int32_t)audio.bins,
+            (int32_t)audio.frames
+          )));
+        } else {
+          const uint8_t *buffer = (const uint8_t *)audio.data.bytes;
+          std::vector<uint8_t> data(buffer, buffer + audio.data.length);
+          nativeInputs.emplace_back(llm::MultimodalInput(llm::Audio(
+            std::move(data),
+            (int32_t)audio.batchSize,
+            (int32_t)audio.bins,
+            (int32_t)audio.frames
+          )));
+        }
         break;
       }
       default: {
diff --git a/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift b/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift
index 7ae9da4969b..7281740c3af 100644
--- a/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift
+++ b/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift
@@ -10,60 +10,157 @@ import ExecuTorchLLM
 import XCTest
 
 extension UIImage {
-  func asImage() -> Image {
-    let targetSide = CGFloat(336)
-    let scale = max(targetSide / size.width, targetSide / size.height)
-    let scaledSize = CGSize(width: size.width * scale, height: size.height * scale)
+  func centerCropped(to sideSize: CGFloat) -> UIImage {
+    precondition(sideSize > 0)
     let format = UIGraphicsImageRendererFormat.default()
     format.scale = 1
-    let scaledImage = UIGraphicsImageRenderer(size: scaledSize, format: format).image { _ in
-      draw(in: CGRect(origin: .zero, size: scaledSize))
-    }
-    guard let scaledCGImage = scaledImage.cgImage else {
-      return Image(data: Data(), width: 336, height: 336, channels: 3)
-    }
-    let cropRect = CGRect(
-      x: ((scaledSize.width - targetSide) * 0.5).rounded(.down),
-      y: ((scaledSize.height - targetSide) * 0.5).rounded(.down),
-      width: targetSide.rounded(.down),
-      height: targetSide.rounded(.down)
-    )
-    let croppedCGImage = scaledCGImage.cropping(to: cropRect) ?? scaledCGImage
-    let imageWidth = croppedCGImage.width
-    let imageHeight = croppedCGImage.height
-    let pixelCount = imageWidth * imageHeight
-    var rgbaBuffer = [UInt8](repeating: 0, count: pixelCount * 4)
-    let context = CGContext(
+    format.opaque = false
+    return UIGraphicsImageRenderer(size: CGSize(width: sideSize, height: sideSize), format: format)
+      .image { _ in
+        let scaleFactor = max(sideSize / size.width, sideSize / size.height)
+        let scaledWidth = size.width * scaleFactor
+        let scaledHeight = size.height * scaleFactor
+        let originX = (sideSize - scaledWidth) / 2
+        let originY = (sideSize - scaledHeight) / 2
+        draw(in: CGRect(x: originX, y: originY, width: scaledWidth, height: scaledHeight))
+      }
+  }
+
+  func rgbBytes() -> [UInt8]? {
+    guard let cgImage = cgImage else { return nil }
+    let pixelWidth = Int(cgImage.width)
+    let pixelHeight = Int(cgImage.height)
+    let pixelCount = pixelWidth * pixelHeight
+    let bytesPerPixel = 4
+    let bytesPerRow = pixelWidth * bytesPerPixel
+    var rgbaBuffer = [UInt8](repeating: 0, count: pixelCount * bytesPerPixel)
+    guard let context = CGContext(
       data: &rgbaBuffer,
-      width: imageWidth,
-      height: imageHeight,
+      width: pixelWidth,
+      height: pixelHeight,
       bitsPerComponent: 8,
-      bytesPerRow: imageWidth * 4,
+      bytesPerRow: bytesPerRow,
       space: CGColorSpaceCreateDeviceRGB(),
       bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue | CGBitmapInfo.byteOrder32Big.rawValue
-    )!
-    context.draw(croppedCGImage, in: CGRect(x: 0, y: 0, width: imageWidth, height: imageHeight))
-    var planarRGB = [UInt8](repeating: 0, count: pixelCount * 3)
+    ) else { return nil }
+
+    context.draw(cgImage, in: CGRect(x: 0, y: 0, width: pixelWidth, height: pixelHeight))
+
+    var rgbBytes = [UInt8](repeating: 0, count: pixelCount * 3)
     for pixelIndex in 0..<pixelCount {
-      let sourceOffset = pixelIndex * 4
-      planarRGB[pixelIndex] = rgbaBuffer[sourceOffset]
-      planarRGB[pixelIndex + pixelCount] = rgbaBuffer[sourceOffset + 1]
-      planarRGB[pixelIndex + pixelCount * 2] = rgbaBuffer[sourceOffset + 2]
+      let sourceIndex = pixelIndex * bytesPerPixel
+      rgbBytes[pixelIndex] = rgbaBuffer[sourceIndex + 0]
+      rgbBytes[pixelIndex + pixelCount] = rgbaBuffer[sourceIndex + 1]
+      rgbBytes[pixelIndex + 2 * pixelCount] = rgbaBuffer[sourceIndex + 2]
     }
-    return Image(data: Data(planarRGB), width: 336, height: 336, channels: 3)
+    return rgbBytes
+  }
+
+  func rgbBytesNormalized(mean: [Float] = [0, 0, 0], std: [Float] = [1, 1, 1]) -> [Float]? {
+    precondition(mean.count == 3 && std.count == 3)
+    precondition(std[0] != 0 && std[1] != 0 && std[2] != 0)
+    guard let rgbBytes = rgbBytes() else { return nil }
+    let pixelCount = rgbBytes.count / 3
+    var rgbBytesNormalized = [Float](repeating: 0, count: pixelCount * 3)
+    for pixelIndex in 0..<pixelCount {
+      rgbBytesNormalized[pixelIndex] =
+        (Float(rgbBytes[pixelIndex]) / 255.0 - mean[0]) / std[0]
+      rgbBytesNormalized[pixelIndex + pixelCount] =
+        (Float(rgbBytes[pixelIndex + pixelCount]) / 255.0 - mean[1]) / std[1]
+      rgbBytesNormalized[pixelIndex + 2 * pixelCount] =
+        (Float(rgbBytes[pixelIndex + 2 * pixelCount]) / 255.0 - mean[2]) / std[2]
+    }
+    return rgbBytesNormalized
+  }
+
+  func asImage(_ sideSize: CGFloat) -> Image {
+    return Image(
+      data: Data(centerCropped(to: sideSize).rgbBytes() ?? []),
+      width: Int(sideSize),
+      height: Int(sideSize),
+      channels: 3
+    )
+  }
+
+  func asNormalizedImage(
+    _ sideSize: CGFloat,
+    mean: [Float] = [0.485, 0.456, 0.406],
+    std: [Float] = [0.229, 0.224, 0.225]
+  ) -> Image {
+    return Image(
+      float: (centerCropped(to: sideSize).rgbBytesNormalized(mean: mean, std: std) ?? []).withUnsafeBufferPointer { Data(buffer: $0) },
+      width: Int(sideSize),
+      height: Int(sideSize),
+      channels: 3
+    )
   }
 }
 
 class MultimodalRunnerTest: XCTestCase {
-  let systemPrompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: "
-  let assistantPrompt = "ASSISTANT: "
+  let systemPrompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
   let userPrompt = "What's on the picture?"
-  let sequenceLength = 768
+
+  func testGemma() {
+    let chatTemplate = "<start_of_turn>user\n%@<end_of_turn>\n<start_of_turn>model"
+    let sideSize: CGFloat = 896
+    let sequenceLength = 768
+    let bundle = Bundle(for: type(of: self))
+    guard let modelPath = bundle.path(forResource: "gemma3", ofType: "pte"),
+          let tokenizerPath = bundle.path(forResource: "gemma3_tokenizer", ofType: "model"),
+          let imagePath = bundle.path(forResource: "IMG_0005", ofType: "jpg"),
+          let uiImage = UIImage(contentsOfFile: imagePath) else {
+      XCTFail("Couldn't find model or tokenizer files")
+      return
+    }
+    let runner = MultimodalRunner(modelPath: modelPath, tokenizerPath: tokenizerPath)
+    var text = ""
+
+    do {
+      try runner.generate([
+        MultimodalInput(systemPrompt),
+        MultimodalInput(uiImage.asNormalizedImage(sideSize)),
+        MultimodalInput(String(format: chatTemplate, userPrompt)),
+      ], Config {
+        $0.sequenceLength = sequenceLength
+      }) { token in
+        text += token
+        if token == "<end_of_turn>" {
+          runner.stop()
+        }
+      }
+    } catch {
+      XCTFail("Failed to generate text with error \(error)")
+    }
+    XCTAssertTrue(text.lowercased().contains("waterfall"))
+
+    text = ""
+    runner.reset()
+    do {
+      try runner.generate([
+        MultimodalInput(systemPrompt),
+        MultimodalInput(uiImage.asNormalizedImage(sideSize)),
+        MultimodalInput(String(format: chatTemplate, userPrompt)),
+      ], Config {
+        $0.sequenceLength = sequenceLength
+      }) { token in
+        text += token
+        if token == "<end_of_turn>" {
+          runner.stop()
+        }
+      }
+    } catch {
+      XCTFail("Failed to generate text with error \(error)")
+    }
+    XCTAssertTrue(text.lowercased().contains("waterfall"))
+  }
 
   func testLLaVA() {
+    let chatTemplate = "USER: %@ ASSISTANT: "
+    let sideSize: CGFloat = 336
+    let sequenceLength = 768
     let bundle = Bundle(for: type(of: self))
     guard let modelPath = bundle.path(forResource: "llava", ofType: "pte"),
-          let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "bin"),
+          let tokenizerPath = bundle.path(forResource: "llava_tokenizer", ofType: "bin"),
           let imagePath = bundle.path(forResource: "IMG_0005", ofType: "jpg"),
           let uiImage = UIImage(contentsOfFile: imagePath) else {
       XCTFail("Couldn't find model or tokenizer files")
@@ -75,8 +172,8 @@ class MultimodalRunnerTest: XCTestCase {
     do {
       try runner.generate([
         MultimodalInput(systemPrompt),
-        MultimodalInput(uiImage.asImage()),
-        MultimodalInput("\(userPrompt) \(assistantPrompt)"),
+        MultimodalInput(uiImage.asImage(sideSize)),
+        MultimodalInput(String(format: chatTemplate, userPrompt)),
       ], Config {
         $0.sequenceLength = sequenceLength
       }) { token in
@@ -92,8 +189,8 @@ class MultimodalRunnerTest: XCTestCase {
     do {
       try runner.generate([
         MultimodalInput(systemPrompt),
-        MultimodalInput(uiImage.asImage()),
-        MultimodalInput("\(userPrompt) \(assistantPrompt)"),
+        MultimodalInput(uiImage.asImage(sideSize)),
+        MultimodalInput(String(format: chatTemplate, userPrompt)),
       ], Config {
         $0.sequenceLength = sequenceLength
       }) { token in
diff --git a/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift b/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift
index f7124fec640..0fa2b59d05d 100644
--- a/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift
+++ b/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift
@@ -42,7 +42,7 @@ class TextRunnerTest: XCTestCase {
   func testLLaMA() {
     let bundle = Bundle(for: type(of: self))
     guard let modelPath = bundle.path(forResource: "llama3_2-1B", ofType: "pte"),
-          let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "model") else {
+          let tokenizerPath = bundle.path(forResource: "llama_tokenizer", ofType: "model") else {
       XCTFail("Couldn't find model or tokenizer files")
       return
     }
@@ -77,7 +77,7 @@ class TextRunnerTest: XCTestCase {
   func testPhi4() {
     let bundle = Bundle(for: type(of: self))
     guard let modelPath = bundle.path(forResource: "phi4-mini", ofType: "pte"),
-          let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "json") else {
+          let tokenizerPath = bundle.path(forResource: "phi_tokenizer", ofType: "json") else {
       XCTFail("Couldn't find model or tokenizer files")
       return
     }

From c609f635ad6fb7939e7f56ed955a59ae4221a5fb Mon Sep 17 00:00:00 2001
From: Andrew Grebenisan <33402477+DrJessop@users.noreply.github.com>
Date: Mon, 6 Oct 2025 17:36:45 -0700
Subject: [PATCH 146/266] Fixed assumption on out_shift for quantized linear

Differential Revision: D83875670

Pull Request resolved: https://github.com/pytorch/executorch/pull/14789
---
 backends/cadence/aot/ref_implementations.py      |  4 ++--
 .../aot/tests/test_ref_implementations.py        | 16 ++++++++--------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py
index 2642340679e..ad1abb3ce4b 100644
--- a/backends/cadence/aot/ref_implementations.py
+++ b/backends/cadence/aot/ref_implementations.py
@@ -330,8 +330,8 @@ def variant(
                 if out_shift.numel() != 1:
                     raise ValueError("out_shift must be a scalar")
 
-                if out_shift.dtype != torch.int64:
-                    raise ValueError("out_shift must be an int64")
+                if out_shift.dtype != torch.int32:
+                    raise ValueError("out_shift must be an int32")
 
                 _out_shift = int(out_shift.item())
                 _out_multiplier = int(out_multiplier[0].item())
diff --git a/backends/cadence/aot/tests/test_ref_implementations.py b/backends/cadence/aot/tests/test_ref_implementations.py
index f78d2292e7b..d8a79454097 100644
--- a/backends/cadence/aot/tests/test_ref_implementations.py
+++ b/backends/cadence/aot/tests/test_ref_implementations.py
@@ -172,7 +172,7 @@ def test_quantized_add(
                     torch.tensor(
                         [1073741824], dtype=torch.int32
                     ),  # out_multiplier (0.5 * 2^31)
-                    torch.tensor([0], dtype=torch.int64),  # out_shift
+                    torch.tensor([0], dtype=torch.int32),  # out_shift
                     0,  # out_zero_point
                     torch.tensor([[0]], dtype=dtype),  # expected_output
                     per_tensor,
@@ -197,7 +197,7 @@ def test_quantized_add(
                     torch.tensor(
                         [1073741824], dtype=torch.int32
                     ),  # out_multiplier (0.5 * 2^31)
-                    torch.tensor([0], dtype=torch.int64),  # out_shift
+                    torch.tensor([0], dtype=torch.int32),  # out_shift
                     0,  # out_zero_point
                     torch.tensor([[-2, -8]], dtype=dtype),  # expected_output
                     per_tensor,
@@ -220,7 +220,7 @@ def test_quantized_add(
                     torch.tensor(
                         [1073741824], dtype=torch.int32
                     ),  # out_multiplier (0.5 * 2^31)
-                    torch.tensor([0], dtype=torch.int64),  # out_shift
+                    torch.tensor([0], dtype=torch.int32),  # out_shift
                     0,  # out_zero_point
                     torch.tensor([[0, 0]], dtype=dtype),  # expected_output
                     per_tensor,
@@ -244,7 +244,7 @@ def test_quantized_add(
                     torch.tensor(
                         [1073741824], dtype=torch.int32
                     ),  # out_multiplier (0.5 * 2^31)
-                    torch.tensor([0], dtype=torch.int64),  # out_shift
+                    torch.tensor([0], dtype=torch.int32),  # out_shift
                     0,  # out_zero_point
                     torch.tensor(
                         [[[0, -2, -4], [-2, -7, -12]]], dtype=dtype
@@ -270,7 +270,7 @@ def test_quantized_add(
                     torch.tensor(
                         [268435456], dtype=torch.int32
                     ),  # out_multiplier (1.0 * 2^31)
-                    torch.tensor([0], dtype=torch.int64),  # out_shift
+                    torch.tensor([0], dtype=torch.int32),  # out_shift
                     1,  # out_zero_point
                     torch.tensor([[1, 1]], dtype=dtype),  # expected_output
                     per_tensor,
@@ -295,7 +295,7 @@ def test_quantized_add(
                     torch.tensor(
                         [268435456], dtype=torch.int32
                     ),  # out_multiplier (1.0 * 2^31)
-                    torch.tensor([0], dtype=torch.int64),  # out_shift
+                    torch.tensor([0], dtype=torch.int32),  # out_shift
                     1,  # out_zero_point
                     torch.tensor([[1, 1]], dtype=dtype),  # expected_output
                     False,
@@ -317,7 +317,7 @@ def test_quantized_add(
                         [268435456], dtype=torch.int32
                     ),  # out_multiplier (0.125 * 2^31)
                     torch.tensor(
-                        [1], dtype=torch.int64
+                        [1], dtype=torch.int32
                     ),  # out_shift (shift=1, doubles the scale)
                     1,  # out_zero_point
                     torch.tensor([[1, 2]], dtype=dtype),  # expected_output
@@ -339,7 +339,7 @@ def test_quantized_add(
                         [268435456], dtype=torch.int32
                     ),  # out_multiplier (0.125 * 2^31)
                     torch.tensor(
-                        [1], dtype=torch.int64
+                        [1], dtype=torch.int32
                     ),  # out_shift (shift=1, doubles the scale)
                     1,  # out_zero_point
                     torch.tensor([[1, 2]], dtype=dtype),  # expected_output

From d36bf8ce6ea37d867384f58829418d3a365f8c3b Mon Sep 17 00:00:00 2001
From: derekxu <derekxu@users.noreply.github.com>
Date: Mon, 6 Oct 2025 21:44:21 -0700
Subject: [PATCH 147/266] Run ET-eager on message recall

Differential Revision: D83990682

Pull Request resolved: https://github.com/pytorch/executorch/pull/14822
---
 examples/models/llama/rope.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/models/llama/rope.py b/examples/models/llama/rope.py
index 0d1dd306091..ea4e6b37243 100644
--- a/examples/models/llama/rope.py
+++ b/examples/models/llama/rope.py
@@ -240,7 +240,7 @@ def __init__(self, params: ModelArgs):
             self.precompute_freqs_cis = partial(
                 hf_precompute_freqs_cis,
                 partial_rotary_factor=self.params.partial_rotary_factor,
-                device=self.params.device,
+                device=getattr(self.params, "device", "cpu"),
             )
             self.apply_rotary_emb = hf_apply_rotary_emb
         else:
@@ -249,7 +249,7 @@ def __init__(self, params: ModelArgs):
                 use_scaled=self.params.use_scaled_rope,
                 scale_factor=self.params.rope_scale_factor,
                 high_freq_factor=self.params.high_freq_factor,
-                device=self.params.device,
+                device=getattr(self.params, "device", "cpu"),
             )
             self.apply_rotary_emb = RotaryEmbedding()
 

From 0b748bfea8278cfdf60233be475e852d5eaf57f2 Mon Sep 17 00:00:00 2001
From: billmguo <minguo@meta.com>
Date: Mon, 6 Oct 2025 21:47:12 -0700
Subject: [PATCH 148/266] oss et update to support SAR2230P

Differential Revision: D83934187

Pull Request resolved: https://github.com/pytorch/executorch/pull/14808
---
 backends/qualcomm/serialization/qc_schema.py | 3 +++
 backends/qualcomm/utils/utils.py             | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/backends/qualcomm/serialization/qc_schema.py b/backends/qualcomm/serialization/qc_schema.py
index f3b9e2cc1a5..6f0bceec4c9 100644
--- a/backends/qualcomm/serialization/qc_schema.py
+++ b/backends/qualcomm/serialization/qc_schema.py
@@ -27,6 +27,7 @@ class HtpArch(IntEnum):
     V73 = 73
     V75 = 75
     V79 = 79
+    V81 = 81
 
 
 @dataclass
@@ -49,6 +50,7 @@ class QcomChipset(IntEnum):
     SXR1230P = 45  # v73
     SXR2230P = 53  # v69
     SXR2330P = 75  # v79
+    SAR2230P = 95  # v81
 
 
 @dataclass
@@ -69,6 +71,7 @@ class SocInfo:
     QcomChipset.SXR1230P: SocInfo(QcomChipset.SXR1230P, HtpInfo(HtpArch.V73, 2)),
     QcomChipset.SXR2230P: SocInfo(QcomChipset.SXR2230P, HtpInfo(HtpArch.V69, 8)),
     QcomChipset.SXR2330P: SocInfo(QcomChipset.SXR2330P, HtpInfo(HtpArch.V79, 8)),
+    QcomChipset.SAR2230P: SocInfo(QcomChipset.SAR2230P, HtpInfo(HtpArch.V81, 4)),
 }
 
 
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
index be4e86de50f..c57bec43dcf 100644
--- a/backends/qualcomm/utils/utils.py
+++ b/backends/qualcomm/utils/utils.py
@@ -1099,6 +1099,7 @@ def get_soc_to_arch_map():
         "SXR1230P": HtpArch.V73,
         "SXR2230P": HtpArch.V69,
         "SXR2330P": HtpArch.V79,
+        "SAR2230P": HtpArch.V81,
     }
 
 
@@ -1115,6 +1116,7 @@ def get_soc_to_chipset_map():
         "SXR1230P": QcomChipset.SXR1230P,
         "SXR2230P": QcomChipset.SXR2230P,
         "SXR2330P": QcomChipset.SXR2330P,
+        "SAR2230P": QcomChipset.SAR2230P,
     }
 
 
From 2c603e43dc2f2db2e1e48512431f21b5910a0a73 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Lindstr=C3=B6m?=
 <33344797+martinlsm@users.noreply.github.com>
Date: Tue, 7 Oct 2025 14:37:05 +0200
Subject: [PATCH 149/266] Arm backend: Move rescale ops out of node visitors
 (#14584)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some TOSA ops do not support INT8 as inputs and outputs. Instead, only
INT32 is supported as a whole number type. Prior to this patch, affected
node visitors inserted rescale ops between the data types INT8 and INT32
before and after the operator such that it will accept its input and
output.

Change this by moving the insertion of the rescale ops to a new pass
called `InsertRescaleInt32Pass`. This will further enable optimizations
to the graph by fusing the rescale nodes.

Only comparison, ABS, MAXIMUM and MINIMUM operators are handled in this
patch; the remaining ones are left out to be done in another patch.

### Test plan
This is refactoring which means that external behavior is not altered. A
new pass `InsertRescaleInt32Pass` has been added and it comes with a new
unit test in backends/arm/test/passes/test_insert_rescale_i32_pass.py.


Signed-off-by: Martin Lindström <Martin.Lindstroem@arm.com>
Co-authored-by: Oscar Andersson <Oscar.Andersson@arm.com>
---
 backends/arm/_passes/__init__.py              |   2 +-
 backends/arm/_passes/arm_pass_manager.py      |   2 +
 backends/arm/_passes/insert_rescales_pass.py  | 240 +++++++++++++++++-
 backends/arm/operators/op_abs.py              |  90 +------
 backends/arm/operators/op_eq.py               |  15 +-
 backends/arm/operators/op_ge.py               |  15 +-
 backends/arm/operators/op_gt.py               |  15 +-
 backends/arm/operators/op_le.py               |  15 +-
 backends/arm/operators/op_lt.py               |  15 +-
 backends/arm/operators/op_maximum.py          |  48 +---
 backends/arm/operators/op_minimum.py          |  45 +---
 .../passes/test_insert_rescale_i32_pass.py    |  77 ++++++
 12 files changed, 341 insertions(+), 238 deletions(-)
 create mode 100644 backends/arm/test/passes/test_insert_rescale_i32_pass.py

diff --git a/backends/arm/_passes/__init__.py b/backends/arm/_passes/__init__.py
index 93bf20e69c1..008bc305aad 100644
--- a/backends/arm/_passes/__init__.py
+++ b/backends/arm/_passes/__init__.py
@@ -81,7 +81,7 @@
 from .insert_int32_casts_after_int64_placeholders import (  # noqa
     InsertInt32CastsAfterInt64PlaceholdersPass,
 )
-from .insert_rescales_pass import InsertRescalePass  # noqa
+from .insert_rescales_pass import InsertRescaleInt32Pass, InsertRescalePass  # noqa
 from .insert_table_ops import InsertTableOpsPass  # noqa
 from .match_arg_dtype_pass import MatchArgDtypePass  # noqa
 from .match_arg_ranks_pass import MatchArgRanksPass  # noqa
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
index b7c511bbe0b..1a0f4e4d384 100644
--- a/backends/arm/_passes/arm_pass_manager.py
+++ b/backends/arm/_passes/arm_pass_manager.py
@@ -81,6 +81,7 @@
     FuseEqualPlaceholdersPass,
     FuseQuantizedActivationPass,
     InsertInt32CastsAfterInt64PlaceholdersPass,
+    InsertRescaleInt32Pass,
     InsertRescalePass,
     InsertTableOpsPass,
     MatchArgDtypePass,
@@ -214,6 +215,7 @@ def _tosa_INT_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         self.add_pass(ToTosaMemoryFormatPass(exported_program))
         self.add_pass(RemoveNoopPass())
         self.add_pass(InsertRescalePass())
+        self.add_pass(InsertRescaleInt32Pass())
 
         self.validate_constraints_mandatory()
         return self._transform(exported_program.graph_module)
diff --git a/backends/arm/_passes/insert_rescales_pass.py b/backends/arm/_passes/insert_rescales_pass.py
index 100ac03c2b0..d56e70e78b3 100644
--- a/backends/arm/_passes/insert_rescales_pass.py
+++ b/backends/arm/_passes/insert_rescales_pass.py
@@ -4,9 +4,14 @@
 # LICENSE file in the root directory of this source tree.
 
 from copy import copy
-from typing import cast, Set, Type
+from typing import cast, Dict, Optional, Set, Tuple, Type
 
-from executorch.backends.arm._passes.arm_pass_utils import create_node
+import torch
+from executorch.backends.arm._passes.arm_pass import ArmPass
+from executorch.backends.arm._passes.arm_pass_utils import create_node, set_node_arg
+from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
+    get_output_qparams,
+)
 from executorch.backends.arm._passes.quant_args import QuantArgs
 from executorch.backends.arm.constants import DQ_OPS, Q_OPS
 from executorch.exir.dialects._ops import ops as exir_ops
@@ -65,3 +70,234 @@ def call(self, graph_module: GraphModule) -> PassResult:
         graph_module = super().call(graph_module).graph_module
         graph_module.recompile()
         return PassResult(graph_module, modified)
+
+
+class InsertRescaleInt32Pass(ArmPass):
+    """
+    Numerous TOSA ops require inputs and outputs to be 32-bit integers in their
+    quantized implementations. This pass treats such operator nodes by
+    inserting rescale ops before and after them if needed. Note that extra logic
+    that handles the scales and zero points must be in place because the affected
+    TOSA have naive implementations that do not account for the quantization
+    parameters.
+    """
+
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
+    included_targets = [
+        exir_ops.edge.aten.abs.default,
+        exir_ops.edge.aten.eq.Tensor,
+        exir_ops.edge.aten.ge.Tensor,
+        exir_ops.edge.aten.gt.Tensor,
+        exir_ops.edge.aten.le.Tensor,
+        exir_ops.edge.aten.lt.Tensor,
+        exir_ops.edge.aten.maximum.default,
+        exir_ops.edge.aten.minimum.default,
+    ]
+
+    def _int32_qargs(self, s):
+        """Helper creator function for INT32-based QuantArgs"""
+
+        return QuantArgs(
+            scale=s,
+            zp=0,
+            qmin=torch.iinfo(torch.int32).min,
+            qmax=torch.iinfo(torch.int32).max,
+            dtype=torch.int32,
+        )
+
+    def _get_inputs_rescaled_qparams(
+        self, target, input_qparams: Dict[int, QuantArgs]
+    ) -> Dict[int, QuantArgs]:
+        """Get the qparams for the INT32 operands to the op ``target``
+
+        Inputs to the INT32-based operator must be rescaled from INT8 to INT32.
+        This function computes the ``QuantArgs`` for each of the operands and returns
+        it as a dict, mapping tensor index to ``QuantArgs``.
+        """
+
+        if target in [
+            exir_ops.edge.aten.abs.default,
+            exir_ops.edge.aten.eq.Tensor,
+            exir_ops.edge.aten.ge.Tensor,
+            exir_ops.edge.aten.gt.Tensor,
+            exir_ops.edge.aten.le.Tensor,
+            exir_ops.edge.aten.lt.Tensor,
+            exir_ops.edge.aten.minimum.default,
+            exir_ops.edge.aten.maximum.default,
+        ]:
+            # For these ops, use the smallest scale among the INT8 operands.
+            min_scale = min(
+                [qp.get_scale_per_tensor() for qp in input_qparams.values()]
+            )
+            qparams = {
+                i: self._int32_qargs(min_scale) for i in range(len(input_qparams))
+            }
+        else:
+            raise ValueError(f"Not a valid target: {target}")
+
+        return qparams
+
+    def _get_output_qparams(
+        self, target, inputs_qparams: Dict[int, QuantArgs]
+    ) -> Optional[QuantArgs]:
+        """Given an op ``target`` and the ``QuantArgs`` for each of its inputs, compute
+        the scale of the output based on how the operator itself affects it."""
+
+        if target in [
+            exir_ops.edge.aten.abs.default,
+            exir_ops.edge.aten.maximum.default,
+            exir_ops.edge.aten.minimum.default,
+        ]:
+            # The op has not altered the scale; the output scale is equal to
+            # the operands' scales.
+            return self._int32_qargs(inputs_qparams[0].get_scale_per_tensor())
+        elif target in [
+            exir_ops.edge.aten.eq.Tensor,
+            exir_ops.edge.aten.ge.Tensor,
+            exir_ops.edge.aten.gt.Tensor,
+            exir_ops.edge.aten.le.Tensor,
+            exir_ops.edge.aten.lt.Tensor,
+        ]:
+            # Output is bool for these ops and thus no qparams are present
+            return None
+        else:
+            raise ValueError(f"Not a valid target: {target}")
+
+    def _get_rescale_qparams(
+        self, target, input_qparams: Dict[int, QuantArgs]
+    ) -> Tuple[Dict[int, QuantArgs], Optional[QuantArgs]]:
+        """
+        Get the quantization parameters of the INT32 inputs/outputs that will
+        surround the node after the new RESCALE ops have been inserted.
+        """
+
+        inputs_rescaled_qparams = self._get_inputs_rescaled_qparams(
+            target, input_qparams
+        )
+        output_qparams = self._get_output_qparams(target, inputs_rescaled_qparams)
+
+        return (inputs_rescaled_qparams, output_qparams)
+
+    def _rescale_inputs(self, graph, node, rescale_qargs: Dict[int, QuantArgs]) -> bool:
+        qargs = node.meta["input_qparams"]
+
+        args_copy = list(node.args)
+        seen_args = set()
+        modified = False
+        for i in qargs:
+            qp = qargs[i]
+            if qp.dtype != torch.int8:
+                continue
+
+            arg_node = args_copy[i]
+            if arg_node in seen_args:
+                continue
+            seen_args.add(arg_node)
+
+            with graph.inserting_after(arg_node):
+                rescale_node = create_node(
+                    graph,
+                    exir_ops.backend.tosa.RESCALE.default,
+                    (
+                        arg_node,
+                        torch.int32,
+                        qp.get_scale_per_tensor()
+                        / rescale_qargs[
+                            i
+                        ].get_scale_per_tensor(),  # Old scale / new scale
+                        qp.get_zp_per_tensor(),  # Old zero point
+                        rescale_qargs[i].get_zp_per_tensor(),  # New zero point
+                    ),
+                    from_node=node,
+                )
+
+                node.replace_input_with(arg_node, rescale_node)
+                modified = True
+
+        return modified
+
+    def _rescale_outputs(self, graph, node, rescale_qargs: Optional[QuantArgs]) -> bool:
+        if "output_qparams" not in node.meta or len(node.meta["output_qparams"]) == 0:
+            return False
+
+        qargs = get_output_qparams(node)
+        assert len(qargs) == 1
+        assert rescale_qargs is not None
+
+        qarg = qargs[0]
+        if qarg.dtype != torch.int8:
+            return False
+
+        users_copy = list(node.users)
+
+        with graph.inserting_after(node):
+            rescale_node = create_node(
+                graph,
+                exir_ops.backend.tosa.RESCALE.default,
+                (
+                    node,
+                    torch.int8,
+                    rescale_qargs.get_scale_per_tensor()
+                    / qarg.get_scale_per_tensor(),  # Old scale / new scale
+                    rescale_qargs.get_zp_per_tensor(),  # Old zero point
+                    qarg.get_zp_per_tensor(),  # New zero point
+                ),
+                from_node=node,
+            )
+
+        for user in users_copy:
+            user.replace_input_with(node, rescale_node)
+
+        return True
+
+    def call(self, graph_module: GraphModule) -> PassResult:
+        graph = graph_module.graph
+
+        modified = False
+        for node in list(graph.nodes):
+            node = cast(Node, node)
+
+            if node.op != "call_function" or node.target not in self.included_targets:
+                continue
+
+            if "input_qparams" not in node.meta or len(node.meta["input_qparams"]) == 0:
+                continue
+            input_qparams = node.meta["input_qparams"]
+
+            inputs_rescale_qargs, output_rescale_qargs = self._get_rescale_qparams(
+                node.target, input_qparams
+            )
+
+            inputs_was_rescaled = self._rescale_inputs(
+                graph, node, inputs_rescale_qargs
+            )
+            outputs_was_rescaled = False
+            if inputs_was_rescaled:
+                outputs_was_rescaled = self._rescale_outputs(
+                    graph, node, output_rescale_qargs
+                )
+                modified = True
+
+            # Update node metadata
+
+            if inputs_was_rescaled:
+                assert len(inputs_rescale_qargs) == len(node.meta["input_qparams"])
+                node.meta["input_qparams"] = inputs_rescale_qargs
+
+            if outputs_was_rescaled:
+                assert len(node.meta["output_qparams"]) == 1
+                node.meta["output_qparams"] = {0: output_rescale_qargs}
+
+                # If the output type is specified in the node, change it such
+                # that it matches the subsequent rescale node(s) that this node
+                # now has output edges to.
+                if "dtype" in node.kwargs:
+                    set_node_arg(node, "dtype", torch.int32)
+
+        if modified:
+            # Retrace the graph to update the fake tensor types
+            graph_module = super().call(graph_module).graph_module
+            graph_module.recompile()
+
+        return PassResult(graph_module, modified)
diff --git a/backends/arm/operators/op_abs.py b/backends/arm/operators/op_abs.py
index ec76eb5517f..943c4778867 100644
--- a/backends/arm/operators/op_abs.py
+++ b/backends/arm/operators/op_abs.py
@@ -6,9 +6,6 @@
 # pyre-unsafe
 from typing import Any, List
 
-import executorch.backends.arm.tosa.quant_utils as tqutils
-import executorch.backends.arm.tosa.utils as tutils
-
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
@@ -18,22 +15,20 @@
     validate_same_dtype,
     validate_valid_dtype,
 )
-from executorch.backends.arm.tosa import TosaSpecification
 from executorch.backends.arm.tosa.mapping import TosaArg
+from executorch.backends.arm.tosa.specification import TosaSpecification
 from torch.fx import Node
 
 
 @register_node_visitor
-class AbsVisitor_INT(NodeVisitor):
+class AbsVisitor(NodeVisitor):
     target = "aten.abs.default"
 
     tosa_specs = [
         TosaSpecification.create_from_string("TOSA-1.0+INT"),
+        TosaSpecification.create_from_string("TOSA-1.0+FP"),
     ]
 
-    def __init__(self, *args):
-        super().__init__(*args)
-
     def define_node(
         self,
         node: Node,
@@ -47,89 +42,18 @@ def define_node(
         validate_num_inputs(self.target, inputs, 1)
         validate_same_dtype(self.target, [*inputs, output], ts)
 
-        # Handle int8 (quantized) and int32
         validate_valid_dtype(
             self.target,
             [*inputs, output],
-            [ts.DType.INT8, ts.DType.INT32],
+            [ts.DType.INT32, ts.DType.FP32],
             output.tosa_spec,
         )
 
-        scale_back = 1.0
-        if inputs[0].dtype == ts.DType.INT8:
-            rescaled_inputs, scale_back = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_spec
-            )  # type: ignore[possibly-undefined]
-        else:
-            # input[0].dtype == ts.DType.INT32
-            # Non quantized input, natively support by TOSA.abs
-            rescaled_inputs = inputs
-
-        if output.dtype == ts.DType.INT8:
-            broadcasted_shape = tutils.tosa_shape(output.shape, output.dim_order)
-            abs_output = tosa_graph.addIntermediate(broadcasted_shape, ts.DType.INT32)
-        else:
-            # output.dtype == ts.DType.INT32
-            abs_output = output
-
-        # Do the INT32 Abs
-        self._serialize_operator(
-            node,
-            tosa_graph,
+        tosa_graph.addOperator(
             ts.TosaOp.Op().ABS,
             [
-                rescaled_inputs[0].name,
+                inputs[0].name,
             ],
-            [abs_output.name],
+            [output.name],
             None,
         )
-
-        if output.dtype == ts.DType.INT8:
-            # Scale output back to 8 bit
-            # pyre-ignore
-            tqutils.insert_rescale_op_to_int8(
-                tosa_graph, abs_output, scale_back, node, self.tosa_spec
-            )  # type: ignore[possibly-undefined]
-
-
-@register_node_visitor
-class AbsVisitor_FP(AbsVisitor_INT):
-    # inheriting 'target' from BI class
-
-    tosa_specs = [TosaSpecification.create_from_string("TOSA-1.0+FP")]
-
-    def __init__(self, *args):
-        super().__init__(*args)
-
-    def define_node(
-        self,
-        node: Node,
-        tosa_graph: Any,
-        inputs: List[TosaArg],
-        output: TosaArg,
-    ) -> None:
-
-        import serializer.tosa_serializer as ts  # type: ignore
-
-        validate_num_inputs(self.target, inputs, 1)
-        validate_same_dtype(self.target, [*inputs, output], ts)
-
-        if inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32]:
-            # Call the inherited define_node for handling integers
-            super().define_node(node, tosa_graph, inputs, output)
-        else:
-            # FP32 Abs lowering
-
-            validate_valid_dtype(
-                self.target, [*inputs, output], ts.DType.FP32, output.tosa_spec
-            )
-
-            # MI lowering
-            self._serialize_operator(
-                node,
-                tosa_graph,
-                ts.TosaOp.Op().ABS,
-                [inputs[0].name],
-                [output.name],
-                None,
-            )
diff --git a/backends/arm/operators/op_eq.py b/backends/arm/operators/op_eq.py
index 2136fe2e946..76b6e67cd8d 100644
--- a/backends/arm/operators/op_eq.py
+++ b/backends/arm/operators/op_eq.py
@@ -7,8 +7,6 @@
 
 from typing import Any, List
 
-import executorch.backends.arm.tosa.quant_utils as tqutils
-
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
@@ -56,23 +54,12 @@ def define_node(
         )
         validate_valid_dtype(self.target, output, ts.DType.BOOL, output.tosa_spec)
 
-        input_nodes = inputs
-        # Handle quantization
-        if inputs[0].dtype == ts.DType.INT8:
-            # Rescale inputs to 32 bit
-            rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_spec
-            )
-
-            # Update IO
-            input_nodes = rescaled_inputs
-
         # Do the equal comparison
         self._serialize_operator(
             node,
             tosa_graph,
             ts.TosaOp.Op().EQUAL,
-            [input_nodes[0].name, input_nodes[1].name],
+            [inputs[0].name, inputs[1].name],
             [output.name],
             None,
         )
diff --git a/backends/arm/operators/op_ge.py b/backends/arm/operators/op_ge.py
index c538e735880..4bb20cac77f 100644
--- a/backends/arm/operators/op_ge.py
+++ b/backends/arm/operators/op_ge.py
@@ -7,8 +7,6 @@
 
 from typing import Any, List
 
-import executorch.backends.arm.tosa.quant_utils as tqutils
-
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
@@ -56,22 +54,11 @@ def define_node(
         )
         validate_valid_dtype(self.target, output, ts.DType.BOOL, output.tosa_spec)
 
-        input_nodes = inputs
-        # Handle quantization
-        if inputs[0].dtype == ts.DType.INT8:
-            # Rescale inputs to 32 bit
-            rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_spec
-            )
-
-            # Update IO
-            input_nodes = rescaled_inputs
-
         self._serialize_operator(
             node,
             tosa_graph,
             ts.TosaOp.Op().GREATER_EQUAL,
-            [input_nodes[0].name, input_nodes[1].name],
+            [inputs[0].name, inputs[1].name],
             [output.name],
             None,
         )
diff --git a/backends/arm/operators/op_gt.py b/backends/arm/operators/op_gt.py
index d407e28c1b6..c25c959681e 100644
--- a/backends/arm/operators/op_gt.py
+++ b/backends/arm/operators/op_gt.py
@@ -7,8 +7,6 @@
 
 from typing import Any, List
 
-import executorch.backends.arm.tosa.quant_utils as tqutils
-
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
@@ -56,22 +54,11 @@ def define_node(
         )
         validate_valid_dtype(self.target, output, ts.DType.BOOL, output.tosa_spec)
 
-        input_nodes = inputs
-        # Handle quantization
-        if inputs[0].dtype == ts.DType.INT8:
-            # Rescale inputs to 32 bit
-            rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_spec
-            )
-
-            # Update IO
-            input_nodes = rescaled_inputs
-
         self._serialize_operator(
             node,
             tosa_graph,
             ts.TosaOp.Op().GREATER,
-            [input_nodes[0].name, input_nodes[1].name],
+            [inputs[0].name, inputs[1].name],
             [output.name],
             None,
         )
diff --git a/backends/arm/operators/op_le.py b/backends/arm/operators/op_le.py
index 403c6c233d3..e62d669814f 100644
--- a/backends/arm/operators/op_le.py
+++ b/backends/arm/operators/op_le.py
@@ -7,8 +7,6 @@
 
 from typing import Any, List
 
-import executorch.backends.arm.tosa.quant_utils as tqutils
-
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
@@ -56,22 +54,11 @@ def define_node(
         )
         validate_valid_dtype(self.target, output, ts.DType.BOOL, output.tosa_spec)
 
-        input_nodes = inputs
-        # Handle quantization
-        if inputs[0].dtype == ts.DType.INT8:
-            # Rescale inputs to 32 bit
-            rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_spec
-            )
-
-            # Update IO
-            input_nodes = rescaled_inputs
-
         self._serialize_operator(
             node,
             tosa_graph,
             ts.TosaOp.Op().GREATER_EQUAL,
-            [input_nodes[1].name, input_nodes[0].name],
+            [inputs[1].name, inputs[0].name],
             [output.name],
             None,
         )
diff --git a/backends/arm/operators/op_lt.py b/backends/arm/operators/op_lt.py
index f5132dd4feb..cccb0abd5d7 100644
--- a/backends/arm/operators/op_lt.py
+++ b/backends/arm/operators/op_lt.py
@@ -7,8 +7,6 @@
 
 from typing import Any, List
 
-import executorch.backends.arm.tosa.quant_utils as tqutils
-
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
@@ -56,22 +54,11 @@ def define_node(
         )
         validate_valid_dtype(self.target, output, ts.DType.BOOL, output.tosa_spec)
 
-        input_nodes = inputs
-        # Handle quantization
-        if inputs[0].dtype == ts.DType.INT8:
-            # Rescale inputs to 32 bit
-            rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_spec
-            )
-
-            # Update IO
-            input_nodes = rescaled_inputs
-
         self._serialize_operator(
             node,
             tosa_graph,
             ts.TosaOp.Op().GREATER,
-            [input_nodes[1].name, input_nodes[0].name],
+            [inputs[1].name, inputs[0].name],
             [output.name],
             None,
         )
diff --git a/backends/arm/operators/op_maximum.py b/backends/arm/operators/op_maximum.py
index 66437f8af1d..50c6e06a4bb 100644
--- a/backends/arm/operators/op_maximum.py
+++ b/backends/arm/operators/op_maximum.py
@@ -7,12 +7,6 @@
 
 from typing import Any, List
 
-import executorch.backends.arm.tosa.quant_utils as tqutils
-
-from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
-    get_input_qparams,
-)
-
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
@@ -22,9 +16,8 @@
     validate_same_dtype,
     validate_valid_dtype,
 )
-from executorch.backends.arm.tosa import TosaSpecification
 from executorch.backends.arm.tosa.mapping import TosaArg
-from executorch.backends.arm.tosa.utils import tosa_shape
+from executorch.backends.arm.tosa.specification import TosaSpecification
 from torch.fx import Node
 
 
@@ -56,35 +49,12 @@ def define_node(
         validate_valid_dtype(
             self.target,
             [*inputs, output],
-            [ts.DType.INT8, ts.DType.INT32, ts.DType.FP32],
+            [ts.DType.INT32, ts.DType.FP32],
             output.tosa_spec,
         )
 
-        scale_back = 1.0
-        max_output = output
-        if inputs[0].dtype == ts.DType.INT8:
-            input_qparams = get_input_qparams(node)
-            if len(input_qparams) != 2:
-                raise ValueError(
-                    f"Both inputs need to have quantization information for {node}"
-                )
-            if input_qparams[0] != input_qparams[1]:
-                raise ValueError(
-                    "Both inputs must have the same quantization parameters for MAX"
-                )
-
-            operand_inputs, scale_back = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_spec
-            )
-
-            output.shape = tosa_shape(output.shape, output.dim_order)
-            max_output = tosa_graph.addIntermediate(output.shape, ts.DType.INT32)
-        else:
-            operand_inputs = inputs
-
         attr_maximum = ts.TosaSerializerAttribute()
-
-        # Set to PROPOGATE as default
+        # Set to PROPAGATE as default
         attr_maximum.MaximumAttribute(nan_mode=NanPropagationMode.PROPAGATE)
 
         self._serialize_operator(
@@ -92,15 +62,9 @@ def define_node(
             tosa_graph,
             ts.TosaOp.Op().MAXIMUM,
             [
-                operand_inputs[0].name,
-                operand_inputs[1].name,
+                inputs[0].name,
+                inputs[1].name,
             ],
-            [max_output.name],
+            [output.name],
             attr_maximum,
         )
-
-        if output.dtype == ts.DType.INT8:
-            # insert RESCALE from int32 back to int8
-            tqutils.insert_rescale_op_to_int8(
-                tosa_graph, max_output, scale_back, node, self.tosa_spec
-            )
diff --git a/backends/arm/operators/op_minimum.py b/backends/arm/operators/op_minimum.py
index 518366d5463..d5b97f186d3 100644
--- a/backends/arm/operators/op_minimum.py
+++ b/backends/arm/operators/op_minimum.py
@@ -7,11 +7,6 @@
 
 from typing import Any, List
 
-import executorch.backends.arm.tosa.quant_utils as tqutils
-
-from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
-    get_input_qparams,
-)
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
@@ -23,7 +18,6 @@
 )
 from executorch.backends.arm.tosa import TosaSpecification
 from executorch.backends.arm.tosa.mapping import TosaArg
-from executorch.backends.arm.tosa.utils import tosa_shape
 from torch.fx import Node
 
 
@@ -55,35 +49,12 @@ def define_node(
         validate_valid_dtype(
             self.target,
             [*inputs, output],
-            [ts.DType.INT8, ts.DType.INT32, ts.DType.FP32],
+            [ts.DType.INT32, ts.DType.FP32],
             output.tosa_spec,
         )
 
-        scale_back = 1.0
-        min_output = output
-        if inputs[0].dtype == ts.DType.INT8:
-            input_qparams = get_input_qparams(node)
-            if len(input_qparams) != 2:
-                raise ValueError(
-                    f"Both inputs need to have quantization information for {node}"
-                )
-            if input_qparams[0] != input_qparams[1]:
-                raise ValueError(
-                    "Both inputs must have the same quantization parameters for MIN"
-                )
-
-            operand_inputs, scale_back = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_spec
-            )
-
-            output.shape = tosa_shape(output.shape, output.dim_order)
-            min_output = tosa_graph.addIntermediate(output.shape, ts.DType.INT32)
-        else:
-            operand_inputs = inputs
-
         attr_minimum = ts.TosaSerializerAttribute()
-
-        # Set to PROPOGATE as default
+        # Set to PROPAGATE as default
         attr_minimum.MinimumAttribute(nan_mode=NanPropagationMode.PROPAGATE)
 
         self._serialize_operator(
@@ -91,15 +62,9 @@ def define_node(
             tosa_graph,
             ts.TosaOp.Op().MINIMUM,
             [
-                operand_inputs[0].name,
-                operand_inputs[1].name,
+                inputs[0].name,
+                inputs[1].name,
             ],
-            [min_output.name],
+            [output.name],
             attr_minimum,
         )
-
-        if output.dtype == ts.DType.INT8:
-            # insert RESCALE from int32 back to int8
-            tqutils.insert_rescale_op_to_int8(
-                tosa_graph, min_output, scale_back, node, self.tosa_spec
-            )
diff --git a/backends/arm/test/passes/test_insert_rescale_i32_pass.py b/backends/arm/test/passes/test_insert_rescale_i32_pass.py
new file mode 100644
index 00000000000..096c90d330d
--- /dev/null
+++ b/backends/arm/test/passes/test_insert_rescale_i32_pass.py
@@ -0,0 +1,77 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+from executorch.backends.arm._passes import (
+    FoldAndAnnotateQParamsPass,
+    InsertRescaleInt32Pass,
+)
+from executorch.backends.arm.test.tester.test_pipeline import PassPipeline
+
+
+class NeedsRescaleOps(torch.nn.Module):
+    """A module containing ops that require INT32 inputs/outputs."""
+
+    input_t = Tuple[torch.Tensor, torch.Tensor]
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y):
+        a = torch.maximum(x, y)
+        b = torch.abs(a)
+        c = a > b
+        return c
+
+    def get_inputs(self, dtype) -> input_t:
+        if dtype == torch.float32:
+            return (torch.rand(1, 3, 5, 6), torch.rand(1, 3, 5, 6))
+        elif dtype == torch.int32:
+            return (
+                torch.randint(3, 5, (3,), dtype=torch.int32),
+                torch.randint(3, 5, (3,), dtype=torch.int32),
+            )
+        else:
+            raise ValueError("Not a valid input dtype for model")
+
+
+def test_insert_rescales():
+    module = NeedsRescaleOps()
+    input_t = Tuple[torch.Tensor, torch.Tensor]
+    ops_not_before = {"executorch_exir_dialects_backend__ops_tosa_RESCALE_default"}
+    ops_after = {
+        # "number of op nodes with i8 output" + "number of i8 node inputs"
+        "executorch_exir_dialects_backend__ops_tosa_RESCALE_default": 2
+        + 5,
+    }
+    pipeline = PassPipeline[input_t](
+        module,
+        module.get_inputs(torch.float32),
+        quantize=True,
+        ops_not_before_pass=ops_not_before,
+        ops_after_pass=ops_after,
+        pass_list=[FoldAndAnnotateQParamsPass, InsertRescaleInt32Pass],
+    )
+    pipeline.pop_stage("run_method_and_compare_outputs")
+    pipeline.run()
+
+
+def test_dont_insert_rescales():
+    module = NeedsRescaleOps()
+    input_t = Tuple[torch.Tensor, torch.Tensor]
+    ops_not_before = {"executorch_exir_dialects_backend__ops_tosa_RESCALE_default"}
+    # All inputs are already i32. Rescales should not be added.
+    ops_not_after = {"executorch_exir_dialects_backend__ops_tosa_RESCALE_default"}
+    pipeline = PassPipeline[input_t](
+        module,
+        module.get_inputs(torch.int32),
+        ops_not_before_pass=ops_not_before,
+        ops_not_after_pass=ops_not_after,
+        pass_list=[FoldAndAnnotateQParamsPass, InsertRescaleInt32Pass],
+    )
+    pipeline.pop_stage("run_method_and_compare_outputs")
+    pipeline.run()

From 1b8d380bf1db79ce22fba5096aefb80c2224e5a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0imon=20Str=C3=BD=C4=8Dek?= <simon.strycek@nxp.com>
Date: Tue, 7 Oct 2025 15:30:38 +0200
Subject: [PATCH 150/266] NXP backend: Add NXP backend tutorial page (#14850)

### Summary
Adds tutorial page for NXP backend.

### Test plan
Documentation built locally using Makefile without any problems.

cc @robert-kalmar @JakeStevens @digantdesai
---
 docs/source/backends-nxp.md | 41 ++++++++++++++++++++++++++++++++++---
 1 file changed, 38 insertions(+), 3 deletions(-)

diff --git a/docs/source/backends-nxp.md b/docs/source/backends-nxp.md
index f02f495f685..4783b4a5bc6 100644
--- a/docs/source/backends-nxp.md
+++ b/docs/source/backends-nxp.md
@@ -1,5 +1,40 @@
 # NXP eIQ Neutron Backend
 
-See
-[NXP eIQ Neutron Backend](https://github.com/pytorch/executorch/blob/main/backends/nxp/README.md)
-for current status about running ExecuTorch on NXP eIQ Neutron Backend.
+This manual page is dedicated to introduction of using the ExecuTorch with NXP eIQ Neutron Backend.
+NXP offers accelerated machine learning models inference on edge devices.
+To learn more about NXP's machine learning acceleration platform, please refer to [the official NXP website](https://www.nxp.com/applications/technologies/ai-and-machine-learning:MACHINE-LEARNING).
+
+<div class="admonition tip">
+For up-to-date status about running ExecuTorch on Neutron Backend please visit the <a href="https://github.com/pytorch/executorch/blob/main/backends/nxp/README.md">manual page</a>.
+</div>
+
+## Features
+
+Executorch v1.0 supports running machine learning models on selected NXP chips (for now only i.MXRT700).
+Among currently supported machine learning models are:
+- Convolution-based neutral networks
+- Full support for MobileNetv2 and CifarNet
+
+## Prerequisites (Hardware and Software)
+
+In order to succesfully build executorch project and convert models for NXP eIQ Neutron Backend you will need a computer running Windows or Linux.
+
+If you want to test the runtime, you'll also need:
+- Hardware with NXP's [i.MXRT700](https://www.nxp.com/products/i.MX-RT700) chip or a testing board like MIMXRT700-AVK
+- [MCUXpresso IDE](https://www.nxp.com/design/design-center/software/development-software/mcuxpresso-software-and-tools-/mcuxpresso-integrated-development-environment-ide:MCUXpresso-IDE) or [MCUXpresso Visual Studio Code extension](https://www.nxp.com/design/design-center/software/development-software/mcuxpresso-software-and-tools-/mcuxpresso-for-visual-studio-code:MCUXPRESSO-VSC)
+
+## Using NXP backend 
+
+To test converting a neural network model for inference on NXP eIQ Neutron Backend, you can use our example script:
+
+```shell
+# cd to the root of executorch repository
+./examples/nxp/aot_neutron_compile.sh [model (cifar10 or mobilenetv2)]
+```
+
+For a quick overview how to convert a custom PyTorch model, take a look at our [exmple python script](https://github.com/pytorch/executorch/tree/release/1.0/examples/nxp/aot_neutron_compile.py).
+
+## Runtime Integration
+
+To learn how to run the converted model on the NXP hardware, use one of our example projects on using executorch runtime from MCUXpresso IDE example projects list.
+For more finegrained tutorial, visit [this manual page](https://mcuxpresso.nxp.com/mcuxsdk/latest/html/middleware/eiq/executorch/docs/nxp/topics/example_applications.html).

From d8e07bd20c848f8b85d78444d8b9b5dcf8df2924 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu0820@users.noreply.github.com>
Date: Tue, 7 Oct 2025 07:55:15 -0700
Subject: [PATCH 151/266] Add .ptd support to portable executor runner (#14833)

This pull request enhances the `executor_runner` example by adding
support for loading and using `.ptd` (portable tensor data) files. This
enables the runner to ingest pre-serialized tensor data, improving
flexibility for model input handling. The changes include updates to
both build configuration and the main runner logic.

**Support for .ptd file loading and usage:**

* Added a new command-line flag `data_path` to specify the path to a
`.ptd` data file in `executor_runner.cpp` and integrated logic to load
this file and parse its contents using `FlatTensorDataMap`.
[[1]](diffhunk://#diff-179a73518cca7aa859d17ae188553f0eb0bee3ba5d2a99d8c636fae0bb39f759R54)
[[2]](diffhunk://#diff-179a73518cca7aa859d17ae188553f0eb0bee3ba5d2a99d8c636fae0bb39f759R177-R204)
* Updated the runner to pass the loaded tensor data map to the model
method loader, allowing methods to access pre-loaded input data.

**Build and dependency updates:**

* Included `flat_tensor_data_map` as a dependency in both the Bazel
build targets and CMake build configuration to ensure the new
functionality is available during compilation.
[[1]](diffhunk://#diff-d613fef537c6c97cf343cfcde252e980f7673c21aad54b40a2315aa44c284a8cR22)
[[2]](diffhunk://#diff-d613fef537c6c97cf343cfcde252e980f7673c21aad54b40a2315aa44c284a8cR42)
[[3]](diffhunk://#diff-1e7de1ae2d059d21e1dd75d5812d5a34b0222cef273b7c3a2af62eb747f9d20aR1024-R1026)
* Added the necessary header include for `flat_tensor_data_map` in
`executor_runner.cpp` and updated the relevant namespace usage.
[[1]](diffhunk://#diff-179a73518cca7aa859d17ae188553f0eb0bee3ba5d2a99d8c636fae0bb39f759R29)
[[2]](diffhunk://#diff-179a73518cca7aa859d17ae188553f0eb0bee3ba5d2a99d8c636fae0bb39f759R77)

## Test Plan:

Tested with .pte and .ptd for CUDA backend:

```
python -m executorch.examples.cuda.scripts.export --model_name linear --output_dir ./
```

Make sure we have `linear.pte` and `aoti_cuda_blob.ptd`.

Build executor runner with the following options:

```
cmake -DCMAKE_BUILD_TYPE=Debug -DEXECUTORCH_BUILD_CUDA=ON -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON -S. -Bcmake-out
```

Then do:

```
cmake --build cmake-out -j8
```

Then we can run:

```
cmake-out/executor_runner --model_path linear.pte --ptd_path aoti_cuda_blob.ptd
I 00:00:00.000594 executorch:executor_runner.cpp:189] PTD file aoti_cuda_blob.ptd is loaded.
I 00:00:00.000671 executorch:executor_runner.cpp:199] PTD data map created with 1 keys.
I 00:00:00.000749 executorch:executor_runner.cpp:249] Model file linear.pte is loaded.
I 00:00:00.000758 executorch:executor_runner.cpp:258] Using method forward
I 00:00:00.000770 executorch:executor_runner.cpp:309] Setting up planned buffer 0, size 96.
I 00:00:00.002908 executorch:cuda_backend.cpp:140] Writing 394624 bytes to /tmp/linear_so_blob844427.so
I 00:00:00.324783 executorch:cuda_backend.cpp:174] container_handle = 0x26a71b0
I 00:00:00.324867 executorch:executor_runner.cpp:337] Method loaded.
I 00:00:00.325796 executorch:cuda_backend.cpp:249] Inputs copied to GPU
I 00:00:00.325829 executorch:cuda_backend.cpp:278] Outputs created on GPU
E 00:00:00.326623 executorch:memory.cpp:286] Cannot delete null tensor
I 00:00:00.326678 executorch:executor_runner.cpp:374] Model executed successfully 1 time(s) in 1.777041 ms.
I 00:00:00.326691 executorch:executor_runner.cpp:383] 1 outputs:
OutputX 0: tensor(sizes=[3, 3], [-0.199237, 0.550725, 0.0830356, -0.199237, 0.550725, 0.0830356, -0.199237, 0.550725, 0.0830356])
E 00:00:00.328474 executorch:memory.cpp:299] Didn't find tensor 0x699a3d0
```
---
 .ci/scripts/test_model.sh                     | 11 +++---
 .ci/scripts/utils.sh                          |  7 ++--
 CMakeLists.txt                                |  4 +++
 examples/portable/custom_ops/CMakeLists.txt   | 10 ++++--
 .../executor_runner/executor_runner.cpp       | 36 ++++++++++++++++++-
 examples/portable/executor_runner/targets.bzl |  2 ++
 .../selective_build/advanced/CMakeLists.txt   |  9 +++--
 examples/selective_build/basic/CMakeLists.txt |  9 +++--
 .../flat_tensor/flat_tensor_data_map.cpp      |  2 +-
 .../serialize/flat_tensor_header.cpp          |  2 ++
 tools/cmake/preset/arm_baremetal.cmake        |  3 +-
 tools/cmake/preset/default.cmake              |  4 +--
 12 files changed, 81 insertions(+), 18 deletions(-)

diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh
index de28597b1d5..8449809ffe3 100755
--- a/.ci/scripts/test_model.sh
+++ b/.ci/scripts/test_model.sh
@@ -48,22 +48,25 @@ prepare_artifacts_upload() {
   fi
 }
 
+
 build_cmake_executor_runner() {
   local backend_string_select="${1:-}"
   echo "Building executor_runner"
   rm -rf ${CMAKE_OUTPUT_DIR}
   mkdir ${CMAKE_OUTPUT_DIR}
+  # Common options:
+  COMMON="-DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE"
   if [[ "$backend_string_select" == "XNNPACK" ]]; then
     echo "Backend $backend_string_select selected"
-    (cd ${CMAKE_OUTPUT_DIR} \
-      && cmake -DCMAKE_BUILD_TYPE=Release \
+    cmake -DCMAKE_BUILD_TYPE=Release \
         -DEXECUTORCH_BUILD_XNNPACK=ON \
-        -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
+        ${COMMON} \
+        -B${CMAKE_OUTPUT_DIR} .
     cmake --build ${CMAKE_OUTPUT_DIR} -j4
   else
     cmake -DCMAKE_BUILD_TYPE=Debug \
         -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-        -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
+        ${COMMON} \
         -B${CMAKE_OUTPUT_DIR} .
     cmake --build ${CMAKE_OUTPUT_DIR} -j4 --config Debug
   fi
diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh
index f6f6ece786b..f896d3f1d40 100644
--- a/.ci/scripts/utils.sh
+++ b/.ci/scripts/utils.sh
@@ -125,14 +125,15 @@ build_executorch_runner_cmake() {
   clean_executorch_install_folders
   mkdir "${CMAKE_OUTPUT_DIR}"
 
-  pushd "${CMAKE_OUTPUT_DIR}" || return
   if [[ $1 == "Debug" ]]; then
       CXXFLAGS="-fsanitize=address,undefined"
   else
       CXXFLAGS=""
   fi
-  CXXFLAGS="$CXXFLAGS" retry cmake -DPYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" -DCMAKE_BUILD_TYPE="${1:-Release}" ..
-  popd || return
+  CXXFLAGS="$CXXFLAGS" retry cmake \
+    -DPYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" \
+    -DCMAKE_BUILD_TYPE="${1:-Release}" \
+    -B${CMAKE_OUTPUT_DIR} .
 
   if [ "$(uname)" == "Darwin" ]; then
     CMAKE_JOBS=$(( $(sysctl -n hw.ncpu) - 1 ))
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7012ec641bf..6a36d7e563a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1021,6 +1021,10 @@ if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
                             extension_runner_util gflags executorch_backends
   )
 
+  if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
+    list(APPEND _executor_runner_libs extension_flat_tensor)
+  endif()
+
   if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
     list(APPEND _executor_runner_libs optimized_native_cpu_ops_lib)
   elseif(EXECUTORCH_BUILD_CADENCE)
diff --git a/examples/portable/custom_ops/CMakeLists.txt b/examples/portable/custom_ops/CMakeLists.txt
index 4188554af79..8e679697b47 100644
--- a/examples/portable/custom_ops/CMakeLists.txt
+++ b/examples/portable/custom_ops/CMakeLists.txt
@@ -117,8 +117,14 @@ list(TRANSFORM _executor_runner__srcs PREPEND "${EXECUTORCH_ROOT}/")
 
 add_executable(custom_ops_executor_runner ${_executor_runner__srcs})
 target_link_libraries(
-  custom_ops_executor_runner custom_ops_lib executorch extension_evalue_util
-  extension_runner_util gflags
+  custom_ops_executor_runner
+  custom_ops_lib
+  executorch
+  extension_evalue_util
+  extension_runner_util
+  gflags
+  extension_data_loader
+  extension_flat_tensor
 )
 target_compile_options(
   custom_ops_executor_runner PUBLIC ${_common_compile_options}
diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp
index 5ce872eec8e..0974e751203 100644
--- a/examples/portable/executor_runner/executor_runner.cpp
+++ b/examples/portable/executor_runner/executor_runner.cpp
@@ -26,6 +26,7 @@
 
 #include <executorch/extension/data_loader/file_data_loader.h>
 #include <executorch/extension/evalue_util/print_evalue.h>
+#include <executorch/extension/flat_tensor/flat_tensor_data_map.h>
 #include <executorch/extension/runner_util/inputs.h>
 #include <executorch/runtime/core/event_tracer.h>
 #include <executorch/runtime/executor/method.h>
@@ -50,6 +51,7 @@ DEFINE_string(
     model_path,
     "model.pte",
     "Model serialized in flatbuffer format.");
+DEFINE_string(data_path, "", "Path to data file.");
 DEFINE_string(inputs, "", "Comma-separated list of input files");
 DEFINE_string(
     output_file,
@@ -72,6 +74,7 @@ DEFINE_int32(
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
 using executorch::extension::FileDataLoader;
+using executorch::extension::FlatTensorDataMap;
 using executorch::runtime::Error;
 using executorch::runtime::EValue;
 using executorch::runtime::EventTracer;
@@ -171,6 +174,34 @@ int main(int argc, char** argv) {
       "FileDataLoader::from() failed: 0x%" PRIx32,
       (uint32_t)loader.error());
 
+  // Load .ptd file if provided
+  std::unique_ptr<FileDataLoader> ptd_loader;
+  std::unique_ptr<FlatTensorDataMap> ptd_data_map;
+  if (!FLAGS_data_path.empty()) {
+    const char* data_path = FLAGS_data_path.c_str();
+    Result<FileDataLoader> ptd_loader_result = FileDataLoader::from(data_path);
+    ET_CHECK_MSG(
+        ptd_loader_result.ok(),
+        "FileDataLoader::from() failed for PTD file: 0x%" PRIx32,
+        (uint32_t)ptd_loader_result.error());
+    ptd_loader =
+        std::make_unique<FileDataLoader>(std::move(ptd_loader_result.get()));
+    ET_LOG(Info, "PTD file %s is loaded.", data_path);
+
+    Result<FlatTensorDataMap> ptd_data_map_result =
+        FlatTensorDataMap::load(ptd_loader.get());
+    ET_CHECK_MSG(
+        ptd_data_map_result.ok(),
+        "FlatTensorDataMap::load() failed for PTD file: 0x%" PRIx32,
+        (uint32_t)ptd_data_map_result.error());
+    ptd_data_map = std::make_unique<FlatTensorDataMap>(
+        std::move(ptd_data_map_result.get()));
+    ET_LOG(
+        Info,
+        "PTD data map created with %" PRIu64 " keys.",
+        static_cast<uint64_t>(ptd_data_map->get_num_keys().get()));
+  }
+
   std::vector<std::string> inputs_storage;
   std::vector<std::pair<char*, size_t>> input_buffers;
 
@@ -294,7 +325,10 @@ int main(int argc, char** argv) {
   //
   EventTraceManager tracer;
   Result<Method> method = program->load_method(
-      method_name, &memory_manager, tracer.get_event_tracer());
+      method_name,
+      &memory_manager,
+      tracer.get_event_tracer(),
+      ptd_data_map.get());
   ET_CHECK_MSG(
       method.ok(),
       "Loading of method %s failed with status 0x%" PRIx32,
diff --git a/examples/portable/executor_runner/targets.bzl b/examples/portable/executor_runner/targets.bzl
index 0af45d85075..d1304a84bcb 100644
--- a/examples/portable/executor_runner/targets.bzl
+++ b/examples/portable/executor_runner/targets.bzl
@@ -19,6 +19,7 @@ def define_common_targets():
             "//executorch/devtools/etdump:etdump_flatcc",
             "//executorch/extension/data_loader:file_data_loader",
             "//executorch/extension/evalue_util:print_evalue",
+            "//executorch/extension/flat_tensor:flat_tensor_data_map",
             "//executorch/extension/runner_util:inputs",
         ],
         external_deps = [
@@ -38,6 +39,7 @@ def define_common_targets():
             "//executorch/runtime/executor:program",
             "//executorch/extension/data_loader:file_data_loader",
             "//executorch/extension/evalue_util:print_evalue",
+            "//executorch/extension/flat_tensor:flat_tensor_data_map",
             "//executorch/extension/runner_util:inputs",
             "//executorch/extension/threadpool:cpuinfo_utils",
             "//executorch/extension/threadpool:threadpool",
diff --git a/examples/selective_build/advanced/CMakeLists.txt b/examples/selective_build/advanced/CMakeLists.txt
index 65ebb50bcac..fdef5e6555d 100644
--- a/examples/selective_build/advanced/CMakeLists.txt
+++ b/examples/selective_build/advanced/CMakeLists.txt
@@ -139,7 +139,12 @@ if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
 endif()
 target_link_libraries(
   selective_build_test
-  PRIVATE executorch_core extension_evalue_util extension_runner_util
-          gflags::gflags ${selected_kernel_target}
+  PRIVATE executorch_core
+          extension_evalue_util
+          extension_runner_util
+          gflags::gflags
+          extension_flat_tensor
+          extension_data_loader
+          ${selected_kernel_target}
 )
 target_compile_options(selective_build_test PUBLIC ${_common_compile_options})
diff --git a/examples/selective_build/basic/CMakeLists.txt b/examples/selective_build/basic/CMakeLists.txt
index 3cc68ad53b6..d74f94d7b3a 100644
--- a/examples/selective_build/basic/CMakeLists.txt
+++ b/examples/selective_build/basic/CMakeLists.txt
@@ -71,7 +71,12 @@ if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
 endif()
 target_link_libraries(
   selective_build_test
-  PRIVATE executorch_core extension_evalue_util extension_runner_util
-          gflags::gflags executorch_kernels
+  PRIVATE executorch_core
+          extension_evalue_util
+          extension_runner_util
+          gflags::gflags
+          executorch_kernels
+          extension_data_loader
+          extension_flat_tensor
 )
 target_compile_options(selective_build_test PUBLIC ${_common_compile_options})
diff --git a/extension/flat_tensor/flat_tensor_data_map.cpp b/extension/flat_tensor/flat_tensor_data_map.cpp
index 478ce9d63cf..515bfe93c28 100644
--- a/extension/flat_tensor/flat_tensor_data_map.cpp
+++ b/extension/flat_tensor/flat_tensor_data_map.cpp
@@ -55,7 +55,7 @@ Result<const flat_tensor_flatbuffer::NamedData*> get_named_data(
   if (named_data == nullptr) {
     return Error::NotFound;
   }
-  for (int i = 0; i < named_data->size(); i++) {
+  for (flatbuffers::uoffset_t i = 0; i < named_data->size(); ++i) {
     if (key.size() == named_data->Get(i)->key()->size() &&
         std::strncmp(
             named_data->Get(i)->key()->c_str(),
diff --git a/extension/flat_tensor/serialize/flat_tensor_header.cpp b/extension/flat_tensor/serialize/flat_tensor_header.cpp
index b329015e4ce..b055d222465 100644
--- a/extension/flat_tensor/serialize/flat_tensor_header.cpp
+++ b/extension/flat_tensor/serialize/flat_tensor_header.cpp
@@ -14,7 +14,9 @@
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/result.h>
 
+#if defined(__clang__)
 #pragma clang diagnostic ignored "-Wdeprecated"
+#endif
 
 namespace executorch {
 using runtime::Error;
diff --git a/tools/cmake/preset/arm_baremetal.cmake b/tools/cmake/preset/arm_baremetal.cmake
index 33a12969484..882780ade1d 100644
--- a/tools/cmake/preset/arm_baremetal.cmake
+++ b/tools/cmake/preset/arm_baremetal.cmake
@@ -5,6 +5,8 @@
 
 set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}")
 set_overridable_option(EXECUTORCH_BUILD_EXECUTOR_RUNNER OFF)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR OFF)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER OFF)
 set_overridable_option(EXECUTORCH_BUILD_ARM_BAREMETAL ON)
 set_overridable_option(EXECUTORCH_BUILD_KERNELS_QUANTIZED ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL ON)
@@ -18,7 +20,6 @@ define_overridable_option(
 if("${EXECUTORCH_BUILD_ARM_ETDUMP}")
   set(EXECUTORCH_BUILD_DEVTOOLS ON)
   set(EXECUTORCH_ENABLE_EVENT_TRACER ON)
-  set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER OFF)
   set(FLATCC_ALLOW_WERROR OFF)
 else()
   set(EXECUTORCH_ENABLE_EVENT_TRACER OFF)
diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake
index fb0dc0a4ade..0039ab551fb 100644
--- a/tools/cmake/preset/default.cmake
+++ b/tools/cmake/preset/default.cmake
@@ -67,11 +67,11 @@ define_overridable_option(
 )
 define_overridable_option(
   EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "Build the Data Loader extension" BOOL
-  OFF
+  ON # Required by executor_runner
 )
 define_overridable_option(
   EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR "Build the Flat Tensor extension" BOOL
-  OFF
+  ON # Required by executor_runner
 )
 define_overridable_option(
   EXECUTORCH_BUILD_EXTENSION_LLM "Build the LLM extension" BOOL OFF

From 0e74a1731353cf96007ead7150715666b369ccce Mon Sep 17 00:00:00 2001
From: winskuo-quic <143469905+winskuo-quic@users.noreply.github.com>
Date: Wed, 8 Oct 2025 00:48:26 +0800
Subject: [PATCH 152/266] Qualcomm AI Engine Direct - Suite Operator Test
 Support Part 2 (#14848)

### Summary

Support following OPs

- Threshold OP
- negative dims permute
- sqrt unit test modified to use desired input rather than random values
- rsqrt unit test modified to use desired input rather than random
values
- per channel conv3d support

For the sqrt/rsqrt, I believe the sample input for each UT is using
`rand` instead of `randn` on purpose to prevent negative numbers input,
however, if we don't set `generate_random_test_inputs=False`, then later
on it will be using random values consisting of negative numbers,
causing `nan` showing up on output.

If everything works as expected, we should pass 6 more tests, bringing
pass rate from **90.7% -> 91.5%**

### Test plan
UT added

cc @cccclai @shewu-quic @haowhsu-quic @DannyYuyang-quic @cbilgin
---
 backends/qualcomm/_passes/__init__.py         |   2 +
 .../qualcomm/_passes/decompose_threshold.py   |  61 +++++++++++
 .../_passes/lift_constant_scalar_operands.py  |   1 +
 backends/qualcomm/_passes/qnn_pass_manager.py |   3 +
 backends/qualcomm/builders/node_visitor.py    |   4 +-
 backends/qualcomm/builders/op_transpose.py    |   2 +
 backends/qualcomm/quantizer/annotators.py     |   3 +-
 backends/qualcomm/quantizer/quantizer.py      |   1 +
 backends/qualcomm/tests/models.py             |  71 ++++++++-----
 backends/qualcomm/tests/test_qnn_delegate.py  | 100 ++++++++++++++++--
 backends/test/suite/flows/qualcomm.py         |   2 +-
 backends/test/suite/operators/__init__.py     |   4 +-
 backends/test/suite/operators/test_rsqrt.py   |  23 +++-
 backends/test/suite/operators/test_sqrt.py    |  12 ++-
 14 files changed, 242 insertions(+), 47 deletions(-)
 create mode 100644 backends/qualcomm/_passes/decompose_threshold.py

diff --git a/backends/qualcomm/_passes/__init__.py b/backends/qualcomm/_passes/__init__.py
index 5d0ac832237..a286bf8b1ae 100644
--- a/backends/qualcomm/_passes/__init__.py
+++ b/backends/qualcomm/_passes/__init__.py
@@ -23,6 +23,7 @@
 from .decompose_minmaxdim import DecomposeMinMaxDim
 from .decompose_roll import DecomposeRoll
 from .decompose_silu import DecomposeSilu
+from .decompose_threshold import DecomposeThreshold
 from .decompose_wrap_with_autocast import DecomposeWrapWithAutocast
 from .expand_broadcast_tensor_shape import ExpandBroadcastTensorShape
 from .fixed_linear_keep_dim import FixedLinearKeepDim
@@ -65,6 +66,7 @@
     DecomposeMinMaxDim,
     DecomposeRoll,
     DecomposeSilu,
+    DecomposeThreshold,
     DecomposeWrapWithAutocast,
     ExpandBroadcastTensorShape,
     FixedLinearKeepDim,
diff --git a/backends/qualcomm/_passes/decompose_threshold.py b/backends/qualcomm/_passes/decompose_threshold.py
new file mode 100644
index 00000000000..0f0a1bc4ea8
--- /dev/null
+++ b/backends/qualcomm/_passes/decompose_threshold.py
@@ -0,0 +1,61 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+
+from executorch.exir.pass_base import ExportPass, PassResult
+
+from .utils import merge_decomposed_graph
+
+
+class DecomposeModule(torch.nn.Module):
+    def __init__(self, threshold, value):
+        super().__init__()
+        self.threshold = threshold
+        self.value = value
+
+    def forward(self, x):
+        return torch.where(x <= self.threshold, self.value, x)
+
+
+class DecomposeThreshold(ExportPass):
+    """
+    Decompose threshold to less_equal and where.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        graph = graph_module.graph
+        for node in graph.nodes:
+            if node.target in {
+                torch.ops.aten.threshold_.default,
+                torch.ops.aten.threshold.default,
+            }:
+                input_node = node.args[0]
+                threshold = node.args[1]
+                value = node.args[2]
+
+                model = DecomposeModule(threshold, value)
+                decomposed_module = torch.export.export(
+                    model, (input_node.meta["val"],), strict=True
+                ).module()
+
+                with graph.inserting_before(node):
+                    # remap is used to map original node values to new node values,
+                    # which ensures that reference to nodes are correctly updated in the new graph
+                    remap = {"x": input_node}
+                    merge_decomposed_graph(
+                        remap=remap,
+                        target_node=node,
+                        target_graph=graph,
+                        decomposed_graph_module=decomposed_module,
+                    )
+                    graph.erase_node(node)
+
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/qualcomm/_passes/lift_constant_scalar_operands.py b/backends/qualcomm/_passes/lift_constant_scalar_operands.py
index f5c5915cab2..52bdf7fa090 100644
--- a/backends/qualcomm/_passes/lift_constant_scalar_operands.py
+++ b/backends/qualcomm/_passes/lift_constant_scalar_operands.py
@@ -51,6 +51,7 @@ class TensorOpInfo:
     # The scalar number arg[1] is missing when using default. Result in a corner case to deal
     aten.leaky_relu.default: TensorOpInfo(aten.prelu.default, True, False),
     aten.leaky_relu_.default: TensorOpInfo(aten.prelu.default, True, False),
+    aten.where.ScalarSelf: TensorOpInfo(aten.where.self, False, True),
     aten.where.ScalarOther: TensorOpInfo(aten.where.self, False, True),
     aten.where.Scalar: TensorOpInfo(aten.where.self, False, True),
     aten.masked_fill.Scalar: TensorOpInfo(aten.masked_fill.Tensor, False, False),
diff --git a/backends/qualcomm/_passes/qnn_pass_manager.py b/backends/qualcomm/_passes/qnn_pass_manager.py
index 6e1369326fa..a377f0f4eb4 100644
--- a/backends/qualcomm/_passes/qnn_pass_manager.py
+++ b/backends/qualcomm/_passes/qnn_pass_manager.py
@@ -28,6 +28,7 @@
     DecomposeMinMaxDim,
     DecomposeRoll,
     DecomposeSilu,
+    DecomposeThreshold,
     DecomposeWrapWithAutocast,
     ExpandBroadcastTensorShape,
     FixedLinearKeepDim,
@@ -200,6 +201,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(DecomposeScaledDotProductAttention())
         self.add_pass(DecomposeRoll())
         self.add_pass(DecomposeSilu())
+        self.add_pass(DecomposeThreshold())
         self.add_pass(DecomposeWrapWithAutocast())
         self.add_pass(DecomposeEinsum())
         self.add_pass(DecomposeExpM1())
@@ -216,6 +218,7 @@ def transform_for_export_pipeline(
         self.add_pass(DecomposeCDist())
         self.add_pass(DecomposeScaledDotProductAttention())
         self.add_pass(DecomposeRoll())
+        self.add_pass(DecomposeThreshold())
         self.add_pass(DecomposeLinalgVectorNorm(quantization_capture=True))
         self.add_pass(DecomposeExpM1())
         self.add_pass(DecomposeWrapWithAutocast())
diff --git a/backends/qualcomm/builders/node_visitor.py b/backends/qualcomm/builders/node_visitor.py
index bc2b62c8c0b..8cbf3a50e22 100644
--- a/backends/qualcomm/builders/node_visitor.py
+++ b/backends/qualcomm/builders/node_visitor.py
@@ -176,7 +176,7 @@ def make_qnn_per_block_config(self, node: torch.fx.Node, quant_attrs: Dict):
         user_0 = self.get_first_user(node)
         if "convolution" in user_0.target.__name__:
             # OIHW (pytorch) -> HWIO (QNN)
-            quant_config[QCOM_AXIS] = 3
+            quant_config[QCOM_AXIS] = node.meta["val"].dim() - 1
             quant_config[QCOM_AXIS_ORDER] = (2, 3, 1, 0)
         elif "linear" in user_0.target.__name__:
             # OI (pytorch) -> OI (QNN)
@@ -218,7 +218,7 @@ def make_qnn_per_channel_config(self, node: torch.fx.Node, quant_attrs: Dict):
         user_0 = self.get_first_user(node)
         # Memory layout of QNN conv weight always ends in Output. Like conv2d is HWIO
         if "convolution" in user_0.target.__name__:
-            quant_config[QCOM_AXIS] = 3
+            quant_config[QCOM_AXIS] = node.meta["val"].dim() - 1
         else:
             quant_config[QCOM_AXIS] = quant_attrs[QCOM_AXIS]
 
diff --git a/backends/qualcomm/builders/op_transpose.py b/backends/qualcomm/builders/op_transpose.py
index dbed10ced46..e7fd84e8e79 100644
--- a/backends/qualcomm/builders/op_transpose.py
+++ b/backends/qualcomm/builders/op_transpose.py
@@ -42,6 +42,8 @@ def define_node(
 
         # permutation
         permute_order = cast(List[int], node.args[1])
+        # to prevent negative values
+        permute_order = [x % len(permute_order) for x in permute_order]
         permute_order_shape = [len(permute_order)]
 
         output_tensor = input_tensor.permute(permute_order)
diff --git a/backends/qualcomm/quantizer/annotators.py b/backends/qualcomm/quantizer/annotators.py
index 6f1ef47c2ee..cf403a1a76d 100644
--- a/backends/qualcomm/quantizer/annotators.py
+++ b/backends/qualcomm/quantizer/annotators.py
@@ -1358,7 +1358,7 @@ def annotate_chunk(node: Node, quantization_config: QuantizationConfig) -> None:
         )
 
 
-@register_annotator([torch.ops.aten.where.self])
+@register_annotator([torch.ops.aten.where.self, torch.ops.aten.where.ScalarSelf])
 def annotate_where(node: Node, quantization_config: QuantizationConfig) -> None:
     if _is_annotated([node]):
         return
@@ -1368,7 +1368,6 @@ def annotate_where(node: Node, quantization_config: QuantizationConfig) -> None:
         assert isinstance(input_node, Node)
         if _is_float_tensor(input_node):
             input_qspec_map[input_node] = quantization_config.input_activation
-
     node.meta[Q_ANNOTATION_KEY] = QuantizationAnnotation(
         input_qspec_map=input_qspec_map,
         output_qspec=(
diff --git a/backends/qualcomm/quantizer/quantizer.py b/backends/qualcomm/quantizer/quantizer.py
index 5943b54d968..44d129d5544 100644
--- a/backends/qualcomm/quantizer/quantizer.py
+++ b/backends/qualcomm/quantizer/quantizer.py
@@ -161,6 +161,7 @@ def __post_init__(self):
                 {
                     torch.ops.aten.conv1d.default,
                     torch.ops.aten.conv2d.default,
+                    torch.ops.aten.conv3d.default,
                     torch.ops.aten.conv_transpose2d.input,
                 }
             )
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
index cf4b2f21aaa..7b1663d09f6 100644
--- a/backends/qualcomm/tests/models.py
+++ b/backends/qualcomm/tests/models.py
@@ -598,28 +598,6 @@ def forward(self, x):
         return self.second(self.first(x))
 
 
-class Conv3dSequential(torch.nn.Module):
-    def __init__(self, bias=True):
-        super().__init__()
-        self.first = torch.nn.Conv3d(
-            in_channels=1,
-            out_channels=3,
-            kernel_size=(3, 3, 3),
-            padding=1,
-            bias=bias,
-        )
-        self.second = torch.nn.Conv3d(
-            in_channels=3,
-            out_channels=2,
-            kernel_size=(3, 3, 3),
-            padding=1,
-            bias=bias,
-        )
-
-    def forward(self, x):
-        return self.second(self.first(x))
-
-
 class Conv2dSingle(torch.nn.Module):
     def __init__(
         self,
@@ -726,6 +704,28 @@ def forward(self, x):
         return topk_values
 
 
+class Conv3dSequential(torch.nn.Module):
+    def __init__(self, bias=True):
+        super().__init__()
+        self.first = torch.nn.Conv3d(
+            in_channels=1,
+            out_channels=3,
+            kernel_size=(3, 3, 3),
+            padding=1,
+            bias=bias,
+        )
+        self.second = torch.nn.Conv3d(
+            in_channels=3,
+            out_channels=2,
+            kernel_size=(3, 3, 3),
+            padding=1,
+            bias=bias,
+        )
+
+    def forward(self, x):
+        return self.second(self.first(x))
+
+
 class ConvTranspose1dSingle(torch.nn.Module):
     def __init__(self, bias=True, dilation=1):
         super().__init__()
@@ -1507,6 +1507,15 @@ def forward(self, x):
         )
 
 
+class Permute(torch.nn.Module):
+    def __init__(self, dims: List[int]):
+        super().__init__()
+        self.dims = dims
+
+    def forward(self, x):
+        return x.permute(self.dims)
+
+
 class PixelShuffle(torch.nn.Module):
     def __init__(self, scale):
         super().__init__()
@@ -1540,11 +1549,12 @@ def forward(self, x):
 
 
 class PowTensorScalar(torch.nn.Module):
-    def __init__(self):
+    def __init__(self, exponent=2):
         super().__init__()
+        self.exponent = exponent
 
     def forward(self, x):
-        return torch.pow(x, 2)
+        return torch.pow(x, self.exponent)
 
 
 class PReLUDefault(torch.nn.Module):
@@ -2001,6 +2011,19 @@ def forward(self, x):
         return torch.tanh(x)
 
 
+class Threshold(torch.nn.Module):
+    def __init__(self, threshold=0.0, value=0.0, inplace=False):
+        super().__init__()
+        self.threshold = threshold
+        self.value = value
+        self.inplace = inplace
+
+    def forward(self, x):
+        return torch.nn.functional.threshold(
+            x, threshold=self.threshold, value=self.value, inplace=self.inplace
+        )
+
+
 class TopKandIndex(torch.nn.Module):
     def __init__(self):
         super().__init__()
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index 7018edcbb9c..e18c5b05a97 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -1117,6 +1117,16 @@ def test_qnn_backend_pad(self):
         sample_input = (torch.randn([1, 8, 128]),)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_permute(self):
+        modules = [
+            Permute([0, 2, 3, 1]),  # noqa: F405
+            Permute([-1, -3, -2, -4]),  # noqa: F405
+        ]
+        sample_input = (torch.randn([2, 3, 4, 5]),)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_pixel_shuffle(self):
         module = PixelShuffle(2)  # noqa: F405
         sample_input = (torch.ones([2, 4, 3, 3]),)
@@ -1128,9 +1138,28 @@ def test_qnn_backend_pixel_unshuffle(self):
         self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_pow_tensor_scalar(self):
-        module = PowTensorScalar()  # noqa: F405
-        sample_input = (torch.rand([2, 4, 3, 3]),)
-        self.lower_module_and_test_output(module, sample_input)
+        test_comb = [
+            {
+                QCOM_MODULE: [
+                    PowTensorScalar(),  # noqa: F405
+                    PowTensorScalar(1),  # noqa: F405
+                    PowTensorScalar(-1),  # noqa: F405
+                    PowTensorScalar(0.5),  # noqa: F405
+                ],  # noqa: F405
+                QCOM_SAMPLE_INPUTS: [(torch.rand(10, 10) + 0.1,)],
+            },
+            {
+                QCOM_MODULE: [PowTensorScalar(10)],  # noqa: F405
+                QCOM_SAMPLE_INPUTS: [(torch.rand(10, 10) * 0.5 + 0.5,)],
+            },
+        ]
+        index = 0
+        for comb in test_comb:
+            for module in comb[QCOM_MODULE]:
+                for sample_input in comb[QCOM_SAMPLE_INPUTS]:
+                    with self.subTest(i=index):
+                        index += 1
+                        self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_prelu(self):
         test_comb = [
@@ -1321,6 +1350,17 @@ def test_qnn_backend_tanh(self):
         sample_input = (torch.randn(2, 5, 1, 3),)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_threshold(self):
+        modules = [
+            Threshold(),  # noqa: F405
+            Threshold(threshold=0.5, value=3.0, inplace=True),  # noqa: F405
+            Threshold(threshold=0.5, value=3.0, inplace=False),  # noqa: F405
+        ]
+        sample_input = (torch.randn(2, 5, 1, 3),)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_unflatten(self):
         module = Unflatten(dim=1, sizes=(2, 3, 4))  # noqa: F405
         sample_input = (torch.randn([1, 24]),)
@@ -2818,6 +2858,17 @@ def test_qnn_backend_pad(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_permute(self):
+        modules = [
+            Permute([0, 2, 3, 1]),  # noqa: F405
+            Permute([-1, -3, -2, -4]),  # noqa: F405
+        ]
+        sample_input = (torch.randn([2, 3, 4, 5]),)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                module = self.get_qdq_module(module, sample_input)
+                self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_pixel_shuffle(self):
         module = PixelShuffle(2)  # noqa: F405
         sample_input = (torch.ones([2, 4, 3, 3]),)
@@ -2831,10 +2882,29 @@ def test_qnn_backend_pixel_unshuffle(self):
         self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_pow_tensor_scalar(self):
-        module = PowTensorScalar()  # noqa: F405
-        sample_input = (torch.rand([2, 4, 3, 3]),)
-        module = self.get_qdq_module(module, sample_input)
-        self.lower_module_and_test_output(module, sample_input)
+        test_comb = [
+            {
+                QCOM_MODULE: [
+                    PowTensorScalar(),  # noqa: F405
+                    PowTensorScalar(1),  # noqa: F405
+                    PowTensorScalar(-1),  # noqa: F405
+                    PowTensorScalar(0.5),  # noqa: F405
+                ],  # noqa: F405
+                QCOM_SAMPLE_INPUTS: [(torch.rand(10, 10) + 0.1,)],
+            },
+            {
+                QCOM_MODULE: [PowTensorScalar(10)],  # noqa: F405
+                QCOM_SAMPLE_INPUTS: [(torch.rand(10, 10) * 0.5 + 0.5,)],
+            },
+        ]
+        index = 0
+        for comb in test_comb:
+            for module in comb[QCOM_MODULE]:
+                for sample_input in comb[QCOM_SAMPLE_INPUTS]:
+                    with self.subTest(i=index):
+                        index += 1
+                        qdq_module = self.get_qdq_module(module, sample_input)
+                        self.lower_module_and_test_output(qdq_module, sample_input)
 
     def test_qnn_backend_prelu(self):
         test_comb = [
@@ -2853,8 +2923,8 @@ def test_qnn_backend_prelu(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        module = self.get_qdq_module(module, sample_input)
-                        self.lower_module_and_test_output(module, sample_input)
+                        qdq_module = self.get_qdq_module(module, sample_input)
+                        self.lower_module_and_test_output(qdq_module, sample_input)
                         index += 1
 
     def test_qnn_backend_relu(self):
@@ -3057,6 +3127,18 @@ def test_qnn_backend_tanh(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_threshold(self):
+        modules = [
+            Threshold(),  # noqa: F405
+            Threshold(threshold=0.5, value=3.0, inplace=True),  # noqa: F405
+            Threshold(threshold=0.5, value=3.0, inplace=False),  # noqa: F405
+        ]
+        sample_input = (torch.randn(2, 5, 1, 3),)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                qdq_module = self.get_qdq_module(module, sample_input)
+                self.lower_module_and_test_output(qdq_module, sample_input)
+
     def test_qnn_backend_unflatten(self):
         module = Unflatten(dim=1, sizes=(2, 3, 4))  # noqa: F405
         sample_input = (torch.randn([1, 24]),)
diff --git a/backends/test/suite/flows/qualcomm.py b/backends/test/suite/flows/qualcomm.py
index 9998caa51b6..99deb3d4877 100644
--- a/backends/test/suite/flows/qualcomm.py
+++ b/backends/test/suite/flows/qualcomm.py
@@ -42,7 +42,7 @@ def create_quantize_stage() -> Quantize:
 
 QNN_TEST_FLOW = _create_qnn_flow("qnn")
 QNN_16A16W_TEST_FLOW = _create_qnn_flow(
-    "qnn_16a16w", quantize=True, quant_dtype=QuantDtype.use_8a8w, use_fp16=False
+    "qnn_16a16w", quantize=True, quant_dtype=QuantDtype.use_16a16w, use_fp16=False
 )
 QNN_16A8W_TEST_FLOW = _create_qnn_flow(
     "qnn_16a8w", quantize=True, quant_dtype=QuantDtype.use_16a8w, use_fp16=False
diff --git a/backends/test/suite/operators/__init__.py b/backends/test/suite/operators/__init__.py
index fa5ec2566d4..7475af29e15 100644
--- a/backends/test/suite/operators/__init__.py
+++ b/backends/test/suite/operators/__init__.py
@@ -70,7 +70,9 @@ def __init__(self, test_runner):
         self._test_runner = test_runner
 
     def _test_op(self, model, args, flow, generate_random_test_inputs=True):
-        self._test_runner.lower_and_run_model(model, args)
+        self._test_runner.lower_and_run_model(
+            model, args, generate_random_test_inputs=generate_random_test_inputs
+        )
 
 
 def wrap_test(original_func, test_type):
diff --git a/backends/test/suite/operators/test_rsqrt.py b/backends/test/suite/operators/test_rsqrt.py
index 705833194fb..bb51b213dd4 100644
--- a/backends/test/suite/operators/test_rsqrt.py
+++ b/backends/test/suite/operators/test_rsqrt.py
@@ -37,15 +37,28 @@ def test_rsqrt_dtype(self, flow: TestFlow, dtype) -> None:
 
     def test_rsqrt_shapes(self, flow: TestFlow) -> None:
         # Test with different tensor shapes
-
         # 1D tensor
-        self._test_op(RsqrtModel(), (torch.rand(20) + 0.01,), flow)
-
+        self._test_op(
+            RsqrtModel(),
+            (torch.rand(20) + 0.01,),
+            flow,
+            generate_random_test_inputs=False,
+        )
         # 2D tensor
-        self._test_op(RsqrtModel(), (torch.rand(5, 10) + 0.01,), flow)
+        self._test_op(
+            RsqrtModel(),
+            (torch.rand(5, 10) + 0.01,),
+            flow,
+            generate_random_test_inputs=False,
+        )
 
         # 3D tensor
-        self._test_op(RsqrtModel(), (torch.rand(3, 4, 5) + 0.01,), flow)
+        self._test_op(
+            RsqrtModel(),
+            (torch.rand(3, 4, 5) + 0.01,),
+            flow,
+            generate_random_test_inputs=False,
+        )
 
     @unittest.skip("NaN and Inf are not enforced for backends.")
     def test_rsqrt_edge_cases(self, flow: TestFlow) -> None:
diff --git a/backends/test/suite/operators/test_sqrt.py b/backends/test/suite/operators/test_sqrt.py
index 3d327ade6a5..92fbc64878e 100644
--- a/backends/test/suite/operators/test_sqrt.py
+++ b/backends/test/suite/operators/test_sqrt.py
@@ -39,13 +39,19 @@ def test_sqrt_shapes(self, flow: TestFlow) -> None:
         # Test with different tensor shapes
 
         # 1D tensor
-        self._test_op(SqrtModel(), (torch.rand(20),), flow)
+        self._test_op(
+            SqrtModel(), (torch.rand(20),), flow, generate_random_test_inputs=False
+        )
 
         # 2D tensor
-        self._test_op(SqrtModel(), (torch.rand(5, 10),), flow)
+        self._test_op(
+            SqrtModel(), (torch.rand(5, 10),), flow, generate_random_test_inputs=False
+        )
 
         # 3D tensor
-        self._test_op(SqrtModel(), (torch.rand(3, 4, 5),), flow)
+        self._test_op(
+            SqrtModel(), (torch.rand(3, 4, 5),), flow, generate_random_test_inputs=False
+        )
 
     @unittest.skip("NaN and Inf are not enforced for backends.")
     def test_sqrt_edge_cases(self, flow: TestFlow) -> None:

From 0bfb61ea1f657e547ffcffc2c98f6c247ade83a2 Mon Sep 17 00:00:00 2001
From: Zingo Andersen <zingo.andersen@arm.com>
Date: Tue, 7 Oct 2025 19:51:47 +0200
Subject: [PATCH 153/266] Arm backend: Backend test call setup_path.sh (#14846)

### Summary
This is needed so that the installed FVP can be used from backed tests.
This also add related files to the backed tests trigger script

### Test plan
Arm Backend tests suite

cc @freddan80 @per @oscarandersson8218 @digantdesai

Signed-off-by: Zingo Andersen <Zingo.Andersen@arm.com>
---
 .ci/scripts/test_backend.sh            | 1 +
 .github/workflows/test-backend-arm.yml | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/.ci/scripts/test_backend.sh b/.ci/scripts/test_backend.sh
index ba5df5c3fe3..a48cc9ec41a 100755
--- a/.ci/scripts/test_backend.sh
+++ b/.ci/scripts/test_backend.sh
@@ -59,6 +59,7 @@ fi
 if [[ "$FLOW" == *arm* ]]; then
     # Setup ARM deps.
     .ci/scripts/setup-arm-baremetal-tools.sh
+    source examples/arm/ethos-u-scratch/setup_path.sh
 
     if [[ "$FLOW" == *ethos_u* ]]; then
         # Prepare a test runner binary that can run on the Corstone-3x0 FVPs
diff --git a/.github/workflows/test-backend-arm.yml b/.github/workflows/test-backend-arm.yml
index 428e3fd1239..22e3d524f6b 100644
--- a/.github/workflows/test-backend-arm.yml
+++ b/.github/workflows/test-backend-arm.yml
@@ -12,6 +12,9 @@ on:
     paths:
       - .github/workflows/test-backend-arm.yml
       - .github/workflows/_test_backend.yml
+      - .ci/scripts/test_backend.sh
+      - backends/test/suite/flow.py
+      - backends/test/suite/flows/arm.py
   workflow_dispatch:
 
 concurrency:

From 4ac04c5a2091304ffdd51f8fc650dfdccee77d7b Mon Sep 17 00:00:00 2001
From: Ryan OShea <86965113+ArmRyan@users.noreply.github.com>
Date: Tue, 7 Oct 2025 20:18:54 +0200
Subject: [PATCH 154/266] Arm backend: Bump tosa version to remove mlplatform
 dependencies (#14818)

The only change with this version is that submodules will
 no longer point to mlplatform and will instead point to gitlab.


Change-Id: I99e78b9401eaffa2b3e4ae8e840c84bd69ac5ab2


cc @freddan80 @per @zingo @oscarandersson8218 @digantdesai

Signed-off-by: Ryan O'Shea <ryan.oshea3@arm.com>
---
 backends/arm/requirements-arm-tosa.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/arm/requirements-arm-tosa.txt b/backends/arm/requirements-arm-tosa.txt
index 0f9c2f702a4..16aa01a6c23 100644
--- a/backends/arm/requirements-arm-tosa.txt
+++ b/backends/arm/requirements-arm-tosa.txt
@@ -8,4 +8,4 @@ flatbuffers == 24.3.25
 tosa-adapter-model-explorer == 0.0.1
 ai-edge-model-explorer >= 0.1.16
 
-tosa-tools @ git+https://git.gitlab.arm.com/tosa/tosa-reference-model.git@v2025.07.0
+tosa-tools @ git+https://git.gitlab.arm.com/tosa/tosa-reference-model.git@v2025.07.1

From 8ac63002110aee54b4c2c9e86edcb56d8b1ce344 Mon Sep 17 00:00:00 2001
From: George Gekov <george.gekov@arm.com>
Date: Tue, 7 Oct 2025 19:19:36 +0100
Subject: [PATCH 155/266] Arm backend: Change input distribution on resnet18
 test (#14815)

cc @freddan80 @per @zingo @oscarandersson8218 @digantdesai
---
 backends/arm/test/models/test_resnet18.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/backends/arm/test/models/test_resnet18.py b/backends/arm/test/models/test_resnet18.py
index 1c1011ec967..3cb21abd772 100644
--- a/backends/arm/test/models/test_resnet18.py
+++ b/backends/arm/test/models/test_resnet18.py
@@ -23,7 +23,8 @@
 model = model.eval()
 normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
 
-model_inputs = (normalize(torch.randn((1, 3, 224, 224))),)
+# Using torch.rand * 2 - 1 to generate numbers in the range [-1;1] like an RGB image
+model_inputs = (normalize(torch.rand((1, 3, 224, 224)) * 2 - 1),)
 
 input_t = Tuple[torch.Tensor]
 
@@ -71,7 +72,7 @@ def test_resnet_u55_INT(per_channel_quantization):
         exir_ops=[],
         use_to_edge_transform_and_lower=True,
         per_channel_quantization=per_channel_quantization,
-        atol=0.5,
+        atol=0.25,
         qtol=1,
     )
     pipeline.run()
@@ -91,7 +92,7 @@ def test_resnet_u85_INT(per_channel_quantization):
         exir_ops=[],
         use_to_edge_transform_and_lower=True,
         per_channel_quantization=per_channel_quantization,
-        atol=0.5,
+        atol=0.25,
         qtol=1,
     )
     pipeline.run()

From 7d8da19611b0c0c6d053eccfb7031cc111115bea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Lindstr=C3=B6m?=
 <33344797+martinlsm@users.noreply.github.com>
Date: Tue, 7 Oct 2025 20:21:54 +0200
Subject: [PATCH 156/266] Arm backend: Mark test in test_bmm.py as flaky
 (#14748)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a new argument to `common.parametrize`, `flakies`, which selects
which parametrized test cases to mark as flaky. With this new argument,
mark the test test_bmm.py::test_bmm_vgf_FP_single_input[rand_big_1] as
flaky.

cc @digantdesai @freddan80 @per @zingo @oscarandersson8218

Signed-off-by: Martin Lindström <Martin.Lindstroem@arm.com>
Co-authored-by: Martin Lindström <Martin.Lindstroem@arm.com>
---
 backends/arm/test/common.py       | 8 +++++++-
 backends/arm/test/ops/test_bmm.py | 6 +++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/backends/arm/test/common.py b/backends/arm/test/common.py
index f8a6242fc0c..3b5dd8bd4db 100644
--- a/backends/arm/test/common.py
+++ b/backends/arm/test/common.py
@@ -227,6 +227,7 @@ def parametrize(
     test_data: dict[str, Any],
     xfails: dict[str, xfail_type] | None = None,
     strict: bool = True,
+    flakies: dict[str, int] | None = None,
 ):
     """
     Custom version of pytest.mark.parametrize with some syntatic sugar and added xfail functionality
@@ -237,12 +238,17 @@ def parametrize(
     """
     if xfails is None:
         xfails = {}
+    if flakies is None:
+        flakies = {}
 
     def decorator_func(func):
         """Test data is transformed from a dict of (id, data) pairs to a list of pytest params to work with the native pytests parametrize function"""
         pytest_testsuite = []
         for id, test_parameters in test_data.items():
-            if id in xfails:
+            if id in flakies:
+                # Mark this parameter as flaky with given reruns
+                marker = (pytest.mark.flaky(reruns=flakies[id]),)
+            elif id in xfails:
                 xfail_info = xfails[id]
                 reason = ""
                 raises = None
diff --git a/backends/arm/test/ops/test_bmm.py b/backends/arm/test/ops/test_bmm.py
index f18d4c997a5..f69b1419c8d 100644
--- a/backends/arm/test/ops/test_bmm.py
+++ b/backends/arm/test/ops/test_bmm.py
@@ -146,7 +146,11 @@ def test_bmm_vgf_FP(test_data: input_t1):
     pipeline.run()
 
 
-@common.parametrize("test_data", BMMSingleInput.test_data_generators)
+@common.parametrize(
+    "test_data",
+    BMMSingleInput.test_data_generators,
+    flakies={"rand_big_1": 3},
+)
 @common.SkipIfNoModelConverter
 def test_bmm_vgf_FP_single_input(test_data: input_t1):
     pipeline = VgfPipeline[input_t1](

From e09abea625654c93afeb3e0fddc81a2e31853f53 Mon Sep 17 00:00:00 2001
From: cccclai <chenlai@meta.com>
Date: Tue, 7 Oct 2025 11:27:41 -0700
Subject: [PATCH 157/266] support argmax/argmin without dim kwargs and fix
 adaptive_max_pool3d (#14710)

Summary: As title, in PyTorch, when dim is not set, it will flatten the
input and get argmax as dim=0. Add a pass to reshape the input when dim
is not set and consolidate test case

edit:
1. Apply to argmin too
2. Add `exir_ops.edge.aten.adaptive_max_pool3d.default` to the to be
implemented op list to pass the error

Differential Revision: D83606497
---
 backends/qualcomm/_passes/__init__.py         |   3 +-
 .../_passes/insert_reshape_for_reduce_ops.py  |  59 ++++++++
 backends/qualcomm/_passes/qnn_pass_manager.py |   3 +
 backends/qualcomm/partition/common_defs.py    |   1 +
 backends/qualcomm/tests/TARGETS               |  14 ++
 backends/qualcomm/tests/models.py             |  14 +-
 backends/qualcomm/tests/test_passes.py        |  54 ++++++++
 backends/qualcomm/tests/test_qnn_delegate.py  | 128 ++++++++++++++++--
 8 files changed, 255 insertions(+), 21 deletions(-)
 create mode 100644 backends/qualcomm/_passes/insert_reshape_for_reduce_ops.py
 create mode 100644 backends/qualcomm/tests/test_passes.py

diff --git a/backends/qualcomm/_passes/__init__.py b/backends/qualcomm/_passes/__init__.py
index a286bf8b1ae..26b2bdc96c9 100644
--- a/backends/qualcomm/_passes/__init__.py
+++ b/backends/qualcomm/_passes/__init__.py
@@ -33,6 +33,7 @@
 from .i64_to_i32 import I64toI32
 from .insert_io_qdq import InsertIOQDQ
 from .insert_requantize import InsertRequantize
+from .insert_reshape_for_reduce_ops import InsertReshapeForReduceOps
 from .layout_transform import LayoutTransform
 from .lift_constant_scalar_operands import LiftConstantScalarOperands
 from .recompose_pixel_unshuffle import RecomposePixelUnshuffle
@@ -45,7 +46,6 @@
 from .seq_mse import SeqMSE
 from .tag_quant_io import TagQuantIO
 
-
 __all__ = [
     AnnotateAdaptiveAvgPool1D,
     AnnotateQuantAttrs,
@@ -75,6 +75,7 @@
     FuseConsecutiveTranspose,
     I64toI32,
     InsertIOQDQ,
+    InsertReshapeForReduceOps,
     InsertRequantize,
     LayoutTransform,
     LiftConstantScalarOperands,
diff --git a/backends/qualcomm/_passes/insert_reshape_for_reduce_ops.py b/backends/qualcomm/_passes/insert_reshape_for_reduce_ops.py
new file mode 100644
index 00000000000..52f9546c28e
--- /dev/null
+++ b/backends/qualcomm/_passes/insert_reshape_for_reduce_ops.py
@@ -0,0 +1,59 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir.pass_base import ExportPass, PassResult
+from executorch.exir.passes import dead_code_elimination_pass
+
+
+class InsertReshapeForReduceOps(ExportPass):
+    """
+    Rewrite `aten.argmax.default` with `dim=None` into
+    a reshape-to-1D followed by argmax(dim=0).
+
+    PyTorch semantics:
+      torch.argmax(x, dim=None) -> flatten(x) then argmax along axis=0
+
+    QNN requires an explicit axis, so we insert the reshape.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.op_map = {torch.ops.aten.argmax.default, torch.ops.aten.argmin.default}
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        graph = graph_module.graph
+        modified = False
+
+        for n in graph.nodes:
+            if n.target in self.op_map:
+                dim_arg = None if len(n.args) == 1 else n.args[1]
+
+                if dim_arg is None:
+                    inp = n.args[0]
+
+                    # Insert reshape before argmax
+                    with graph.inserting_before(n):
+                        reshape_node = graph.create_node(
+                            "call_function",
+                            torch.ops.aten.reshape.default,
+                            (inp, [-1]),
+                            {},
+                        )
+                        reshape_node.meta = dict(inp.meta)
+                        if "val" in inp.meta:
+                            reshape_node.meta["val"] = inp.meta["val"].reshape(-1)
+
+                    # Rewrite argmax: take reshape_node as input, set dim=0
+                    n.args = (reshape_node, 0, *n.args[2:])
+
+                modified = True
+
+        if modified:
+            graph_module.recompile()
+            dead_code_elimination_pass(graph_module)
+
+        return PassResult(graph_module, modified)
diff --git a/backends/qualcomm/_passes/qnn_pass_manager.py b/backends/qualcomm/_passes/qnn_pass_manager.py
index a377f0f4eb4..796662ca6b3 100644
--- a/backends/qualcomm/_passes/qnn_pass_manager.py
+++ b/backends/qualcomm/_passes/qnn_pass_manager.py
@@ -38,6 +38,7 @@
     I64toI32,
     InsertIOQDQ,
     InsertRequantize,
+    InsertReshapeForReduceOps,
     LayoutTransform,
     LiftConstantScalarOperands,
     RecomposePixelUnshuffle,
@@ -209,6 +210,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(DecomposeLinalgVectorNorm(quantization_capture=True))
         self.add_pass(ReplaceInfValues())
         self.add_pass(LiftConstantScalarOperands())
+        self.add_pass(InsertReshapeForReduceOps())
         return self._transform(graph_module)
 
     def transform_for_export_pipeline(
@@ -229,6 +231,7 @@ def transform_for_export_pipeline(
             self.add_pass(ConvertLinearToConv2d(exported_program))
         self.add_pass(ConvertSquareToPow())
         self.add_pass(LiftConstantScalarOperands())
+        self.add_pass(InsertReshapeForReduceOps())
         self._transform(exported_program.graph_module)
         ep = lift_constant_tensor_pass(exported_program)
         return ep
diff --git a/backends/qualcomm/partition/common_defs.py b/backends/qualcomm/partition/common_defs.py
index 7a2924fe756..0a947759538 100644
--- a/backends/qualcomm/partition/common_defs.py
+++ b/backends/qualcomm/partition/common_defs.py
@@ -17,6 +17,7 @@
 to_be_implemented_operator = [
     exir_ops.edge.aten._adaptive_avg_pool3d.default,
     exir_ops.edge.aten.adaptive_max_pool2d.default,
+    exir_ops.edge.aten.adaptive_max_pool3d.default,
     exir_ops.edge.aten.avg_pool3d.default,
     exir_ops.edge.aten.div.Tensor_mode,
     exir_ops.edge.aten.log10.default,
diff --git a/backends/qualcomm/tests/TARGETS b/backends/qualcomm/tests/TARGETS
index 639303c7eb8..d968f954485 100644
--- a/backends/qualcomm/tests/TARGETS
+++ b/backends/qualcomm/tests/TARGETS
@@ -47,3 +47,17 @@ runtime.python_library(
         ":test_qnn_delegate"
     ]
 )
+
+runtime.python_test(
+    name = "test_passes",
+    srcs = [
+        "test_passes.py",
+    ],
+    deps = [
+        "fbsource//third-party/pypi/expecttest:expecttest",  # @manual
+        "//caffe2:torch",
+        "//executorch/exir:lib",
+        "//executorch/backends/qualcomm/_passes:passes",
+        "//executorch/backends/qualcomm/builders:builders",
+    ],
+)
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
index 7b1663d09f6..3240ad7a018 100644
--- a/backends/qualcomm/tests/models.py
+++ b/backends/qualcomm/tests/models.py
@@ -171,21 +171,23 @@ def forward(self, y):
 
 
 class Argmax(torch.nn.Module):
-    def __init__(self):
+    def __init__(self, dim: Optional[int] = None, keepdim: bool = False):
         super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
 
     def forward(self, x):
-        x = torch.argmax(x, dim=0, keepdim=True)
-        return x
+        return torch.argmax(x, dim=self.dim, keepdim=self.keepdim)
 
 
 class Argmin(torch.nn.Module):
-    def __init__(self):
+    def __init__(self, dim: Optional[int] = None, keepdim: bool = False):
         super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
 
     def forward(self, x):
-        x = torch.argmin(x, dim=0, keepdim=True)
-        return x
+        return torch.argmin(x, dim=self.dim, keepdim=self.keepdim)
 
 
 class ArgminViewSqueezeConv2D(torch.nn.Module):
diff --git a/backends/qualcomm/tests/test_passes.py b/backends/qualcomm/tests/test_passes.py
new file mode 100644
index 00000000000..94a5d08acc1
--- /dev/null
+++ b/backends/qualcomm/tests/test_passes.py
@@ -0,0 +1,54 @@
+import unittest
+
+import torch
+from executorch.backends.qualcomm._passes import InsertReshapeForReduceOps
+
+
+class TestPasses(unittest.TestCase):
+    def test_insert_reshape_for_argmax(self):
+        class ArgmaxModule(torch.nn.Module):
+            def forward(self, x):
+                return torch.argmax(x, dim=None)
+
+        mod = ArgmaxModule()
+
+        x = torch.tensor([[1.0, 5.0], [3.0, 2.0]])
+        ep = torch.export.export(mod, (x,))
+        # Run original module for reference
+        ref = mod(x)
+
+        reshape_nodes = [
+            n for n in ep.graph.nodes if n.target == torch.ops.aten.reshape.default
+        ]
+        argmax_nodes = [
+            n for n in ep.graph.nodes if n.target == torch.ops.aten.argmax.default
+        ]
+        self.assertTrue(len(reshape_nodes) == 0, "Reshape node not inserted")
+        self.assertTrue(len(argmax_nodes) == 1, "Argmax node missing")
+
+        InsertReshapeForReduceOps()(ep.graph_module)
+
+        out = ep.graph_module(x)
+
+        # Check graph structure: argmax should take a reshape as input
+        reshape_nodes = [
+            n for n in ep.graph.nodes if n.target == torch.ops.aten.reshape.default
+        ]
+        argmax_nodes = [
+            n for n in ep.graph.nodes if n.target == torch.ops.aten.argmax.default
+        ]
+        self.assertTrue(len(reshape_nodes) == 1, "Reshape node should be inserted")
+        self.assertTrue(len(argmax_nodes) == 1, "Argmax node missing")
+
+        argmax_node = argmax_nodes[0]
+        self.assertEqual(argmax_node.args[1], 0, "Argmax dim not set to 0")
+
+        # Execute new graph and compare with reference
+        out = ep.graph_module(x)
+        self.assertTrue(
+            torch.equal(*out, ref), f"Output mismatch: got {out}, expected {ref}"
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index e18c5b05a97..fd0454e3250 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -173,14 +173,64 @@ def test_qnn_backend_arange(self):
                 self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_argmax(self):
-        module = Argmax()  # noqa: F405
-        sample_input = (torch.randn(16, 3, 4, 4),)
-        self.lower_module_and_test_output(module, sample_input)
+        test_cases = [
+            {
+                QCOM_MODULE: Argmax(),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(16, 3, 4, 4),),
+            },
+            {
+                QCOM_MODULE: Argmax(dim=0, keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(16, 3, 4, 4),),
+            },
+            {
+                QCOM_MODULE: Argmax(dim=1, keepdim=False),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(8, 5),),
+            },
+            {
+                QCOM_MODULE: Argmax(dim=None, keepdim=False),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.tensor([5.0]),),
+            },
+            {
+                QCOM_MODULE: Argmax(dim=2, keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(2, 3, 4),),
+            },
+        ]
+
+        for i, case in enumerate(test_cases):
+            with self.subTest(i=i):
+                self.lower_module_and_test_output(
+                    case[QCOM_MODULE], case[QCOM_SAMPLE_INPUTS]
+                )
 
     def test_qnn_backend_argmin(self):
-        module = Argmin()  # noqa: F405
-        sample_input = (torch.rand(3, 4),)
-        self.lower_module_and_test_output(module, sample_input)
+        test_cases = [
+            {
+                QCOM_MODULE: Argmin(),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(16, 3, 4, 4),),
+            },
+            {
+                QCOM_MODULE: Argmin(dim=0, keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(16, 3, 4, 4),),
+            },
+            {
+                QCOM_MODULE: Argmin(dim=1, keepdim=False),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(8, 5),),
+            },
+            {
+                QCOM_MODULE: Argmin(dim=None, keepdim=False),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.tensor([5.0]),),
+            },
+            {
+                QCOM_MODULE: Argmin(dim=2, keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(2, 3, 4),),
+            },
+        ]
+
+        for i, case in enumerate(test_cases):
+            with self.subTest(i=i):
+                self.lower_module_and_test_output(
+                    case[QCOM_MODULE], case[QCOM_SAMPLE_INPUTS]
+                )
 
     @unittest.expectedFailure
     def test_qnn_backend_asin(self):
@@ -1797,16 +1847,66 @@ def test_qnn_backend_arange(self):
                 self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_argmax(self):
-        module = Argmax()  # noqa: F405
-        sample_input = (torch.randn(16, 3, 4, 4),)
-        module = self.get_qdq_module(module, sample_input)
-        self.lower_module_and_test_output(module, sample_input)
+        test_cases = [
+            {
+                QCOM_MODULE: Argmax(),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(16, 3, 4, 4),),
+            },
+            {
+                QCOM_MODULE: Argmax(dim=0, keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(16, 3, 4, 4),),
+            },
+            {
+                QCOM_MODULE: Argmax(dim=1, keepdim=False),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(8, 5),),
+            },
+            {
+                QCOM_MODULE: Argmax(dim=None, keepdim=False),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.tensor([5.0]),),
+            },
+            {
+                QCOM_MODULE: Argmax(dim=2, keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(2, 3, 4),),
+            },
+        ]
+
+        for i, case in enumerate(test_cases):
+            with self.subTest(i=i):
+                module = self.get_qdq_module(
+                    case[QCOM_MODULE], case[QCOM_SAMPLE_INPUTS]
+                )
+                self.lower_module_and_test_output(module, case[QCOM_SAMPLE_INPUTS])
 
     def test_qnn_backend_argmin(self):
-        module = Argmin()  # noqa: F405
-        sample_input = (torch.randn(16, 3, 4, 4),)
-        module = self.get_qdq_module(module, sample_input)
-        self.lower_module_and_test_output(module, sample_input)
+        test_cases = [
+            {
+                QCOM_MODULE: Argmin(),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(16, 3, 4, 4),),
+            },
+            {
+                QCOM_MODULE: Argmin(dim=0, keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(16, 3, 4, 4),),
+            },
+            {
+                QCOM_MODULE: Argmin(dim=1, keepdim=False),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(8, 5),),
+            },
+            {
+                QCOM_MODULE: Argmin(dim=None, keepdim=False),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.tensor([5.0]),),
+            },
+            {
+                QCOM_MODULE: Argmin(dim=2, keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(2, 3, 4),),
+            },
+        ]
+
+        for i, case in enumerate(test_cases):
+            with self.subTest(i=i):
+                module = self.get_qdq_module(
+                    case[QCOM_MODULE], case[QCOM_SAMPLE_INPUTS]
+                )
+                self.lower_module_and_test_output(module, case[QCOM_SAMPLE_INPUTS])
 
     def test_qnn_backend_asin(self):
         module = Asin()  # noqa: F405

From 351d82fe11a1c1ddd79cc1f6a981cd18df7bec9f Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 7 Oct 2025 12:02:05 -0700
Subject: [PATCH 158/266] Sweep major CMake files for use of include/lib
 instead of CMAKE_INSTALL_INCLUDE/LIBDIR (#12792)

Supposed to use the latter. (This is the thing that causes binaries to
show up in lib64 for some weird builds, for example.)
---
 CMakeLists.txt                             | 10 +++++-----
 backends/aoti/CMakeLists.txt               |  2 +-
 backends/apple/coreml/CMakeLists.txt       |  6 +++---
 backends/apple/mps/CMakeLists.txt          |  2 +-
 backends/cortex_m/CMakeLists.txt           |  5 +++--
 backends/mediatek/CMakeLists.txt           |  2 +-
 backends/nxp/CMakeLists.txt                |  2 +-
 backends/openvino/CMakeLists.txt           |  2 +-
 backends/qualcomm/CMakeLists.txt           |  2 +-
 backends/samsung/CMakeLists.txt            |  2 +-
 backends/vulkan/CMakeLists.txt             |  2 +-
 configurations/CMakeLists.txt              |  2 +-
 exir/backend/test/demos/rpc/CMakeLists.txt |  2 +-
 extension/data_loader/CMakeLists.txt       |  2 +-
 extension/flat_tensor/CMakeLists.txt       |  2 +-
 extension/llm/custom_ops/CMakeLists.txt    |  2 +-
 extension/module/CMakeLists.txt            |  2 +-
 extension/runner_util/CMakeLists.txt       |  2 +-
 extension/tensor/CMakeLists.txt            |  2 +-
 extension/threadpool/CMakeLists.txt        |  2 +-
 extension/training/CMakeLists.txt          |  2 +-
 kernels/optimized/CMakeLists.txt           |  5 +++--
 kernels/optimized/External/EigenBLAS.cmake |  4 ++--
 kernels/portable/CMakeLists.txt            |  7 ++++---
 kernels/quantized/CMakeLists.txt           |  5 +++--
 runtime/executor/test/CMakeLists.txt       |  2 +-
 26 files changed, 42 insertions(+), 38 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6a36d7e563a..aa2101d1a97 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -226,7 +226,7 @@ if(EXECUTORCH_BUILD_CPUINFO)
   install(
     TARGETS cpuinfo
     EXPORT ExecuTorchTargets
-    DESTINATION lib
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
     INCLUDES
     DESTINATION ${_common_include_directories}
   )
@@ -269,7 +269,7 @@ if(EXECUTORCH_BUILD_PTHREADPOOL)
   install(
     TARGETS pthreadpool pthreadpool_interface fxdiv
     EXPORT ExecuTorchTargets
-    DESTINATION lib
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
     INCLUDES
     DESTINATION ${_common_include_directories}
   )
@@ -708,7 +708,7 @@ if(EXECUTORCH_BUILD_KERNELS_TORCHAO)
   install(
     TARGETS torchao_ops_executorch torchao_kernels_aarch64
     EXPORT ExecuTorchTargets
-    DESTINATION lib
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
     INCLUDES
     DESTINATION ${_common_include_directories}
   )
@@ -719,7 +719,7 @@ if(EXECUTORCH_BUILD_KERNELS_TORCHAO)
     install(
       TARGETS kleidiai
       EXPORT ExecuTorchTargets
-      DESTINATION lib
+      DESTINATION ${CMAKE_INSTALL_LIBDIR}
       INCLUDES
       DESTINATION ${_common_include_directories}
     )
@@ -999,7 +999,7 @@ if(NOT EXECUTORCH_SELECT_OPS_YAML STREQUAL ""
   install(
     TARGETS executorch_selected_kernels
     EXPORT ExecuTorchTargets
-    DESTINATION lib
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
   )
 else()
   # No selective build - link the full library.
diff --git a/backends/aoti/CMakeLists.txt b/backends/aoti/CMakeLists.txt
index 2aa8a5692ac..6a32a86cbf3 100644
--- a/backends/aoti/CMakeLists.txt
+++ b/backends/aoti/CMakeLists.txt
@@ -50,5 +50,5 @@ executorch_target_link_options_shared_lib(aoti_common)
 install(
   TARGETS aoti_common
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
 )
diff --git a/backends/apple/coreml/CMakeLists.txt b/backends/apple/coreml/CMakeLists.txt
index 9879a05e3dc..17e2d94e336 100644
--- a/backends/apple/coreml/CMakeLists.txt
+++ b/backends/apple/coreml/CMakeLists.txt
@@ -115,7 +115,7 @@ if(APPLE)
 endif()
 target_compile_options(coreml_util PUBLIC -fPIC)
 
-install(TARGETS coreml_util DESTINATION lib)
+install(TARGETS coreml_util DESTINATION ${CMAKE_INSTALL_LIBDIR})
 
 install(
   DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/runtime/util
@@ -154,7 +154,7 @@ target_compile_options(coreml_inmemoryfs PUBLIC -fPIC)
 
 install(
   TARGETS coreml_inmemoryfs
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
@@ -251,7 +251,7 @@ if(APPLE)
   install(
     TARGETS coremldelegate coreml_util coreml_inmemoryfs
     EXPORT ExecuTorchTargets
-    DESTINATION lib
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
     INCLUDES
     DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
   )
diff --git a/backends/apple/mps/CMakeLists.txt b/backends/apple/mps/CMakeLists.txt
index 5a253347b01..99a8afa16ac 100644
--- a/backends/apple/mps/CMakeLists.txt
+++ b/backends/apple/mps/CMakeLists.txt
@@ -77,7 +77,7 @@ target_compile_options(mpsdelegate PRIVATE "-fno-objc-arc")
 install(
   TARGETS mpsdelegate mps_schema
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
diff --git a/backends/cortex_m/CMakeLists.txt b/backends/cortex_m/CMakeLists.txt
index 24a34546732..a728584e49c 100644
--- a/backends/cortex_m/CMakeLists.txt
+++ b/backends/cortex_m/CMakeLists.txt
@@ -90,6 +90,7 @@ gen_operators_lib(
 install(
   TARGETS cortex_m_kernels cortex_m_ops_lib cmsis-nn
   EXPORT ExecuTorchTargets
-  DESTINATION lib
-  PUBLIC_HEADER DESTINATION include/executorch/backends/cortex_m/ops/
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  PUBLIC_HEADER
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/executorch/backends/cortex_m/ops/
 )
diff --git a/backends/mediatek/CMakeLists.txt b/backends/mediatek/CMakeLists.txt
index ed9b37e1998..10c28be0053 100644
--- a/backends/mediatek/CMakeLists.txt
+++ b/backends/mediatek/CMakeLists.txt
@@ -46,5 +46,5 @@ executorch_target_link_options_shared_lib(neuron_backend)
 install(
   TARGETS neuron_backend
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
 )
diff --git a/backends/nxp/CMakeLists.txt b/backends/nxp/CMakeLists.txt
index 43fcaa24d19..bfc4c046be6 100644
--- a/backends/nxp/CMakeLists.txt
+++ b/backends/nxp/CMakeLists.txt
@@ -17,5 +17,5 @@ target_include_directories(
 install(
   TARGETS executorch_delegate_neutron
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
 )
diff --git a/backends/openvino/CMakeLists.txt b/backends/openvino/CMakeLists.txt
index 4d32d8932c2..f5b957da881 100644
--- a/backends/openvino/CMakeLists.txt
+++ b/backends/openvino/CMakeLists.txt
@@ -81,7 +81,7 @@ endif()
 install(
   TARGETS openvino_backend
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
 )
diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt
index 32105597260..07166b92ea2 100644
--- a/backends/qualcomm/CMakeLists.txt
+++ b/backends/qualcomm/CMakeLists.txt
@@ -214,7 +214,7 @@ add_subdirectory(
 install(
   TARGETS qnn_executorch_backend
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
 )
 
 # QNN pybind
diff --git a/backends/samsung/CMakeLists.txt b/backends/samsung/CMakeLists.txt
index fff3ece5239..6ea020c0970 100644
--- a/backends/samsung/CMakeLists.txt
+++ b/backends/samsung/CMakeLists.txt
@@ -161,7 +161,7 @@ if(${ANDROID})
   install(
     TARGETS enn_backend enn_logging
     EXPORT ExecuTorchTargets
-    DESTINATION lib
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
   )
 endif()
 
diff --git a/backends/vulkan/CMakeLists.txt b/backends/vulkan/CMakeLists.txt
index 17b2be4e73c..4d955a34116 100644
--- a/backends/vulkan/CMakeLists.txt
+++ b/backends/vulkan/CMakeLists.txt
@@ -132,7 +132,7 @@ set_property(TARGET vulkan_backend PROPERTY CXX_STANDARD 17)
 install(
   TARGETS vulkan_backend vulkan_schema
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${COMMON_INCLUDES}
 )
diff --git a/configurations/CMakeLists.txt b/configurations/CMakeLists.txt
index fa5412ac476..fb154ff88bc 100644
--- a/configurations/CMakeLists.txt
+++ b/configurations/CMakeLists.txt
@@ -63,6 +63,6 @@ if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
   install(
     TARGETS optimized_native_cpu_ops_lib
     EXPORT ExecuTorchTargets
-    DESTINATION lib
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
   )
 endif()
diff --git a/exir/backend/test/demos/rpc/CMakeLists.txt b/exir/backend/test/demos/rpc/CMakeLists.txt
index 97f90ea9baa..af843954601 100644
--- a/exir/backend/test/demos/rpc/CMakeLists.txt
+++ b/exir/backend/test/demos/rpc/CMakeLists.txt
@@ -36,7 +36,7 @@ target_include_directories(
 )
 install(
   TARGETS executor_backend
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
diff --git a/extension/data_loader/CMakeLists.txt b/extension/data_loader/CMakeLists.txt
index a5e7a0c4a81..b45ba0594e1 100644
--- a/extension/data_loader/CMakeLists.txt
+++ b/extension/data_loader/CMakeLists.txt
@@ -41,7 +41,7 @@ target_compile_options(extension_data_loader PUBLIC ${_common_compile_options})
 install(
   TARGETS extension_data_loader
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
diff --git a/extension/flat_tensor/CMakeLists.txt b/extension/flat_tensor/CMakeLists.txt
index ff70bcc9565..9a0ad782ef5 100644
--- a/extension/flat_tensor/CMakeLists.txt
+++ b/extension/flat_tensor/CMakeLists.txt
@@ -31,7 +31,7 @@ target_compile_options(extension_flat_tensor PUBLIC ${_common_compile_options})
 install(
   TARGETS extension_flat_tensor
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
diff --git a/extension/llm/custom_ops/CMakeLists.txt b/extension/llm/custom_ops/CMakeLists.txt
index 1678dc80296..8b29dfdcfd0 100644
--- a/extension/llm/custom_ops/CMakeLists.txt
+++ b/extension/llm/custom_ops/CMakeLists.txt
@@ -83,7 +83,7 @@ target_compile_options(custom_ops PUBLIC ${_common_compile_options})
 install(
   TARGETS custom_ops
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
 )
 
 if(EXECUTORCH_BUILD_KERNELS_LLM_AOT)
diff --git a/extension/module/CMakeLists.txt b/extension/module/CMakeLists.txt
index 5f114f1befa..d887d873ab7 100644
--- a/extension/module/CMakeLists.txt
+++ b/extension/module/CMakeLists.txt
@@ -56,7 +56,7 @@ target_compile_options(
 install(
   TARGETS extension_module extension_module_static
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
diff --git a/extension/runner_util/CMakeLists.txt b/extension/runner_util/CMakeLists.txt
index 0bf8f33a656..75fa11c0493 100644
--- a/extension/runner_util/CMakeLists.txt
+++ b/extension/runner_util/CMakeLists.txt
@@ -29,7 +29,7 @@ target_compile_options(extension_runner_util PUBLIC ${_common_compile_options})
 install(
   TARGETS extension_runner_util
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
diff --git a/extension/tensor/CMakeLists.txt b/extension/tensor/CMakeLists.txt
index 0e409c3bfb3..2a8d9b17916 100644
--- a/extension/tensor/CMakeLists.txt
+++ b/extension/tensor/CMakeLists.txt
@@ -28,7 +28,7 @@ target_compile_options(extension_tensor PUBLIC ${_common_compile_options})
 install(
   TARGETS extension_tensor
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
diff --git a/extension/threadpool/CMakeLists.txt b/extension/threadpool/CMakeLists.txt
index a6c06e84293..9cd514fa0ad 100644
--- a/extension/threadpool/CMakeLists.txt
+++ b/extension/threadpool/CMakeLists.txt
@@ -43,7 +43,7 @@ target_compile_options(extension_threadpool PUBLIC ${_common_compile_options})
 install(
   TARGETS extension_threadpool
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
diff --git a/extension/training/CMakeLists.txt b/extension/training/CMakeLists.txt
index 1e17913141d..ed2b3bc5a1e 100644
--- a/extension/training/CMakeLists.txt
+++ b/extension/training/CMakeLists.txt
@@ -83,7 +83,7 @@ endif()
 install(
   TARGETS extension_training
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt
index 32ae865bfdf..f87e2c8d722 100644
--- a/kernels/optimized/CMakeLists.txt
+++ b/kernels/optimized/CMakeLists.txt
@@ -83,6 +83,7 @@ install(
   # it.
   TARGETS cpublas optimized_kernels optimized_ops_lib eigen_blas
   EXPORT ExecuTorchTargets
-  DESTINATION lib
-  PUBLIC_HEADER DESTINATION include/executorch/kernels/optimized/
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  PUBLIC_HEADER
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/executorch/kernels/optimized/
 )
diff --git a/kernels/optimized/External/EigenBLAS.cmake b/kernels/optimized/External/EigenBLAS.cmake
index 29d42478798..bc09786bed4 100644
--- a/kernels/optimized/External/EigenBLAS.cmake
+++ b/kernels/optimized/External/EigenBLAS.cmake
@@ -53,6 +53,6 @@ set_property(TARGET eigen_blas PROPERTY POSITION_INDEPENDENT_CODE ON)
 
 install(
   TARGETS eigen_blas
-  LIBRARY DESTINATION lib
-  ARCHIVE DESTINATION lib
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
 )
diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt
index eb8475b8d5a..a3ab1654ee5 100644
--- a/kernels/portable/CMakeLists.txt
+++ b/kernels/portable/CMakeLists.txt
@@ -91,13 +91,14 @@ if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
   install(
     TARGETS optimized_portable_kernels optimized_portable_ops_lib
     EXPORT ExecuTorchTargets
-    DESTINATION lib
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
   )
 endif()
 
 install(
   TARGETS portable_kernels portable_ops_lib
   EXPORT ExecuTorchTargets
-  DESTINATION lib
-  PUBLIC_HEADER DESTINATION include/executorch/kernels/portable/
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  PUBLIC_HEADER
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/executorch/kernels/portable/
 )
diff --git a/kernels/quantized/CMakeLists.txt b/kernels/quantized/CMakeLists.txt
index b0c837cdefd..d4fc52af76b 100644
--- a/kernels/quantized/CMakeLists.txt
+++ b/kernels/quantized/CMakeLists.txt
@@ -152,6 +152,7 @@ gen_operators_lib(
 install(
   TARGETS quantized_kernels quantized_ops_lib
   EXPORT ExecuTorchTargets
-  DESTINATION lib
-  PUBLIC_HEADER DESTINATION include/executorch/kernels/quantized/
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  PUBLIC_HEADER
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/executorch/kernels/quantized/
 )
diff --git a/runtime/executor/test/CMakeLists.txt b/runtime/executor/test/CMakeLists.txt
index 05d149ab1b4..5477831923c 100644
--- a/runtime/executor/test/CMakeLists.txt
+++ b/runtime/executor/test/CMakeLists.txt
@@ -191,4 +191,4 @@ target_link_libraries(test_backend_compiler_lib PUBLIC executorch_core)
 
 executorch_target_link_options_shared_lib(test_backend_compiler_lib)
 
-install(TARGETS test_backend_compiler_lib DESTINATION lib)
+install(TARGETS test_backend_compiler_lib DESTINATION ${CMAKE_INSTALL_LIBDIR})

From 740fe14c72030d00bf242ad97c004f2865d1293a Mon Sep 17 00:00:00 2001
From: cccclai <chenlai@meta.com>
Date: Tue, 7 Oct 2025 12:17:09 -0700
Subject: [PATCH 159/266] Back out "oss et update to support SAR2230P"

Differential Revision: D84069979

Pull Request resolved: https://github.com/pytorch/executorch/pull/14859
---
 backends/qualcomm/serialization/qc_schema.py | 3 ---
 backends/qualcomm/utils/utils.py             | 2 --
 2 files changed, 5 deletions(-)

diff --git a/backends/qualcomm/serialization/qc_schema.py b/backends/qualcomm/serialization/qc_schema.py
index 6f0bceec4c9..f3b9e2cc1a5 100644
--- a/backends/qualcomm/serialization/qc_schema.py
+++ b/backends/qualcomm/serialization/qc_schema.py
@@ -27,7 +27,6 @@ class HtpArch(IntEnum):
     V73 = 73
     V75 = 75
     V79 = 79
-    V81 = 81
 
 
 @dataclass
@@ -50,7 +49,6 @@ class QcomChipset(IntEnum):
     SXR1230P = 45  # v73
     SXR2230P = 53  # v69
     SXR2330P = 75  # v79
-    SAR2230P = 95  # v81
 
 
 @dataclass
@@ -71,7 +69,6 @@ class SocInfo:
     QcomChipset.SXR1230P: SocInfo(QcomChipset.SXR1230P, HtpInfo(HtpArch.V73, 2)),
     QcomChipset.SXR2230P: SocInfo(QcomChipset.SXR2230P, HtpInfo(HtpArch.V69, 8)),
     QcomChipset.SXR2330P: SocInfo(QcomChipset.SXR2330P, HtpInfo(HtpArch.V79, 8)),
-    QcomChipset.SAR2230P: SocInfo(QcomChipset.SAR2230P, HtpInfo(HtpArch.V81, 4)),
 }
 
 
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
index c57bec43dcf..be4e86de50f 100644
--- a/backends/qualcomm/utils/utils.py
+++ b/backends/qualcomm/utils/utils.py
@@ -1099,7 +1099,6 @@ def get_soc_to_arch_map():
         "SXR1230P": HtpArch.V73,
         "SXR2230P": HtpArch.V69,
         "SXR2330P": HtpArch.V79,
-        "SAR2230P": HtpArch.V81,
     }
 
 
@@ -1116,7 +1115,6 @@ def get_soc_to_chipset_map():
         "SXR1230P": QcomChipset.SXR1230P,
         "SXR2230P": QcomChipset.SXR2230P,
         "SXR2330P": QcomChipset.SXR2330P,
-        "SAR2230P": QcomChipset.SAR2230P,
     }
 
 
From 15a203b61935587c88af969e86d692e92b13de0a Mon Sep 17 00:00:00 2001
From: Andrew Grebenisan <33402477+DrJessop@users.noreply.github.com>
Date: Tue, 7 Oct 2025 13:18:29 -0700
Subject: [PATCH 160/266] Fix avg_pool2d replace ops pass

Differential Revision: D83873937

Pull Request resolved: https://github.com/pytorch/executorch/pull/14857
---
 backends/cadence/aot/replace_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/cadence/aot/replace_ops.py b/backends/cadence/aot/replace_ops.py
index 24390da5e16..7025159e443 100644
--- a/backends/cadence/aot/replace_ops.py
+++ b/backends/cadence/aot/replace_ops.py
@@ -1644,7 +1644,7 @@ def call_operator(self, op, args, kwargs, meta):
         ceil_mode = args[4] if len(args) >= 5 else False
         count_include_pad = args[5] if len(args) >= 6 else True
         divisor_override = args[6] if len(args) >= 7 else None
-        zero_point = torch.tensor(0, dtype=torch.int32)
+        zero_point = args[7] if len(args) >= 8 else None
 
         # If the op is avg_pool1d, then we need to reshape the 3d input to a 4d
         # tensor.

From 5c4d214fffb9875d8652b883b7bd9ae12e3ca90c Mon Sep 17 00:00:00 2001
From: JP <46308822+zonglinpeng@users.noreply.github.com>
Date: Tue, 7 Oct 2025 13:50:22 -0700
Subject: [PATCH 161/266] link new vision kernel internally

Differential Revision: D83810321

Pull Request resolved: https://github.com/pytorch/executorch/pull/14790
---
 .../operators/op_dequantize_per_tensor.cpp    | 11 ++-
 .../operators/op_quantize_per_tensor.cpp      | 10 +--
 .../operators/op_quantized_conv_out.cpp       | 81 ++++++++++++++++++-
 .../cadence/vision/operators/op_softmax.cpp   |  4 +-
 .../cadence/vision/operators/quantized_ops.h  |  6 +-
 backends/cadence/vision/operators/targets.bzl | 21 ++++-
 .../third-party/include_private/idma_init.h   | 25 +++---
 .../third-party/library/api/vsoftmaxf.c       | 64 +++++++--------
 .../third-party/library/tables/expf_tbl.c     | 23 ++++--
 .../third-party/library/tables/inff_tbl.c     |  2 +-
 .../third-party/library/tables/nanf_tbl.c     |  2 +-
 .../cadence/vision/third-party/targets.bzl    |  8 +-
 12 files changed, 184 insertions(+), 73 deletions(-)

diff --git a/backends/cadence/vision/operators/op_dequantize_per_tensor.cpp b/backends/cadence/vision/operators/op_dequantize_per_tensor.cpp
index 833606fb651..daffecda1bf 100644
--- a/backends/cadence/vision/operators/op_dequantize_per_tensor.cpp
+++ b/backends/cadence/vision/operators/op_dequantize_per_tensor.cpp
@@ -31,25 +31,24 @@ void dequantize_per_tensor_out(
 
   if (input.scalar_type() == ScalarType::Byte) {
     const uint8_t* input_data = input.const_data_ptr<uint8_t>();
-    impl::vision::native::kernels::dequantize<uint8_t>(
+    kernels::dequantize<uint8_t>(
         out_data, input_data, scale, zero_point, numel);
   } else if (input.scalar_type() == ScalarType::Char) {
     const int8_t* input_data = input.const_data_ptr<int8_t>();
-    impl::vision::native::kernels::dequantize<int8_t>(
-        out_data, input_data, scale, zero_point, numel);
+    kernels::dequantize<int8_t>(out_data, input_data, scale, zero_point, numel);
   } else if (
       input.scalar_type() == ScalarType::Bits16 ||
       input.scalar_type() == ScalarType::UInt16) {
     const uint16_t* input_data = input.const_data_ptr<uint16_t>();
-    impl::vision::native::kernels::dequantize<uint16_t>(
+    kernels::dequantize<uint16_t>(
         out_data, input_data, scale, zero_point, numel);
   } else if (input.scalar_type() == ScalarType::Short) {
     const int16_t* input_data = input.const_data_ptr<int16_t>();
-    impl::vision::native::kernels::dequantize<int16_t>(
+    kernels::dequantize<int16_t>(
         out_data, input_data, scale, zero_point, numel);
   } else if (input.scalar_type() == ScalarType::Int) {
     const int32_t* input_data = input.const_data_ptr<int32_t>();
-    impl::vision::native::kernels::dequantize<int32_t>(
+    kernels::dequantize<int32_t>(
         out_data, input_data, scale, zero_point, numel);
   } else {
     ET_CHECK_MSG(
diff --git a/backends/cadence/vision/operators/op_quantize_per_tensor.cpp b/backends/cadence/vision/operators/op_quantize_per_tensor.cpp
index 8d209af24b1..cd72d2de2b5 100644
--- a/backends/cadence/vision/operators/op_quantize_per_tensor.cpp
+++ b/backends/cadence/vision/operators/op_quantize_per_tensor.cpp
@@ -33,25 +33,25 @@ void quantize_per_tensor_out(
 
   if (out.scalar_type() == ScalarType::Byte) {
     uint8_t* out_data = out.mutable_data_ptr<uint8_t>();
-    impl::vision::native::kernels::quantize<uint8_t>(
+    kernels::quantize<uint8_t>(
         out_data, input_data, 1. / scale, zero_point, numel);
   } else if (out.scalar_type() == ScalarType::Char) {
     int8_t* out_data = out.mutable_data_ptr<int8_t>();
-    impl::vision::native::kernels::quantize<int8_t>(
+    kernels::quantize<int8_t>(
         out_data, input_data, 1. / scale, zero_point, numel);
   } else if (
       out.scalar_type() == ScalarType::Bits16 ||
       out.scalar_type() == ScalarType::UInt16) {
     uint16_t* out_data = out.mutable_data_ptr<uint16_t>();
-    impl::vision::native::kernels::quantize<uint16_t>(
+    kernels::quantize<uint16_t>(
         out_data, input_data, 1. / scale, zero_point, numel);
   } else if (out.scalar_type() == ScalarType::Short) {
     int16_t* out_data = out.mutable_data_ptr<int16_t>();
-    impl::vision::native::kernels::quantize<int16_t>(
+    kernels::quantize<int16_t>(
         out_data, input_data, 1. / scale, zero_point, numel);
   } else if (out.scalar_type() == ScalarType::Int) {
     int32_t* out_data = out.mutable_data_ptr<int32_t>();
-    impl::vision::native::kernels::quantize<int32_t>(
+    kernels::quantize<int32_t>(
         out_data, input_data, 1. / scale, zero_point, numel);
   } else {
     ET_CHECK_MSG(
diff --git a/backends/cadence/vision/operators/op_quantized_conv_out.cpp b/backends/cadence/vision/operators/op_quantized_conv_out.cpp
index 6ffb36aa836..1e1e6c8cdc7 100644
--- a/backends/cadence/vision/operators/op_quantized_conv_out.cpp
+++ b/backends/cadence/vision/operators/op_quantized_conv_out.cpp
@@ -141,8 +141,7 @@ __attribute__((noinline)) void conv2d_nchw_core_generic(
             if (quantized) {
               float val = bias_scale * acc;
               out_plane[_oh * ow + _ow] =
-                  ::impl::vision::native::kernels::quantize<OT>(
-                      val, inv_out_scale, out_zero_point);
+                  kernels::quantize<OT>(val, inv_out_scale, out_zero_point);
             } else {
               out_plane[_oh * ow + _ow] = acc;
             }
@@ -267,8 +266,8 @@ __attribute__((noinline)) void conv2d_nhwc_core_generic(
             }
             if (quantized) {
               float val = bias_scale * acc;
-              out_line[_oc] = ::impl::vision::native::kernels::quantize<OT>(
-                  val, inv_out_scale, out_zero_point);
+              out_line[_oc] =
+                  kernels::quantize<OT>(val, inv_out_scale, out_zero_point);
             } else {
               out_line[_oc] = acc;
             }
@@ -530,6 +529,80 @@ void quantized_conv_per_tensor_out(
   }
 }
 
+void quantized_conv2d_nchw_per_tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    int64_t out_multiplier,
+    int64_t out_shift,
+    Tensor& out) {
+  quantized_conv_per_tensor_out(
+      ctx,
+      input,
+      weight,
+      bias,
+      stride,
+      padding,
+      dilation,
+      groups,
+      in_zero_point,
+      weight_zero_point,
+      bias_scale,
+      output_scale,
+      output_zero_point,
+      out_multiplier,
+      out_shift,
+      false, // channel_last = false for NCHW
+      out);
+}
+
+void quantized_conv2d_nhwc_per_tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    int64_t out_multiplier,
+    int64_t out_shift,
+    Tensor& out) {
+  quantized_conv_per_tensor_out(
+      ctx,
+      input,
+      weight,
+      bias,
+      stride,
+      padding,
+      dilation,
+      groups,
+      in_zero_point,
+      weight_zero_point,
+      bias_scale,
+      output_scale,
+      output_zero_point,
+      out_multiplier,
+      out_shift,
+      true, // channel_last = true for NHWC
+      out);
+}
+
 } // namespace native
 } // namespace vision
 } // namespace impl
diff --git a/backends/cadence/vision/operators/op_softmax.cpp b/backends/cadence/vision/operators/op_softmax.cpp
index e2963bdcffe..58ca33c6a0b 100644
--- a/backends/cadence/vision/operators/op_softmax.cpp
+++ b/backends/cadence/vision/operators/op_softmax.cpp
@@ -6,13 +6,13 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <api.h>
 #include <executorch/backends/cadence/vision/kernels/kernels.h>
 #include <executorch/kernels/portable/cpu/util/activation_ops_util.h>
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
-#include <idma_init.h>
+#include <include/api.h>
+#include <include_private/idma_init.h>
 #include <stdio.h>
 
 using executorch::aten::ScalarType;
diff --git a/backends/cadence/vision/operators/quantized_ops.h b/backends/cadence/vision/operators/quantized_ops.h
index b42e45b0b3d..a7251724c53 100644
--- a/backends/cadence/vision/operators/quantized_ops.h
+++ b/backends/cadence/vision/operators/quantized_ops.h
@@ -49,7 +49,7 @@ inline __attribute__((always_inline)) void quantized_linear_per_tensor_(
             (int32_t)weight_data[j * in_dim + k] - (int32_t)weight_zero_point;
         sum += x * w;
       }
-      out_data[i * out_dim + j] = ::impl::vision::native::kernels::quantize<T>(
+      out_data[i * out_dim + j] = impl::vision::kernels::quantize<T>(
           sum, requant_scale, out_zero_point);
     }
   }
@@ -121,8 +121,8 @@ inline __attribute__((always_inline)) void quantized_linear_per_channel_(
       // Compute the out_scale from out_multiplier and out_shift
       const float out_scale =
           -out_multiplier_data[j] * 1.0 / (1 << 31) * pow(2, out_shift_data[j]);
-      out_data[i * out_dim + j] = ::impl::vision::native::kernels::quantize<T>(
-          sum, out_scale, out_zero_point);
+      out_data[i * out_dim + j] =
+          impl::vision::kernels::quantize<T>(sum, out_scale, out_zero_point);
     }
   }
 }
diff --git a/backends/cadence/vision/operators/targets.bzl b/backends/cadence/vision/operators/targets.bzl
index b12118a9c47..2dd47e12bd2 100644
--- a/backends/cadence/vision/operators/targets.bzl
+++ b/backends/cadence/vision/operators/targets.bzl
@@ -21,6 +21,25 @@ def define_operator(name: str, deps: list[str] | None = None) -> None:
     if deps == None:
         deps = []
 
+    # Determine which headers to export based on operator name
+    exported_headers = ["operators.h"]
+    
+    # Add quantized_ops.h header for quantized operators
+    quantized_ops = [
+        "quantized_fully_connected_out",
+        "quantized_matmul_out", 
+        "quantized_layer_norm",
+        "quantized_relu_out",
+        "quantized_conv_out",
+        "quantized_linear_out",
+        "quantize_per_tensor",
+        "dequantize_per_tensor",
+        "requantize_out"
+    ]
+    
+    if name in quantized_ops:
+        exported_headers.append("quantized_ops.h")
+
     runtime.cxx_library(
         name = op_name,
         srcs = [op_name + ".cpp"],
@@ -31,7 +50,7 @@ def define_operator(name: str, deps: list[str] | None = None) -> None:
         ],
         compatible_with = ["ovr_config//cpu:xtensa"],
         deps = deps + common_deps,
-        exported_headers = ["operators.h"],
+        exported_headers = exported_headers,
     )
 
 OPERATORS = [
diff --git a/backends/cadence/vision/third-party/include_private/idma_init.h b/backends/cadence/vision/third-party/include_private/idma_init.h
index ee0666842fd..841a39cf891 100644
--- a/backends/cadence/vision/third-party/include_private/idma_init.h
+++ b/backends/cadence/vision/third-party/include_private/idma_init.h
@@ -1,31 +1,36 @@
 #ifndef __IDMA__INIT_H__
 #define __IDMA__INIT_H__
 
-#include "dtypes.h"
+#include "../include/dtypes.h"
 #include "common.h"
 
-#define IDMA_BUFF_SIZE 16384 // 16 kb DRAM storage. Assume 4 buffers (2 input and 2 output)
+#define IDMA_BUFF_SIZE \
+  16384 // 16 kb DRAM storage. Assume 4 buffers (2 input and 2 output)
 
 #ifndef PLACE_IN_DRAM0
-	#define PLACE_IN_DRAM0 __attribute__ ((aligned(2*IVP_SIMD_WIDTH), section(".dram0.data")))
+#define PLACE_IN_DRAM0 \
+  __attribute__((aligned(2 * IVP_SIMD_WIDTH), section(".dram0.data")))
 #endif
 
 #ifndef PLACE_IN_DRAM1
-	#define PLACE_IN_DRAM1 __attribute__ ((aligned(2*IVP_SIMD_WIDTH), section(".dram1.data")))
+#define PLACE_IN_DRAM1 \
+  __attribute__((aligned(2 * IVP_SIMD_WIDTH), section(".dram1.data")))
 #endif
 
 float32_t data_dram0[IDMA_BUFF_SIZE / 2] PLACE_IN_DRAM0;
 float32_t data_dram1[IDMA_BUFF_SIZE / 2] PLACE_IN_DRAM1;
 
-float32_t *inpData[2] = {&data_dram0[0], &data_dram1[0]};
-float32_t *outData[2] = {&data_dram0[IDMA_BUFF_SIZE / 4], &data_dram1[IDMA_BUFF_SIZE / 4]};
+float32_t* inpData[2] = {&data_dram0[0], &data_dram1[0]};
+float32_t* outData[2] = {
+    &data_dram0[IDMA_BUFF_SIZE / 4],
+    &data_dram1[IDMA_BUFF_SIZE / 4]};
 
 IDMA_BUFFER_DEFINE(buffer_idma_ch0, 1, IDMA_2D_DESC);
 IDMA_BUFFER_DEFINE(buffer_idma_ch1, 1, IDMA_2D_DESC);
 
-idma_buffer_t * descbuf[] = {
-  buffer_idma_ch0,
-  buffer_idma_ch1,
+idma_buffer_t* descbuf[] = {
+    buffer_idma_ch0,
+    buffer_idma_ch1,
 };
 
-#endif // __IDMA__INIT_H__
\ No newline at end of file
+#endif // __IDMA__INIT_H__
diff --git a/backends/cadence/vision/third-party/library/api/vsoftmaxf.c b/backends/cadence/vision/third-party/library/api/vsoftmaxf.c
index 413b6f10567..27487c75d6c 100644
--- a/backends/cadence/vision/third-party/library/api/vsoftmaxf.c
+++ b/backends/cadence/vision/third-party/library/api/vsoftmaxf.c
@@ -63,33 +63,33 @@ y[N]   result, Q7.8 or floating point
 x,y    Must not overlap
 -------------------------------------------------------------------------*/
 
-#define IVP_ADDSN_2X32(b_, c_)                                                 \
-  ({                                                                           \
-    xb_vecN_2x32v a_;                                                          \
-    xb_vecN_2x64w tmp_a_;                                                      \
-    tmp_a_ = IVP_MULN_2X32(b_, 1);                                             \
-    IVP_MULAN_2X32(tmp_a_, c_, 1);                                             \
-    a_ = IVP_PACKVRN_2X64W(tmp_a_, 0);                                         \
-    a_;                                                                        \
+#define IVP_ADDSN_2X32(b_, c_)         \
+  ({                                   \
+    xb_vecN_2x32v a_;                  \
+    xb_vecN_2x64w tmp_a_;              \
+    tmp_a_ = IVP_MULN_2X32(b_, 1);     \
+    IVP_MULAN_2X32(tmp_a_, c_, 1);     \
+    a_ = IVP_PACKVRN_2X64W(tmp_a_, 0); \
+    a_;                                \
   })
 
 #if !HAVE_VFPU
-DISCARD_FUN(void, vsoftmaxf, (float32_t * y, const float32_t *x, int N))
+DISCARD_FUN(void, vsoftmaxf, (float32_t * y, const float32_t* x, int N))
 #else
-void vsoftmaxf(float32_t *y, const float32_t *x, int N) {
+void vsoftmaxf(float32_t* y, const float32_t* x, int N) {
 #if !defined(IVP_MULN_2X32)
 #else
-  const int *pTbl = (const int *)expftbl_Q30;
+  const int* pTbl = (const int*)expftbl_Q30;
 #endif
-  const xb_vecN_2xf32 *restrict pX;
-  xb_vecN_2xf32 *restrict pY;
+  const xb_vecN_2xf32* restrict pX;
+  xb_vecN_2xf32* restrict pY;
   xb_vecN_2xf32 norm, ysum, xmax;
   int n;
   valign al_X, al_R, al_Y;
   if (N < 0)
     return;
   xmax = minusInff.f;
-  pX = (const xb_vecN_2xf32 *)x;
+  pX = (const xb_vecN_2xf32*)x;
   al_X = IVP_LAN_2XF32_PP(pX);
   al_Y = IVP_ZALIGN();
   for (n = 0; n < (N >> (LOG2_IVP_SIMD_WIDTH - 1)); n++) {
@@ -99,17 +99,17 @@ void vsoftmaxf(float32_t *y, const float32_t *x, int N) {
   }
   if (N & (IVP_SIMD_WIDTH / 2 - 1)) {
     xb_vecN_2xf32 x;
-    IVP_LAVN_2XF32_XP(x, al_X, pX,
-                      sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
-    IVP_MAXNUMN_2XF32T(xmax, xmax, x,
-                       IVP_LTRSN_2((N & (IVP_SIMD_WIDTH / 2 - 1))));
+    IVP_LAVN_2XF32_XP(
+        x, al_X, pX, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
+    IVP_MAXNUMN_2XF32T(
+        xmax, xmax, x, IVP_LTRSN_2((N & (IVP_SIMD_WIDTH / 2 - 1))));
   }
 
   xmax = IVP_REPN_2XF32(IVP_RMAXNUMN_2XF32(xmax), 0);
   __Pragma("no_reorder");
   ysum = 0.f;
-  pX = (const xb_vecN_2xf32 *)x;
-  pY = (xb_vecN_2xf32 *)y;
+  pX = (const xb_vecN_2xf32*)x;
+  pY = (xb_vecN_2xf32*)y;
   al_X = IVP_LAN_2XF32_PP(pX);
   {
     vboolN_2 bnan;
@@ -163,8 +163,8 @@ void vsoftmaxf(float32_t *y, const float32_t *x, int N) {
     }
     if (N & (IVP_SIMD_WIDTH / 2 - 1)) {
       xb_vecN_2xf32 x;
-      IVP_LAVN_2XF32_XP(x, al_X, pX,
-                        sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
+      IVP_LAVN_2XF32_XP(
+          x, al_X, pX, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
       x = IVP_SUBN_2XF32(x, xmax);
       bnan |= IVP_UNN_2XF32(x, x);
       {
@@ -206,18 +206,18 @@ void vsoftmaxf(float32_t *y, const float32_t *x, int N) {
         zout = IVP_MULN_2XF32(gf, IVP_MOVN_2XF32_FROMN_2X32(exp));
         x = zout;
       }
-      IVP_ADDN_2XF32T(ysum, ysum, x,
-                      IVP_LTRSN_2((N & (IVP_SIMD_WIDTH / 2 - 1))));
-      IVP_SAVN_2XF32_XP(x, al_Y, pY,
-                        sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
+      IVP_ADDN_2XF32T(
+          ysum, ysum, x, IVP_LTRSN_2((N & (IVP_SIMD_WIDTH / 2 - 1))));
+      IVP_SAVN_2XF32_XP(
+          x, al_Y, pY, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
     }
     IVP_SAPOSN_2XF32_FP(al_Y, pY);
     ysum = IVP_MOVN_2XF32T(qNaNf.f, ysum, bnan);
   }
   norm = XT_RECIP_S(IVP_RADDN_2XF32(ysum));
   __Pragma("no_reorder");
-  pX = (const xb_vecN_2xf32 *)y;
-  pY = (xb_vecN_2xf32 *)y;
+  pX = (const xb_vecN_2xf32*)y;
+  pY = (xb_vecN_2xf32*)y;
 
   al_R = IVP_LAN_2XF32_PP(pX);
 
@@ -229,11 +229,11 @@ void vsoftmaxf(float32_t *y, const float32_t *x, int N) {
   }
   if (N & (IVP_SIMD_WIDTH / 2 - 1)) {
     xb_vecN_2xf32 x;
-    IVP_LAVN_2XF32_XP(x, al_R, pX,
-                      sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
+    IVP_LAVN_2XF32_XP(
+        x, al_R, pX, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
     x = IVP_MULN_2XF32(x, norm);
-    IVP_SAVN_2XF32_XP(x, al_Y, pY,
-                      sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
+    IVP_SAVN_2XF32_XP(
+        x, al_Y, pY, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
   }
   IVP_SAPOSN_2XF32_FP(al_Y, pY);
 
diff --git a/backends/cadence/vision/third-party/library/tables/expf_tbl.c b/backends/cadence/vision/third-party/library/tables/expf_tbl.c
index 0ed5dd22257..f1c6f3d44ae 100644
--- a/backends/cadence/vision/third-party/library/tables/expf_tbl.c
+++ b/backends/cadence/vision/third-party/library/tables/expf_tbl.c
@@ -42,22 +42,28 @@
    p(order)=p(order)-(sum(p)-2);
 */
 const int32_t ALIGN_2SIMD expftbl_Q30[8] = {
-    234841,    1329551,   10400465,   59570027,
-    257946177, 744260763, 1073741824, 0 /* Padding to allow for vector loads */
+    234841,
+    1329551,
+    10400465,
+    59570027,
+    257946177,
+    744260763,
+    1073741824,
+    0 /* Padding to allow for vector loads */
 };
 
 const union ufloat32uint32 ALIGN_2SIMD
     expfminmax[2] = /* minimum and maximum arguments of expf() input */
     {
         {0xc2ce8ed0}, /*-1.0327893066e+002f */
-        {0x42b17218}  /* 8.8722839355e+001f */
+        {0x42b17218} /* 8.8722839355e+001f */
 };
 
 const int32_t invln2_Q30 = 1549082005L; /* 1/ln(2), Q30 */
 
 const union ufloat32uint32 ALIGN_2SIMD log2_e[2] = {
     {0x3fb8aa3b}, /* 1.4426950216      */
-    {0x32a57060}  /* 1.9259629891e-008 */
+    {0x32a57060} /* 1.9259629891e-008 */
 };
 
 /*
@@ -70,5 +76,10 @@ p(order)=p(order)-(sum(p)-2);
 num2hex(single(p));
 */
 const union ufloat32uint32 ALIGN_2SIMD expftblf[] = {
-    {0x39655635}, {0x3aa24c7a}, {0x3c1eb2d1}, {0x3d633ddb},
-    {0x3e75ff24}, {0x3f317212}, {0x3f800000}};
+    {0x39655635},
+    {0x3aa24c7a},
+    {0x3c1eb2d1},
+    {0x3d633ddb},
+    {0x3e75ff24},
+    {0x3f317212},
+    {0x3f800000}};
diff --git a/backends/cadence/vision/third-party/library/tables/inff_tbl.c b/backends/cadence/vision/third-party/library/tables/inff_tbl.c
index 9b2bf62e6bf..8464ee9f549 100644
--- a/backends/cadence/vision/third-party/library/tables/inff_tbl.c
+++ b/backends/cadence/vision/third-party/library/tables/inff_tbl.c
@@ -31,7 +31,7 @@
 #include "dtypes.h"
 
 const union ufloat32uint32 minusInff = {0xff800000}; /* -Inf */
-const union ufloat32uint32 plusInff = {0x7f800000};  /* +Inf */
+const union ufloat32uint32 plusInff = {0x7f800000}; /* +Inf */
 const union ufloat32uint32 realmaxf = {
     0x7f7fffff}; /* maximum floating point number */
 const union ufloat32uint32 realminf = {
diff --git a/backends/cadence/vision/third-party/library/tables/nanf_tbl.c b/backends/cadence/vision/third-party/library/tables/nanf_tbl.c
index 27c5f437b9a..f165234fce4 100644
--- a/backends/cadence/vision/third-party/library/tables/nanf_tbl.c
+++ b/backends/cadence/vision/third-party/library/tables/nanf_tbl.c
@@ -27,9 +27,9 @@
 */
 
 /* Portable data types. */
-#include "dtypes.h"
 /* NaN values for single precision routines. */
 #include "nanf_tbl.h"
+#include "dtypes.h"
 
 const union ufloat32uint32 sNaNf = {0x7f800001}; /* Signalling NaN          */
 const union ufloat32uint32 qNaNf = {0x7fc00000}; /* Quiet NaN               */
diff --git a/backends/cadence/vision/third-party/targets.bzl b/backends/cadence/vision/third-party/targets.bzl
index 6bbb7da8d49..26a097010d5 100644
--- a/backends/cadence/vision/third-party/targets.bzl
+++ b/backends/cadence/vision/third-party/targets.bzl
@@ -16,7 +16,7 @@ def define_common_targets():
             "include/*.h", 
             "include_private/*.h"
         ]),
-        header_namespace = "backends/cadence/vision/third-party",
+        header_namespace = "",
         visibility = [
             "//executorch/backends/cadence/...",
             "@EXECUTORCH_CLIENTS",
@@ -28,7 +28,11 @@ def define_common_targets():
         }),
         compiler_flags = select({
             "DEFAULT": ["-UCOMPILER_XTENSA"],  # Ensure COMPILER_XTENSA is not defined for non-Xtensa builds
-            "ovr_config//cpu:xtensa": ["-DCOMPILER_XTENSA"],
+            "ovr_config//cpu:xtensa": [
+                "-DCOMPILER_XTENSA",
+                "-Ixplat/executorch/backends/cadence/vision/third-party/include",
+                "-Ixplat/executorch/backends/cadence/vision/third-party/include_private",
+            ],
         }),
         define_static_target = True,
     )

From 5dee2227cf066e5820f3437480c86108d6ab8722 Mon Sep 17 00:00:00 2001
From: Sicheng Stephen Jia <ssjia@meta.com>
Date: Tue, 7 Oct 2025 17:30:28 -0400
Subject: [PATCH 162/266] [ez] Try to fix Samsung CI job (#14866)

Summary:
Title says it all! Currently the API key cannot be extracted
successfully.
---
 .github/workflows/pull.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 8248a9637ec..11e005847e6 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -909,12 +909,12 @@ jobs:
       contents: read
     secrets: inherit
     with:
+      secrets-env: SAMSUNG_AI_LITECORE_KEY
       runner: linux.2xlarge
       docker-image: ci-image:executorch-ubuntu-22.04-clang12-android
       submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
-      secrets-env: SAMSUNG_AI_LITECORE_KEY
       script: |
         set -ex
 

From fcd42bc44221739573afacde27de1a18c136772b Mon Sep 17 00:00:00 2001
From: Mergen Nachin <mnachin@meta.com>
Date: Tue, 7 Oct 2025 17:47:12 -0400
Subject: [PATCH 163/266] Update link for working with Large Language Models
 (#14863)

---
 docs/source/edge-platforms-section.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/edge-platforms-section.md b/docs/source/edge-platforms-section.md
index 8761325451d..99e44093544 100644
--- a/docs/source/edge-platforms-section.md
+++ b/docs/source/edge-platforms-section.md
@@ -59,7 +59,7 @@ Key features:
 
 After choosing your platform:
 - **{doc}`backends-section`** - Deep dive into backend selection and optimization
-- **{doc}`llms-section`** - Working with Large Language Models on edge devices
+- **{doc}`llm/working-with-llms`** - Working with Large Language Models on edge devices
 
 ```{toctree}
 :hidden:

From 697078b7bb70306f828e278ba5a78ec93e949f33 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu0820@users.noreply.github.com>
Date: Tue, 7 Oct 2025 15:57:57 -0700
Subject: [PATCH 164/266] [aoti-et] Add cuda delegate runtime code (#14827)

This pull request introduces comprehensive support for the CUDA backend
in ExecuTorch, enabling model export, build, and runtime execution with
CUDA acceleration. It adds new CMake build logic, implements the CUDA
backend runtime, updates workflow automation for CUDA model testing, and
improves type and error handling for CUDA-specific operations.

**CUDA Backend Integration**

* Added new CUDA backend build logic to `CMakeLists.txt`, including
registration of the `aoti_cuda` backend and dependencies on common AOTI
and CUDA-specific sources. (`CMakeLists.txt`,
[[1]](diffhunk://#diff-1e7de1ae2d059d21e1dd75d5812d5a34b0222cef273b7c3a2af62eb747f9d20aR590-R599);
`backends/cuda/CMakeLists.txt`,
[[2]](diffhunk://#diff-c2a6fbfdf4c7871966d5decf186dd0d6591d64d5e8a96abd126476942debe7fdR1-R63)
* Implemented the `CudaBackend` runtime in `cuda_backend.cpp`, handling
dynamic loading of model containers, GPU tensor management, and
execution flow for CUDA kernels.
(`backends/cuda/runtime/cuda_backend.cpp`,
[backends/cuda/runtime/cuda_backend.cppR1-R383](diffhunk://#diff-a4b17eccf1aa933837671c5184e02bc815d934a362344bb2b17b789cdfaa5375R1-R383))

**Workflow and Testing Automation**

* Updated and renamed the CUDA workflow file to add a matrix job for
CUDA model testing, running tests for multiple models on GPU hardware.
(`.github/workflows/cuda.yml`,
[.github/workflows/cuda.ymlR64-R87](diffhunk://#diff-29abea04e0613c2569973e5c8e3c89e04846d408c855eeb1f3efcfae7cfa6f89R64-R87))
* Enhanced the CI test script to support CUDA backend selection, model
export, and execution, including artifact preparation.
(`.ci/scripts/test_model.sh`,
[[1]](diffhunk://#diff-841b10bb60e2171b43fd26ab87545bb645f3a4f40a20b5dedb7447387dd133d0R66-R72)
[[2]](diffhunk://#diff-841b10bb60e2171b43fd26ab87545bb645f3a4f40a20b5dedb7447387dd133d0R333-R339)
[[3]](diffhunk://#diff-841b10bb60e2171b43fd26ab87545bb645f3a4f40a20b5dedb7447387dd133d0R392-R397)

**Type and Error Handling Improvements**

* Extended supported data types for the CUDA backend, adding `INT64` and
updating error messages for unsupported dtypes.
(`backends/cuda/runtime/shims/utils.h`,
[[1]](diffhunk://#diff-f4873dd1770e339eb207c219bea2b72b3bad59fad941f39d7cfb8923cadd3541R43)
[[2]](diffhunk://#diff-f4873dd1770e339eb207c219bea2b72b3bad59fad941f39d7cfb8923cadd3541R104)
[[3]](diffhunk://#diff-f4873dd1770e339eb207c219bea2b72b3bad59fad941f39d7cfb8923cadd3541L116-R120)
* Added new type aliases and fields for CUDA delegate and tensor handles
to support runtime operations. (`backends/aoti/aoti_model_container.h`,
[[1]](diffhunk://#diff-84caca41e72ad693665c930ab7d0c31e05f64b268f4d7ac37c17869149fad0c7R24)
[[2]](diffhunk://#diff-84caca41e72ad693665c930ab7d0c31e05f64b268f4d7ac37c17869149fad0c7R78)

**Miscellaneous**

* Improved include paths for the AOTI common library to ensure proper
header resolution. (`backends/aoti/CMakeLists.txt`,
[backends/aoti/CMakeLists.txtL33-R35](diffhunk://#diff-c3d5933d211acc568c9bdf8e08d0ca99b01e50bca113307fbab4cbc4018fdf55L33-R35))
* Added copyright and documentation to the CUDA export scripts.
(`examples/cuda/scripts/__init__.py`,
[examples/cuda/scripts/__init__.pyR1-R7](diffhunk://#diff-2ef2a5794420089aeb5bf7cf3bcd4e82c722c408a171b83c2caafddc1ab55d84R1-R7))
---
 .ci/scripts/test_model.sh                     |  21 +
 .../{test-cuda-builds.yml => cuda.yml}        |  25 ++
 .lintrunner.toml                              |   1 +
 CMakeLists.txt                                |  10 +
 backends/aoti/CMakeLists.txt                  |   4 +-
 backends/aoti/aoti_model_container.h          |   2 +
 backends/cuda/CMakeLists.txt                  |  69 ++++
 backends/cuda/runtime/cuda_backend.cpp        | 382 ++++++++++++++++++
 backends/cuda/runtime/shims/utils.h           |   5 +-
 examples/cuda/scripts/__init__.py             |   0
 examples/cuda/scripts/export.py               | 116 ++++++
 .../models/moshi/mimi/install_requirements.sh |   2 +-
 examples/models/moshi/mimi/test_mimi.py       |   2 +-
 src/executorch/examples/cuda                  |   1 +
 tools/cmake/preset/default.cmake              |   7 +
 torch_pin.py                                  |   2 +-
 16 files changed, 644 insertions(+), 5 deletions(-)
 rename .github/workflows/{test-cuda-builds.yml => cuda.yml} (72%)
 create mode 100644 backends/cuda/CMakeLists.txt
 create mode 100644 backends/cuda/runtime/cuda_backend.cpp
 create mode 100644 examples/cuda/scripts/__init__.py
 create mode 100644 examples/cuda/scripts/export.py
 create mode 120000 src/executorch/examples/cuda

diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh
index 8449809ffe3..34063a23374 100755
--- a/.ci/scripts/test_model.sh
+++ b/.ci/scripts/test_model.sh
@@ -63,6 +63,14 @@ build_cmake_executor_runner() {
         ${COMMON} \
         -B${CMAKE_OUTPUT_DIR} .
     cmake --build ${CMAKE_OUTPUT_DIR} -j4
+  elif [[ "$backend_string_select" == "CUDA" ]]; then
+    echo "Backend $backend_string_select selected"
+    cmake -DCMAKE_BUILD_TYPE=Release \
+        -DEXECUTORCH_BUILD_CUDA=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+        ${COMMON} \
+        -B${CMAKE_OUTPUT_DIR} .
+    cmake --build ${CMAKE_OUTPUT_DIR} -j4
   else
     cmake -DCMAKE_BUILD_TYPE=Debug \
         -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
@@ -323,6 +331,13 @@ test_model_with_mediatek() {
   EXPORTED_MODEL=$(find "./${EXPORT_SCRIPT}" -type f -name "*.pte" -print -quit)
 }
 
+test_model_with_cuda() {
+  # Export a basic .pte and .ptd, then run the model.
+  "${PYTHON_EXECUTABLE}" -m examples.cuda.scripts.export --model_name="${MODEL_NAME}" --output_dir "./"
+  build_cmake_executor_runner "CUDA"
+  ./${CMAKE_OUTPUT_DIR}/executor_runner --model_path "./${MODEL_NAME}.pte" --data_path "./aoti_cuda_blob.ptd"
+}
+
 
 if [[ "${BACKEND}" == "portable" ]]; then
   echo "Testing ${MODEL_NAME} with portable kernels..."
@@ -375,6 +390,12 @@ elif [[ "${BACKEND}" == "mediatek" ]]; then
   if [[ $? -eq 0 ]]; then
     prepare_artifacts_upload
   fi
+elif [[ "${BACKEND}" == "cuda" ]]; then
+  echo "Testing ${MODEL_NAME} with cuda..."
+  test_model_with_cuda
+  if [[ $? -eq 0 ]]; then
+    prepare_artifacts_upload
+  fi
 else
   set +e
   if [[ "${BACKEND}" == *"quantization"* ]]; then
diff --git a/.github/workflows/test-cuda-builds.yml b/.github/workflows/cuda.yml
similarity index 72%
rename from .github/workflows/test-cuda-builds.yml
rename to .github/workflows/cuda.yml
index 5e054c1de84..8724fab99d4 100644
--- a/.github/workflows/test-cuda-builds.yml
+++ b/.github/workflows/cuda.yml
@@ -61,3 +61,28 @@ jobs:
           else
             echo "SUCCESS: All ExecuTorch CUDA builds (12.6, 12.8, 12.9) completed successfully!"
           fi
+
+  test-models-cuda:
+    name: test-models-cuda
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+      matrix:
+        model: [linear, add, add_mul, resnet18]
+    with:
+      timeout: 90
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        PYTHON_EXECUTABLE=python CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh
+        export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
+        PYTHON_EXECUTABLE=python source .ci/scripts/test_model.sh "${{ matrix.model }}" cmake cuda
diff --git a/.lintrunner.toml b/.lintrunner.toml
index ef771bdb9df..b366c141799 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -219,6 +219,7 @@ exclude_patterns = [
     '**/*.gif',
     'extension/llm/tokenizers',
     'extension/llm/tokenizers/**',
+    'examples/cuda',
     # File contains @generated
     'extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h',
     'extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_special_unstrided_cpu.h',
diff --git a/CMakeLists.txt b/CMakeLists.txt
index aa2101d1a97..678484ea722 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -587,6 +587,16 @@ endif()
 
 if(EXECUTORCH_BUILD_CORTEX_M)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cortex_m)
+  list(APPEND _executorch_backends coretex_m_backend)
+endif()
+
+if(EXECUTORCH_BUILD_CUDA)
+  # Build common AOTI functionality (required for CUDA)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/aoti)
+  # Build CUDA-specific AOTI functionality
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cuda)
+  # Add aoti_cuda to backends - it already depends on aoti_common
+  list(APPEND _executorch_backends aoti_cuda)
 endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_APPLE)
diff --git a/backends/aoti/CMakeLists.txt b/backends/aoti/CMakeLists.txt
index 6a32a86cbf3..8d49bcf1f96 100644
--- a/backends/aoti/CMakeLists.txt
+++ b/backends/aoti/CMakeLists.txt
@@ -30,7 +30,9 @@ set(_aoti_common_sources aoti_model_container.cpp common_shims.cpp)
 add_library(aoti_common STATIC ${_aoti_common_sources})
 target_include_directories(
   aoti_common
-  PUBLIC $<BUILD_INTERFACE:${EXECUTORCH_ROOT}> $<INSTALL_INTERFACE:include>
+  PUBLIC $<BUILD_INTERFACE:${EXECUTORCH_ROOT}>
+         $<INSTALL_INTERFACE:include>
+         $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
          # PyTorch AOTI headers from ExecuTorch's torch detection
          ${TORCH_INCLUDE_DIRS}
 )
diff --git a/backends/aoti/aoti_model_container.h b/backends/aoti/aoti_model_container.h
index 4b20aefc976..844bd2d5a77 100644
--- a/backends/aoti/aoti_model_container.h
+++ b/backends/aoti/aoti_model_container.h
@@ -21,6 +21,7 @@ using executorch::runtime::etensor::Tensor;
 extern "C" {
 
 // Type definitions
+using AOTITensorHandle = Tensor*;
 using AOTIRuntimeError = Error;
 
 // Forward declarations for AOT Inductor model container
@@ -74,6 +75,7 @@ extern AOTInductorModelContainerRunFunc AOTInductorModelContainerRun;
 // AOTI Delegate Handle structure
 struct AOTIDelegateHandle {
   void* so_handle;
+  std::string so_path;
   AOTInductorModelContainerHandle container_handle;
 };
 
diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
new file mode 100644
index 00000000000..90588218c02
--- /dev/null
+++ b/backends/cuda/CMakeLists.txt
@@ -0,0 +1,69 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Build AOTI CUDA backend for runtime.
+#
+# ### Editing this file ###
+#
+# This file should be formatted with
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+# It should also be cmake-lint clean.
+#
+cmake_minimum_required(VERSION 3.29)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+
+find_package(CUDAToolkit REQUIRED)
+
+# Use ExecutorTorch's standard way to find PyTorch libraries for AOTI
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+find_package_torch()
+
+# CUDA-specific AOTI functionality
+set(_aoti_cuda_sources runtime/cuda_backend.cpp runtime/shims/memory.cpp
+                       runtime/shims/tensor_attribute.cpp
+)
+add_library(aoti_cuda STATIC ${_aoti_cuda_sources})
+target_include_directories(
+  aoti_cuda
+  PUBLIC ${CUDAToolkit_INCLUDE_DIRS}
+         $<BUILD_INTERFACE:${EXECUTORCH_ROOT}>
+         $<INSTALL_INTERFACE:include>
+         # PyTorch AOTI headers from ExecutorTorch's torch detection
+         ${TORCH_INCLUDE_DIRS}
+)
+target_compile_options(aoti_cuda PUBLIC -fexceptions -frtti -fPIC)
+# Ensure symbols are exported properly
+target_link_options(aoti_cuda PUBLIC -Wl,--export-dynamic)
+
+# Link against CUDA::cudart, common AOTI library, and PyTorch CUDA libraries
+target_link_libraries(
+  aoti_cuda
+  PUBLIC aoti_common CUDA::cudart ${CMAKE_DL_LIBS}
+         # Link PyTorch libraries for AOTI CUDA functions
+         ${TORCH_LIBRARIES}
+)
+# If you need other CUDA libraries, link them similarly:
+# target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...)
+executorch_target_link_options_shared_lib(aoti_cuda)
+
+install(
+  TARGETS aoti_cuda
+  EXPORT ExecuTorchTargets
+  DESTINATION lib
+)
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
new file mode 100644
index 00000000000..08031ce6a26
--- /dev/null
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -0,0 +1,382 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <dlfcn.h>
+#include <executorch/runtime/backend/interface.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/evalue.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+#include <unistd.h>
+#include <cstdio>
+
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+// Include our shim layer headers
+#include <executorch/backends/aoti/aoti_model_container.h>
+#include <executorch/backends/aoti/common_shims.h>
+#include <executorch/backends/cuda/runtime/shims/memory.h>
+
+namespace executorch {
+namespace backends {
+namespace cuda {
+
+using namespace std;
+using namespace aoti;
+
+using executorch::aten::ScalarType;
+using executorch::runtime::ArrayRef;
+using executorch::runtime::Backend;
+using executorch::runtime::BackendExecutionContext;
+using executorch::runtime::BackendInitContext;
+using executorch::runtime::CompileSpec;
+using executorch::runtime::DelegateHandle;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::NamedDataMap;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
+using executorch::runtime::etensor::Tensor;
+
+class ET_EXPERIMENTAL CudaBackend final
+    : public ::executorch::runtime::BackendInterface {
+ private:
+  Error register_shared_library_functions(void* so_handle) const {
+    AOTInductorModelContainerCreateWithDevice =
+        reinterpret_cast<AOTInductorModelContainerCreateWithDeviceFunc>(
+            dlsym(so_handle, "AOTInductorModelContainerCreateWithDevice"));
+    if (AOTInductorModelContainerCreateWithDevice == nullptr) {
+      ET_LOG(Error, "Failed to load AOTInductorModelContainerCreateWithDevice");
+      return Error::AccessFailed;
+    }
+
+    AOTInductorModelContainerDelete =
+        reinterpret_cast<AOTInductorModelContainerDeleteFunc>(
+            dlsym(so_handle, "AOTInductorModelContainerDelete"));
+    if (AOTInductorModelContainerDelete == nullptr) {
+      ET_LOG(Error, "Failed to load AOTInductorModelContainerDelete");
+      return Error::AccessFailed;
+    }
+
+    AOTInductorModelContainerGetNumInputs =
+        reinterpret_cast<AOTInductorModelContainerGetNumInputsFunc>(
+            dlsym(so_handle, "AOTInductorModelContainerGetNumInputs"));
+    if (AOTInductorModelContainerGetNumInputs == nullptr) {
+      ET_LOG(Error, "Failed to load AOTInductorModelContainerGetNumInputs");
+      return Error::AccessFailed;
+    }
+
+    AOTInductorModelContainerGetNumOutputs =
+        reinterpret_cast<AOTInductorModelContainerGetNumOutputsFunc>(
+            dlsym(so_handle, "AOTInductorModelContainerGetNumOutputs"));
+    if (AOTInductorModelContainerGetNumOutputs == nullptr) {
+      ET_LOG(Error, "Failed to load AOTInductorModelContainerGetNumOutputs");
+      return Error::AccessFailed;
+    }
+
+    AOTInductorModelContainerRun =
+        reinterpret_cast<AOTInductorModelContainerRunFunc>(
+            dlsym(so_handle, "AOTInductorModelContainerRun"));
+    if (AOTInductorModelContainerRun == nullptr) {
+      ET_LOG(Error, "Failed to load AOTInductorModelContainerRun");
+      return Error::AccessFailed;
+    }
+
+    return Error::Ok;
+  }
+
+ public:
+  bool is_available() const override {
+    return 1;
+  }
+
+  // Once per loaded binary blob
+  Result<DelegateHandle*> init(
+      BackendInitContext& context,
+      FreeableBuffer* processed, // This will be a empty buffer
+      ArrayRef<CompileSpec> compile_specs // This will be my empty list
+  ) const override {
+    std::string method_name;
+    for (const CompileSpec& spec : compile_specs) {
+      if (std::strcmp(spec.key, "method_name") == 0) {
+        method_name.assign(
+            static_cast<const char*>(spec.value.buffer),
+            spec.value.nbytes); // no nullptr guarantee, so pass size
+        break;
+      }
+    }
+
+    std::string so_blob_key =
+        method_name.empty() ? "so_blob" : method_name + "_so_blob";
+
+    const NamedDataMap* named_data_map = context.get_named_data_map();
+    auto aoti_cuda_buffer = named_data_map->get_data(so_blob_key.c_str());
+    if (!aoti_cuda_buffer.ok()) {
+      ET_LOG(
+          Error,
+          "Failed to get data for key %s: 0x%x",
+          so_blob_key.c_str(),
+          aoti_cuda_buffer.error());
+      return aoti_cuda_buffer.error();
+    }
+    // Generate dynamic temporary file path
+    filesystem::path temp_dir = filesystem::temp_directory_path();
+    filesystem::path so_path =
+        temp_dir / (so_blob_key + to_string(getpid()) + ".so");
+
+    // Create a temporary file
+    ofstream outfile(so_path.c_str(), ios::binary);
+
+    // Write the ELF buffer to the temporary file
+    ET_LOG(
+        Info,
+        "Writing %zu bytes to %s",
+        aoti_cuda_buffer->size(),
+        so_path.c_str());
+    outfile.write(
+        static_cast<const char*>(aoti_cuda_buffer->data()),
+        aoti_cuda_buffer->size());
+
+    if (!outfile) {
+      ET_LOG(Error, "Failed to write to file %s", so_path.c_str());
+      return Error::AccessFailed;
+    }
+    // Finish writing the file to disk
+    outfile.close();
+
+    // Load the ELF using dlopen
+    void* so_handle = dlopen(so_path.c_str(), RTLD_LAZY | RTLD_LOCAL);
+    if (so_handle == nullptr) {
+      ET_LOG(Error, "Failed to load shared library: %s", dlerror());
+      return Error::AccessFailed;
+    }
+
+    processed->Free();
+
+    // Register all shared library functions
+    Error reg_err = register_shared_library_functions(so_handle);
+    if (reg_err != Error::Ok) {
+      return reg_err;
+    }
+
+    AOTInductorModelContainerHandle container_handle = nullptr;
+
+    AOTIRuntimeError err = AOTInductorModelContainerCreateWithDevice(
+        &container_handle, 1, "cuda", nullptr);
+    if (err != Error::Ok) {
+      return err;
+    }
+    ET_LOG(Info, "container_handle = %p", container_handle);
+
+    AOTIDelegateHandle* handle = new AOTIDelegateHandle();
+    handle->so_handle = so_handle;
+    handle->so_path = so_path.string();
+    handle->container_handle = container_handle;
+    return (DelegateHandle*)handle; // Return the handle post-processing
+  }
+
+  // Once per execution
+  Error execute(
+      BackendExecutionContext& context,
+      DelegateHandle* handle_,
+      Span<EValue*> args) const override {
+    AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
+
+    size_t n_inputs;
+    AOTInductorModelContainerGetNumInputs(handle->container_handle, &n_inputs);
+
+    size_t n_outputs;
+    AOTInductorModelContainerGetNumOutputs(
+        handle->container_handle, &n_outputs);
+
+    if (n_inputs + n_outputs != args.size()) {
+      ET_LOG(
+          Error,
+          "number of user input %zd and output %zd generated from AOT Inductor does not match ET runner's %zd. Exit.",
+          n_inputs,
+          n_outputs,
+          args.size());
+      return Error::InvalidArgument;
+    }
+
+    // NOTE: ExecuTorch tensors are always on CPU/host memory
+    // We need to create GPU copies for CUDA kernel execution
+    std::vector<AOTITensorHandle> gpu_inputs(
+        n_inputs); // GPU copies for kernel execution
+    std::vector<AOTITensorHandle> gpu_outputs(
+        n_outputs); // GPU tensors for kernel output
+
+    // Process input tensors: ExecuTorch provides CPU tensors, create GPU
+    // copies
+    for (int i = 0; i < n_inputs; i++) {
+      // Get tensor dimensions and properties from ExecuTorch CPU tensor
+      auto cpu_tensor = &(args[i]->toTensor());
+      auto sizes = cpu_tensor->sizes();
+      auto scalar_type = cpu_tensor->scalar_type();
+
+      // Create GPU tensor with same shape
+      std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
+
+      AOTITensorHandle gpu_input_handle;
+      Error create_err = aoti_torch_empty_strided(
+          sizes_vec.size(),
+          sizes_vec.data(),
+          nullptr, // use default strides
+          static_cast<int32_t>(scalar_type),
+          1, // device_type = cuda
+          0, // device_index = 0
+          &gpu_input_handle);
+
+      if (create_err != Error::Ok) {
+        ET_LOG(Error, "Failed to create GPU tensor for input %d", i);
+        return Error::Internal;
+      }
+
+      gpu_inputs[i] = gpu_input_handle;
+
+      // Copy data from CPU to GPU
+      Error copy_err = aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0);
+      if (copy_err != Error::Ok) {
+        ET_LOG(Error, "Failed to copy input %d from CPU to GPU", i);
+        return Error::Internal;
+      }
+    }
+    ET_LOG(Info, "Inputs copied to GPU");
+    // Process output tensors: create GPU counterparts for ExecuTorch CPU
+    // tensors
+    for (int i = 0; i < n_outputs; i++) {
+      // Get output tensor dimensions from ExecuTorch CPU tensor
+      auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
+      auto sizes = cpu_output_tensor->sizes();
+      auto scalar_type = cpu_output_tensor->scalar_type();
+
+      // Create GPU tensor with same shape for kernel output
+      std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
+
+      AOTITensorHandle gpu_output_handle;
+      Error create_err = aoti_torch_empty_strided(
+          sizes_vec.size(),
+          sizes_vec.data(),
+          nullptr, // use default strides
+          static_cast<int32_t>(scalar_type),
+          1, // device_type = cuda
+          0, // device_index = 0
+          &gpu_output_handle);
+
+      if (create_err != Error::Ok) {
+        ET_LOG(Error, "Failed to create GPU tensor for output %d", i);
+        return Error::Internal;
+      }
+
+      gpu_outputs[i] = gpu_output_handle;
+    }
+    ET_LOG(Info, "Outputs created on GPU");
+    // Run AOTI container with GPU tensors
+    AOTIRuntimeError error = AOTInductorModelContainerRun(
+        handle->container_handle,
+        gpu_inputs.data(), // Use GPU input tensors
+        n_inputs,
+        gpu_outputs.data(), // Use GPU output tensors
+        n_outputs,
+        nullptr, // Pass the actual CUDA stream!
+        nullptr); // proxy_executor_handle can remain nullptr
+
+    if (error != Error::Ok) {
+      ET_LOG(
+          Error,
+          "AOTInductorModelContainerRun failed with error code %d",
+          error);
+      return Error::Internal;
+    }
+
+    // Copy GPU output results back to CPU output tensors
+    for (int i = 0; i < n_outputs; i++) {
+      auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
+      // For DYNAMIC_BOUND tensors we try to resize
+      ET_CHECK_OK_OR_RETURN_ERROR(
+          resize_tensor(*cpu_output_tensor, gpu_outputs[i]->sizes()),
+          "Error resizing tensor at output index %d",
+          i);
+      ET_CHECK_OK_OR_RETURN_ERROR(
+          aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0),
+          "Failed to copy GPU output %d back to CPU",
+          i);
+    }
+
+    // Clean up GPU tensors that we created (ExecuTorch tensors are always
+    // CPU, so all GPU tensors are our copies)
+    for (int i = 0; i < n_inputs; i++) {
+      // All GPU input tensors were created by us, delete them
+      aoti_torch_delete_tensor_object(gpu_inputs[i]);
+    }
+
+    for (int i = 0; i < n_outputs; i++) {
+      // All GPU output tensors were created by us, delete them
+      aoti_torch_delete_tensor_object(gpu_outputs[i]);
+    }
+
+    return Error::Ok;
+  }
+
+  void destroy(DelegateHandle* handle_) const override {
+    if (handle_ == nullptr) {
+      return;
+    }
+    AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
+
+    // Delete the container BEFORE closing the shared library
+    if (handle->container_handle != nullptr) {
+      AOTIRuntimeError delete_result =
+          AOTInductorModelContainerDelete(handle->container_handle);
+      if (delete_result != Error::Ok) {
+        ET_LOG(
+            Error,
+            "AOTInductorModelContainerDelete failed with error code %d",
+            delete_result);
+      }
+      handle->container_handle = nullptr;
+    }
+
+    // Now close the shared library
+    if (handle->so_handle != nullptr) {
+      dlclose(handle->so_handle);
+    }
+
+    // Remove the temporary shared library file
+    if (!handle->so_path.empty()) {
+      std::error_code remove_error;
+      std::filesystem::remove(handle->so_path, remove_error);
+      if (remove_error) {
+        ET_LOG(
+            Error,
+            "Failed to remove temporary shared library %s: %s",
+            handle->so_path.c_str(),
+            remove_error.message().c_str());
+      }
+    }
+
+    delete handle;
+  }
+};
+
+} // namespace cuda
+
+namespace {
+auto cls = cuda::CudaBackend();
+executorch::runtime::Backend backend{"CudaBackend", &cls};
+static executorch::runtime::Error success_with_compiler =
+    register_backend(backend);
+} // namespace
+
+} // namespace backends
+} // namespace executorch
diff --git a/backends/cuda/runtime/shims/utils.h b/backends/cuda/runtime/shims/utils.h
index 99d2bc102f5..02c3abfc83f 100644
--- a/backends/cuda/runtime/shims/utils.h
+++ b/backends/cuda/runtime/shims/utils.h
@@ -40,6 +40,7 @@ namespace cuda {
 
 // Enum for supported data types in et-cuda backend
 enum class SupportedDTypes : int32_t {
+  INT64 = 4, // PyTorch's int64 dtype code
   FLOAT32 = 6, // PyTorch's float32 dtype code
   BFLOAT16 = 15, // PyTorch's bfloat16 dtype code
 };
@@ -100,6 +101,7 @@ using AOTITorchError = Error;
 // Helper function to check if a dtype is supported in ET CUDA backend
 inline bool is_dtype_supported_in_et_cuda(int32_t dtype) {
   switch (dtype) {
+    case static_cast<int32_t>(SupportedDTypes::INT64):
     case static_cast<int32_t>(SupportedDTypes::FLOAT32):
     case static_cast<int32_t>(SupportedDTypes::BFLOAT16):
       return true;
@@ -113,8 +115,9 @@ inline AOTITorchError validate_dtype(int32_t dtype) {
   ET_CHECK_OR_RETURN_ERROR(
       is_dtype_supported_in_et_cuda(dtype),
       InvalidArgument,
-      "Unsupported dtype: %d. Supported dtypes: %d (float32), %d (bfloat16)",
+      "Unsupported dtype: %d. Supported dtypes: %d (int64), %d (float32), %d (bfloat16)",
       dtype,
+      static_cast<int32_t>(SupportedDTypes::INT64),
       static_cast<int32_t>(SupportedDTypes::FLOAT32),
       static_cast<int32_t>(SupportedDTypes::BFLOAT16));
 
diff --git a/examples/cuda/scripts/__init__.py b/examples/cuda/scripts/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/examples/cuda/scripts/export.py b/examples/cuda/scripts/export.py
new file mode 100644
index 00000000000..c103d7ee50a
--- /dev/null
+++ b/examples/cuda/scripts/export.py
@@ -0,0 +1,116 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Example script for exporting simple models to flatbuffer with CUDA delegate.
+
+import argparse
+import pathlib
+
+import torch
+
+from executorch.backends.cuda.cuda_backend import CudaBackend
+
+from executorch.backends.cuda.cuda_partitioner import CudaPartitioner
+
+from executorch.examples.models import MODEL_NAME_TO_MODEL
+from executorch.examples.models.model_factory import EagerModelFactory
+
+from executorch.exir import EdgeCompileConfig, to_edge_transform_and_lower
+
+from executorch.extension.export_util.utils import save_pte_program
+from torch._inductor.decomposition import conv1d_to_conv2d
+from torch.nn.attention import SDPBackend
+
+# Script to export a model with CUDA delegation.
+
+_EDGE_COMPILE_CONFIG = EdgeCompileConfig(
+    _check_ir_validity=False,
+    _skip_dim_order=True,  # TODO(T182928844): enable dim_order in backend
+)
+
+
+def is_fbcode():
+    return not hasattr(torch.version, "git_version")
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-m",
+        "--model_name",
+        required=True,
+        help=f"Provide model name. Valid ones: {list(MODEL_NAME_TO_MODEL.keys())}",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=pathlib.Path,
+        default=pathlib.Path("./"),
+        help="Output directory for the exported model",
+    )
+    parser.add_argument("--generate_etrecord", action=argparse.BooleanOptionalAction)
+    parser.add_argument("--save_processed_bytes", action=argparse.BooleanOptionalAction)
+
+    args = parser.parse_args()
+    return args
+
+
+def save_processed_bytes(processed_bytes, base_name: str):
+    filename = f"{base_name}.bin"
+    print(f"Saving processed bytes to {filename}")
+    with open(filename, "wb") as file:
+        file.write(processed_bytes)
+    return
+
+
+def main():
+    args = parse_args()
+
+    if args.model_name not in MODEL_NAME_TO_MODEL:
+        raise RuntimeError(
+            f"Model {args.model_name} is not a valid name. "
+            f"Available models are {list(MODEL_NAME_TO_MODEL.keys())}."
+        )
+
+    (
+        model,
+        example_args,
+        example_kwargs,
+        dynamic_shapes,
+    ) = EagerModelFactory.create_model(*MODEL_NAME_TO_MODEL[args.model_name])
+    model = model.eval()
+    exported_programs = torch.export.export(
+        model,
+        args=example_args,
+        kwargs=example_kwargs,
+        dynamic_shapes=dynamic_shapes,
+    )
+    print(exported_programs)
+
+    partitioner = CudaPartitioner(
+        [CudaBackend.generate_method_name_compile_spec(args.model_name)]
+    )
+    # Add decompositions for triton to generate kernels.
+    exported_programs = exported_programs.run_decompositions(
+        {
+            torch.ops.aten.conv1d.default: conv1d_to_conv2d,
+        }
+    )
+    with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]):
+        et_prog = to_edge_transform_and_lower(
+            exported_programs,
+            partitioner=[partitioner],
+            compile_config=_EDGE_COMPILE_CONFIG,
+            generate_etrecord=args.generate_etrecord,
+        )
+    exec_program = et_prog.to_executorch()
+    save_pte_program(exec_program, args.model_name, args.output_dir)
+    if args.generate_etrecord:
+        exec_program.get_etrecord().save(f"{args.model_name}_etrecord.bin")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/models/moshi/mimi/install_requirements.sh b/examples/models/moshi/mimi/install_requirements.sh
index cfe691c7bd4..6df4caf8692 100755
--- a/examples/models/moshi/mimi/install_requirements.sh
+++ b/examples/models/moshi/mimi/install_requirements.sh
@@ -8,7 +8,7 @@
 set -x
 
 conda install -c conda-forge "ffmpeg<8" -y
-pip install torchcodec==0.7.0.dev20250906 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+pip install torchcodec==0.7.0.dev20250929 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
 pip install moshi==0.2.4
 pip install bitsandbytes soundfile
 # Run llama2/install requirements for torchao deps
diff --git a/examples/models/moshi/mimi/test_mimi.py b/examples/models/moshi/mimi/test_mimi.py
index be3c075913d..d0c3c2ceb15 100644
--- a/examples/models/moshi/mimi/test_mimi.py
+++ b/examples/models/moshi/mimi/test_mimi.py
@@ -156,7 +156,7 @@ def test_streaming_encoding_decoding(self):
         all_pcms_streaming = torch.cat(all_pcms_streaming, dim=-1)
         sqnr_streaming = compute_sqnr(pcm_ref, all_pcms_streaming)
         print(f"sqnr_streaming = {sqnr_streaming} dB")
-        self.assertTrue(sqnr_streaming > 100)
+        self.assertTrue(sqnr_streaming > 70)
 
     def test_exported_encoding(self):
         """Ensure exported encoding model is consistent with reference output."""
diff --git a/src/executorch/examples/cuda b/src/executorch/examples/cuda
new file mode 120000
index 00000000000..aa2e50dd2cc
--- /dev/null
+++ b/src/executorch/examples/cuda
@@ -0,0 +1 @@
+../../../examples/cuda
\ No newline at end of file
diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake
index 0039ab551fb..bf5eaaef107 100644
--- a/tools/cmake/preset/default.cmake
+++ b/tools/cmake/preset/default.cmake
@@ -145,6 +145,9 @@ define_overridable_option(
 define_overridable_option(
   EXECUTORCH_BUILD_CORTEX_M "Build the Cortex-M backend" BOOL OFF
 )
+define_overridable_option(
+  EXECUTORCH_BUILD_CUDA "Build the CUDA backend" BOOL OFF
+)
 define_overridable_option(
   EXECUTORCH_BUILD_VGF "Build the Arm VGF backend" BOOL OFF
 )
@@ -342,6 +345,10 @@ check_required_options_on(
   EXECUTORCH_BUILD_EXTENSION_LLM
 )
 
+check_required_options_on(
+  IF_ON EXECUTORCH_BUILD_CUDA REQUIRES EXECUTORCH_BUILD_EXTENSION_TENSOR
+)
+
 if(NOT EXISTS ${EXECUTORCH_PAL_DEFAULT_FILE_PATH})
   message(
     FATAL_ERROR
diff --git a/torch_pin.py b/torch_pin.py
index 1b89309ad05..02040c91963 100644
--- a/torch_pin.py
+++ b/torch_pin.py
@@ -1,2 +1,2 @@
 TORCH_VERSION = "2.10.0"
-NIGHTLY_VERSION = "dev20250915"
+NIGHTLY_VERSION = "dev20251003"

From bba9d2631089a0a86ccaa198cdaddd15793bf683 Mon Sep 17 00:00:00 2001
From: lucylq <lfq@meta.com>
Date: Tue, 7 Oct 2025 17:26:17 -0700
Subject: [PATCH 165/266] Introduce public MergedDataMap

Differential Revision: D83527299

Pull Request resolved: https://github.com/pytorch/executorch/pull/14861
---
 .ci/scripts/build-qnn-sdk.sh                  |   1 +
 .ci/scripts/test_llama_torchao_lowbit.sh      |   1 +
 .../test_torchao_huggingface_checkpoints.sh   |   1 +
 .ci/scripts/test_yolo12.sh                    |   4 +
 .github/workflows/trunk.yml                   |   1 +
 CMakeLists.txt                                |   5 +
 backends/mediatek/scripts/mtk_build.sh        |   1 +
 backends/qualcomm/scripts/build.sh            |   2 +
 backends/samsung/build.sh                     |   2 +
 backends/vulkan/test/scripts/test_model.sh    |   1 +
 backends/vulkan/test/scripts/test_op.sh       |   1 +
 extension/named_data_map/CMakeLists.txt       |  46 +++++
 extension/named_data_map/TARGETS              |   8 +
 extension/named_data_map/merged_data_map.cpp  | 117 ++++++++++++
 extension/named_data_map/merged_data_map.h    | 106 +++++++++++
 extension/named_data_map/targets.bzl          |  21 +++
 extension/named_data_map/test/CMakeLists.txt  |  60 ++++++
 extension/named_data_map/test/TARGETS         |   8 +
 .../test/merged_data_map_test.cpp             | 174 ++++++++++++++++++
 extension/named_data_map/test/targets.bzl     |  26 +++
 scripts/build_wasm_tests.sh                   |   1 +
 .../executorch/build/build_variables.bzl      |   4 +
 test/run_oss_cpp_tests.sh                     |   1 +
 tools/cmake/Codegen.cmake                     |   2 +
 tools/cmake/preset/android.cmake              |   1 +
 tools/cmake/preset/apple_common.cmake         |   1 +
 tools/cmake/preset/default.cmake              |  10 +
 tools/cmake/preset/llm.cmake                  |   1 +
 tools/cmake/preset/profiling.cmake            |   1 +
 tools/cmake/preset/pybind.cmake               |   3 +-
 tools/cmake/preset/windows.cmake              |   1 +
 31 files changed, 611 insertions(+), 1 deletion(-)
 create mode 100644 extension/named_data_map/CMakeLists.txt
 create mode 100644 extension/named_data_map/TARGETS
 create mode 100644 extension/named_data_map/merged_data_map.cpp
 create mode 100644 extension/named_data_map/merged_data_map.h
 create mode 100644 extension/named_data_map/targets.bzl
 create mode 100644 extension/named_data_map/test/CMakeLists.txt
 create mode 100644 extension/named_data_map/test/TARGETS
 create mode 100644 extension/named_data_map/test/merged_data_map_test.cpp
 create mode 100644 extension/named_data_map/test/targets.bzl

diff --git a/.ci/scripts/build-qnn-sdk.sh b/.ci/scripts/build-qnn-sdk.sh
index 7f34e8afb63..30835cf5085 100755
--- a/.ci/scripts/build-qnn-sdk.sh
+++ b/.ci/scripts/build-qnn-sdk.sh
@@ -38,6 +38,7 @@ set_up_aot() {
       -DEXECUTORCH_BUILD_EXTENSION_EXTENSION_LLM=ON \
       -DEXECUTORCH_BUILD_EXTENSION_EXTENSION_LLM_RUNNER=ON \
       -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+      -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
       -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
       -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
       -DPYTHON_EXECUTABLE=python3
diff --git a/.ci/scripts/test_llama_torchao_lowbit.sh b/.ci/scripts/test_llama_torchao_lowbit.sh
index 5f472fad63b..a7ded52ccc6 100644
--- a/.ci/scripts/test_llama_torchao_lowbit.sh
+++ b/.ci/scripts/test_llama_torchao_lowbit.sh
@@ -31,6 +31,7 @@ cmake -DPYTHON_EXECUTABLE=python \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_XNNPACK=OFF \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
diff --git a/.ci/scripts/test_torchao_huggingface_checkpoints.sh b/.ci/scripts/test_torchao_huggingface_checkpoints.sh
index f06c794f88d..da50d28800a 100644
--- a/.ci/scripts/test_torchao_huggingface_checkpoints.sh
+++ b/.ci/scripts/test_torchao_huggingface_checkpoints.sh
@@ -129,6 +129,7 @@ if [[ "$TEST_WITH_RUNNER" -eq 1 ]]; then
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
         -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
         -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
         -DEXECUTORCH_BUILD_XNNPACK=ON \
         -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
diff --git a/.ci/scripts/test_yolo12.sh b/.ci/scripts/test_yolo12.sh
index e3f20d5f970..594ddbf86ed 100755
--- a/.ci/scripts/test_yolo12.sh
+++ b/.ci/scripts/test_yolo12.sh
@@ -119,6 +119,8 @@ cmake_install_executorch_libraries() {
           -DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \
           -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
           -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
           -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
           -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
           -B"${build_dir}"
@@ -131,6 +133,8 @@ cmake_install_executorch_libraries() {
                        -DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \
                        -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
                        -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+                       -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+                       -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
                        -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
                        -DEXECUTORCH_ENABLE_LOGGING=ON \
                        -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index aabea88f517..2d25f469ae7 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -852,6 +852,7 @@ jobs:
           -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
           -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
           -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
           -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
           -DEXECUTORCH_BUILD_XNNPACK=ON \
           -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 678484ea722..c6d06fca2b1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -640,6 +640,11 @@ if(EXECUTORCH_BUILD_EXTENSION_MODULE)
   list(APPEND _executorch_extensions extension_module_static)
 endif()
 
+if(EXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/named_data_map)
+  list(APPEND _executorch_extensions extension_named_data_map)
+endif()
+
 if(EXECUTORCH_BUILD_EXTENSION_LLM)
   if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER)
     set(SUPPORT_REGEX_LOOKAHEAD ON)
diff --git a/backends/mediatek/scripts/mtk_build.sh b/backends/mediatek/scripts/mtk_build.sh
index 599f754d7bc..d42e5f7e10a 100755
--- a/backends/mediatek/scripts/mtk_build.sh
+++ b/backends/mediatek/scripts/mtk_build.sh
@@ -30,6 +30,7 @@ cmake -DCMAKE_INSTALL_PREFIX="${build_dir}" \
       -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
       -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
       -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+      -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
       -DEXECUTORCH_BUILD_NEURON=ON \
       -B"${build_dir}"
 
diff --git a/backends/qualcomm/scripts/build.sh b/backends/qualcomm/scripts/build.sh
index c84911cf851..4cdd1efe6f4 100755
--- a/backends/qualcomm/scripts/build.sh
+++ b/backends/qualcomm/scripts/build.sh
@@ -86,6 +86,7 @@ if [ "$BUILD_AARCH64" = true ]; then
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
         -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
         -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
         -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
         -DEXECUTORCH_ENABLE_LOGGING=ON \
@@ -155,6 +156,7 @@ if [ "$BUILD_X86_64" = true ]; then
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
         -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
         -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
         -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
         -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
diff --git a/backends/samsung/build.sh b/backends/samsung/build.sh
index dfa6407ff50..4845c760f0c 100755
--- a/backends/samsung/build.sh
+++ b/backends/samsung/build.sh
@@ -45,6 +45,7 @@ function build_x86_64() {
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
 	      -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
         -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
         -S ${PROJECT_DIR} \
         -B ${X86_64_BUILD_DIR}
@@ -77,6 +78,7 @@ function build_android() {
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
 	      -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
         -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
         -DEXECUTORCH_ENABLE_LOGGING=1 \
         -DEXECUTORCH_BUILD_DEVTOOLS=ON \
diff --git a/backends/vulkan/test/scripts/test_model.sh b/backends/vulkan/test/scripts/test_model.sh
index 5f06d2c039b..40ec88bae70 100755
--- a/backends/vulkan/test/scripts/test_model.sh
+++ b/backends/vulkan/test/scripts/test_model.sh
@@ -111,6 +111,7 @@ build_core_libraries_and_devtools() {
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
     -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
diff --git a/backends/vulkan/test/scripts/test_op.sh b/backends/vulkan/test/scripts/test_op.sh
index 1ec07b7f75f..797089e54dc 100755
--- a/backends/vulkan/test/scripts/test_op.sh
+++ b/backends/vulkan/test/scripts/test_op.sh
@@ -138,6 +138,7 @@ build_core_libraries() {
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
     -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
diff --git a/extension/named_data_map/CMakeLists.txt b/extension/named_data_map/CMakeLists.txt
new file mode 100644
index 00000000000..a4ad208c7e2
--- /dev/null
+++ b/extension/named_data_map/CMakeLists.txt
@@ -0,0 +1,46 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Please format this file by running:
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+
+cmake_minimum_required(VERSION 3.19)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+
+list(TRANSFORM _extension_named_data_map__srcs PREPEND "${EXECUTORCH_ROOT}/")
+# Create the library
+add_library(extension_named_data_map ${_extension_named_data_map__srcs})
+
+# Link dependencies
+target_link_libraries(extension_named_data_map PUBLIC executorch_core)
+
+target_include_directories(
+  extension_named_data_map PUBLIC ${_common_include_directories}
+)
+
+target_compile_options(
+  extension_named_data_map PUBLIC ${_common_compile_options}
+)
+
+# Install libraries
+install(
+  TARGETS extension_named_data_map
+  EXPORT ExecuTorchTargets
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  INCLUDES
+  DESTINATION ${_common_include_directories}
+)
+
+# Add tests if testing is enabled
+if(BUILD_TESTING)
+  add_subdirectory(test)
+endif()
diff --git a/extension/named_data_map/TARGETS b/extension/named_data_map/TARGETS
new file mode 100644
index 00000000000..2341af9282f
--- /dev/null
+++ b/extension/named_data_map/TARGETS
@@ -0,0 +1,8 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/extension/named_data_map/merged_data_map.cpp b/extension/named_data_map/merged_data_map.cpp
new file mode 100644
index 00000000000..b42701c7587
--- /dev/null
+++ b/extension/named_data_map/merged_data_map.cpp
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/named_data_map/merged_data_map.h>
+#include <executorch/runtime/core/data_loader.h>
+
+#include <unordered_map>
+#include <vector>
+
+using executorch::aten::string_view;
+using executorch::ET_RUNTIME_NAMESPACE::NamedDataMap;
+using executorch::ET_RUNTIME_NAMESPACE::TensorLayout;
+using executorch::runtime::Error;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
+
+namespace executorch::extension {
+
+/*static*/ Result<MergedDataMap> MergedDataMap::load(
+    Span<const NamedDataMap*> named_data_maps) {
+  std::vector<const NamedDataMap*> valid_data_maps;
+  for (auto i : c10::irange(named_data_maps.size())) {
+    if (named_data_maps[i] != nullptr &&
+        named_data_maps[i]->get_num_keys().get() > 0) {
+      valid_data_maps.push_back(named_data_maps[i]);
+    }
+  }
+  ET_CHECK_OR_RETURN_ERROR(
+      !valid_data_maps.empty(),
+      InvalidArgument,
+      "No non-empty named data maps provided to merge");
+
+  // Check for duplicate keys.
+  std::unordered_map<std::string, uint32_t> key_to_map_index;
+  for (auto i : c10::irange(valid_data_maps.size())) {
+    const auto cur_map = valid_data_maps[i];
+    uint32_t num_keys = cur_map->get_num_keys().get();
+    for (auto j : c10::irange(num_keys)) {
+      const auto cur_key = cur_map->get_key(j).get();
+      const auto [it, inserted] = key_to_map_index.emplace(cur_key, i);
+      ET_CHECK_OR_RETURN_ERROR(
+          inserted,
+          InvalidArgument,
+          "Duplicate key %s in named data maps at index %u and %lu",
+          cur_key,
+          it->second,
+          i);
+    }
+  }
+  return MergedDataMap(std::move(valid_data_maps), std::move(key_to_map_index));
+}
+
+ET_NODISCARD Result<const TensorLayout> MergedDataMap::get_tensor_layout(
+    string_view key) const {
+  const auto it = key_to_map_index_.find(key.data());
+  ET_CHECK_OR_RETURN_ERROR(
+      it != key_to_map_index_.end(),
+      NotFound,
+      "Key %s not found in named data maps",
+      key.data());
+
+  return named_data_maps_.at(it->second)->get_tensor_layout(key);
+}
+
+ET_NODISCARD
+Result<FreeableBuffer> MergedDataMap::get_data(string_view key) const {
+  const auto it = key_to_map_index_.find(key.data());
+  ET_CHECK_OR_RETURN_ERROR(
+      it != key_to_map_index_.end(),
+      NotFound,
+      "Key %s not found in named data maps",
+      key.data());
+  return named_data_maps_.at(it->second)->get_data(key);
+}
+
+ET_NODISCARD Error MergedDataMap::load_data_into(
+    string_view key,
+    void* buffer,
+    size_t size) const {
+  const auto it = key_to_map_index_.find(key.data());
+  ET_CHECK_OR_RETURN_ERROR(
+      it != key_to_map_index_.end(),
+      NotFound,
+      "Key %s not found in named data maps",
+      key.data());
+  return named_data_maps_.at(it->second)->load_data_into(key, buffer, size);
+}
+
+ET_NODISCARD Result<uint32_t> MergedDataMap::get_num_keys() const {
+  return key_to_map_index_.size();
+}
+
+ET_NODISCARD Result<const char*> MergedDataMap::get_key(uint32_t index) const {
+  uint32_t total_num_keys = get_num_keys().get();
+  ET_CHECK_OR_RETURN_ERROR(
+      index < total_num_keys,
+      InvalidArgument,
+      "Index %u out of range of size %u",
+      index,
+      total_num_keys);
+  for (auto i : c10::irange(named_data_maps_.size())) {
+    auto num_keys = named_data_maps_[i]->get_num_keys().get();
+    if (index < num_keys) {
+      return named_data_maps_[i]->get_key(index);
+    }
+    index -= num_keys;
+  }
+  // Shouldn't reach here.
+  return Error::Internal;
+}
+} // namespace executorch::extension
diff --git a/extension/named_data_map/merged_data_map.h b/extension/named_data_map/merged_data_map.h
new file mode 100644
index 00000000000..13415c0b59e
--- /dev/null
+++ b/extension/named_data_map/merged_data_map.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/named_data_map.h>
+
+#include <unordered_map>
+#include <vector>
+
+namespace executorch::extension {
+/**
+ * A NamedDataMap implementation that wraps other NamedDataMaps.
+ */
+class MergedDataMap final
+    : public executorch::ET_RUNTIME_NAMESPACE::NamedDataMap {
+ public:
+  /**
+   * Creates a new NamedDataMap that takes in other data maps.
+   *
+   * @param[in] data_maps vector of NamedDataMap pointers to merge.
+   * Note: the data maps must outlive the MergedDataMap instance.
+   */
+  static executorch::runtime::Result<MergedDataMap>
+  load(executorch::runtime::Span<
+       const executorch::ET_RUNTIME_NAMESPACE::NamedDataMap*> named_data_maps);
+
+  /**
+   * Retrieve the tensor_layout for the specified key.
+   *
+   * @param[in] key The name of the tensor to get metadata on.
+   *
+   * @return Error::NotFound if the key is not present.
+   */
+  ET_NODISCARD
+  executorch::runtime::Result<
+      const executorch::ET_RUNTIME_NAMESPACE::TensorLayout>
+  get_tensor_layout(executorch::aten::string_view key) const override;
+
+  /**
+   * Retrieve read-only data for the specified key.
+   *
+   * @param[in] key The name of the tensor to get data on.
+   *
+   * @return error if the key is not present or data cannot be loaded.
+   */
+  ET_NODISCARD
+  executorch::runtime::Result<executorch::runtime::FreeableBuffer> get_data(
+      executorch::aten::string_view key) const override;
+
+  /**
+   * Loads the data of the specified tensor into the provided buffer.
+   *
+   * @param[in] key The name of the tensor to get the data of.
+   * @param[in] buffer The buffer to load data into. Must point to at least
+   * `size` bytes of memory.
+   * @param[in] size The number of bytes to load.
+   *
+   * @returns an Error indicating if the load was successful.
+   */
+  ET_NODISCARD executorch::runtime::Error load_data_into(
+      executorch::aten::string_view key,
+      void* buffer,
+      size_t size) const override;
+
+  /**
+   * @returns The number of keys in the map.
+   */
+  ET_NODISCARD executorch::runtime::Result<uint32_t> get_num_keys()
+      const override;
+  /**
+   * @returns The key at the specified index, error if index out of bounds.
+   */
+  ET_NODISCARD executorch::runtime::Result<const char*> get_key(
+      uint32_t index) const override;
+
+  MergedDataMap(MergedDataMap&&) noexcept = default;
+
+  ~MergedDataMap() override = default;
+
+ private:
+  MergedDataMap(
+      std::vector<const executorch::ET_RUNTIME_NAMESPACE::NamedDataMap*>
+          named_data_maps,
+      std::unordered_map<std::string, uint32_t> key_to_map_index)
+      : named_data_maps_(std::move(named_data_maps)),
+        key_to_map_index_(std::move(key_to_map_index)) {}
+
+  // Not copyable or assignable.
+  MergedDataMap(const MergedDataMap& rhs) = delete;
+  MergedDataMap& operator=(MergedDataMap&& rhs) noexcept = delete;
+  MergedDataMap& operator=(const MergedDataMap& rhs) = delete;
+
+  std::vector<const executorch::ET_RUNTIME_NAMESPACE::NamedDataMap*>
+      named_data_maps_;
+
+  // Map from key to index in the named_data_maps_ vector.
+  std::unordered_map<std::string, uint32_t> key_to_map_index_;
+};
+
+} // namespace executorch::extension
diff --git a/extension/named_data_map/targets.bzl b/extension/named_data_map/targets.bzl
new file mode 100644
index 00000000000..0c2b2fa6d5c
--- /dev/null
+++ b/extension/named_data_map/targets.bzl
@@ -0,0 +1,21 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
+
+def define_common_targets():
+    for aten_mode in get_aten_mode_options():
+        aten_suffix = "_aten" if aten_mode else ""
+        runtime.cxx_library(
+            name = "merged_data_map" + aten_suffix,
+            srcs = [
+                "merged_data_map.cpp",
+            ],
+            exported_headers = [
+                "merged_data_map.h",
+            ],
+            visibility = [
+                "@EXECUTORCH_CLIENTS",
+            ],
+            deps = [
+                "//executorch/runtime/core:named_data_map" + aten_suffix,
+                "//executorch/runtime/core:core",
+            ],
+        )
diff --git a/extension/named_data_map/test/CMakeLists.txt b/extension/named_data_map/test/CMakeLists.txt
new file mode 100644
index 00000000000..7fbcb7e5989
--- /dev/null
+++ b/extension/named_data_map/test/CMakeLists.txt
@@ -0,0 +1,60 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Please this file formatted by running:
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+
+cmake_minimum_required(VERSION 3.19)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
+endif()
+
+include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
+
+add_custom_command(
+  OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.ptd"
+         "${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.ptd"
+  COMMAND
+    ${PYTHON_EXECUTABLE} -m test.models.export_program --modules
+    "ModuleAddMul,ModuleLinear" --external-constants --outdir
+    "${CMAKE_CURRENT_BINARY_DIR}"
+  WORKING_DIRECTORY ${EXECUTORCH_ROOT}
+)
+
+add_custom_target(
+  extension_named_data_map_test_resources
+  DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.ptd"
+          "${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.ptd"
+)
+
+set(test_env
+    "ET_MODULE_ADD_MUL_DATA_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.ptd"
+    "ET_MODULE_LINEAR_DATA_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.ptd"
+)
+
+set(_test_srcs merged_data_map_test.cpp)
+
+et_cxx_test(
+  extension_named_data_map_test
+  SOURCES
+  ${_test_srcs}
+  EXTRA_LIBS
+  extension_named_data_map
+  extension_flat_tensor
+  extension_data_loader
+)
+
+add_dependencies(
+  extension_named_data_map_test extension_named_data_map
+  extension_named_data_map_test_resources
+)
+set_property(
+  TEST extension_named_data_map_test PROPERTY ENVIRONMENT ${test_env}
+)
diff --git a/extension/named_data_map/test/TARGETS b/extension/named_data_map/test/TARGETS
new file mode 100644
index 00000000000..883ab644309
--- /dev/null
+++ b/extension/named_data_map/test/TARGETS
@@ -0,0 +1,8 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets(is_fbcode=True)
diff --git a/extension/named_data_map/test/merged_data_map_test.cpp b/extension/named_data_map/test/merged_data_map_test.cpp
new file mode 100644
index 00000000000..4086855f439
--- /dev/null
+++ b/extension/named_data_map/test/merged_data_map_test.cpp
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/extension/flat_tensor/flat_tensor_data_map.h>
+#include <executorch/extension/named_data_map/merged_data_map.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/core/span.h>
+#include <executorch/runtime/platform/runtime.h>
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+using namespace ::testing;
+using executorch::extension::FileDataLoader;
+using executorch::extension::FlatTensorDataMap;
+using executorch::extension::MergedDataMap;
+using executorch::runtime::DataLoader;
+using executorch::runtime::Error;
+using executorch::runtime::NamedDataMap;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
+using executorch::runtime::TensorLayout;
+
+class MergedDataMapTest : public ::testing::Test {
+ protected:
+  void load_flat_tensor_data_map(const char* path, const char* module_name) {
+    Result<FileDataLoader> loader = FileDataLoader::from(path);
+    ASSERT_EQ(loader.error(), Error::Ok);
+    loaders_.emplace(
+        module_name, std::make_unique<FileDataLoader>(std::move(loader.get())));
+
+    Result<FlatTensorDataMap> data_map =
+        FlatTensorDataMap::load(loaders_[module_name].get());
+    EXPECT_EQ(data_map.error(), Error::Ok);
+
+    data_maps_.emplace(
+        module_name,
+        std::make_unique<FlatTensorDataMap>(std::move(data_map.get())));
+  }
+
+  void SetUp() override {
+    // Since these tests cause ET_LOG to be called, the PAL must be initialized
+    // first.
+    executorch::runtime::runtime_init();
+
+    // Load FlatTensor data maps.
+    // The eager addmul and linear models are defined at:
+    // //executorch/test/models/export_program.py
+    load_flat_tensor_data_map(
+        std::getenv("ET_MODULE_ADD_MUL_DATA_PATH"), "addmul");
+    load_flat_tensor_data_map(
+        std::getenv("ET_MODULE_LINEAR_DATA_PATH"), "linear");
+  }
+
+ private:
+  // Must outlive data_maps_, but tests shouldn't need to touch it.
+  std::unordered_map<std::string, std::unique_ptr<FileDataLoader>> loaders_;
+
+ protected:
+  std::unordered_map<std::string, std::unique_ptr<NamedDataMap>> data_maps_;
+};
+
+// Check that two tensor layouts are equivalent.
+void check_tensor_layout(TensorLayout& layout1, TensorLayout& layout2) {
+  EXPECT_EQ(layout1.scalar_type(), layout2.scalar_type());
+  EXPECT_EQ(layout1.nbytes(), layout2.nbytes());
+  EXPECT_EQ(layout1.sizes().size(), layout2.sizes().size());
+  for (auto i : c10::irange(layout1.sizes().size())) {
+    EXPECT_EQ(layout1.sizes()[i], layout2.sizes()[i]);
+  }
+  EXPECT_EQ(layout1.dim_order().size(), layout2.dim_order().size());
+  for (auto i : c10::irange(layout1.dim_order().size())) {
+    EXPECT_EQ(layout1.dim_order()[i], layout2.dim_order()[i]);
+  }
+}
+
+// Given that ndm is part of merged, check that all the API calls on ndm produce
+// the same results as merged.
+void compare_ndm_api_calls(
+    const NamedDataMap* ndm,
+    const NamedDataMap* merged) {
+  uint32_t num_keys = ndm->get_num_keys().get();
+  for (auto i : c10::irange(num_keys)) {
+    auto key = ndm->get_key(i).get();
+
+    // Compare get_tensor_layout.
+    auto ndm_meta = ndm->get_tensor_layout(key).get();
+    auto merged_meta = merged->get_tensor_layout(key).get();
+    check_tensor_layout(ndm_meta, merged_meta);
+
+    // Compare get_data.
+    auto ndm_data = ndm->get_data(key);
+    auto merged_data = merged->get_data(key);
+    EXPECT_EQ(ndm_data.get().size(), merged_data.get().size());
+    for (auto j : c10::irange(ndm_meta.nbytes())) {
+      EXPECT_EQ(
+          ((uint8_t*)ndm_data.get().data())[j],
+          ((uint8_t*)merged_data.get().data())[j]);
+    }
+    ndm_data->Free();
+    merged_data->Free();
+
+    // Compare load_into.
+    auto nbytes = ndm_meta.nbytes();
+    auto ndm_buffer = std::make_unique<uint8_t[]>(nbytes);
+    auto ndm_load_into = ndm->load_data_into(key, ndm_buffer.get(), nbytes);
+    EXPECT_EQ(ndm_load_into, Error::Ok);
+    auto merged_buffer = std::make_unique<uint8_t[]>(nbytes);
+    auto merged_load_into =
+        merged->load_data_into(key, merged_buffer.get(), nbytes);
+    EXPECT_EQ(merged_load_into, Error::Ok);
+    for (auto j : c10::irange(ndm_meta.nbytes())) {
+      EXPECT_EQ(
+          ((uint8_t*)merged_buffer.get())[j],
+          ((uint8_t*)merged_buffer.get())[j]);
+    }
+  }
+}
+
+TEST_F(MergedDataMapTest, LoadNullDataMap) {
+  Result<MergedDataMap> merged_map = MergedDataMap::load({nullptr, nullptr});
+  EXPECT_EQ(merged_map.error(), Error::InvalidArgument);
+}
+
+TEST_F(MergedDataMapTest, LoadSingleDataMap) {
+  std::vector<const NamedDataMap*> ndms = {data_maps_["addmul"].get(), nullptr};
+  Result<MergedDataMap> merged_map =
+      MergedDataMap::load(Span<const NamedDataMap*>(ndms.data(), ndms.size()));
+  EXPECT_EQ(merged_map.error(), Error::Ok);
+
+  // Num keys.
+  EXPECT_EQ(
+      merged_map->get_num_keys().get(),
+      data_maps_["addmul"]->get_num_keys().get());
+
+  // API calls produce equivalent results.
+  compare_ndm_api_calls(data_maps_["addmul"].get(), &merged_map.get());
+}
+
+TEST_F(MergedDataMapTest, LoadDuplicateDataMapsFail) {
+  std::vector<const NamedDataMap*> ndms = {
+      data_maps_["addmul"].get(), data_maps_["addmul"].get()};
+  Result<MergedDataMap> merged_map =
+      MergedDataMap::load(Span<const NamedDataMap*>(ndms.data(), ndms.size()));
+  EXPECT_EQ(merged_map.error(), Error::InvalidArgument);
+}
+
+TEST_F(MergedDataMapTest, CheckDataMapContents) {
+  std::vector<const NamedDataMap*> ndms = {
+      data_maps_["addmul"].get(), data_maps_["linear"].get()};
+  Result<MergedDataMap> merged_map =
+      MergedDataMap::load(Span<const NamedDataMap*>(ndms.data(), ndms.size()));
+  EXPECT_EQ(merged_map.error(), Error::Ok);
+
+  // Num keys.
+  size_t addmul_num_keys = data_maps_["addmul"]->get_num_keys().get();
+  size_t linear_num_keys = data_maps_["linear"]->get_num_keys().get();
+  EXPECT_EQ(
+      merged_map->get_num_keys().get(), addmul_num_keys + linear_num_keys);
+
+  // API calls produce equivalent results.
+  compare_ndm_api_calls(data_maps_["addmul"].get(), &merged_map.get());
+  compare_ndm_api_calls(data_maps_["linear"].get(), &merged_map.get());
+}
diff --git a/extension/named_data_map/test/targets.bzl b/extension/named_data_map/test/targets.bzl
new file mode 100644
index 00000000000..516abb8d45e
--- /dev/null
+++ b/extension/named_data_map/test/targets.bzl
@@ -0,0 +1,26 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets(is_fbcode=False):
+    if not runtime.is_oss and is_fbcode:
+        modules_env = {
+            # The tests use this var to find the program file to load. This uses
+            # an fbcode target path because the authoring/export tools
+            # intentionally don't work in xplat (since they're host-only tools).
+            "ET_MODULE_ADD_MUL_DATA_PATH": "$(location fbcode//executorch/test/models:exported_program_and_data[ModuleAddMul.ptd])",
+            "ET_MODULE_LINEAR_DATA_PATH": "$(location fbcode//executorch/test/models:exported_program_and_data[ModuleLinear.ptd])",
+        }
+
+        runtime.cxx_test(
+            name = "merged_data_map_test",
+            srcs = [
+                "merged_data_map_test.cpp",
+            ],
+            deps = [
+                "//executorch/extension/data_loader:file_data_loader",
+                "//executorch/extension/flat_tensor:flat_tensor_data_map",
+                "//executorch/extension/named_data_map:merged_data_map",
+                "//executorch/runtime/core:named_data_map",
+                "//executorch/runtime/core/exec_aten:lib",
+            ],
+            env = modules_env,
+        )
diff --git a/scripts/build_wasm_tests.sh b/scripts/build_wasm_tests.sh
index 9a09ddd2749..4dd7355e118 100644
--- a/scripts/build_wasm_tests.sh
+++ b/scripts/build_wasm_tests.sh
@@ -22,6 +22,7 @@ emcmake cmake . -DEXECUTORCH_BUILD_WASM=ON \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_SELECT_OPS_LIST="aten::mm.out,aten::add.out" \
     -DEXECUTORCH_BUILD_TESTS=ON \
diff --git a/shim_et/xplat/executorch/build/build_variables.bzl b/shim_et/xplat/executorch/build/build_variables.bzl
index ea086886449..8d8893f7454 100644
--- a/shim_et/xplat/executorch/build/build_variables.bzl
+++ b/shim_et/xplat/executorch/build/build_variables.bzl
@@ -341,6 +341,10 @@ EXTENSION_MODULE_SRCS = [
     "extension/module/module.cpp",
 ]
 
+EXTENSION_NAMED_DATA_MAP_SRCS = [
+    "extension/named_data_map/merged_data_map.cpp",
+]
+
 EXTENSION_RUNNER_UTIL_SRCS = [
     "extension/runner_util/inputs.cpp",
     "extension/runner_util/inputs_portable.cpp",
diff --git a/test/run_oss_cpp_tests.sh b/test/run_oss_cpp_tests.sh
index 1648f2ba434..5166d454e60 100755
--- a/test/run_oss_cpp_tests.sh
+++ b/test/run_oss_cpp_tests.sh
@@ -41,6 +41,7 @@ build_executorch() {
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
     -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
     -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
diff --git a/tools/cmake/Codegen.cmake b/tools/cmake/Codegen.cmake
index 2a6bf42b48a..32d3d8b554f 100644
--- a/tools/cmake/Codegen.cmake
+++ b/tools/cmake/Codegen.cmake
@@ -399,6 +399,7 @@ function(executorch_load_build_variables)
       EXTENSION_EVALUE_UTIL_SRCS
       EXTENSION_FLAT_TENSOR_SRCS
       EXTENSION_MODULE_SRCS
+      EXTENSION_NAMED_DATA_MAP_SRCS
       EXTENSION_RUNNER_UTIL_SRCS
       EXTENSION_LLM_RUNNER_SRCS
       EXTENSION_TENSOR_SRCS
@@ -431,6 +432,7 @@ function(executorch_load_build_variables)
       _extension_evalue_util__srcs
       _extension_flat_tensor__srcs
       _extension_module__srcs
+      _extension_named_data_map__srcs
       _extension_runner_util__srcs
       _extension_llm_runner__srcs
       _extension_tensor__srcs
diff --git a/tools/cmake/preset/android.cmake b/tools/cmake/preset/android.cmake
index d794e8fcef3..5c9bc97e3ef 100644
--- a/tools/cmake/preset/android.cmake
+++ b/tools/cmake/preset/android.cmake
@@ -23,6 +23,7 @@ set_overridable_option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_LLM ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_MODULE ON)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_TRAINING ON)
diff --git a/tools/cmake/preset/apple_common.cmake b/tools/cmake/preset/apple_common.cmake
index 7b4ec420996..27ec35aa43e 100644
--- a/tools/cmake/preset/apple_common.cmake
+++ b/tools/cmake/preset/apple_common.cmake
@@ -28,6 +28,7 @@ set_overridable_option(EXECUTORCH_BUILD_EXTENSION_LLM ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_MODULE ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_KERNELS_LLM ON)
 set_overridable_option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED ON)
diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake
index bf5eaaef107..37c10d25332 100644
--- a/tools/cmake/preset/default.cmake
+++ b/tools/cmake/preset/default.cmake
@@ -86,6 +86,10 @@ define_overridable_option(
 define_overridable_option(
   EXECUTORCH_BUILD_EXTENSION_MODULE "Build the Module extension" BOOL OFF
 )
+define_overridable_option(
+  EXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP
+  "Build the Named Data Map extension" BOOL OFF
+)
 define_overridable_option(
   EXECUTORCH_BUILD_EXTENSION_TENSOR "Build the Tensor extension" BOOL OFF
 )
@@ -280,6 +284,12 @@ check_required_options_on(
 check_required_options_on(
   IF_ON EXECUTORCH_BUILD_EXTENSION_MODULE REQUIRES
   EXECUTORCH_BUILD_EXTENSION_DATA_LOADER EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR
+  EXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP
+)
+
+check_required_options_on(
+  IF_ON EXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP REQUIRES
+  EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
 )
 
 check_required_options_on(
diff --git a/tools/cmake/preset/llm.cmake b/tools/cmake/preset/llm.cmake
index e29fc7c4287..6cd2482f717 100644
--- a/tools/cmake/preset/llm.cmake
+++ b/tools/cmake/preset/llm.cmake
@@ -10,6 +10,7 @@ set_overridable_option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_LLM ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_MODULE ON)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_KERNELS_LLM ON)
 set_overridable_option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED ON)
diff --git a/tools/cmake/preset/profiling.cmake b/tools/cmake/preset/profiling.cmake
index a73c340078c..640a84b261c 100644
--- a/tools/cmake/preset/profiling.cmake
+++ b/tools/cmake/preset/profiling.cmake
@@ -9,6 +9,7 @@
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_MODULE ON)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_KERNELS_LLM ON)
 set_overridable_option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED ON)
diff --git a/tools/cmake/preset/pybind.cmake b/tools/cmake/preset/pybind.cmake
index f98e68ef5ac..c71c10ad01f 100644
--- a/tools/cmake/preset/pybind.cmake
+++ b/tools/cmake/preset/pybind.cmake
@@ -17,10 +17,11 @@ set_overridable_option(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL ON)
 set_overridable_option(EXECUTORCH_BUILD_KERNELS_LLM ON)
 set_overridable_option(EXECUTORCH_BUILD_KERNELS_LLM_AOT ON)
+set_overridable_option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER ON)
-set_overridable_option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_MODULE ON)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP ON)
 
 # TODO(larryliu0820): Temporarily disable building llm_runner for Windows wheel
 # due to the issue of tokenizer file path length limitation.
diff --git a/tools/cmake/preset/windows.cmake b/tools/cmake/preset/windows.cmake
index b75a5af578e..5123dfc956d 100644
--- a/tools/cmake/preset/windows.cmake
+++ b/tools/cmake/preset/windows.cmake
@@ -10,6 +10,7 @@ set_overridable_option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_EVALUE_UTIL ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_MODULE ON)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED ON)

From fb87fa6fbe7becf473c29b222a5e5ff178b18dc6 Mon Sep 17 00:00:00 2001
From: Marco Giordano <112122023+mgiordy@users.noreply.github.com>
Date: Tue, 7 Oct 2025 23:35:04 -0700
Subject: [PATCH 166/266] Including mixed quant Linear op in Jarvis

Differential Revision: D81605171

Pull Request resolved: https://github.com/pytorch/executorch/pull/14820
---
 backends/cadence/aot/functions_hifi.yaml      |  5 ++
 backends/cadence/aot/ops_registrations.py     | 27 +++++++++
 backends/cadence/aot/quantizer/fusion_pass.py | 33 +++++++++++
 backends/cadence/aot/quantizer/patterns.py    | 55 ++++++++++++++++++-
 backends/cadence/aot/quantizer/quantizer.py   | 22 ++++++++
 5 files changed, 141 insertions(+), 1 deletion(-)

diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml
index bcab980abd6..8c65e745c21 100644
--- a/backends/cadence/aot/functions_hifi.yaml
+++ b/backends/cadence/aot/functions_hifi.yaml
@@ -548,3 +548,8 @@
   kernels:
     - arg_meta: null
       kernel_name: impl::HiFi::quantized_fully_connected_asym8uxasym8u_asym8u_per_tensor_out
+
+- func: cadence::quantized_w8a32_linear.out(Tensor input, Tensor weight, float w_scale, Tensor bias, float b_scale, *, Tensor(a!) output) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::quantized_w8a32_linear_out
diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
index f7d07018e59..9266cc72970 100644
--- a/backends/cadence/aot/ops_registrations.py
+++ b/backends/cadence/aot/ops_registrations.py
@@ -564,6 +564,14 @@
     "_softmax_f32_f32.out(Tensor self, int dim, bool? half_to_float, *, Tensor(a!) out) -> Tensor(a!)"
 )
 
+lib.define(
+    "quantized_w8a32_linear(Tensor input, Tensor weight, float w_scale, Tensor bias, float b_scale) -> Tensor"
+)
+lib.define(
+    "quantized_w8a32_linear.out(Tensor input, Tensor weight, float w_scale, Tensor bias, float b_scale, *, Tensor(a!) output) -> Tensor(a!)"
+)
+
+
 # Custom ops with aten namespace. Need to specify the lib var as FRAGMENT type as aten library is already defined
 aten_lib = Library("aten", "FRAGMENT")
 aten_lib.define(
@@ -2562,3 +2570,22 @@ def quantized_softmax_per_tensor_meta(
     out_zero_point: int,
 ) -> torch.Tensor:
     return input.new_empty(input.size(), dtype=input.dtype)
+
+
+@register_fake("cadence::quantized_w8a32_linear")
+def quantized_w8a32_linear_meta(
+    src: torch.Tensor,
+    weight: torch.Tensor,
+    w_scale: float,
+    bias: torch.Tensor,
+    b_scale: float,
+) -> torch.Tensor:
+    # src comes in shape [leading_dims, in_dim]
+    # weight comes in shape [in_dim, out_dim]
+    # output comes in empty with shape [leading_dims, out_dim]
+    src_shape = list(src.shape)
+    weight_shape = weight.shape
+    assert len(weight_shape) == 2
+    assert src_shape[-1] == weight_shape[-1]
+    src_shape[-1] = weight_shape[0]
+    return src.new_empty(src_shape, dtype=src.dtype)
diff --git a/backends/cadence/aot/quantizer/fusion_pass.py b/backends/cadence/aot/quantizer/fusion_pass.py
index 0461c03ccb7..cdadedff6cf 100644
--- a/backends/cadence/aot/quantizer/fusion_pass.py
+++ b/backends/cadence/aot/quantizer/fusion_pass.py
@@ -24,6 +24,7 @@
     LayerNormPattern,
     LinearPattern,
     MatmulPattern,
+    MixedW8A32LinearPattern,
     ReluPattern0,
     ReluPattern1,
     SoftmaxPattern,
@@ -390,6 +391,29 @@ def get_args_and_kwargs_relu(
     return args, kwargs
 
 
+def get_args_and_kwargs_mixed_w8a32_linear(
+    graph_module: GraphModule,
+    other_inputs: List[fx.Node],
+    weights_inputs: List[fx.Node],
+    dequants_weights: List[fx.Node],
+    bias_inputs: List[fx.Node],
+    dequants_biases: List[fx.Node],
+) -> Tuple[Tuple[ArgsType, ...], Dict[str, ArgsType]]:
+    w_scale_ = dequants_weights[0].args[1]
+    b_scale_ = dequants_biases[0].args[1]
+
+    args = (
+        other_inputs[0],
+        weights_inputs[0],
+        w_scale_,
+        bias_inputs[0],
+        b_scale_,
+    )
+    kwargs = {}
+
+    return args, kwargs
+
+
 def get_args_and_kwargs_softmax(
     graph_module: GraphModule,
     inputs_inputs: List[fx.Node],
@@ -617,6 +641,15 @@ def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
                             quant_node,
                             op_node,
                         )
+                    elif isinstance(pattern, MixedW8A32LinearPattern):
+                        args, kwargs = get_args_and_kwargs_mixed_w8a32_linear(
+                            graph_module,
+                            other_inputs,
+                            weights_inputs,
+                            dequants_weights,
+                            bias_inputs,
+                            dequants_biases,
+                        )
 
                     fused = graph_module.graph.call_function(
                         pattern.replacement_op(),
diff --git a/backends/cadence/aot/quantizer/patterns.py b/backends/cadence/aot/quantizer/patterns.py
index 4eae55502d7..5ceb2ffdda3 100644
--- a/backends/cadence/aot/quantizer/patterns.py
+++ b/backends/cadence/aot/quantizer/patterns.py
@@ -524,7 +524,6 @@ def partition_types(self) -> List[OpOverload]:
 
 
 class SoftmaxPattern(QuantizationPattern):
-
     def partition_types(self) -> List[OpOverload]:
         return [torch.ops.aten._softmax.default]
 
@@ -546,3 +545,57 @@ def get_anchors(
 
     def replacement_op(self) -> OpOverload:
         return torch.ops.cadence.quantized_softmax.default
+
+
+class MixedW8A32LinearPattern(QuantizationPattern):
+    def partition_types(self) -> List[OpOverload]:
+        return [torch.ops.aten.linear.default]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+    ) -> Tuple[PartitionAnchors, fx.Node]:
+        # pyre-ignore[29]
+        linear_layer = fused_partition[0].nodes[-1]
+
+        # Bail if the arguments have different shapes than expected
+        if len(linear_layer.args) != 3 or len(linear_layer.kwargs) > 0:
+            return (
+                PartitionAnchors(
+                    empty=True,
+                ),
+                linear_layer,
+            )
+
+        input_node = linear_layer.args[0]
+        input_shape = input_node.meta["tensor_meta"].shape
+
+        # Bail if the weights are not multiple of 4 (SIMD)
+        if input_shape[-1] % 4 != 0:
+            return (
+                PartitionAnchors(
+                    empty=True,
+                ),
+                linear_layer,
+            )
+        # Currenly only supporting vector-matrix multiplication
+        if len(input_shape) > 0 and input_shape[-2] != 1:
+            return (
+                PartitionAnchors(
+                    empty=True,
+                ),
+                linear_layer,
+            )
+
+        return (
+            PartitionAnchors(
+                inputs=[],
+                weights=[(linear_layer, 1)],
+                biases=[(linear_layer, 2)],
+                output=[],
+                others=[(linear_layer, 0)],
+            ),
+            linear_layer,
+        )
+
+    def replacement_op(self) -> OpOverload:
+        return torch.ops.cadence.quantized_w8a32_linear.default
diff --git a/backends/cadence/aot/quantizer/quantizer.py b/backends/cadence/aot/quantizer/quantizer.py
index 536b28f5cec..4df69df0779 100644
--- a/backends/cadence/aot/quantizer/quantizer.py
+++ b/backends/cadence/aot/quantizer/quantizer.py
@@ -24,6 +24,7 @@
     LayerNormPattern,
     LinearPattern,
     MatmulPattern,
+    MixedW8A32LinearPattern,
     QuantizationPattern,
     ReluPattern0,
     ReluPattern1,
@@ -109,6 +110,13 @@
     None,
 )
 
+qconfig_A32W8sym = QuantizationConfig(
+    input_activation=None,
+    output_activation=None,
+    weight=wgt_qspec_sym8s,
+    bias=wgt_qspec_sym8s,
+)
+
 
 class CadenceAtenQuantizer(Quantizer):
     def __init__(
@@ -302,6 +310,20 @@ def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None:
         super().__init__(quantizers)
 
 
+class CadenceW8A32MixedQuantizer(CadenceQuantizer):
+    """
+    Quantizer for mixed quantization, 8 bit weights and 32 bit activations
+    TODO: Experimental quantizer, not yet well supported in OSS
+    """
+
+    def __init__(self) -> None:
+        quantizers = []
+        quantizers.append(
+            CadenceAtenQuantizer(MixedW8A32LinearPattern(), qconfig_A32W8sym)
+        )
+        super().__init__(quantizers)
+
+
 class CadenceWithSoftmaxQuantizer(CadenceQuantizer):
     """
     Quantizer including A16 softmax

From 229bbd27dfb5a622b67377f66cb58fe5c3bc6d28 Mon Sep 17 00:00:00 2001
From: suryasidd <surya.siddharth.pemmaraju@intel.com>
Date: Tue, 7 Oct 2025 23:44:44 -0700
Subject: [PATCH 167/266] Use defualt runner for OpenVINO backend as well

---
 backends/openvino/CMakeLists.txt            | 24 ---------------------
 backends/openvino/README.md                 |  2 +-
 backends/openvino/scripts/openvino_build.sh |  3 ++-
 docs/source/build-run-openvino.md           |  4 ++--
 examples/openvino/README.md                 |  8 +++----
 5 files changed, 9 insertions(+), 32 deletions(-)

diff --git a/backends/openvino/CMakeLists.txt b/backends/openvino/CMakeLists.txt
index f5b957da881..736ed6d8603 100644
--- a/backends/openvino/CMakeLists.txt
+++ b/backends/openvino/CMakeLists.txt
@@ -53,30 +53,6 @@ target_sources(
 
 executorch_target_link_options_shared_lib(openvino_backend)
 
-if(EXECUTORCH_BUILD_OPENVINO_EXECUTOR_RUNNER)
-  # Build executor runner binary for openvino backend
-  list(APPEND openvino_executor_runner_libs openvino_backend executorch)
-
-  set(_openvino_executor_runner__srcs
-      ${EXECUTORCH_ROOT}/examples/portable/executor_runner/executor_runner.cpp
-      ${EXECUTORCH_ROOT}/extension/data_loader/file_data_loader.cpp
-      ${EXECUTORCH_ROOT}/extension/evalue_util/print_evalue.cpp
-      ${EXECUTORCH_ROOT}/extension/runner_util/inputs.cpp
-      ${EXECUTORCH_ROOT}/extension/runner_util/inputs_portable.cpp
-  )
-  add_executable(openvino_executor_runner ${_openvino_executor_runner__srcs})
-
-  list(APPEND openvino_executor_runner_libs)
-
-  target_link_libraries(
-    openvino_executor_runner gflags portable_ops_lib
-    ${openvino_executor_runner_libs}
-  )
-  target_compile_options(
-    openvino_executor_runner PUBLIC ${_common_compile_options}
-  )
-endif()
-
 # Install OpenVINO backend library to the lib directory
 install(
   TARGETS openvino_backend
diff --git a/backends/openvino/README.md b/backends/openvino/README.md
index 0046ad23486..5ce38ade56f 100644
--- a/backends/openvino/README.md
+++ b/backends/openvino/README.md
@@ -105,7 +105,7 @@ Follow the steps below to setup your build environment:
      ```bash
    ./openvino_build.sh --enable_python
    ```
-   **Build C++ Runtime Libraries for OpenVINO Backend**: Run the `openvino_build.sh` script with the `--cpp_runtime` flag to build the C++ runtime libraries as shown in the below command. The compiled libraries files and binaries can be found in the `<executorch_root>/cmake-out` directory. The binary located at `<executorch_root>/cmake-out/backends/openvino/openvino_executor_runner` can be used to run inference with vision models.
+   **Build C++ Runtime Libraries for OpenVINO Backend**: Run the `openvino_build.sh` script with the `--cpp_runtime` flag to build the C++ runtime libraries as shown in the below command. The compiled libraries files and binaries can be found in the `<executorch_root>/cmake-out` directory. The binary located at `<executorch_root>/cmake-out/executor_runner` can be used to run inference with vision models.
      ```bash
    ./openvino_build.sh --cpp_runtime
    ```
diff --git a/backends/openvino/scripts/openvino_build.sh b/backends/openvino/scripts/openvino_build.sh
index b7e5f5270ab..6d7853b96e5 100755
--- a/backends/openvino/scripts/openvino_build.sh
+++ b/backends/openvino/scripts/openvino_build.sh
@@ -30,10 +30,11 @@ build_cpp_runtime() {
           -DEXECUTORCH_BUILD_OPENVINO=ON \
           -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
           -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
           -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
           -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
           -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-          -DEXECUTORCH_BUILD_OPENVINO_EXECUTOR_RUNNER=ON \
+          -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
           -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
           -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
           -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
diff --git a/docs/source/build-run-openvino.md b/docs/source/build-run-openvino.md
index dc6f098850f..12aa5df130d 100644
--- a/docs/source/build-run-openvino.md
+++ b/docs/source/build-run-openvino.md
@@ -92,7 +92,7 @@ The exported model will be saved as 'resnet50.pte' in the current directory.
 
 ### Build C++ OpenVINO Examples
 
-After building the OpenVINO backend following the [instructions](#setup) above, the executable will be saved in `<executorch_root>/cmake-out/backends/openvino/`.
+After building the OpenVINO backend following the [instructions](#setup) above, the executable will be saved in `<executorch_root>/cmake-out/`.
 
 The executable requires a model file (`.pte` file generated in the aot step) and the number of inference executions.
 
@@ -101,7 +101,7 @@ The executable requires a model file (`.pte` file generated in the aot step) and
 Run inference with a given model for 10 executions:
 
 ```
-./openvino_executor_runner \
+./executor_runner \
     --model_path=model.pte \
     --num_executions=10
 ```
diff --git a/examples/openvino/README.md b/examples/openvino/README.md
index 0ecedde092c..83e3daf6849 100644
--- a/examples/openvino/README.md
+++ b/examples/openvino/README.md
@@ -157,7 +157,7 @@ Build the backend libraries and executor runner by executing the script below in
 ```bash
 ./openvino_build.sh
 ```
-The executable is saved in `<executorch_root>/cmake-out/backends/openvino/`
+The executable is saved in `<executorch_root>/cmake-out/`
 
 ### Run the Example with Executor Runner
 
@@ -166,9 +166,9 @@ Now, run the example using the executable generated in the above step. The execu
 #### Command Syntax:
 
 ```
-cd ../../cmake-out/backends/openvino
+cd ../../cmake-out
 
-./openvino_executor_runner \
+./executor_runner \
     --model_path=<path_to_model> \
     --num_executions=<iterations>
 ```
@@ -182,7 +182,7 @@ cd ../../cmake-out/backends/openvino
 Run inference with a given model for 10 iterations:
 
 ```
-./openvino_executor_runner \
+./executor_runner \
     --model_path=model.pte \
     --num_executions=10
 ```

From 400b2a598dce2047174776f07d465b0b1cd9b5d2 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu0820@users.noreply.github.com>
Date: Wed, 8 Oct 2025 02:20:12 -0700
Subject: [PATCH 168/266] [aoti-et] Add a voxtral runner and add CI (#14875)

This pull request introduces an end-to-end CUDA test for the Voxtral
model, adds a new runtime executable for Voxtral, and makes supporting
updates to the build system and utility code. The main focus is on
enabling automated validation of Voxtral's CUDA export and runtime
within CI, including latency measurement and output verification.

**End-to-end Voxtral CUDA test integration:**

* Added a new `test-voxtral-cuda-e2e` job to the
`.github/workflows/cuda.yml` CI workflow, which builds, exports, and
runs the Voxtral model using CUDA, and checks for expected output and
exit codes.
* Updated the optimum-executorch commit pin in
`.ci/docker/ci_commit_pins/optimum-executorch.txt` to ensure
compatibility with the latest Voxtral export.

**Voxtral runtime and build system enhancements:**

* Added a new `voxtral_runner` executable to
`backends/cuda/CMakeLists.txt` for running exported Voxtral models,
linking it with required CUDA and extension libraries.
* Introduced the implementation of `voxtral_runner.cpp`, which loads the
model, runs the main methods (`audio_encoder`, `token_embedding`,
`text_decoder`), prints tensor summaries, and reports method and run
latencies.

**Utility and compatibility updates:**

* Updated `dtype_to_scalar_type` in `backends/aoti/utils.h` to support
PyTorch's int64 dtype code, improving tensor type handling for Voxtral
inputs.
---
 .../ci_commit_pins/optimum-executorch.txt     |   2 +-
 .github/workflows/cuda.yml                    |  83 ++++++
 backends/aoti/utils.h                         |   2 +
 backends/cuda/CMakeLists.txt                  |   9 +
 backends/cuda/tests/voxtral_runner.cpp        | 264 ++++++++++++++++++
 5 files changed, 359 insertions(+), 1 deletion(-)
 create mode 100644 backends/cuda/tests/voxtral_runner.cpp

diff --git a/.ci/docker/ci_commit_pins/optimum-executorch.txt b/.ci/docker/ci_commit_pins/optimum-executorch.txt
index 4cf99a4f78e..49b079047a3 100644
--- a/.ci/docker/ci_commit_pins/optimum-executorch.txt
+++ b/.ci/docker/ci_commit_pins/optimum-executorch.txt
@@ -1 +1 @@
-bd06b54e627fbfd354a2cffa4c80fb21883209a9
+44d8d54e38c0258357d4e92e1fefe21e845947a3
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index 8724fab99d4..a983d40f639 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -86,3 +86,86 @@ jobs:
         PYTHON_EXECUTABLE=python CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh
         export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
         PYTHON_EXECUTABLE=python source .ci/scripts/test_model.sh "${{ matrix.model }}" cmake cuda
+
+  test-voxtral-cuda-e2e:
+    name: test-voxtral-cuda-e2e
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    secrets: inherit
+    strategy:
+      fail-fast: false
+    with:
+      timeout: 90
+      secrets-env: EXECUTORCH_HF_TOKEN
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        echo "::group::Setup ExecuTorch"
+        CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh
+        echo "::endgroup::"
+
+        echo "::group::Setup Huggingface"
+        pip install -U "huggingface_hub[cli]" accelerate
+        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
+        pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
+        pip install mistral-common librosa
+        echo "::endgroup::"
+
+        echo "::group::Export Voxtral"
+        optimum-cli export executorch \
+            --model "mistralai/Voxtral-Mini-3B-2507" \
+            --task "multimodal-text-to-text" \
+            --recipe "cuda" \
+            --dtype bfloat16 \
+            --device cuda \
+            --max_seq_len 1024 \
+            --output_dir ./
+        echo "::endgroup::"
+
+        echo "::group::Build Voxtral Runner"
+        cmake -DCMAKE_BUILD_TYPE=Release \
+              -DEXECUTORCH_BUILD_CUDA=ON \
+              -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+              -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+              -DEXECUTORCH_BUILD_TESTS=ON \
+              -Bcmake-out .
+        cmake --build cmake-out -j$(( $(nproc) - 1 )) --target voxtral_runner
+        echo "::endgroup::"
+
+        echo "::group::Run Voxtral Runner"
+        # Capture output and allow exit code 139 if we have the expected printout
+        set +e
+        export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
+        OUTPUT=$(cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd 2>&1)
+        EXIT_CODE=$?
+        set -e
+
+        echo "$OUTPUT"
+
+        # Check if the output contains "Run latency (ms):"
+        if echo "$OUTPUT" | grep -q "Run latency (ms):"; then
+          echo "Found expected output: 'Run latency (ms):'"
+          if [ $EXIT_CODE -eq 139 ]; then
+            echo "Exit code 139 (segfault) detected, but passing since we have the expected output"
+            exit 0
+          elif [ $EXIT_CODE -ne 0 ]; then
+            echo "Unexpected exit code: $EXIT_CODE"
+            exit $EXIT_CODE
+          else
+            echo "Command succeeded with exit code 0"
+            exit 0
+          fi
+        else
+          echo "Expected output 'Run latency (ms):' not found in output"
+          exit 1
+        fi
+        echo "::endgroup::"
diff --git a/backends/aoti/utils.h b/backends/aoti/utils.h
index 1c872e08648..78c07bcea6e 100644
--- a/backends/aoti/utils.h
+++ b/backends/aoti/utils.h
@@ -34,6 +34,8 @@ inline executorch::aten::ScalarType dtype_to_scalar_type(int32_t dtype) {
   // Convert based on known PyTorch dtype codes (without CUDA-specific
   // dependency)
   switch (dtype) {
+    case 4: // PyTorch's int64 dtype code
+      return executorch::aten::ScalarType::Long;
     case 6: // PyTorch's float32 dtype code
       return executorch::aten::ScalarType::Float;
     case 15: // PyTorch's bfloat16 dtype code
diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index 90588218c02..7a9cdbd0b39 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -62,6 +62,15 @@ target_link_libraries(
 # target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...)
 executorch_target_link_options_shared_lib(aoti_cuda)
 
+if(BUILD_TESTING)
+  # Add runtime
+  add_executable(voxtral_runner tests/voxtral_runner.cpp)
+  target_link_libraries(
+    voxtral_runner PUBLIC aoti_cuda extension_module_static
+                          extension_flat_tensor portable_ops_lib
+  )
+endif()
+
 install(
   TARGETS aoti_cuda
   EXPORT ExecuTorchTargets
diff --git a/backends/cuda/tests/voxtral_runner.cpp b/backends/cuda/tests/voxtral_runner.cpp
new file mode 100644
index 00000000000..feed458e1f5
--- /dev/null
+++ b/backends/cuda/tests/voxtral_runner.cpp
@@ -0,0 +1,264 @@
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor_ptr.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/evalue.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/core/portable_type/tensor.h>
+
+namespace {
+
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::extension::make_tensor_ptr;
+using executorch::extension::TensorPtr;
+using executorch::extension::module::Module;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::Result;
+using Clock = std::chrono::steady_clock;
+using DurationMs = std::chrono::duration<double, std::milli>;
+
+std::vector<executorch::aten::SizesType> to_sizes(
+    std::initializer_list<int64_t> dims) {
+  return std::vector<executorch::aten::SizesType>(dims.begin(), dims.end());
+}
+
+std::string format_shape(const Tensor& tensor) {
+  std::ostringstream oss;
+  oss << "[";
+  const auto& sizes = tensor.sizes();
+  for (size_t i = 0; i < sizes.size(); ++i) {
+    if (i > 0) {
+      oss << ", ";
+    }
+    oss << sizes[i];
+  }
+  oss << "]";
+  return oss.str();
+}
+
+void print_tensor_summary(const std::string& label, const Tensor& tensor) {
+  std::cout << "    " << label
+            << ": dtype=" << executorch::runtime::toString(tensor.scalar_type())
+            << ", shape=" << format_shape(tensor)
+            << ", numel=" << tensor.numel() << std::endl;
+}
+
+TensorPtr create_audio_input() {
+  const auto sizes = to_sizes({3, 128, 3000});
+  const size_t numel = 3ull * 128ull * 3000ull;
+  std::vector<float> data(numel, 0.5f);
+  return make_tensor_ptr<float>(
+      sizes, std::move(data), {}, {}, ScalarType::BFloat16);
+}
+
+TensorPtr create_token_ids_input() {
+  const auto sizes = to_sizes({1, 1138});
+  std::vector<int64_t> data(static_cast<size_t>(1) * 1138, 0);
+  return make_tensor_ptr<int64_t>(sizes, std::move(data));
+}
+
+TensorPtr create_positions_input() {
+  const auto sizes = to_sizes({1138});
+  std::vector<int64_t> data(static_cast<size_t>(1138), 0);
+  return make_tensor_ptr<int64_t>(sizes, std::move(data));
+}
+
+TensorPtr create_fallback_text_embedding() {
+  const auto sizes = to_sizes({1, 1138, 3072});
+  const size_t numel = 1ull * 1138ull * 3072ull;
+  std::vector<float> data(numel, 0.0f);
+  return make_tensor_ptr<float>(
+      sizes, std::move(data), {}, {}, ScalarType::BFloat16);
+}
+
+struct MethodTiming {
+  double load_ms{0.0};
+  double run_ms{0.0};
+};
+
+} // namespace
+
+int main(int argc, char** argv) {
+  if (argc != 3) {
+    std::cerr << "Usage: " << argv[0]
+              << " <path/to/model.pte> <path/to/aoti_cuda_blob.ptd>"
+              << std::endl;
+    return 1;
+  }
+
+  const std::string program_path = argv[1];
+  const std::string data_map_path = argv[2];
+
+  try {
+    Module module(program_path, data_map_path);
+
+    const auto program_load_start = Clock::now();
+    const Error program_load_error = module.load();
+    const auto program_load_end = Clock::now();
+    if (program_load_error != Error::Ok) {
+      std::cerr << "Failed to load ExecuTorch program: error code "
+                << static_cast<int>(program_load_error) << std::endl;
+      return 1;
+    }
+    const DurationMs program_load_latency =
+        program_load_end - program_load_start;
+
+    MethodTiming audio_timing;
+    MethodTiming token_timing;
+    MethodTiming text_timing;
+
+    auto measure_method_load =
+        [&](const std::string& name) -> std::pair<Error, double> {
+      const auto start = Clock::now();
+      const Error err = module.load_method(name);
+      const auto end = Clock::now();
+      return {err, DurationMs(end - start).count()};
+    };
+
+    // audio_encoder
+    {
+      const auto [err, load_ms] = measure_method_load("audio_encoder");
+      if (err != Error::Ok) {
+        std::cerr << "Failed to load method audio_encoder: error code "
+                  << static_cast<int>(err) << std::endl;
+        return 1;
+      }
+      audio_timing.load_ms = load_ms;
+
+      const TensorPtr audio_input = create_audio_input();
+      std::vector<EValue> inputs;
+      std::vector<TensorPtr> owned_inputs;
+      owned_inputs.emplace_back(audio_input);
+      inputs.emplace_back(*audio_input);
+
+      const auto run_start = Clock::now();
+      Result<std::vector<EValue>> output_result =
+          module.execute("audio_encoder", inputs);
+      const auto run_end = Clock::now();
+      audio_timing.run_ms = DurationMs(run_end - run_start).count();
+
+      if (output_result.error() != Error::Ok) {
+        std::cerr << "audio_encoder execution failed: error code "
+                  << static_cast<int>(output_result.error()) << std::endl;
+        return 1;
+      }
+
+      const auto& outputs = output_result.get();
+      if (!outputs.empty() && outputs[0].isTensor()) {
+        print_tensor_summary("audio_encoder output", outputs[0].toTensor());
+      }
+    }
+
+    EValue token_output;
+    bool token_executed = false;
+
+    // token_embedding
+    {
+      const auto [err, load_ms] = measure_method_load("token_embedding");
+      if (err != Error::Ok) {
+        std::cerr << "Failed to load method token_embedding: error code "
+                  << static_cast<int>(err) << std::endl;
+        return 1;
+      }
+      token_timing.load_ms = load_ms;
+
+      const TensorPtr token_ids = create_token_ids_input();
+      std::vector<EValue> inputs;
+      std::vector<TensorPtr> owned_inputs;
+      owned_inputs.emplace_back(token_ids);
+      inputs.emplace_back(*token_ids);
+
+      const auto run_start = Clock::now();
+      auto token_output_result = module.execute("token_embedding", inputs);
+      const auto run_end = Clock::now();
+      token_timing.run_ms = DurationMs(run_end - run_start).count();
+
+      if (token_output_result.error() != Error::Ok) {
+        std::cerr << "token_embedding execution failed: error code "
+                  << static_cast<int>(token_output_result.error()) << std::endl;
+        return 1;
+      }
+
+      token_executed = true;
+      const auto& outputs = token_output_result.get();
+      if (!outputs.empty() && outputs[0].isTensor()) {
+        print_tensor_summary("token_embedding output", outputs[0].toTensor());
+        token_output = outputs[0];
+      }
+    }
+
+    // text_decoder
+    {
+      const auto [err, load_ms] = measure_method_load("text_decoder");
+      if (err != Error::Ok) {
+        std::cerr << "Failed to load method text_decoder: error code "
+                  << static_cast<int>(err) << std::endl;
+        return 1;
+      }
+      text_timing.load_ms = load_ms;
+
+      std::vector<EValue> inputs;
+      std::vector<TensorPtr> owned_inputs;
+      if (token_executed) {
+        if (token_output.isTensor()) {
+          inputs.emplace_back(token_output);
+        }
+      }
+
+      if (inputs.empty()) {
+        auto fallback_embedding = create_fallback_text_embedding();
+        owned_inputs.emplace_back(fallback_embedding);
+        inputs.emplace_back(*fallback_embedding);
+      }
+
+      auto positions = create_positions_input();
+      owned_inputs.emplace_back(positions);
+      inputs.emplace_back(*positions);
+
+      const auto run_start = Clock::now();
+      Result<std::vector<EValue>> output_result =
+          module.execute("text_decoder", inputs);
+      const auto run_end = Clock::now();
+      text_timing.run_ms = DurationMs(run_end - run_start).count();
+
+      if (output_result.error() != Error::Ok) {
+        std::cerr << "text_decoder execution failed: error code "
+                  << static_cast<int>(output_result.error()) << std::endl;
+        return 1;
+      }
+
+      const auto& outputs = output_result.get();
+      if (!outputs.empty() && outputs[0].isTensor()) {
+        print_tensor_summary("text_decoder output", outputs[0].toTensor());
+      }
+    }
+
+    std::cout << std::fixed << std::setprecision(3);
+    std::cout << "Program load latency (ms): " << program_load_latency.count()
+              << std::endl;
+
+    std::cout << "Method load latency (ms):" << std::endl;
+    std::cout << "  audio_encoder: " << audio_timing.load_ms << std::endl;
+    std::cout << "  token_embedding: " << token_timing.load_ms << std::endl;
+    std::cout << "  text_decoder: " << text_timing.load_ms << std::endl;
+
+    std::cout << "Run latency (ms):" << std::endl;
+    std::cout << "  audio_encoder: " << audio_timing.run_ms << std::endl;
+    std::cout << "  token_embedding: " << token_timing.run_ms << std::endl;
+    std::cout << "  text_decoder: " << text_timing.run_ms << std::endl;
+
+    return 0;
+  } catch (const std::exception& ex) {
+    std::cerr << "Unhandled exception: " << ex.what() << std::endl;
+    return 1;
+  }
+}

From ab5fb84707a4bf79a4ebcd414fc9d03c1a0e88b6 Mon Sep 17 00:00:00 2001
From: Erik Lundell <erik.lundell@arm.com>
Date: Wed, 8 Oct 2025 12:59:06 +0200
Subject: [PATCH 169/266] Arm backend: fix meandim when dim = None (#14883)

This is a valid argument, but the pass did not support it.

cc @freddan80 @per @zingo @oscarandersson8218 @digantdesai

Signed-off-by: Erik Lundell <erik.lundell@arm.com>
---
 backends/arm/_passes/decompose_meandim_pass.py | 2 ++
 backends/arm/test/ops/test_mean_dim.py         | 8 ++++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/backends/arm/_passes/decompose_meandim_pass.py b/backends/arm/_passes/decompose_meandim_pass.py
index e3e0a873020..4d4c0ee75b1 100644
--- a/backends/arm/_passes/decompose_meandim_pass.py
+++ b/backends/arm/_passes/decompose_meandim_pass.py
@@ -94,6 +94,8 @@ def call_operator(self, op, args, kwargs, meta):
         input_shape = list(x.data.shape)
         output_shape = list(meta["val"].shape)
         dims_to_reduce = get_node_arg(args, 1)
+        if dims_to_reduce is None:
+            dims_to_reduce = range(len(input_shape))
         dims_to_reduce = [dim % len(input_shape) for dim in dims_to_reduce]
         dims_to_reduce = [dim for dim in dims_to_reduce if input_shape[dim] != 1]
 
diff --git a/backends/arm/test/ops/test_mean_dim.py b/backends/arm/test/ops/test_mean_dim.py
index 96ec7793551..656f35fb17f 100644
--- a/backends/arm/test/ops/test_mean_dim.py
+++ b/backends/arm/test/ops/test_mean_dim.py
@@ -115,7 +115,7 @@ class MeanDim(torch.nn.Module):
     test_data_suite: dict[str, tuple] = {
         "rank_1_keepdim": lambda: (
             torch.rand(7),
-            (0),
+            0,
             True,
         ),
         "rank_2_keepdim": lambda: (
@@ -168,6 +168,11 @@ class MeanDim(torch.nn.Module):
             (0, 1, 2, 3),
             True,
         ),
+        "rand_none_keepdim": lambda: (
+            torch.rand(1, 5, 7, 3),
+            None,
+            True,
+        ),
         "rank_1": lambda: (
             torch.rand(7),
             (-1),
@@ -280,7 +285,6 @@ def test_mean_dim_tosa_INT(test_data):
         (test_data,),
         [],  # Might be sum, avgpool, or both
         symmetric_io_quantization=True,
-        custom_path="MEANDIM",
     )
     pipeline.run()
 

From 45bf018585939360a75a89b021135e0870454fee Mon Sep 17 00:00:00 2001
From: Erik Lundell <erik.lundell@arm.com>
Date: Wed, 8 Oct 2025 13:52:19 +0200
Subject: [PATCH 170/266] Arm backend: build with NAMED_DATA_MAP=ON for vgf
 (#14885)

PR #14861 introduced the NAMED_DATA_MAP extension which is required to
be on when BUILD_EXTENSION_MODULE is on. Therefore, turn it on where
needed.

Signed-off-by: Erik Lundell <erik.lundell@arm.com>
---
 backends/arm/scripts/build_executor_runner_vkml.sh | 1 +
 examples/arm/vgf_minimal_example.ipynb             | 1 +
 2 files changed, 2 insertions(+)

diff --git a/backends/arm/scripts/build_executor_runner_vkml.sh b/backends/arm/scripts/build_executor_runner_vkml.sh
index 1df63acc425..afca02c6299 100755
--- a/backends/arm/scripts/build_executor_runner_vkml.sh
+++ b/backends/arm/scripts/build_executor_runner_vkml.sh
@@ -69,6 +69,7 @@ cmake \
     -DCMAKE_BUILD_TYPE=${build_type}            \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
     -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_XNNPACK=OFF \
diff --git a/examples/arm/vgf_minimal_example.ipynb b/examples/arm/vgf_minimal_example.ipynb
index 4589745e8e7..1f8e0a61601 100644
--- a/examples/arm/vgf_minimal_example.ipynb
+++ b/examples/arm/vgf_minimal_example.ipynb
@@ -240,6 +240,7 @@
     "  -DCMAKE_BUILD_TYPE=Debug \\\n",
     "  -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \\\n",
     "  -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \\\n",
+    "  -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \\\n",
     "  -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \\\n",
     "  -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \\\n",
     "  -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \\\n",

From 9be3aaa9523c5367a55a5c7e916090706d5237ce Mon Sep 17 00:00:00 2001
From: Erik Lundell <erik.lundell@arm.com>
Date: Wed, 8 Oct 2025 13:53:56 +0200
Subject: [PATCH 171/266] Arm backend: Support min/max with unset dim. (#14884)

The dim is defined as optional, but before the pass requried it to be
set. When it is not set, the operation should be done on all dims.

Signed-off-by: Erik Lundell <erik.lundell@arm.com>
---
 backends/arm/_passes/convert_minmax_pass.py | 23 ++++++++++++++-------
 backends/arm/test/ops/test_amin.py          | 10 ++++++---
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/backends/arm/_passes/convert_minmax_pass.py b/backends/arm/_passes/convert_minmax_pass.py
index f1c81dbc41e..79bb6e2db0c 100644
--- a/backends/arm/_passes/convert_minmax_pass.py
+++ b/backends/arm/_passes/convert_minmax_pass.py
@@ -3,9 +3,10 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Set, Type
+from typing import cast, Set, Type
 
 import torch
+from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
 from executorch.backends.arm._passes.convert_squeezes_to_view import (
     ConvertSqueezesToViewPass,
 )
@@ -101,20 +102,28 @@ def call(self, graph_module: torch.fx.GraphModule):
             replace_node, op, squeeze_op = self.get_variables(node)
 
             # Unwrap args
-            if len(node.args) == 2:
+            if len(node.args) == 1:
+                # If dims is unspecified, min/max over all dims.
+                input_node = cast(torch.fx.Node, node.args[0])
+                input_shape = get_first_fake_tensor(input_node).shape
+                dims = range(len(input_shape))
+                keepdims = False
+            elif len(node.args) == 2:
                 input_node, dims = node.args
                 keepdims = False
             elif len(node.args) == 3:
                 input_node, dims, keepdims = node.args
             else:
-                raise RuntimeError(f"Unexpected arg size in {node.name}")
+                raise RuntimeError(
+                    f"Unexpected arg size {len(node.args)} in {node.name}"
+                )
 
             try:
-                iter(dims)
-            except:
-                dims = [dims]
+                iter(dims)  # type:ignore[assignment]
+            except Exception:
+                dims = [dims]  # type:ignore[assignment]
             else:
-                dims = list(dims)
+                dims = list(dims)  # type:ignore[assignment]
 
             # Unroll multi-dimensional reduction and keep-dims arg
             with graph_module.graph.inserting_before(node):
diff --git a/backends/arm/test/ops/test_amin.py b/backends/arm/test/ops/test_amin.py
index 1526ed21b89..d213cabf5a1 100644
--- a/backends/arm/test/ops/test_amin.py
+++ b/backends/arm/test/ops/test_amin.py
@@ -29,12 +29,16 @@ def __init__(self, dim, keep_dims):
         super().__init__()
 
     def forward(self, x):
-        return torch.amin(x, self.dim, self.keep_dims)
+        if self.dim is None:
+            return torch.amin(x, keepdim=self.keep_dims)
+        else:
+            return torch.amin(x, self.dim, self.keep_dims)
 
-    test_data: Dict[str, input_t] = {
+    test_data: Dict = {
         "rank_1_dim_0": lambda: ((torch.rand([10]),), 0, False),
         "rank_2_dim_1_keep_dims": lambda: ((torch.rand([2, 2]),), (1,), True),
         "rank_4_all_dim": lambda: ((torch.rand([1, 2, 5, 5]),), (0, 1, 2, 3), False),
+        "rank_4_no_dim": lambda: ((torch.rand([1, 2, 5, 5]),), None, False),
         "rank_4_0,3_keep_dims": lambda: ((torch.rand([1, 2, 2, 2]),), (0, 3), True),
         "rank_4_mult_batches": lambda: ((torch.rand([2, 2, 2, 2]),), (0), True),
     }
@@ -52,7 +56,7 @@ def forward(self, x):
         x = torch.min(x, self.dim)
         return x[0]
 
-    test_data: Dict[str, input_t] = {
+    test_data: Dict = {
         "rank_1_dim_0": lambda: ((torch.rand([10]),), 0),
         "rank_2_dim_1": lambda: ((torch.rand([2, 2]),), 1),
         "rank_4_dim_2": lambda: ((torch.rand([2, 2, 2, 2]),), 2),

From 7d2b8c6771a31bf8e4810c601e88ab785beaa6e6 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Wed, 8 Oct 2025 13:15:07 +0100
Subject: [PATCH 172/266] Arm backend: Add correction for floor mode (#14776)

Correct implementation of div.tensor_mode for 'floor' case to make it
numerically stable.


Signed-off-by: Elena Zhelezina <elena.zhelezina@arm.com>
---
 .../arm/_passes/decompose_div_tensor_mode.py  | 52 ++++++++++++++++++-
 backends/arm/test/ops/test_div_tensor_mode.py | 26 +++++++++-
 2 files changed, 74 insertions(+), 4 deletions(-)

diff --git a/backends/arm/_passes/decompose_div_tensor_mode.py b/backends/arm/_passes/decompose_div_tensor_mode.py
index b5352475d51..5ad348806e3 100644
--- a/backends/arm/_passes/decompose_div_tensor_mode.py
+++ b/backends/arm/_passes/decompose_div_tensor_mode.py
@@ -22,6 +22,8 @@
     "full": exir_ops.edge.aten.full.default,
     "lt": exir_ops.edge.aten.lt.Tensor,
     "where": exir_ops.edge.aten.where.self,
+    "mul": exir_ops.edge.aten.mul.Tensor,
+    "sub": exir_ops.edge.aten.sub.Tensor,
 }
 
 aten_unary = {
@@ -31,6 +33,8 @@
     "full": torch.ops.aten.full.default,
     "lt": torch.ops.aten.lt.Tensor,
     "where": torch.ops.aten.where.self,
+    "mul": torch.ops.aten.mul.Tensor,
+    "sub": torch.ops.aten.sub.Tensor,
 }
 
 
@@ -70,13 +74,57 @@ def call_operator(self, op, args, kwargs, meta):
             return q
 
         if rounding_mode == "floor":
-            return super().call_operator(opset["floor"], (q,), {}, meta)
+            q_raw = q
+
+            # trunc(q_raw) = where(q_raw < 0, ceil(q_raw), floor(q_raw))
+            q_floor = super().call_operator(opset["floor"], (q_raw,), {}, meta)
+            q_ceil = super().call_operator(opset["ceil"], (q_raw,), {}, meta)
+
+            # a zero tensor with the right shape
+            out_shape = (1,) * len(meta["val"].size())
+            zero = super().call_operator(
+                opset["full"],
+                args=(out_shape, 0.0),
+                kwargs={},
+                meta=meta,
+            )
+
+            is_neg = super().call_operator(opset["lt"], (q_raw, zero), {}, meta)
+            q_trunc = super().call_operator(
+                opset["where"], (is_neg, q_ceil, q_floor), {}, meta
+            )
+
+            # r = a - q_trunc * b (true remainder under truncation)
+            q_times_b = super().call_operator(opset["mul"], (q_trunc, b), {}, meta)
+            r = super().call_operator(opset["sub"], (a, q_times_b), {}, meta)
+
+            # Decide if we need to subtract 1:
+            # for b > 0, adjust if r < 0; for b < 0, adjust if r > 0.
+            b_pos = super().call_operator(opset["lt"], (zero, b), {}, meta)  # b > 0
+            r_lt0 = super().call_operator(opset["lt"], (r, zero), {}, meta)  # r < 0
+            r_gt0 = super().call_operator(opset["lt"], (zero, r), {}, meta)  # r > 0
+
+            adjust_if = super().call_operator(
+                opset["where"], (b_pos, r_lt0, r_gt0), {}, meta
+            )
+
+            one = super().call_operator(
+                opset["full"],
+                args=(out_shape, 1.0),
+                kwargs={},
+                meta=meta,
+            )
+            q_minus_1 = super().call_operator(opset["sub"], (q_trunc, one), {}, meta)
+
+            return super().call_operator(
+                opset["where"], (adjust_if, q_minus_1, q_trunc), {}, meta
+            )
 
         if rounding_mode == "trunc":
             zero = super().call_operator(
                 opset["full"],
                 args=((1,) * len(meta["val"].size()), 0.0),
-                kwargs={"dtype": torch.float32},
+                kwargs={},
                 meta=meta,
             )
             lt0 = self.call_operator(opset["lt"], (q, zero), {}, meta)
diff --git a/backends/arm/test/ops/test_div_tensor_mode.py b/backends/arm/test/ops/test_div_tensor_mode.py
index e1f6036a487..9057be343f1 100644
--- a/backends/arm/test/ops/test_div_tensor_mode.py
+++ b/backends/arm/test/ops/test_div_tensor_mode.py
@@ -36,6 +36,14 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         return torch.div(x, y, rounding_mode=self.mode)
 
 
+def _rank4_large_randn_case():
+    torch.manual_seed(0)
+    x = 200 * torch.randn(5, 10, 25, 20) + 1
+    torch.manual_seed(1)
+    y = torch.rand(5, 10, 25, 20) + 1
+    return x, y
+
+
 test_data = {
     "mode_none": lambda: (None, (torch.randn(4, 8), torch.randn(4, 8).abs() + 1e-3)),
     "mode_floor": lambda: (
@@ -47,6 +55,13 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         (torch.randn(4, 8), torch.randn(4, 8).abs() + 1e-3),
     ),
     "int_denominator": lambda: (None, (torch.randn(4, 8), 2)),
+    "op_floor_div_rank4_large_randn": lambda: (
+        "floor",
+        (
+            200 * torch.randn(5, 10, 25, 20) + 1,
+            torch.rand(5, 10, 25, 20) + 1,
+        ),
+    ),
 }
 
 
@@ -84,7 +99,13 @@ def test_div_tensor_mode_tosa_INT(data):
 
 @common.XfailIfNoCorstone300
 @common.parametrize(
-    "data", test_data, xfails={"mode_trunc": "CPU op missing in unittests"}
+    "data",
+    test_data,
+    xfails={
+        "mode_trunc": "CPU op missing in unittests",
+        "mode_floor": "Not supported",
+        "op_floor_div_rank4_large_randn": "Not supported",
+    },
 )
 def test_div_tensor_mode_u55_INT(data):
     mode, inputs = data()
@@ -94,9 +115,10 @@ def test_div_tensor_mode_u55_INT(data):
         model,
         inputs,
         aten_ops=model.aten_ops_int,
-        exir_ops=[],
         use_to_edge_transform_and_lower=True,
     )
+    pipeline.pop_stage("check_not.exir")
+    pipeline.pop_stage("check_count.exir")
     pipeline.run()
 
 
From 41b061e3180081ef63e2dccb434c383e4e144499 Mon Sep 17 00:00:00 2001
From: roman-janik-nxp <roman.janik@nxp.com>
Date: Wed, 8 Oct 2025 14:53:02 +0200
Subject: [PATCH 173/266] NXP backend: Update user guide and docs Readme
 (#14852)

### Summary
This PR updates NXP backend Readmes in backend and examples directories.

### Test plan
-


cc @robert-kalmar @JakeStevens @digantdesai
---
 backends/nxp/README.md | 38 ++++++++++++------------------
 examples/nxp/README.md | 52 +++++++++++++++++++++++++++++++-----------
 2 files changed, 54 insertions(+), 36 deletions(-)

diff --git a/backends/nxp/README.md b/backends/nxp/README.md
index 10eb1290a8b..de41cdd282e 100644
--- a/backends/nxp/README.md
+++ b/backends/nxp/README.md
@@ -15,7 +15,8 @@ networks, as well as the ability to adapt and scale to new model architectures,
 to AI workloads. ML application development with the eIQ Neutron NPU is fully supported by the 
 [eIQ machine learning software development environment](https://www.nxp.com/design/design-center/software/eiq-ml-development-environment/eiq-toolkit-for-end-to-end-model-development-and-deployment:EIQ-TOOLKIT).
 The eIQ AI SW Stack provides a streamlined development experience for developers and end-users of NXP products.
-eIQ extensions connect broader AI ecosystems to the edge, such as the NVIDIA TAO extension, which enables developers to bring AI models trained and fine-tuned with TAO to NXP-powered edge devices.
+eIQ extensions connect broader AI ecosystems to the edge, such as the NVIDIA TAO extension, which enables developers 
+to bring AI models trained and fine-tuned with TAO to NXP-powered edge devices.
 
 
 ## Supported NXP platforms
@@ -35,37 +36,28 @@ improvements. NXP and the ExecuTorch community is actively developing this codeb
 
 ## Neutron Backend implementation and SW architecture
 Neutron Backend uses the eIQ Neutron Converter as ML compiler to compile the delegated subgraph to Neutron microcode. 
-The Neutron Converter accepts the ML model in LiteRT format, for the **eIQ Neutron N3** class  therefore the Neutron Backend uses the LiteRT flatbuffers format as IR between the ExecuTorch and Neutron Converter ML compiler. 
-
-The Neutron Backend in its early prototype phase, is based on existing NXP products, such as 
-onnx2tflite, known from the NXP's eIQ Toolkit. 
-The **onnx2tflite** is a converter from the ONNX format to LiteRT (formerly known as TFLite).
-It consists of 3 stages: 
-* ONNX Model Parsing
-* Tensor Format Inference, to identify tensors using channel-first layer
-* ONNX to LiteRT Conversion 
-* Optimization Passes, which operate on top of the LiteRT format
-* LiteRT Serialization 
-
-Due to the similarities between ONNX to LiteRT and Edge to LiteRT conversion, the Neutron Backend's 
-currently leverages the Tensor format Inference and LiteRT Optimizer. 
-This shall be considered as temporary solution, intended to be replaced with: 
-* Dim Order (https://github.com/pytorch/executorch/issues/4873)
-* Corresponding ExecuTorch/ATen passes
-
-before reaching higher maturity status by the end of 2025. 
+The Neutron Converter accepts the ML model in LiteRT format, for the **eIQ Neutron N3** class  therefore the Neutron Backend
+uses the LiteRT flatbuffers format as IR between the ExecuTorch and Neutron Converter ML compiler.
 
 ## Layout
-The current code base is as follows:
 * `backend/ir/` - TFLite/LiteRT based IR to represent the Edge Subgraph, taken from onnx2tflite code base and extended to
   support Edge Dialect to LiteRT conversion.
     * `backend/ir/converter` - Neutron Backends conversion from Edge (ATen) Dialect to LiteRT, TFLite. The subfolder
       `node_conveters` is structured as single module for each Edge operator.
-    * `backend/ir/lib` - automatically generated handlers from LiteRT flatbuffers schema
+    * `backend/ir/lib` - automatically generated handlers from LiteRT flatbuffers schema.
     * `backend/ir/tflite_generator` and `backend/ir/tflite_optimizer` handle the serialization
        of the in-memory built subgraph for delegation into LiteRT/TFLite flatbuffers 
        representation. Code taken from the onnx2tflite tool.
-*  `quantizer` - Neutron Backends quantizer implementation. 
+*  `edge_passes` - Various passes operating on Edge dialect level. 
+*  `quantizer` - Neutron Backend quantizer implementation. 
+*  `runtime` - Neutron Backend runtime implementation. For running compiled on device.
+*  `tests/` - Unit tests for Neutron backend.
+    * `tests/converter/node_converter` - Operator level unit tests.
+
+* `examples/nxp/` - Example models and scripts for running them.
+
+## Examples
+Please see this [README.md](https://github.com/pytorch/executorch/blob/main/examples/nxp/README.md).
 
 ## Help & Improvements
 If you have problems or questions or have suggestions for ways to make
diff --git a/examples/nxp/README.md b/examples/nxp/README.md
index bb503ffd288..ef3153f2c91 100644
--- a/examples/nxp/README.md
+++ b/examples/nxp/README.md
@@ -1,20 +1,46 @@
-# PyTorch Model Delegation to Neutron Backend
+# ExecuTorch Neutron Backend examples
+This directory contains examples demonstrating the use of ExecuTorch AoT flow to convert a PyTorch model to ExecuTorch
+format and delegate the model computation to eIQ Neutron NPU using the eIQ Neutron Backend.
 
-In this guide we will show how to use the ExecuTorch AoT flow to convert a PyTorch model to ExecuTorch format and delegate the model computation to eIQ Neutron NPU using the eIQ Neutron Backend.
+## Layout
+* `experimental/` - contains CifarNet model example.
+* `models` - various example models.
+* `aot_neutron_compile.py` - script with end-to-end ExecuTorch AoT Neutron Backend workflow.
+* `README.md` - this file.
+* `run_aot_example.sh` - utility script for aot_neutron_compile.py.
+* `setup.sh` - setup script for Neutron Converter installation.
 
-First we will start with an example script converting the model. This example show the CifarNet model preparation. It is the same model which is part of the `example_cifarnet`
+## Setup
+Please finish tutorial [Setting up ExecuTorch](https://pytorch.org/executorch/main/getting-started-setup).
 
-The steps are expected to be executed from the executorch root folder.
-1. Run the setup.sh script to install the neutron-converter:
+Run the setup.sh script to install the neutron-converter:
 ```commandline
-$ examples/nxp/setup.sh
+$ ./examples/nxp/setup.sh
 ```
 
-2. Now run the `aot_neutron_compile.py` example with the `cifar10` model 
-```commandline
-$ python -m examples.nxp.aot_neutron_compile --quantize \
-    --delegate --neutron_converter_flavor SDK_25_09 -m cifar10 
-```
+## Supported models
+* CifarNet
+* MobileNetV2
+
+## PyTorch Model Delegation to Neutron Backend
+First we will start with an example script converting the model. This example show the CifarNet model preparation. 
+It is the same model which is part of the `example_cifarnet` in 
+[MCUXpresso SDK](https://www.nxp.com/design/design-center/software/development-software/mcuxpresso-software-and-tools-/mcuxpresso-software-development-kit-sdk:MCUXpresso-SDK).
+
+The NXP MCUXpresso software and tools offer comprehensive development solutions designed to help accelerate embedded 
+system development of applications based on MCUs from NXP. The MCUXpresso SDK includes a flexible set of peripheral 
+drivers designed to speed up and simplify development of embedded applications.
+
+The steps are expected to be executed from the `executorch` root folder.
+
+1. Run the `aot_neutron_compile.py` example with the `cifar10` model 
+    ```commandline
+    $ python -m examples.nxp.aot_neutron_compile --quantize \
+        --delegate --neutron_converter_flavor SDK_25_09 -m cifar10 
+    ```
 
-3. It will generate you `cifar10_nxp_delegate.pte` file which can be used with the MXUXpresso SDK `cifarnet_example` project, presented [here](https://mcuxpresso.nxp.com/mcuxsdk/latest/html/middleware/eiq/executorch/docs/nxp/topics/example_applications.html#how-to-build-and-run-executorch-cifarnet-example).
-To get the MCUXpresso SDK follow this [guide](https://mcuxpresso.nxp.com/mcuxsdk/latest/html/middleware/eiq/executorch/docs/nxp/topics/getting_mcuxpresso.html), use the MCUXpresso SDK v25.03.00. 
\ No newline at end of file
+2. It will generate you `cifar10_nxp_delegate.pte` file which can be used with the MCUXpresso SDK `cifarnet_example` 
+project, presented [here](https://mcuxpresso.nxp.com/mcuxsdk/latest/html/middleware/eiq/executorch/docs/nxp/topics/example_applications.html#how-to-build-and-run-executorch-cifarnet-example).
+This project will guide you through the process of deploying your PTE model to the device.
+To get the MCUXpresso SDK follow this [guide](https://mcuxpresso.nxp.com/mcuxsdk/latest/html/middleware/eiq/executorch/docs/nxp/topics/getting_mcuxpresso.html),
+use the MCUXpresso SDK v25.09.00. 

From a41cdef75f7a2d0d2fe78f25fd3f4dad710469da Mon Sep 17 00:00:00 2001
From: Onuralp SEZER <onuralp@ultralytics.com>
Date: Wed, 8 Oct 2025 15:54:54 +0300
Subject: [PATCH 174/266] =?UTF-8?q?refactor:=20=E2=99=BB=EF=B8=8F=20update?=
 =?UTF-8?q?=20YOLO12=20example=20doc=20and=20code=20(#14771)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Update requirements.txt for YOLO12 dependencies
- Add .gitignore for YOLO12 model files
- Update README.md to improve clarity and formatting
- Replace de_parallel with unwrap_model in export_and_validate.py

Signed-off-by: Onuralp SEZER <onuralp@ultralytics.com>
---
 examples/models/yolo12/.gitignore             |  3 ++
 examples/models/yolo12/README.md              | 31 ++++++++++---------
 examples/models/yolo12/export_and_validate.py |  4 +--
 3 files changed, 21 insertions(+), 17 deletions(-)
 create mode 100644 examples/models/yolo12/.gitignore

diff --git a/examples/models/yolo12/.gitignore b/examples/models/yolo12/.gitignore
new file mode 100644
index 00000000000..02deda29710
--- /dev/null
+++ b/examples/models/yolo12/.gitignore
@@ -0,0 +1,3 @@
+*.pt
+*.pte
+*.ptd
diff --git a/examples/models/yolo12/README.md b/examples/models/yolo12/README.md
index 2260afa5dde..1a54f1a4a16 100644
--- a/examples/models/yolo12/README.md
+++ b/examples/models/yolo12/README.md
@@ -1,10 +1,11 @@
 # YOLO12 Detection C++ Inference with ExecuTorch
 
-This example demonstrates how to perform inference of [Ultralytics YOLO12 family](https://docs.ultralytics.com/models/yolo12/) detection models in C++ leveraging the Executorch backends:
+This example demonstrates how to perform inference of [YOLO12 family](https://docs.ultralytics.com/models/yolo12/) detection models in C++ leveraging the Executorch backends:
+
 - [OpenVINO](../../../backends/openvino/README.md)
 - [XNNPACK](../../../backends/xnnpack/README.md)
 
-# Performance Evaluation
+## Performance Evaluation
 
 | CPU                            | Model   | Backend  | Device | Precision | Average Latency, ms |
 |--------------------------------|---------|----------|--------|-----------|---------------------|
@@ -17,8 +18,7 @@ This example demonstrates how to perform inference of [Ultralytics YOLO12 family
 | Intel(R) Core(TM) Ultra 7 155H | yolo12s | xnnpack  | CPU    | FP32      | 169.36              |
 | Intel(R) Core(TM) Ultra 7 155H | yolo12l | xnnpack  | CPU    | FP32      | 436.876             |
 
-
-# Instructions
+## Instructions
 
 ### Step 1: Install ExecuTorch
 
@@ -31,35 +31,36 @@ To install ExecuTorch, follow this [guide](https://pytorch.org/executorch/stable
 
 ### Step 3: Install the demo requirements
 
-
 Python demo requirements:
+
 ```bash
 python -m pip install -r examples/models/yolo12/requirements.txt
 ```
 
 Demo infenrece dependency - OpenCV library:
-https://opencv.org/get-started/
-
-
-### Step 4: Export the Yolo12 model to the ExecuTorch
+<https://opencv.org/get-started/>
 
+### Step 4: Export the YOLO12 model to the ExecuTorch
 
 OpenVINO:
+
 ```bash
 python export_and_validate.py --model_name yolo12s --input_dims=[1920,1080]  --backend openvino --device CPU
 ```
 
 OpenVINO quantized model:
+
 ```bash
 python export_and_validate.py --model_name yolo12s --input_dims=[1920,1080]  --backend openvino --quantize --video_input /path/to/calibration/video --device CPU
 ```
 
 XNNPACK:
+
 ```bash
 python export_and_validate.py --model_name yolo12s --input_dims=[1920,1080] --backend xnnpack
 ```
 
-> **_NOTE:_**  Quantization for XNNPACK backend is WIP. Please refere to https://github.com/pytorch/executorch/issues/11523 for more details.
+> **_NOTE:_**  Quantization for XNNPACK backend is WIP. Please refere to <https://github.com/pytorch/executorch/issues/11523> for more details.
 
 Exported model could be validated using the `--validate` key:
 
@@ -70,8 +71,8 @@ python export_and_validate.py --model_name yolo12s --backend ... --validate data
 A list of available datasets and instructions on how to use a custom dataset can be found [here](https://docs.ultralytics.com/datasets/detect/).
 Validation only supports the default `--input_dims`; please do not specify this parameter when using the `--validate` flag.
 
-
 To get a full parameters description please use the following command:
+
 ```bash
 python export_and_validate.py --help
 ```
@@ -103,11 +104,11 @@ make -j$(nproc)
 ```
 
 To get a full parameters description please use the following command:
-```
+
+```bash
 ./build/Yolo12DetectionDemo --help
 ```
 
+## Credits
 
-# Credits:
-
-Ultralytics examples: https://github.com/ultralytics/ultralytics/tree/main/examples
+Ultralytics examples: <https://github.com/ultralytics/ultralytics/tree/main/examples>
diff --git a/examples/models/yolo12/export_and_validate.py b/examples/models/yolo12/export_and_validate.py
index e2349fb6434..ccd0db76d7d 100644
--- a/examples/models/yolo12/export_and_validate.py
+++ b/examples/models/yolo12/export_and_validate.py
@@ -35,7 +35,7 @@
 
 from ultralytics.data.utils import check_det_dataset
 from ultralytics.engine.validator import BaseValidator as Validator
-from ultralytics.utils.torch_utils import de_parallel
+from ultralytics.utils.torch_utils import unwrap_model
 
 
 class CV2VideoIter:
@@ -293,7 +293,7 @@ def _prepare_validation(
     stride = 32  # default stride
     validator.stride = stride  # used in get_dataloader() for padding
     validator.data = check_det_dataset(dataset_yaml_path)
-    validator.init_metrics(de_parallel(model))
+    validator.init_metrics(unwrap_model(model))
 
     data_loader = validator.get_dataloader(
         validator.data.get(validator.args.split), validator.args.batch

From b88b09ca60480d4d6f5c15b8d09c9a8e62144f4d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Lindstr=C3=B6m?=
 <33344797+martinlsm@users.noreply.github.com>
Date: Wed, 8 Oct 2025 15:38:42 +0200
Subject: [PATCH 175/266] Arm backend: Add missing attribute in VisualizePass
 (#14847)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

VisualizePass was missing _passes_required_after and could therefore not
be initialized. Define this attribute to fix the problem.

Signed-off-by: Martin Lindström <Martin.Lindstroem@arm.com>
---
 backends/arm/_passes/_debug_passes.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/backends/arm/_passes/_debug_passes.py b/backends/arm/_passes/_debug_passes.py
index 7809885d465..4c1661e50a9 100644
--- a/backends/arm/_passes/_debug_passes.py
+++ b/backends/arm/_passes/_debug_passes.py
@@ -3,6 +3,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import Set, Type
+
 import torch
 from executorch.devtools.visualization.visualization_utils import visualize_graph
 from executorch.exir import ExportedProgram
@@ -14,6 +16,8 @@ class VisualizePass(ExportPass):
     This pass visualizes the graph at the point of insertion in the pass manager
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     def __init__(self, exported_program: ExportedProgram) -> None:
         super().__init__()
         self.exported_program = exported_program

From 5c254936ad903b6c81a43065b3e63a2fcfbe688f Mon Sep 17 00:00:00 2001
From: Sebastian Larsson <38941629+Sebastian-Larsson@users.noreply.github.com>
Date: Wed, 8 Oct 2025 15:39:21 +0200
Subject: [PATCH 176/266] Arm backend: Add docstrings for tosa/partitioner.py
 (#14844)

Signed-off-by: Sebastian Larsson <sebastian.larsson@arm.com>
---
 backends/arm/tosa/partitioner.py | 147 +++++++++++++++++++++++++++++--
 1 file changed, 139 insertions(+), 8 deletions(-)

diff --git a/backends/arm/tosa/partitioner.py b/backends/arm/tosa/partitioner.py
index 3e512847109..6eb1dcbef72 100644
--- a/backends/arm/tosa/partitioner.py
+++ b/backends/arm/tosa/partitioner.py
@@ -4,6 +4,15 @@
 # LICENSE file in the root directory of this source tree.
 
 # pyre-unsafe
+"""Provide a partitioner for delegating subgraphs to the TOSA backend.
+
+Implement logic to identify and tag regions of an ``ExportedProgram`` that can
+be delegated to the TOSA backend. Use this module to:
+
+- Partition graphs based on operator support and additional checks.
+- Prune trivial no-op partitions that would lower to empty TOSA graphs.
+- Tag constant data and report reasons for rejected nodes.
+"""
 
 import logging
 from typing import Callable, List, Optional, Sequence, Tuple
@@ -34,14 +43,46 @@
 
 
 def is_noop_clone(node: torch.fx.node.Node) -> bool:
+    """Return True if the node is a no-op ``dim_order_ops._clone_dim_order``.
+
+    Args:
+        node (torch.fx.Node): FX node to inspect.
+
+    Returns:
+        bool: True if the node targets ``dim_order_ops._clone_dim_order.default``
+        in the Edge dialect; otherwise, False.
+
+    """
     return node.target == exir_ops.edge.dim_order_ops._clone_dim_order.default
 
 
 def is_noop_alias_copy(node: torch.fx.Node) -> bool:
+    """Return True if the node is a no-op ``aten.alias_copy``.
+
+    Args:
+        node (torch.fx.Node): FX node to inspect.
+
+    Returns:
+        bool: True if the node targets ``aten.alias_copy.default``; otherwise,
+        False.
+
+    """
     return node.target == exir_ops.edge.aten.alias_copy.default
 
 
 def is_noop_to_dim_order_copy(node: torch.fx.node.Node) -> bool:
+    """Return True if node is a no-op ``dim_order_ops._to_dim_order_copy``.
+
+    Consider the op a no-op when the output dtype equals the input's dtype.
+
+    Args:
+        node (torch.fx.Node): FX node to inspect.
+
+    Returns:
+        bool: True if it targets ``_to_dim_order_copy.default`` and preserves
+        dtype; otherwise, False.
+
+    """
     if node.target != exir_ops.edge.dim_order_ops._to_dim_order_copy.default:
         return False
     else:
@@ -49,6 +90,19 @@ def is_noop_to_dim_order_copy(node: torch.fx.node.Node) -> bool:
 
 
 def is_noop_expand(node: torch.fx.node.Node) -> bool:
+    """Return True if the node is an ``expand_copy`` with all-ones multiples.
+
+    This corresponds to a semantic no-op, since expanding by 1 along every
+    dimension leaves the tensor unchanged.
+
+    Args:
+        node (torch.fx.Node): FX node to inspect.
+
+    Returns:
+        bool: True if the node targets ``aten.expand_copy.default`` and all
+        computed multiples are 1; otherwise, False.
+
+    """
     if node.target != exir_ops.edge.aten.expand_copy.default:
         return False
     else:
@@ -57,11 +111,30 @@ def is_noop_expand(node: torch.fx.node.Node) -> bool:
 
 
 class TOSAPartitioner(Partitioner):
+    """Partition an exported program into TOSA-delegable subgraphs.
+
+    Construct this partitioner for compile specs targeting TOSA. The partition
+    algorithm uses capability checks and optional additional operator-support
+    rules to tag nodes with a delegation tag per subgraph.
+    """
+
     def __init__(
         self,
         compile_spec: TosaCompileSpec,
         additional_checks: Optional[Sequence[OperatorSupportBase]] = None,
     ) -> None:
+        """Initialize the TOSAPartitioner.
+
+        Args:
+            compile_spec (TosaCompileSpec): Parsed compile specifications for
+                TOSA containing the TOSA spec and original list.
+            additional_checks (Optional[Sequence[OperatorSupportBase]]): Extra
+                operator-support checks to apply when partitioning.
+
+        Raises:
+            RuntimeError: If the provided compile spec does not target TOSA.
+
+        """
         self.delegation_spec = DelegationSpec(
             TOSABackend.__name__, compile_spec.to_list()
         )
@@ -70,9 +143,22 @@ def __init__(
         self.tosa_spec = compile_spec.tosa_spec
 
     def partition(self, exported_program: ExportedProgram) -> PartitionResult:  # noqa
-        # Run the CapabilityBasedPartitioner to return the largest possible
-        # subgraphs containing the nodes with the tags
+        """Partition the program and tag TOSA-compatible subgraphs.
+
+        Run the FX capability-based partitioner to propose subgraphs, then
+        refine tags by removing boundary-only quantize/dequantize nodes and by
+        rejecting partitions that would lower to no-ops. Emit a detailed report
+        of rejected nodes and their reasons.
+
+        Args:
+            exported_program (ExportedProgram): Program to analyze and
+                partition.
+
+        Returns:
+            PartitionResult: The input program with nodes tagged for delegation
+            and a mapping of partition tags to delegation specs.
 
+        """
         logger.info("TOSAPartitioner::partition")
         partition_tags: dict[str, DelegationSpec] = {}
 
@@ -92,6 +178,15 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:  # no
         partition_list = capability_partitioner.propose_partitions()
 
         def reject_partition(reason: str, partition, tag) -> None:
+            """Remove a proposed partition and record the rejection reason.
+
+            Args:
+                reason (str): Human-readable explanation for rejection.
+                partition (object): Proposed partition object from the
+                    capability partitioner.
+                tag (str): Delegation tag associated with the partition.
+
+            """
             for node in partition.nodes:
                 if "delegation_tag" in node.meta:
                     del node.meta["delegation_tag"]
@@ -105,6 +200,16 @@ def reject_partition(reason: str, partition, tag) -> None:
             tag = f"tag{partition.id}"
 
             def is_partitioned(node: torch.fx.Node, tag=tag) -> bool:
+                """Return True if the node currently belongs to the partition ``tag``.
+
+                Args:
+                    node (torch.fx.Node): FX node to check.
+                    tag (str): Delegation tag identifying the partition.
+
+                Returns:
+                    bool: True if the node carries the matching delegation tag.
+
+                """
                 return (
                     "delegation_tag" in node.meta and node.meta["delegation_tag"] == tag
                 )
@@ -113,8 +218,8 @@ def is_partitioned(node: torch.fx.Node, tag=tag) -> bool:
                 node.meta["delegation_tag"] = tag
                 partition_tags[tag] = self.delegation_spec
 
-            # De-tag outmost q-nodes upwards and dq-nodes downwards.
-            # De-tag if at least one input/ output is not part of partition.
+            # De-tag outermost q-nodes upwards and dq-nodes downwards.
+            # De-tag if at least one input/output is not part of the partition.
             for node in exported_program.graph_module.graph.nodes:
                 if not is_partitioned(node):
                     continue
@@ -175,15 +280,41 @@ def ops_to_not_decompose(
         self,
         ep: ExportedProgram,
     ) -> Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]:
+        """Return operators and a filter that should not be decomposed.
+
+        Provide a base set of ops to preserve as-is and a predicate that keeps
+        certain activations whole when surrounded by quantize/dequantize ops in
+        a quantized graph. This helps downstream TOSA lowering and delegation.
+
+        Args:
+            ep (ExportedProgram): Program used to infer target-specific policy.
+
+        Returns:
+            Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]:
+                A list of op overloads to keep intact, and an optional filter
+                function that returns True when an op should not be decomposed.
+
+        """
         ops_to_not_decompose_if_quant_op = [
             torch.ops.aten.hardsigmoid.default,
             torch.ops.aten.hardswish.default,
         ]
 
         def filter_fn(node: torch.fx.Node) -> bool:
-            # This function filters for operators to not decompose where:
-            #   - It's target is in ops_to_not_decompose_if_quant_op list.
-            #   - All it's inputs/outputs are quantize operators.
+            """Return True to keep selected ops intact inside quantized regions.
+
+            The predicate holds when the target is in
+            ``ops_to_not_decompose_if_quant_op`` and all inputs/outputs are
+            quantize/dequantize ops, indicating a quantized activation that
+            should not be decomposed.
+
+            Args:
+                node (torch.fx.Node): FX node to evaluate.
+
+            Returns:
+                bool: True to keep the op intact; otherwise, False.
+
+            """
             dq = torch.ops.quantized_decomposed.dequantize_per_tensor.default
             q = torch.ops.quantized_decomposed.quantize_per_tensor.default
 
@@ -204,7 +335,7 @@ def filter_fn(node: torch.fx.Node) -> bool:
 
                 return should_not_decompose
 
-            # Be default, do not decompose the operator
+            # By default, do not decompose the operator
             return True
 
         ops_to_not_decompose = [

From bf3b66c2893a2b98655e4173a2bee844c076d39a Mon Sep 17 00:00:00 2001
From: Sebastian Larsson <38941629+Sebastian-Larsson@users.noreply.github.com>
Date: Wed, 8 Oct 2025 15:40:12 +0200
Subject: [PATCH 177/266] Arm backend: Add docstrings for
 operator_support/ethos_u55_support.py (#14774)

---
 .../arm/operator_support/ethos_u55_support.py | 169 ++++++++++++++++--
 1 file changed, 154 insertions(+), 15 deletions(-)

diff --git a/backends/arm/operator_support/ethos_u55_support.py b/backends/arm/operator_support/ethos_u55_support.py
index 983aa091eec..27ddb95637b 100644
--- a/backends/arm/operator_support/ethos_u55_support.py
+++ b/backends/arm/operator_support/ethos_u55_support.py
@@ -2,6 +2,13 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+"""Provide Ethos-U55 specific operator support checks.
+
+Contains dtype validation, explicit unsupported-op filtering, and shape/
+permutation constraints for view and permute operations when targeting the
+Ethos-U55 subset of TOSA.
+
+"""
 
 # pyre-unsafe
 
@@ -21,6 +28,19 @@
 
 
 def _try_determine_dtype(node: fx.Node) -> torch.dtype | None:
+    """Return an inferred dtype for a node when possible.
+
+    Uses fake tensor metadata and nearby quantize/dequantize nodes to infer the
+    integer dtype used by the operator. Returns ``None`` when the dtype cannot
+    be determined reliably.
+
+    Args:
+        node (fx.Node): FX node to inspect.
+
+    Returns:
+        torch.dtype | None: Inferred dtype or ``None`` if unknown.
+
+    """
     dtype = get_first_fake_tensor(node).dtype
     if not dtype.is_floating_point:
         return dtype
@@ -34,8 +54,23 @@ def _try_determine_dtype(node: fx.Node) -> torch.dtype | None:
 
 
 class EthosU55DtypeSupport(OperatorSupportBase):
+    """Validate dtypes for U55-supported operators.
+
+    Ensures operators use a supported integer dtype according to U55
+    constraints, with specific rules for convolution, matmul, and table ops.
+
+    Attributes:
+        reporter (WhyNoPartitionReporter): Reporter for rejection reasons.
+
+    """
 
     def __init__(self, reporter: WhyNoPartitionReporter):
+        """Initialize the check with a reporter.
+
+        Args:
+            reporter (WhyNoPartitionReporter): Reporter for rejection reasons.
+
+        """
         super().__init__()
         self.reporter = reporter
 
@@ -52,7 +87,20 @@ def __init__(self, reporter: WhyNoPartitionReporter):
     def is_node_supported(  # noqa: C901
         self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
     ) -> bool:
+        """Return True if the node uses supported dtypes.
 
+        Applies per-operator dtype rules for U55, including specialized input
+        and weight constraints for convolution and int8-only checks for table
+        operations and matmul variants.
+
+        Args:
+            submodules (typing.Mapping[str, torch.nn.Module]): Exported modules.
+            node (fx.Node): FX node to check.
+
+        Returns:
+            bool: True if supported; otherwise, False.
+
+        """
         dtype = _try_determine_dtype(node)
         if dtype is None:
             # If we couldn't determine dtype, just return ok.
@@ -112,10 +160,12 @@ def is_node_supported(  # noqa: C901
 
 
 class EthosU55NotSupported(OperatorSupportBase):
-    """
-    Certain operators are not supported on U55. These are listed in `unsupported_ops`.
-    The comment mentions the unsupported TOSA operator that the aten operator maps to where it is not obvious.
-    For unimplemented operators, this is the anticipated mapping, and it might be incorrect.
+    """Reject operators not supported by Ethos-U55.
+
+    The ``unsupported_ops`` list contains aten ops that either map to TOSA
+    operators the U55 cannot run or remain unimplemented. The mapping comments
+    capture expected TOSA equivalents when not obvious.
+
     """
 
     unsupported_ops = [
@@ -165,12 +215,27 @@ class EthosU55NotSupported(OperatorSupportBase):
     ]
 
     def __init__(self, reporter: WhyNoPartitionReporter):
+        """Initialize the check with a reporter.
+
+        Args:
+            reporter (WhyNoPartitionReporter): Reporter for rejection reasons.
+
+        """
         self.reporter = reporter
 
     def is_node_supported(
         self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
     ) -> bool:
+        """Return False for nodes explicitly unsupported on U55.
 
+        Args:
+            submodules (typing.Mapping[str, torch.nn.Module]): Exported modules.
+            node (fx.Node): FX node to check.
+
+        Returns:
+            bool: False if ``node.target`` is in ``unsupported_ops``; else True.
+
+        """
         if node.target in self.unsupported_ops:
             self.reporter.report_reject(node, "Op is not supported on U55.")
             return False
@@ -182,12 +247,37 @@ def is_node_supported(
 
 
 class EthosU55ViewCheck(OperatorSupportBase):
+    """Validate view/select shapes and dtypes for U55.
+
+    Performs lightweight checks on output shape rank and product constraints,
+    with awareness that transposes may be inserted around view/select during
+    lowering to channels-last.
+
+    Attributes:
+        reporter (WhyNoPartitionReporter): Reporter for rejection reasons.
+
+    """
 
     def __init__(self, reporter: WhyNoPartitionReporter):
+        """Initialize the check with a reporter.
+
+        Args:
+            reporter (WhyNoPartitionReporter): Reporter for rejection reasons.
+
+        """
         super().__init__()
         self.reporter = reporter
 
     def axes_product(self, nhwc_shape: shape_t) -> int:
+        """Return the product of all axes in ``nhwc_shape``.
+
+        Args:
+            nhwc_shape (list[int]): Shape in NHWC order.
+
+        Returns:
+            int: Product of the axis sizes.
+
+        """
         product = 1
         for axes in nhwc_shape:
             product *= axes
@@ -197,26 +287,27 @@ def axes_product(self, nhwc_shape: shape_t) -> int:
     def is_node_supported(
         self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
     ) -> bool:
-        """
-        Check whether a given view node is supported on U55.
+        """Check whether a given view/select node is U55-supported.
 
         Currently only checks dtypes and product of axes.
 
-        It is not the view operator itself that is not supported on U55. In order for the
-        view operator to be compatible with the channels-last format of TosaBackend,
-        transposes may need to be inserted before and after the view op. If that happens
-        and that transpose operator does not adhere to the limitations then it will
-        result in the following error:
+        It is not the view operator itself that is not supported on U55. In
+        order for the view operator to be compatible with the channels-last
+        format of TosaBackend, transposes may need to be inserted before and
+        after the view op. If that happens and that transpose operator does not
+        adhere to the limitations then it will result in the following error:
 
             CPU performance estimation for "Transpose" not implemented.
             ...
             CPU operations are not supported for GraphAPI input
 
         Args:
-            node: The FX node representing the view_copy operator.
+            submodules (typing.Mapping[str, torch.nn.Module]): Exported modules.
+            node (fx.Node): FX node for ``view_copy`` or ``select``.
 
         Returns:
-            False if the operator is not support and True if it is supported.
+            bool: False if rejected by constraints; otherwise, True.
+
         """
         # Select decomposes into squeeze, which in turn becomes a view. Therefore,
         # perform the same check on select operators as view operators.
@@ -279,14 +370,40 @@ def is_node_supported(
 
 
 class EthosU55TransposeCheck(OperatorSupportBase):
+    """Validate permute nodes against U55 reshape/transpose limits.
+
+    Applies dtype- and rank-specific constraints to permutations. Tests both
+    NCHW and NHWC interpretations for rank-3/4 shapes since dim order is unknown
+    at partition time.
+
+    Attributes:
+        reporter (WhyNoPartitionReporter): Reporter for rejection reasons.
+
+    """
 
     def __init__(self, reporter: WhyNoPartitionReporter):
+        """Initialize the check with a reporter.
+
+        Args:
+            reporter (WhyNoPartitionReporter): Reporter for rejection reasons.
+
+        """
         super().__init__()
         self.reporter = reporter
 
     def _pad_to_rank_4(
         self, shape: shape_t, permutation: list[int]
     ) -> tuple[shape_t, shape_t]:
+        """Pad shape/permutation to rank 4 by prepending ones/indices.
+
+        Args:
+            shape (list[int]): Original shape.
+            permutation (list[int]): Original permutation indices.
+
+        Returns:
+            tuple[list[int], list[int]]: Padded shape and permutation.
+
+        """
         diff = 4 - len(shape)
         padded_shape = [1] * diff + shape
         for i in range(len(permutation)):
@@ -295,6 +412,15 @@ def _pad_to_rank_4(
         return padded_shape, padded_permutation
 
     def axes_product(self, nhwc_shape: shape_t) -> int:
+        """Return the product of all axes in ``nhwc_shape``.
+
+        Args:
+            nhwc_shape (list[int]): Shape in NHWC order.
+
+        Returns:
+            int: Product of the axis sizes.
+
+        """
         product = 1
         for axes in nhwc_shape:
             product *= axes
@@ -303,7 +429,7 @@ def axes_product(self, nhwc_shape: shape_t) -> int:
     def _permute_constraint_i8_i16(
         self, nhwc_shape: list[int], permutation: list[int]
     ) -> bool:
-        """Returns True if the constraints are ok."""
+        """Return True if permutation meets i8/i16 constraints."""
         N, H, W, C = nhwc_shape
         match permutation:
             case (0, 1, 2, 3):  # NHWC -> NHWC
@@ -316,7 +442,7 @@ def _permute_constraint_i8_i16(
     def _permute_constraint_i32(
         self, nhwc_shape: list[int], permutation: list[int]
     ) -> bool:
-        """Returns True if the constraints are ok."""
+        """Return True if permutation meets i32 constraints."""
         N, H, W, C = nhwc_shape
         match permutation:
             case (0, 1, 2, 3):  # NHWC -> NHWC
@@ -329,6 +455,7 @@ def _permute_constraint_i32(
                 return False
 
     def _permute_constraint(self, shape, permutation, dtype):
+        """Return True if permutation meets dtype-specific constraints."""
         if dtype in (torch.int8, torch.int16):
             return self._permute_constraint_i8_i16(shape, permutation)
         if dtype == torch.int32:
@@ -338,7 +465,19 @@ def _permute_constraint(self, shape, permutation, dtype):
     def is_node_supported(
         self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
     ) -> bool:
+        """Return True if a permute node satisfies U55 constraints.
+
+        Tests both NCHW and NHWC interpretations for rank-3/4 shapes, and
+        applies dtype-specific limits to shapes and permutations.
+
+        Args:
+            submodules (typing.Mapping[str, torch.nn.Module]): Exported modules.
+            node (fx.Node): FX node to check.
+
+        Returns:
+            bool: True if supported; otherwise, False.
 
+        """
         if not node.target == exir_ops.edge.aten.permute_copy.default:
             return True
 

From 91f1769acfd205586f2831670350bb43fb558feb Mon Sep 17 00:00:00 2001
From: Emma Kujala <47500215+emmakujala@users.noreply.github.com>
Date: Wed, 8 Oct 2025 16:18:26 +0200
Subject: [PATCH 178/266] Arm backend: Switch torch.tan to torch.max in
 test_multiple_delegates (#14813)

Switch torch.tan to torch.max in test_multiple_delegates.


Signed-off-by: Emma Kujala <Emma.Kujala@arm.com>
---
 backends/arm/test/misc/test_multiple_delegates.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/arm/test/misc/test_multiple_delegates.py b/backends/arm/test/misc/test_multiple_delegates.py
index f716bc45385..8dad25f4180 100644
--- a/backends/arm/test/misc/test_multiple_delegates.py
+++ b/backends/arm/test/misc/test_multiple_delegates.py
@@ -23,7 +23,7 @@ class MultipleDelegatesModule(torch.nn.Module):
 
     def forward(self, x: torch.Tensor, y: torch.Tensor):
         z = x + y
-        s = torch.tan(z)
+        s = torch.max(z)
         return s * z
 
 
From 5a6113f71eb212aed7e787c062886339e459d683 Mon Sep 17 00:00:00 2001
From: Oscar Andersson <87121123+oscarandersson8218@users.noreply.github.com>
Date: Wed, 8 Oct 2025 18:23:20 +0200
Subject: [PATCH 179/266] Arm backend: Add TOSA dialect op for MATMUL (#14694)

Adds TOSA backend dialect op for MATMUL and associating pass to rewrite
edge.aten.bmm to tosa.MATMUL.


Signed-off-by: Oscar Andersson <oscar.andersson@arm.com>
---
 backends/arm/_passes/__init__.py              |   1 +
 backends/arm/_passes/arm_pass_manager.py      |   3 +
 .../arm/_passes/fuse_constant_ops_pass.py     |   1 +
 backends/arm/_passes/rewrite_matmul.py        |  97 ++++++++++++
 backends/arm/operators/__init__.py            |  10 +-
 backends/arm/operators/op_bmm.py              | 143 ------------------
 backends/arm/operators/op_tosa_matmul.py      |  94 ++++++++++++
 .../{op_rescale.py => op_tosa_rescale.py}     |   0
 .../{op_resize.py => op_tosa_resize.py}       |   0
 .../{op_table.py => op_tosa_table.py}         |   0
 .../{op_transpose.py => op_tosa_transpose.py} |   0
 backends/arm/tosa/dialect/__init__.py         |   1 +
 backends/arm/tosa/dialect/ops/matmul.py       |  56 +++++++
 13 files changed, 258 insertions(+), 148 deletions(-)
 create mode 100644 backends/arm/_passes/rewrite_matmul.py
 delete mode 100644 backends/arm/operators/op_bmm.py
 create mode 100644 backends/arm/operators/op_tosa_matmul.py
 rename backends/arm/operators/{op_rescale.py => op_tosa_rescale.py} (100%)
 rename backends/arm/operators/{op_resize.py => op_tosa_resize.py} (100%)
 rename backends/arm/operators/{op_table.py => op_tosa_table.py} (100%)
 rename backends/arm/operators/{op_transpose.py => op_tosa_transpose.py} (100%)
 create mode 100644 backends/arm/tosa/dialect/ops/matmul.py

diff --git a/backends/arm/_passes/__init__.py b/backends/arm/_passes/__init__.py
index 008bc305aad..1374ed8a3d3 100644
--- a/backends/arm/_passes/__init__.py
+++ b/backends/arm/_passes/__init__.py
@@ -91,6 +91,7 @@
     ReplaceScalarWithTensorArgPassTOSABI,
     ReplaceScalarWithTensorArgPassTOSAMI,
 )
+from .rewrite_matmul import RewriteMatmulPass  # noqa
 from .rewrite_upsample import RewriteUpsamplePass  # noqa
 from .scalars_to_attribute_pass import ScalarsToAttributePass  # noqa
 from .size_adjust_input_pass import SizeAdjustInputPass  # noqa
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
index 1a0f4e4d384..ef6d6e6810a 100644
--- a/backends/arm/_passes/arm_pass_manager.py
+++ b/backends/arm/_passes/arm_pass_manager.py
@@ -92,6 +92,7 @@
     ReplaceScalarWithTensorArgPassTOSABI,
     ReplaceScalarWithTensorArgPassTOSAMI,
     RetraceFoldedDtypesPass,
+    RewriteMatmulPass,
     RewriteUpsamplePass,
     ScalarsToAttributePass,
     SizeAdjustInputPass,
@@ -211,6 +212,7 @@ def _tosa_INT_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         self.add_pass(RewriteUpsamplePass(exported_program))
         self.add_pass(AddBiasPass(exported_program))
 
+        self.add_pass(RewriteMatmulPass(exported_program))
         self.add_pass(FuseEqualPlaceholdersPass(exported_program))
         self.add_pass(ToTosaMemoryFormatPass(exported_program))
         self.add_pass(RemoveNoopPass())
@@ -297,6 +299,7 @@ def _tosa_FP_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         self.add_pass(RewriteUpsamplePass(exported_program))
         self.add_pass(AddBiasPass(exported_program))
         self.add_pass(InsertTableOpsPass(exported_program))
+        self.add_pass(RewriteMatmulPass(exported_program))
         self.add_pass(FuseEqualPlaceholdersPass(exported_program))
         self.add_pass(ToTosaMemoryFormatPass(exported_program))
         self.add_pass(RemoveNoopPass())
diff --git a/backends/arm/_passes/fuse_constant_ops_pass.py b/backends/arm/_passes/fuse_constant_ops_pass.py
index 07d8288b5f1..c48fc008b5d 100644
--- a/backends/arm/_passes/fuse_constant_ops_pass.py
+++ b/backends/arm/_passes/fuse_constant_ops_pass.py
@@ -114,6 +114,7 @@ def call(self, graph_module):
             if node.op != "call_function":
                 continue
             if node.target in [
+                exir_ops.backend.tosa.MATMUL.default,
                 exir_ops.backend.tosa.RESCALE.default,
                 exir_ops.backend.tosa.RESIZE.default,
                 exir_ops.backend.tosa.TABLE.default,
diff --git a/backends/arm/_passes/rewrite_matmul.py b/backends/arm/_passes/rewrite_matmul.py
new file mode 100644
index 00000000000..28ff800792b
--- /dev/null
+++ b/backends/arm/_passes/rewrite_matmul.py
@@ -0,0 +1,97 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Set, Type
+
+import torch
+from executorch.backends.arm._passes import ArmPass
+from executorch.backends.arm._passes.arm_pass_utils import (
+    create_node,
+    get_first_fake_tensor,
+)
+from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
+    get_input_qparams,
+    get_output_qparams,
+)
+from executorch.backends.arm.tosa.mapping import TosaSpecialDtype
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+class RewriteMatmulPass(ArmPass):
+    """Rewrites aten.bmm to tosa.MATMUL and inserts a tosa.RESCALE op if needed."""
+
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
+    def _insert_output_rescale(self, graph_module, node, tosa_matmul_node, dtype):
+        input_qparams = get_input_qparams(node)
+        output_qparams = get_output_qparams(node)[0]
+        scale = (
+            input_qparams[0].get_scale_per_tensor()
+            * input_qparams[1].get_scale_per_tensor()
+        ) / output_qparams.get_scale_per_tensor()
+
+        with graph_module.graph.inserting_after(tosa_matmul_node):
+            # If the input is int8, we need to cast the output to int32
+            rescale_node = create_node(
+                graph_module.graph,
+                op_target=exir_ops.backend.tosa.RESCALE.default,
+                from_node=tosa_matmul_node,
+            )
+            tosa_matmul_node.replace_all_uses_with(rescale_node)
+            rescale_node.args = (
+                tosa_matmul_node,
+                dtype,
+                scale,
+                0,
+                output_qparams.get_zp_per_tensor(),
+            )
+
+    def call(self, graph_module):
+        modified = False
+        for node in graph_module.graph.nodes:
+            if (
+                node.op != "call_function"
+                or node.target != exir_ops.edge.aten.bmm.default
+            ):
+                continue
+            modified = True
+
+            x1, x2 = node.args
+            tosa_matmul_target = exir_ops.backend.tosa.MATMUL.default
+            with graph_module.graph.inserting_before(node):
+                tosa_matmul_node = create_node(
+                    graph_module.graph,
+                    op_target=tosa_matmul_target,
+                    args=(x1, x2),
+                    kwargs={},
+                    from_node=node,
+                )
+                node.replace_all_uses_with(tosa_matmul_node)
+                graph_module.graph.erase_node(node)
+
+            x1_fake_tensor = get_first_fake_tensor(x1)
+            x2_fake_tensor = get_first_fake_tensor(x2)
+            output_fake_tensor = tosa_matmul_target(x1_fake_tensor, x2_fake_tensor)
+            node_output_fake_tensor = get_first_fake_tensor(node)
+            if (
+                output_fake_tensor.dtype == torch.int32
+                and node_output_fake_tensor.dtype in (torch.int8, torch.int16)
+            ):
+                self._insert_output_rescale(
+                    graph_module,
+                    node,
+                    tosa_matmul_node,
+                    dtype=node_output_fake_tensor.dtype,
+                )
+                if x1_fake_tensor.dtype == torch.int16:
+                    tosa_matmul_node.meta[TosaSpecialDtype.meta_key()] = (
+                        TosaSpecialDtype.INT48
+                    )
+
+        if modified:
+            graph_module.recompile()
+            graph_module = super().call(graph_module).graph_module
+        return PassResult(graph_module, modified)
diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py
index d8b371570f6..9278d25959f 100644
--- a/backends/arm/operators/__init__.py
+++ b/backends/arm/operators/__init__.py
@@ -14,7 +14,6 @@
     op_any,
     op_avg_pool2d,
     op_bitwise_not,
-    op_bmm,
     op_cat,
     op_ceil,
     op_clamp,
@@ -42,8 +41,6 @@
     op_pow,
     op_reciprocal,
     op_repeat,
-    op_rescale,
-    op_resize,
     op_rshift_tensor,
     op_rsqrt,
     op_sigmoid,
@@ -51,10 +48,13 @@
     op_slice,
     op_sub,
     op_sum,
-    op_table,
     op_tanh,
     op_to_dim_order_copy,
-    op_transpose,
+    op_tosa_matmul,
+    op_tosa_rescale,
+    op_tosa_resize,
+    op_tosa_table,
+    op_tosa_transpose,
     op_view,
     op_where,
     ops_binary,
diff --git a/backends/arm/operators/op_bmm.py b/backends/arm/operators/op_bmm.py
deleted file mode 100644
index 9bebc3597ca..00000000000
--- a/backends/arm/operators/op_bmm.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# Copyright 2024-2025 Arm Limited and/or its affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-unsafe
-"""Provide a visitor for lowering batched matmul (BMM) to TOSA."""
-
-from typing import Any, List
-
-import torch
-
-from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
-    get_input_qparams,
-    get_output_qparams,
-)
-from executorch.backends.arm.operators.node_visitor import (
-    NodeVisitor,
-    register_node_visitor,
-)
-from executorch.backends.arm.operators.operator_validation_utils import (
-    validate_num_inputs,
-    validate_same_dtype,
-    validate_valid_dtype,
-)
-from executorch.backends.arm.tosa import TosaSpecification
-from executorch.backends.arm.tosa.mapping import TosaArg
-from executorch.backends.arm.tosa.quant_utils import build_rescale
-from tosa.RoundingMode import RoundingMode  # type: ignore
-
-
-@register_node_visitor
-class BMMVisitor(NodeVisitor):
-    """Provide a visitor that lowers ``aten.bmm`` to TOSA ``MATMUL``.
-
-    INT8 accumulates into INT32; add a rescale to INT8 using SINGLE_ROUND
-    rounding and output zero-point.
-
-    """
-
-    target = "aten.bmm.default"
-
-    tosa_specs = [
-        TosaSpecification.create_from_string("TOSA-1.0+INT"),
-        TosaSpecification.create_from_string("TOSA-1.0+FP"),
-    ]
-
-    def __init__(self, *args):
-        super().__init__(*args)
-
-    def define_node(
-        self,
-        node: torch.fx.Node,
-        tosa_graph: Any,
-        inputs: List[TosaArg],
-        output: TosaArg,
-    ) -> None:
-        """Define the TOSA ``MATMUL`` operator and optional rescale."""
-        import serializer.tosa_serializer as ts  # type: ignore
-
-        validate_num_inputs(self.target, inputs, 2)
-        validate_same_dtype(self.target, [*inputs, output], ts)
-        validate_valid_dtype(
-            self.target,
-            [*inputs, output],
-            [ts.DType.INT8, ts.DType.INT16, ts.DType.FP32],
-            output.tosa_spec,
-        )
-
-        # aten.bmm maps directly to MATMUL
-
-        # For INT8, we need to get the zero points and add an intermediate tensor
-        # for a later rescale.
-
-        if inputs[0].dtype == ts.DType.INT8:
-            input_qparams = get_input_qparams(node)
-            input0_zp = input_qparams[0].get_zp_per_tensor()
-            input1_zp = input_qparams[1].get_zp_per_tensor()
-            bmm_result = tosa_graph.addIntermediate(output.shape, ts.DType.INT32)
-            bmm_output_name = bmm_result.name
-        elif inputs[0].dtype == ts.DType.INT16:
-            input_qparams = get_input_qparams(node)
-            input0_zp = input_qparams[0].get_zp_per_tensor()
-            input1_zp = input_qparams[1].get_zp_per_tensor()
-            bmm_result = tosa_graph.addIntermediate(output.shape, ts.DType.INT48)
-            bmm_output_name = bmm_result.name
-        else:
-            bmm_output_name = output.name
-            input0_zp, input1_zp = 0, 0
-
-        tosa_graph.addConst([1], inputs[0].dtype, [input0_zp], name=f"{node.name}_A_ZP")
-        tosa_graph.addConst([1], inputs[1].dtype, [input1_zp], name=f"{node.name}_B_ZP")
-
-        # Add the MATMUL to the TOSA graph.
-        self._serialize_operator(
-            node,
-            tosa_graph,
-            ts.TosaOp.Op().MATMUL,
-            [
-                inputs[0].name,
-                inputs[1].name,
-                f"{node.name}_A_ZP",
-                f"{node.name}_B_ZP",
-            ],
-            [bmm_output_name],
-        )
-
-        # As INT8 accumulates into INT32, we need to rescale it back to INT8
-        if output.dtype == ts.DType.INT8:
-            output_qparams = get_output_qparams(node)[0]
-            final_output_scale = (
-                input_qparams[0].get_scale_per_tensor() * input_qparams[1].get_scale_per_tensor()  # type: ignore[possibly-undefined]  # pyre-ignore[61]
-            ) / output_qparams.get_scale_per_tensor()
-
-            build_rescale(
-                tosa_fb=tosa_graph,
-                scale=[final_output_scale],
-                # pyre-ignore[61]: Uninitialized local [61]: Local variable `bmm_result` is undefined, or not always defined.
-                input_node=bmm_result,  # type: ignore[possibly-undefined]
-                output_name=output.name,
-                output_type=ts.DType.INT8,
-                input_zp=[0],
-                output_zp=[output_qparams.get_zp_per_tensor()],
-                rounding_mode=RoundingMode.SINGLE_ROUND,
-            )
-        elif output.dtype == ts.DType.INT16:
-            output_qparams = get_output_qparams(node)[0]
-            final_output_scale = (
-                input_qparams[0].get_scale_per_tensor() * input_qparams[1].get_scale_per_tensor()  # type: ignore[possibly-undefined]  # pyre-ignore[61]
-            ) / output_qparams.get_scale_per_tensor()
-
-            build_rescale(
-                tosa_fb=tosa_graph,
-                scale=[final_output_scale],
-                # pyre-ignore[61]: Uninitialized local [61]: Local variable `bmm_result` is undefined, or not always defined.
-                input_node=bmm_result,  # type: ignore[possibly-undefined]
-                output_name=output.name,
-                output_type=ts.DType.INT16,
-                input_zp=[0],
-                output_zp=[output_qparams.get_zp_per_tensor()],
-                rounding_mode=RoundingMode.SINGLE_ROUND,
-            )
diff --git a/backends/arm/operators/op_tosa_matmul.py b/backends/arm/operators/op_tosa_matmul.py
new file mode 100644
index 00000000000..b177fd2ba37
--- /dev/null
+++ b/backends/arm/operators/op_tosa_matmul.py
@@ -0,0 +1,94 @@
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+"""Provide a visitor for lowering batched matmul (BMM) to TOSA."""
+
+from typing import Any, List
+
+import torch
+
+from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
+    get_input_qparams,
+)
+from executorch.backends.arm.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.arm.operators.operator_validation_utils import (
+    validate_num_inputs,
+    validate_same_dtype,
+    validate_valid_dtype,
+)
+from executorch.backends.arm.tosa import TosaSpecification
+from executorch.backends.arm.tosa.mapping import TosaArg
+
+
+@register_node_visitor
+class MatmulVisitor(NodeVisitor):
+    """Provide a visitor that serializes TOSA ``MATMUL``."""
+
+    target = "tosa.MATMUL.default"
+
+    tosa_specs = [
+        TosaSpecification.create_from_string("TOSA-1.0+INT"),
+        TosaSpecification.create_from_string("TOSA-1.0+FP"),
+    ]
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        tosa_graph: Any,
+        inputs: List[TosaArg],
+        output: TosaArg,
+    ) -> None:
+        """Define the TOSA ``MATMUL`` operator."""
+        import serializer.tosa_serializer as ts  # type: ignore
+
+        validate_num_inputs(self.target, inputs, 2)
+        validate_same_dtype(self.target, [*inputs], ts)
+        validate_valid_dtype(
+            self.target,
+            [*inputs],
+            [ts.DType.INT8, ts.DType.INT16, ts.DType.FP32],
+            output.tosa_spec,
+        )
+        validate_valid_dtype(
+            self.target,
+            [output],
+            [ts.DType.INT32, ts.DType.INT48, ts.DType.FP32],
+            output.tosa_spec,
+        )
+
+        # We need to get the zero points and add an intermediate tensor for INT16 case
+        if inputs[0].dtype in (ts.DType.INT8, ts.DType.INT16):
+            input_qparams = get_input_qparams(node)
+            input0_zp = input_qparams[0].get_zp_per_tensor()
+            input1_zp = input_qparams[1].get_zp_per_tensor()
+        else:
+            input0_zp, input1_zp = 0, 0
+
+        input_A_ZP_name = f"{node.name}_A_ZP"
+        input_B_ZP_name = f"{node.name}_B_ZP"
+        tosa_graph.addConst([1], inputs[0].dtype, [input0_zp], name=input_A_ZP_name)
+        tosa_graph.addConst([1], inputs[1].dtype, [input1_zp], name=input_B_ZP_name)
+
+        # Add the MATMUL to the TOSA graph.
+        self._serialize_operator(
+            node,
+            tosa_graph,
+            ts.TosaOp.Op().MATMUL,
+            [
+                inputs[0].name,
+                inputs[1].name,
+                input_A_ZP_name,
+                input_B_ZP_name,
+            ],
+            [output.name],
+        )
diff --git a/backends/arm/operators/op_rescale.py b/backends/arm/operators/op_tosa_rescale.py
similarity index 100%
rename from backends/arm/operators/op_rescale.py
rename to backends/arm/operators/op_tosa_rescale.py
diff --git a/backends/arm/operators/op_resize.py b/backends/arm/operators/op_tosa_resize.py
similarity index 100%
rename from backends/arm/operators/op_resize.py
rename to backends/arm/operators/op_tosa_resize.py
diff --git a/backends/arm/operators/op_table.py b/backends/arm/operators/op_tosa_table.py
similarity index 100%
rename from backends/arm/operators/op_table.py
rename to backends/arm/operators/op_tosa_table.py
diff --git a/backends/arm/operators/op_transpose.py b/backends/arm/operators/op_tosa_transpose.py
similarity index 100%
rename from backends/arm/operators/op_transpose.py
rename to backends/arm/operators/op_tosa_transpose.py
diff --git a/backends/arm/tosa/dialect/__init__.py b/backends/arm/tosa/dialect/__init__.py
index f1e3a29ac22..897de70279f 100644
--- a/backends/arm/tosa/dialect/__init__.py
+++ b/backends/arm/tosa/dialect/__init__.py
@@ -4,6 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 from executorch.backends.arm.tosa.dialect.ops import (  # noqa F401
+    matmul,
     rescale,
     resize,
     table,
diff --git a/backends/arm/tosa/dialect/ops/matmul.py b/backends/arm/tosa/dialect/ops/matmul.py
new file mode 100644
index 00000000000..1ba3821f674
--- /dev/null
+++ b/backends/arm/tosa/dialect/ops/matmul.py
@@ -0,0 +1,56 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.backends.arm.tosa.dialect.lib import TosaValueError
+from executorch.backends.arm.tosa.dialect.ops_registration import register_fake_tosa_op
+
+from executorch.backends.arm.tosa.specification import (
+    get_context_spec,
+    TosaSpecification,
+)
+from executorch.exir.dialects._ops import ops as exir_ops
+
+
+@register_fake_tosa_op(
+    "MATMUL(Tensor input1, Tensor input2) -> Tensor",  # schema
+    (
+        TosaSpecification.create_from_string("TOSA-1.0+INT"),
+    ),  # target TOSA specifications
+)
+def MATMUL(x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
+    tosa_spec = get_context_spec()
+    """Performs matrix multiplication on two input tensors.
+    Additionally validates TOSA constraints of a MATMUL op.
+    """
+    if x1.dtype != x2.dtype:
+        raise TosaValueError(
+            f"Input tensors must have the same dtype, got {x1.dtype} and {x2.dtype}",
+            op="MATMUL",
+        )
+    if x1.dtype in (torch.int8, torch.int16):
+        if not tosa_spec.support_integer():
+            raise TosaValueError(
+                f"TOSA spec {tosa_spec} doesn't support integers", op="MATMUL"
+            )
+        else:
+            dtype = torch.int32
+    elif x1.dtype in (torch.float16, torch.float32):
+        if not tosa_spec.support_float():
+            raise TosaValueError(
+                f"TOSA spec {tosa_spec} doesn't support float", op="MATMUL"
+            )
+        else:
+            # float16 supports float16 accumulation as well
+            dtype = torch.float32
+    else:
+        raise TosaValueError(
+            f"Input tensors must be of type int8, float16 or float32, got {x1.dtype}",
+            op="MATMUL",
+        )
+
+    aten_fake_tensor = exir_ops.edge.aten.bmm.default(x1, x2)
+
+    return torch.empty_like(aten_fake_tensor, dtype=dtype)

From a9fe0b48379b6ae30bd4634a56404297da12b033 Mon Sep 17 00:00:00 2001
From: Adrian Lundell <36153706+AdrianLundell@users.noreply.github.com>
Date: Wed, 8 Oct 2025 18:25:35 +0200
Subject: [PATCH 180/266] Cortex_m backend: Add script for building test runner
 (#14750)

Signed-off-by: Adrian Lundell <adrian.lundell@arm.com>
---
 backends/cortex_m/test/build_test_runner.sh | 22 +++++++++++++++++++++
 1 file changed, 22 insertions(+)
 create mode 100755 backends/cortex_m/test/build_test_runner.sh

diff --git a/backends/cortex_m/test/build_test_runner.sh b/backends/cortex_m/test/build_test_runner.sh
new file mode 100755
index 00000000000..cc28ac5484a
--- /dev/null
+++ b/backends/cortex_m/test/build_test_runner.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# TODO: More separation from the regular arm executor runner and testing.
+
+set -eu
+
+# Always rebuild executorch in case the cortex-m kernels has been updated.
+script_dir=$(realpath "$(dirname "${BASH_SOURCE[0]}")")
+et_root_dir=$(realpath "${script_dir}/../../..")
+build_executorch="${et_root_dir}/backends/arm/scripts/build_executorch.sh"
+${build_executorch}
+
+# Build executor runner with all portable ops selected and semi hosting
+build_dir="${et_root_dir}/arm_test"
+build_executor_runner="${et_root_dir}/backends/arm/scripts/build_executor_runner.sh"
+build_root_test_dir="${et_root_dir}/arm_test/arm_semihosting_executor_runner_corstone-300"
+
+${build_executor_runner} --pte=semihosting --target=ethos-u55-128 --output="${build_root_test_dir}"

From 5af73ebc474f5d1a844af81cad813e64c32b63b0 Mon Sep 17 00:00:00 2001
From: winskuo-quic <143469905+winskuo-quic@users.noreply.github.com>
Date: Thu, 9 Oct 2025 00:35:39 +0800
Subject: [PATCH 181/266] Qualcomm AI Engine Direct - Support floor_divide with
 int input in QNN HTP backend (#14888)

### Summary
- Since QNN does not support floor_divide operations for int32 or int64
inputs, it is necessary to decompose the operation into a division using
floating-point precision, followed by applying the floor function.

### Test plan
UT added


Author: @shewu-quic


cc @cccclai @shewu-quic @haowhsu-quic @DannyYuyang-quic @cbilgin

---------

Co-authored-by: shewu <shewu@qti.qualcomm.com>
---
 backends/qualcomm/_passes/__init__.py         |  2 +
 .../_passes/decompose_floor_divide.py         | 62 +++++++++++++++
 backends/qualcomm/_passes/qnn_pass_manager.py |  6 ++
 backends/qualcomm/tests/test_qnn_delegate.py  | 75 +++++++++++++------
 4 files changed, 124 insertions(+), 21 deletions(-)
 create mode 100644 backends/qualcomm/_passes/decompose_floor_divide.py

diff --git a/backends/qualcomm/_passes/__init__.py b/backends/qualcomm/_passes/__init__.py
index 26b2bdc96c9..154a360689e 100644
--- a/backends/qualcomm/_passes/__init__.py
+++ b/backends/qualcomm/_passes/__init__.py
@@ -18,6 +18,7 @@
 from .decompose_col_im import DecomposeColIm
 from .decompose_einsum import DecomposeEinsum
 from .decompose_expm1 import DecomposeExpM1
+from .decompose_floor_divide import DecomposeFloorDivide
 from .decompose_glu import DecomposeGlu
 from .decompose_linalg_vector_norm import DecomposeLinalgVectorNorm
 from .decompose_minmaxdim import DecomposeMinMaxDim
@@ -61,6 +62,7 @@
     DecomposeColIm,
     DecomposeEinsum,
     DecomposeExpM1,
+    DecomposeFloorDivide,
     DecomposeGlu,
     DecomposeLinalgVectorNorm,
     DecomposeMinMaxDim,
diff --git a/backends/qualcomm/_passes/decompose_floor_divide.py b/backends/qualcomm/_passes/decompose_floor_divide.py
new file mode 100644
index 00000000000..f7de074259e
--- /dev/null
+++ b/backends/qualcomm/_passes/decompose_floor_divide.py
@@ -0,0 +1,62 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir.pass_base import ExportPass, PassResult
+
+from .utils import merge_decomposed_graph
+
+
+class FloorDivide(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y):
+        dtype = x.dtype
+        result = torch.div(x, y)
+        result = torch.floor(result)
+        return result.to(dtype)
+
+
+class DecomposeFloorDivide(ExportPass):
+    """
+    Decompose for math equivalent op.
+    Since QNN does not support floor_divide operations for int32 or int64 inputs,
+    it is necessary to decompose the operation into a division using floating-point precision,
+    followed by applying the floor function.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        graph = graph_module.graph
+        for node in graph.nodes:
+            model = FloorDivide()
+            if (
+                torch.ops.aten.floor_divide.default == node.target
+                and not torch.is_floating_point(node.meta["val"])
+            ):
+                decomposed_module = torch.export.export(
+                    model,
+                    (node.args[0].meta["val"], node.args[1].meta["val"]),
+                    strict=True,
+                ).module()
+                with graph.inserting_before(node):
+                    # remap is used to map original node values to new node values,
+                    # which ensures that reference to nodes are correctly updated in the new graph
+                    remap = {"x": node.args[0], "y": node.args[1]}
+                    merge_decomposed_graph(
+                        remap=remap,
+                        target_node=node,
+                        target_graph=graph,
+                        decomposed_graph_module=decomposed_module,
+                    )
+                    graph.erase_node(node)
+
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/qualcomm/_passes/qnn_pass_manager.py b/backends/qualcomm/_passes/qnn_pass_manager.py
index 796662ca6b3..360581a2929 100644
--- a/backends/qualcomm/_passes/qnn_pass_manager.py
+++ b/backends/qualcomm/_passes/qnn_pass_manager.py
@@ -23,6 +23,7 @@
     DecomposeColIm,
     DecomposeEinsum,
     DecomposeExpM1,
+    DecomposeFloorDivide,
     DecomposeGlu,
     DecomposeLinalgVectorNorm,
     DecomposeMinMaxDim,
@@ -223,6 +224,11 @@ def transform_for_export_pipeline(
         self.add_pass(DecomposeThreshold())
         self.add_pass(DecomposeLinalgVectorNorm(quantization_capture=True))
         self.add_pass(DecomposeExpM1())
+        # DecomposeFloorDivide does not apply to the annotation pipeline,
+        # since the CPU QDQ model would reduce accuracy.
+        # We keep div and floor operations in floating-point to maintain precision.
+        # This pass is needed before to_edge pipeline to avoid mixed type for div operator with RemoveMixedTypeOperators pass.
+        self.add_pass(DecomposeFloorDivide())
         self.add_pass(DecomposeWrapWithAutocast())
         # this pass will rewrite state_dict, it needs to be accomplished before
         # to_edge_transform_and_lower
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index fd0454e3250..56983561e5f 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -397,8 +397,8 @@ def test_qnn_backend_cumsum(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        self.lower_module_and_test_output(module, sample_input)
                         index += 1
+                        self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_einsum_outer_product(self):
         module = EinsumOuterProduct()  # noqa: F405
@@ -466,8 +466,8 @@ def test_qnn_backend_element_wise_add(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        self.lower_module_and_test_output(module, sample_input)
                         index += 1
+                        self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_element_wise_and(self):
         module = And(torch.tensor(1.7), torch.tensor(0.2))  # noqa: F405
@@ -505,8 +505,8 @@ def test_qnn_backend_element_wise_div(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        self.lower_module_and_test_output(module, sample_input)
                         index += 1
+                        self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_element_wise_mul(self):
         test_comb = [
@@ -532,8 +532,8 @@ def test_qnn_backend_element_wise_mul(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        self.lower_module_and_test_output(module, sample_input)
                         index += 1
+                        self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_element_wise_or(self):
         test_comb = [
@@ -607,8 +607,8 @@ def test_qnn_backend_element_wise_sub(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        self.lower_module_and_test_output(module, sample_input)
                         index += 1
+                        self.lower_module_and_test_output(module, sample_input)
 
     @unittest.expectedFailure
     def test_qnn_backend_elu(self):
@@ -650,10 +650,10 @@ def test_qnn_backend_expand(self):
         for module in modules:
             for sample_input in sample_inputs:
                 with self.subTest(i=index):
+                    index += 1
                     self.lower_module_and_test_output(
                         module, sample_input, passes_job=passes_job
                     )
-                    index += 1
 
     def test_qnn_backend_expm1(self):
         sample_input = (torch.randn(3, 4, 5),)
@@ -676,6 +676,21 @@ def test_qnn_backend_floor_divide(self):
             {
                 QCOM_MODULE: [FloorDiv()],  # noqa: F405
                 QCOM_SAMPLE_INPUTS: [
+                    (torch.randint(-100, 100, (10, 10)), torch.full((10, 10), 3)),
+                    (
+                        torch.randint(-100, 100, (10, 10)).float(),
+                        torch.full((10, 10), 2.5),
+                    ),
+                    (torch.randint(-1000, 1000, (10, 10)), torch.full((10, 10), 100)),
+                    (torch.tensor([10]), torch.arange(1, 5)),  # Failed
+                    (torch.arange(-10, 10), torch.tensor([2])),
+                    (torch.randint(-100, 100, (20,)), torch.full((20,), 2)),
+                    (torch.randint(-100, 100, (5, 10)), torch.full((5, 10), 2)),
+                    (torch.randint(-100, 100, (3, 4, 5)), torch.full((3, 4, 5), 2)),
+                    (
+                        torch.randint(-100, 100, (2, 3, 4, 5)),
+                        torch.full((2, 3, 4, 5), 2),
+                    ),
                     (torch.randn(2, 5, 1, 3), eps + torch.randn(2, 5, 1, 3)),
                     (torch.randn([2, 5, 1, 3]), eps + torch.randn([4, 1])),
                 ],
@@ -691,8 +706,8 @@ def test_qnn_backend_floor_divide(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        self.lower_module_and_test_output(module, sample_input)
                         index += 1
+                        self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_fold(self):
         sample_input = (torch.randn(3, 512, 256),)
@@ -972,8 +987,8 @@ def test_qnn_backend_leaky_relu(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        self.lower_module_and_test_output(module, sample_input)
                         index += 1
+                        self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_less_equal(self):
         test_comb = [
@@ -1228,8 +1243,8 @@ def test_qnn_backend_prelu(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        self.lower_module_and_test_output(module, sample_input)
                         index += 1
+                        self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_relu(self):
         module = Relu()  # noqa: F405
@@ -1356,8 +1371,8 @@ def test_qnn_backend_slice_scatter(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        self.lower_module_and_test_output(module, sample_input)
                         index += 1
+                        self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_stack(self):
         module = Stack()  # noqa: F405
@@ -2168,9 +2183,9 @@ def test_qnn_backend_element_wise_add(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
+                        index += 1
                         gm = self.get_qdq_module(module, sample_input)
                         self.lower_module_and_test_output(gm, sample_input)
-                        index += 1
 
     def test_qnn_backend_element_wise_and(self):
         module = And(torch.tensor(1.7), torch.tensor(0.2))  # noqa: F405
@@ -2209,9 +2224,9 @@ def test_qnn_backend_element_wise_div(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
+                        index += 1
                         gm = self.get_qdq_module(module, sample_input)
                         self.lower_module_and_test_output(gm, sample_input)
-                        index += 1
 
     def test_qnn_backend_element_wise_mul(self):
         test_comb = [
@@ -2237,9 +2252,9 @@ def test_qnn_backend_element_wise_mul(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
+                        index += 1
                         gm = self.get_qdq_module(module, sample_input)
                         self.lower_module_and_test_output(gm, sample_input)
-                        index += 1
 
     def test_qnn_backend_element_wise_or(self):
         test_comb = [
@@ -2315,9 +2330,9 @@ def test_qnn_backend_element_wise_sub(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
+                        index += 1
                         gm = self.get_qdq_module(module, sample_input)
                         self.lower_module_and_test_output(gm, sample_input)
-                        index += 1
 
     def test_qnn_backend_elu(self):
         module = Elu()  # noqa: F405
@@ -2366,11 +2381,11 @@ def test_qnn_backend_expand(self):
         for module in modules:
             for sample_input in sample_inputs:
                 with self.subTest(i=index):
+                    index += 1
                     module = self.get_qdq_module(module, sample_input)
                     self.lower_module_and_test_output(
                         module, sample_input, passes_job=passes_job
                     )
-                    index += 1
 
     def test_qnn_backend_expm1(self):
         sample_input = (torch.randn(3, 4, 5),)
@@ -2396,6 +2411,21 @@ def test_qnn_backend_floor_divide(self):
             {
                 QCOM_MODULE: [FloorDiv()],  # noqa: F405
                 QCOM_SAMPLE_INPUTS: [
+                    (torch.randint(-100, 100, (10, 10)), torch.full((10, 10), 3)),
+                    (
+                        torch.randint(-100, 100, (10, 10)).float(),
+                        torch.full((10, 10), 2.5),
+                    ),
+                    (torch.randint(-1000, 1000, (10, 10)), torch.full((10, 10), 100)),
+                    (torch.tensor([10]), torch.arange(1, 5)),
+                    (torch.arange(-10, 10), torch.tensor([2])),
+                    (torch.randint(-100, 100, (20,)), torch.full((20,), 2)),
+                    (torch.randint(-100, 100, (5, 10)), torch.full((5, 10), 2)),
+                    (torch.randint(-100, 100, (3, 4, 5)), torch.full((3, 4, 5), 2)),
+                    (
+                        torch.randint(-100, 100, (2, 3, 4, 5)),
+                        torch.full((2, 3, 4, 5), 2),
+                    ),
                     (torch.randn(2, 5, 1, 3), eps + torch.randn(2, 5, 1, 3)),
                     (torch.randn([2, 5, 1, 3]), eps + torch.randn([4, 1])),
                 ],
@@ -2411,9 +2441,12 @@ def test_qnn_backend_floor_divide(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        gm = self.get_qdq_module(module, sample_input)
-                        self.lower_module_and_test_output(gm, sample_input)
                         index += 1
+                        # Support int input cases with bypass_check=True
+                        gm = self.get_qdq_module(
+                            module, sample_input, bypass_check=True
+                        )
+                        self.lower_module_and_test_output(gm, sample_input)
 
     def test_qnn_backend_fold(self):
         sample_input = (torch.randn(3, 512, 256),)
@@ -2719,9 +2752,9 @@ def test_qnn_backend_leaky_relu(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
+                        index += 1
                         module = self.get_qdq_module(module, sample_input)
                         self.lower_module_and_test_output(module, sample_input)
-                        index += 1
 
     def test_qnn_backend_less_equal(self):
         test_comb = [
@@ -3023,9 +3056,9 @@ def test_qnn_backend_prelu(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        qdq_module = self.get_qdq_module(module, sample_input)
-                        self.lower_module_and_test_output(qdq_module, sample_input)
                         index += 1
+                        module = self.get_qdq_module(module, sample_input)
+                        self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_relu(self):
         module = Relu()  # noqa: F405
@@ -3175,9 +3208,9 @@ def test_qnn_backend_slice_scatter(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
+                        index += 1
                         module = self.get_qdq_module(module, sample_input)
                         self.lower_module_and_test_output(module, sample_input)
-                        index += 1
 
     def test_qnn_backend_softmax(self):
         modules = [Softmax(dim=1), Softmax(dim=-1)]  # noqa: F405

From 7c148a73c08d716089a968725b16f511a704251a Mon Sep 17 00:00:00 2001
From: Ethan Ng <ethann@meta.com>
Date: Wed, 8 Oct 2025 09:49:49 -0700
Subject: [PATCH 182/266] Add constraints for split_copy test

Differential Revision: D84104833

Pull Request resolved: https://github.com/pytorch/executorch/pull/14870
---
 backends/cadence/utils/facto_util.py | 75 ++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/backends/cadence/utils/facto_util.py b/backends/cadence/utils/facto_util.py
index a09f3578391..e49cf412c19 100644
--- a/backends/cadence/utils/facto_util.py
+++ b/backends/cadence/utils/facto_util.py
@@ -222,6 +222,34 @@ def random_size_constraint(deps: object, r: int, d: int) -> int:
                     cp.Value.Le(lambda deps, dtype, struct: 2),
                 ]
             )
+        case "transpose_copy.int":
+            tensor_constraints.extend(
+                [
+                    cp.Dtype.In(lambda deps: [torch.float32, torch.int32]),
+                ]
+            )
+        case "permute_copy.default":
+            tensor_constraints.extend(
+                [
+                    cp.Dtype.In(lambda deps: [torch.float32, torch.int8, torch.uint8]),
+                    cp.Rank.Le(
+                        lambda deps: 5
+                    ),  # xa_nn_transpose only supports up to 5D
+                    cp.Rank.Ge(lambda deps: 1),  # Must have at least 1 dimension
+                ]
+            )
+        case "sqrt.default":
+            tensor_constraints.extend(
+                [
+                    cp.Dtype.In(lambda deps: [torch.float32, torch.int32]),
+                ]
+            )
+        case "clamp.default":
+            tensor_constraints.extend(
+                [
+                    cp.Dtype.In(lambda deps: [torch.float32, torch.int32]),
+                ]
+            )
         case "rsqrt.default":
             tensor_constraints.extend(
                 [
@@ -232,6 +260,12 @@ def random_size_constraint(deps: object, r: int, d: int) -> int:
                     cp.Value.Le(lambda deps, dtype, struct: 2**2),
                 ]
             )
+        case "relu.default":
+            tensor_constraints.extend(
+                [
+                    cp.Dtype.In(lambda deps: [torch.float32]),
+                ]
+            )
         case "mean.dim":
             tensor_constraints.extend(
                 [
@@ -241,10 +275,17 @@ def random_size_constraint(deps: object, r: int, d: int) -> int:
         case "exp.default":
             tensor_constraints.extend(
                 [
+                    cp.Dtype.In(lambda deps: [torch.float32]),
                     cp.Value.Ge(lambda deps, dtype, struct: -(2**2)),
                     cp.Value.Le(lambda deps, dtype, struct: 2**2),
                 ]
             )
+        case "tanh.default":
+            tensor_constraints.extend(
+                [
+                    cp.Dtype.In(lambda deps: [torch.float32]),
+                ]
+            )
         case "slice_copy.Tensor":
             tensor_constraints.extend(
                 [
@@ -253,6 +294,34 @@ def random_size_constraint(deps: object, r: int, d: int) -> int:
                     cp.Value.Le(lambda deps, dtype, struct: 2),
                 ]
             )
+        case "div.Scalar" | "add.Tensor" | "mul.Tensor" | "sub.Tensor":
+            tensor_constraints.extend(
+                [
+                    cp.Dtype.In(
+                        lambda deps: [
+                            torch.int32,
+                            torch.int64,
+                            torch.float32,
+                        ]
+                    ),
+                ]
+            )
+        case "split_copy.Tensor":
+            tensor_constraints.extend(
+                [
+                    cp.Dtype.In(
+                        lambda deps: [
+                            torch.int32,
+                            torch.int64,
+                            torch.float32,
+                        ]
+                    ),
+                    cp.Value.Ge(lambda deps, dtype, struct: 1),
+                    cp.Value.Le(lambda deps, dtype, struct: 2**3),
+                    cp.Rank.Le(lambda deps: 3),
+                    cp.Size.Le(lambda deps, r, d: 2**2),
+                ]
+            )
         case "constant_pad_nd.default":
             tensor_constraints.extend(
                 [
@@ -283,6 +352,12 @@ def random_size_constraint(deps: object, r: int, d: int) -> int:
                     cp.Rank.Le(lambda deps: 2**2),
                 ]
             )
+        case "pow.Tensor_Scalar":
+            tensor_constraints.extend(
+                [
+                    cp.Dtype.In(lambda deps: [torch.float32, torch.int32]),
+                ]
+            )
         case "div.Tensor_mode" | "minimum.default":
             if index == 0:
                 tensor_constraints = [

From d6772775b3a303332d3a9127ff34ce7b15740d78 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu0820@users.noreply.github.com>
Date: Wed, 8 Oct 2025 09:59:24 -0700
Subject: [PATCH 183/266] Enable named data map extension in CUDA build
 (#14898)

Since #14861 we need to specify
`EXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON` along with
`EXECUTORCH_BUILD_EXTENSION_MODULE=ON`. Adding that to fix CUDA CI.
---
 .github/workflows/cuda.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index a983d40f639..8dbbb254ac3 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -136,6 +136,7 @@ jobs:
               -DEXECUTORCH_BUILD_CUDA=ON \
               -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
               -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+              -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
               -DEXECUTORCH_BUILD_TESTS=ON \
               -Bcmake-out .
         cmake --build cmake-out -j$(( $(nproc) - 1 )) --target voxtral_runner

From ec56cfa888c676ca7b0183bd1b86700e86e0abe1 Mon Sep 17 00:00:00 2001
From: eigen-k <eigen@meta.com>
Date: Wed, 8 Oct 2025 10:36:57 -0700
Subject: [PATCH 184/266] Gather common remove passes in one list.

Differential Revision: D83793087

Pull Request resolved: https://github.com/pytorch/executorch/pull/14781
---
 backends/cadence/aot/remove_ops.py | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/backends/cadence/aot/remove_ops.py b/backends/cadence/aot/remove_ops.py
index 663c5825e52..755692ec2ec 100644
--- a/backends/cadence/aot/remove_ops.py
+++ b/backends/cadence/aot/remove_ops.py
@@ -9,7 +9,7 @@
 
 import logging
 from dataclasses import dataclass, field
-from typing import cast, List, Optional, Sequence, Set
+from typing import cast, List, Optional, Sequence, Set, Type
 
 import torch
 import torch.fx
@@ -926,19 +926,25 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
         return super().call(graph_module)
 
 
+class CommonRemovePasses:
+    passes: List[Type[ExportPass]] = [
+        RemoveCloneOpPass,
+        RemoveAliasCopyOpPass,
+        RemoveNopExpandOpPass,
+        RemoveNopSliceOrViewOpPass,
+        RemoveNopSelectOpPass,
+        RemoveToOpsPass,
+        RemoveZeroSizedCatArgsPass,
+    ]
+
+
 class CadenceRemoveNops:
-    passes = [
+    passes: List[Type[ExportPass]] = CommonRemovePasses.passes + [
         SimplifySliceOpPass,
         RemoveCloneOpsTransformImported,
-        RemoveToOpsPass,
         RemoveNopRequantizeOpPass,
-        RemoveZeroSizedCatArgsPass,
-        RemoveNopSliceOrViewOpPass,
-        RemoveNopExpandOpPass,
         RemoveZeroSizedConstantPadNd,
-        RemoveCloneOpPass,
         RemoveContiguousOpPass,
-        RemoveAliasCopyOpPass,
         RemoveNopMulOpPass,
         RemoveNopAddOpPass,
         RemoveNopLinalgVectorNormOpPass,

From 524616893b9243912f2b966d937a78fbf2a4aa31 Mon Sep 17 00:00:00 2001
From: Andrew Grebenisan <33402477+DrJessop@users.noreply.github.com>
Date: Wed, 8 Oct 2025 10:50:13 -0700
Subject: [PATCH 185/266] Group-quantized embedding op

Differential Revision: D84020397

Pull Request resolved: https://github.com/pytorch/executorch/pull/14835
---
 backends/cadence/aot/functions.yaml           |   5 +
 backends/cadence/aot/ops_registrations.py     |  26 +++-
 backends/cadence/aot/ref_implementations.py   |  31 +++++
 .../aot/tests/test_ref_implementations.py     | 113 ++++++++++++++++++
 4 files changed, 173 insertions(+), 2 deletions(-)

diff --git a/backends/cadence/aot/functions.yaml b/backends/cadence/aot/functions.yaml
index d8024c0245a..1d63a41f989 100644
--- a/backends/cadence/aot/functions.yaml
+++ b/backends/cadence/aot/functions.yaml
@@ -468,3 +468,8 @@
   kernels:
     - arg_meta: null
       kernel_name: impl::generic::requantize_per_tensor_out
+
+- func: cadence::quantized_embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, Tensor indices, bool pruned_weights, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::generic::quantized_embedding_byte_out
diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
index 9266cc72970..2b78d81b156 100644
--- a/backends/cadence/aot/ops_registrations.py
+++ b/backends/cadence/aot/ops_registrations.py
@@ -320,7 +320,7 @@
     "float out_scale, int out_zero_point) -> (Tensor Z)"
 )
 lib.define(
-    "quantized_embedding_byte(Tensor weight, Tensor weight_scales, Tensor weight_zero_points, "
+    "quantized_embedding_byte(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
     "Tensor indices, bool pruned_weights=False) -> (Tensor X)"
 )
 lib.define(
@@ -514,7 +514,7 @@
     "int weight_zero_point, int out_multiplier, int out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)"
 )
 lib.define(
-    "quantized_embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor weight_zero_points, "
+    "quantized_embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
     "Tensor indices, bool pruned_weights=False, *, Tensor(a!) out) -> Tensor(a!)"
 )
 
@@ -2310,6 +2310,28 @@ def transposed_im2row_meta(
     return input.new_empty(output_size, dtype=input.dtype)
 
 
+@register_fake("cadence::quantized_embedding_byte")
+def quantized_embedding_byte_meta(
+    weight: torch.Tensor,
+    weight_scales: torch.Tensor,
+    weight_zero_points: torch.Tensor | None,
+    indices: torch.Tensor,
+    pruned_weights: bool = False,
+) -> torch.Tensor:
+    assert not pruned_weights
+    assert len(weight.shape) == 2
+    assert 1 <= len(weight_scales.shape) <= 2
+    if len(weight_scales.shape) == 2:
+        num_groups = weight_scales.shape[-1]
+        assert weight.shape[1] % num_groups == 0
+
+    if weight_zero_points is not None:
+        assert weight_zero_points.shape == weight_scales.shape
+
+    assert 1 <= len(indices.shape) <= 2
+    return torch.empty(*indices.shape, weight.shape[1], dtype=torch.float32)
+
+
 @register_fake("cadence::where_Scalar")
 def where_Scalar_meta(
     condition: torch.Tensor,
diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py
index ad1abb3ce4b..4f612e3bab4 100644
--- a/backends/cadence/aot/ref_implementations.py
+++ b/backends/cadence/aot/ref_implementations.py
@@ -1572,3 +1572,34 @@ def transposed_im2row(
     # Optionally, flatten to (N, num_patches, patch_size) if needed
     patches = patches.view(N, C * H_in * W_in, -1).transpose(1, 2).contiguous()
     return patches
+
+
+@impl(m, "quantized_embedding_byte")
+def quantized_embedding_byte(
+    weight: torch.Tensor,
+    weight_scales: torch.Tensor,
+    weight_zero_points: torch.Tensor | None,
+    indices: torch.Tensor,
+    pruned_weights: bool = False,
+) -> torch.Tensor:
+    if pruned_weights:
+        raise NotImplementedError("Pruned weights not supported")
+
+    # Cannot use torch.ops.quantized_decomposed.embedding_byte.dtype because
+    # it doesn't support num_groups == 1
+    num_groups = 1
+    if len(weight_scales.shape) == 2:
+        num_groups = weight_scales.shape[1]
+
+    group_size = weight.shape[1] // num_groups
+    weight = torch.ops.torchao.dequantize_affine.default(
+        input=weight,
+        block_size=(1, group_size),
+        scale=weight_scales,
+        zero_point=weight_zero_points,
+        input_dtype=weight.dtype,
+        quant_min=torch.iinfo(weight.dtype).min,
+        quant_max=torch.iinfo(weight.dtype).max,
+    )
+
+    return weight[indices]
diff --git a/backends/cadence/aot/tests/test_ref_implementations.py b/backends/cadence/aot/tests/test_ref_implementations.py
index d8a79454097..5856c9def66 100644
--- a/backends/cadence/aot/tests/test_ref_implementations.py
+++ b/backends/cadence/aot/tests/test_ref_implementations.py
@@ -2306,3 +2306,116 @@ def test_transposed_im2row(
             torch.equal(output, expected_output),
             f"transposed_im2row output mismatch in {name}: got {output}, expected {expected_output}",
         )
+
+    @expand(
+        [
+            (
+                "1_group",
+                torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=torch.int8),
+                torch.tensor([1, 1, 1], dtype=torch.float32),
+                torch.tensor([0, 0, 0], dtype=torch.int8),
+                torch.tensor([0, 2, 1], dtype=torch.int64),
+                torch.tensor(
+                    [[0.0, 1.0, 2.0], [6.0, 7.0, 8.0], [3.0, 4.0, 5.0]],
+                    dtype=torch.float32,
+                ),
+            ),
+            (
+                "2_groups",
+                torch.tensor(
+                    [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]], dtype=torch.int8
+                ),
+                torch.tensor([[0.5, 1.0], [1.5, 2.0], [2.5, 3.0]], dtype=torch.float32),
+                torch.tensor([[0, 1], [2, 3], [4, 5]], dtype=torch.int8),
+                torch.tensor([0, 2, 1], dtype=torch.int64),
+                torch.tensor(
+                    [
+                        [0.0, 0.5, 1.0, 2.0],
+                        [10.0, 12.5, 15.0, 18.0],
+                        [3.0, 4.5, 6.0, 8.0],
+                    ],
+                    dtype=torch.float32,
+                ),
+            ),
+            (
+                "1_group_none_zero_point",
+                torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=torch.int8),
+                torch.tensor([1, 1, 1], dtype=torch.float32),
+                None,
+                torch.tensor([0, 2, 1], dtype=torch.int64),
+                torch.tensor(
+                    [[0.0, 1.0, 2.0], [6.0, 7.0, 8.0], [3.0, 4.0, 5.0]],
+                    dtype=torch.float32,
+                ),
+            ),
+            (
+                "1_group_batch2",
+                torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=torch.int8),
+                torch.tensor([1, 1, 1], dtype=torch.float32),
+                torch.tensor([0, 0, 0], dtype=torch.int8),
+                torch.tensor([[0, 2, 1], [1, 0, 2]], dtype=torch.int64),
+                torch.tensor(
+                    [
+                        [[0.0, 1.0, 2.0], [6.0, 7.0, 8.0], [3.0, 4.0, 5.0]],
+                        [[3.0, 4.0, 5.0], [0.0, 1.0, 2.0], [6.0, 7.0, 8.0]],
+                    ],
+                    dtype=torch.float32,
+                ),
+            ),
+            (
+                "2_groups_batch2",
+                torch.tensor(
+                    [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]], dtype=torch.int8
+                ),
+                torch.tensor([[0.5, 1.0], [1.5, 2.0], [2.5, 3.0]], dtype=torch.float32),
+                torch.tensor([[0, 1], [2, 3], [4, 5]], dtype=torch.int8),
+                torch.tensor([[0, 2, 1], [2, 1, 0]], dtype=torch.int64),
+                torch.tensor(
+                    [
+                        [
+                            [0.0, 0.5, 1.0, 2.0],
+                            [10.0, 12.5, 15.0, 18.0],
+                            [3.0, 4.5, 6.0, 8.0],
+                        ],
+                        [
+                            [10.0, 12.5, 15.0, 18.0],
+                            [3.0, 4.5, 6.0, 8.0],
+                            [0.0, 0.5, 1.0, 2.0],
+                        ],
+                    ],
+                    dtype=torch.float32,
+                ),
+            ),
+            (
+                "1_group_none_zero_point_batch2",
+                torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=torch.int8),
+                torch.tensor([1, 1, 1], dtype=torch.float32),
+                None,
+                torch.tensor([[0, 2, 1], [1, 0, 2]], dtype=torch.int64),
+                torch.tensor(
+                    [
+                        [[0.0, 1.0, 2.0], [6.0, 7.0, 8.0], [3.0, 4.0, 5.0]],
+                        [[3.0, 4.0, 5.0], [0.0, 1.0, 2.0], [6.0, 7.0, 8.0]],
+                    ],
+                    dtype=torch.float32,
+                ),
+            ),
+        ]
+    )
+    def test_quantized_embedding_byte(
+        self,
+        name: str,
+        weight: torch.Tensor,
+        weight_scales: torch.Tensor,
+        weight_zero_points: torch.Tensor | None,
+        indices: torch.Tensor,
+        expected_out: torch.Tensor,
+    ) -> None:
+        self.assertTrue(
+            torch.equal(
+                torch.ops.cadence.quantized_embedding_byte(
+                    weight, weight_scales, weight_zero_points, indices
+                ),
+                expected_out,
+            )
+        )

From 1da530df8539611b0ff23013e6bc5b78c69dfc52 Mon Sep 17 00:00:00 2001
From: Gregory Comer <gjcomer@meta.com>
Date: Wed, 8 Oct 2025 12:25:33 -0600
Subject: [PATCH 186/266] Build pthreadpool with hidden visibility on Apple
 (#14838)

### Summary
We are seeing pthreadpool-related crashes on Mac when running with
pybindings. This appears to be due to XNNPACK using the Google fork of
pthreadpool and extension/threadpool using the pthreadpool in
libtorch_cpu. See https://github.com/pytorch/executorch/issues/14321 for
more details.

Beyond the obvious one definition rule issues, the specific failure
happens because the pthreadpool functions in the copy of pthreadpool
built with ET are marked as weak on Apple platforms. The functions are
not marked as weak in source code or in the build, and the behavior
appears to be specific to Apple's toolchain.

Weak symbols are compiled as indirect calls and can be overridden at
runtime by strong symbols in another dylib. For reasons that I don't
fully understand, the pthreadpool symbols in libtorch_cpu are strong.
Also, the calls in XNNPACK prefer the symbols from the local pthreadpool

This PR works around the issue by building pthreadpool with
-fvisibility=hidden, which causes the symbols to not be exposed in the
final dylib, and thus not end up in the symbol table as an indirect
symbol. Instead, the call to pthreadpool_create in extension_threadpool
is compiled as a direct call to the pthreadpool_create in the
pthreadpool built by executorch.

This isn't a proper fix for the issue, as there are still two
pthreadpool implementations in the process whenever we link
libtorch_cpu. However, it does appear to mitigate the symptoms and thus
prevent crashes. Long-term, we'll need to find a proper solution, such
as namespacing the pthreadpool fork.

### Test plan
In addition to validating this change on CI (including trunk CI), I
manually verified the fix by testing the repro in
https://github.com/pytorch/executorch/issues/14321 before and after the
change. I verified that ASan does not trip upon resetting the
threadpool. I also verified with `nm` and `otool` that
`pthreadpool_create` does not show up in the indirect symbol table, and
thus cannot (to my knowledge) be overridden at runtime by the
implementation in libtorch_cpu.
---
 CMakeLists.txt                      | 12 ++++++++++++
 extension/threadpool/threadpool.cpp |  2 ++
 2 files changed, 14 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c6d06fca2b1..ad08c72d1ae 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -266,6 +266,18 @@ if(EXECUTORCH_BUILD_PTHREADPOOL)
   executorch_move_interface_include_directories_to_build_time_only(
     pthreadpool_interface
   )
+
+  if(APPLE)
+    # Use hidden visibility for pthreadpool on Apple platforms to avoid issues
+    # with pthreadpool symbols from libtorch_cpu taking precedence over the ones
+    # from the pthreadpool library statically linked in _portable_lib. The
+    # pthreadpool public APIs are marked as weak by default on some Apple
+    # platforms, so setting to hidden visibility works around this by not
+    # putting the symbol in the indirection table. See
+    # https://github.com/pytorch/executorch/issues/14321 for more details.
+    target_compile_options(pthreadpool PRIVATE -fvisibility=hidden)
+  endif()
+
   install(
     TARGETS pthreadpool pthreadpool_interface fxdiv
     EXPORT ExecuTorchTargets
diff --git a/extension/threadpool/threadpool.cpp b/extension/threadpool/threadpool.cpp
index e9f3b0f5f4a..bebb4745581 100644
--- a/extension/threadpool/threadpool.cpp
+++ b/extension/threadpool/threadpool.cpp
@@ -44,6 +44,8 @@ size_t ThreadPool::get_thread_count() const {
 }
 
 bool ThreadPool::_unsafe_reset_threadpool(uint32_t new_thread_count) {
+  ET_LOG(Info, "Resetting threadpool to %u threads.", new_thread_count);
+
   // No need to do anything if the count is same or 0
   if (new_thread_count == get_thread_count() || new_thread_count == 0) {
     return true;

From 2672dd3db7c417ef43231d02df614fd5bfd2cb9f Mon Sep 17 00:00:00 2001
From: Shen Chen Xu <shenchenxu@meta.com>
Date: Wed, 8 Oct 2025 11:36:55 -0700
Subject: [PATCH 187/266] TransformerBlock: support attention skips

Differential Revision: D84003431

Pull Request resolved: https://github.com/pytorch/executorch/pull/14826
---
 examples/models/llama/attention.py         | 15 +++++++++++++++
 examples/models/llama/llama_transformer.py |  9 +++++++--
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/examples/models/llama/attention.py b/examples/models/llama/attention.py
index 6e3f7cb9fb2..0c0176269b3 100644
--- a/examples/models/llama/attention.py
+++ b/examples/models/llama/attention.py
@@ -516,3 +516,18 @@ def forward(
         output = self.wo(output)
 
         return output, None
+
+
+@register_attention("skip")
+class AttentionSkip(Attention):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        freqs_cos: torch.Tensor,
+        freqs_sin: torch.Tensor,
+        **kwargs: ForwardOptions,
+    ) -> Tuple[torch.Tensor, Optional[Any]]:
+        return x, None
diff --git a/examples/models/llama/llama_transformer.py b/examples/models/llama/llama_transformer.py
index 3a325d0f4f8..6587f7e1a10 100644
--- a/examples/models/llama/llama_transformer.py
+++ b/examples/models/llama/llama_transformer.py
@@ -15,6 +15,7 @@
 from executorch.examples.models.llama.attention import (
     Attention,
     ATTENTION_REGISTRY,
+    AttentionSkip,
     ForwardOptions,
 )
 from executorch.examples.models.llama.feed_forward import FeedForward
@@ -95,7 +96,10 @@ def __init__(self, args: ModelArgs, attention: Attention):
         else:
             self.feed_forward = FeedForward(dim=args.dim, hidden_dim=args.hidden_dim)
 
-        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
+        if isinstance(self.attention, AttentionSkip):
+            self.attention_norm = nn.Identity()
+        else:
+            self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
         self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
 
     @classmethod
@@ -120,8 +124,9 @@ def forward(self, x, freqs_cos, freqs_sin, attn_options: ForwardOptions):  # x:
         h, attn_options_update = self.attention.forward(
             self.attention_norm(x), freqs_cos, freqs_sin, **attn_options
         )
+        if not isinstance(self.attention, AttentionSkip):
+            h = x + h
 
-        h = x + h
         if hasattr(self, "block_sparse_moe"):
             out = h + self.block_sparse_moe(self.ffn_norm(h))
         else:

From c62cbfe99c080ef1ae70e44ab7715abd751ff56a Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Wed, 8 Oct 2025 20:30:02 +0100
Subject: [PATCH 188/266] Arm backend: Remove out of date warning for ethos-u
 tutorial (#14897)

---
 docs/source/tutorial-arm-ethos-u.md | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/docs/source/tutorial-arm-ethos-u.md b/docs/source/tutorial-arm-ethos-u.md
index 8fc4299cbb9..0c713e996f8 100644
--- a/docs/source/tutorial-arm-ethos-u.md
+++ b/docs/source/tutorial-arm-ethos-u.md
@@ -17,12 +17,6 @@ In this tutorial you will learn how to export a simple PyTorch model for the Exe
 
 ::::
 
-```{warning}
-This delegate is under active development, to get best results please use a recent version.
-The TOSA and Ethos-U backend support is reasonably mature and used in production by some users.
-You may encounter some rough edges and features which may be documented or planned but not implemented, please refer to the in-tree documentation for the latest status of features.
-```
-
 ```{tip}
 If you are already familiar with this delegate, you may want to jump directly to the examples:
 * [Examples in the ExecuTorch repository](https://github.com/pytorch/executorch/tree/main/examples/arm)
@@ -217,4 +211,4 @@ If you encountered any bugs or issues following this tutorial please file a bug/
 
 ```
 Arm is a registered trademark of Arm Limited (or its subsidiaries or affiliates).
-```
\ No newline at end of file
+```

From 73c8d8c11dcd6cdaa5f1cbad70dabf03205c8423 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Wed, 8 Oct 2025 16:26:14 -0400
Subject: [PATCH 189/266] Move cuda/runtime/shim/utils to cuda/runtime for
 better usibility. (#14913)

This PR was created by the merge bot to help merge the original PR into
the main branch.
ghstack PR number: https://github.com/pytorch/executorch/pull/14900 by
@Gasoonjia
^ Please use this as the source of truth for the PR details, comments,
and reviews
ghstack PR base:
https://github.com/pytorch/executorch/tree/gh/gasoonjia/45/base
ghstack PR head:
https://github.com/pytorch/executorch/tree/gh/gasoonjia/45/head
Merge bot PR base: https://github.com/pytorch/executorch/tree/main
Merge bot PR head:
https://github.com/pytorch/executorch/tree/gh/gasoonjia/45/orig
Differential Revision:
[D84169267](https://our.internmc.facebook.com/intern/diff/D84169267/)
@diff-train-skip-merge

Co-authored-by: gasoonjia <gasoonjia@icloud.com>
---
 backends/cuda/runtime/TARGETS                                   | 2 +-
 backends/cuda/runtime/shims/memory.cpp                          | 2 +-
 .../runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp | 2 +-
 backends/cuda/runtime/shims/tests/test_aoti_torch_copy_.cpp     | 2 +-
 .../shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp  | 2 +-
 .../shims/tests/test_aoti_torch_delete_tensor_object.cpp        | 2 +-
 .../cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp  | 2 +-
 backends/cuda/runtime/{shims => }/utils.h                       | 0
 8 files changed, 7 insertions(+), 7 deletions(-)
 rename backends/cuda/runtime/{shims => }/utils.h (100%)

diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS
index 1aa38760e5a..29fba0e706a 100644
--- a/backends/cuda/runtime/TARGETS
+++ b/backends/cuda/runtime/TARGETS
@@ -11,7 +11,7 @@ runtime.cxx_library(
     headers = [
         "shims/memory.h",
         "shims/tensor_attribute.h",
-        "shims/utils.h",
+        "utils.h",
     ],
     # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
     link_whole = True,
diff --git a/backends/cuda/runtime/shims/memory.cpp b/backends/cuda/runtime/shims/memory.cpp
index 2b32d820301..cbaca68576e 100644
--- a/backends/cuda/runtime/shims/memory.cpp
+++ b/backends/cuda/runtime/shims/memory.cpp
@@ -10,7 +10,7 @@
 #include <executorch/backends/aoti/utils.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
-#include <executorch/backends/cuda/runtime/shims/utils.h>
+#include <executorch/backends/cuda/runtime/utils.h>
 #include <executorch/runtime/platform/log.h>
 #include <cstdint>
 #include <cstdlib> // For posix_memalign
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp
index ef00ecff656..e18bf142b5c 100644
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp
@@ -10,7 +10,7 @@
 #include <executorch/backends/aoti/common_shims.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
-#include <executorch/backends/cuda/runtime/shims/utils.h>
+#include <executorch/backends/cuda/runtime/utils.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/platform/platform.h>
 #include <gtest/gtest.h>
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_copy_.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_copy_.cpp
index 7579eaef039..9fca0f92cf8 100644
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch_copy_.cpp
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_copy_.cpp
@@ -10,7 +10,7 @@
 #include <executorch/backends/aoti/common_shims.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
-#include <executorch/backends/cuda/runtime/shims/utils.h>
+#include <executorch/backends/cuda/runtime/utils.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/platform/platform.h>
 #include <gtest/gtest.h>
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp
index 2cb12719782..d9b785a5a78 100644
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp
@@ -10,7 +10,7 @@
 #include <executorch/backends/aoti/common_shims.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
-#include <executorch/backends/cuda/runtime/shims/utils.h>
+#include <executorch/backends/cuda/runtime/utils.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/platform/platform.h>
 #include <gtest/gtest.h>
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object.cpp
index eceb141e9ca..10c8d8c1a31 100644
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object.cpp
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object.cpp
@@ -10,7 +10,7 @@
 #include <executorch/backends/aoti/common_shims.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
-#include <executorch/backends/cuda/runtime/shims/utils.h>
+#include <executorch/backends/cuda/runtime/utils.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/platform/platform.h>
 #include <gtest/gtest.h>
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp
index 8e6998f457c..da65129f18a 100644
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp
@@ -10,7 +10,7 @@
 #include <executorch/backends/aoti/common_shims.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
-#include <executorch/backends/cuda/runtime/shims/utils.h>
+#include <executorch/backends/cuda/runtime/utils.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/platform/platform.h>
 #include <gtest/gtest.h>
diff --git a/backends/cuda/runtime/shims/utils.h b/backends/cuda/runtime/utils.h
similarity index 100%
rename from backends/cuda/runtime/shims/utils.h
rename to backends/cuda/runtime/utils.h

From 0142a1af77ff237ab78da5a5ba402d09639d85dd Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Wed, 8 Oct 2025 16:26:47 -0400
Subject: [PATCH 190/266] introduce CudaGuard and cudastreamguard (#14914)

This PR was created by the merge bot to help merge the original PR into
the main branch.
ghstack PR number: https://github.com/pytorch/executorch/pull/14901 by
@Gasoonjia
^ Please use this as the source of truth for the PR details, comments,
and reviews
ghstack PR base:
https://github.com/pytorch/executorch/tree/gh/gasoonjia/46/base
ghstack PR head:
https://github.com/pytorch/executorch/tree/gh/gasoonjia/46/head
Merge bot PR base:
https://github.com/pytorch/executorch/tree/gh/gasoonjia/45/orig
Merge bot PR head:
https://github.com/pytorch/executorch/tree/gh/gasoonjia/46/orig
Differential Revision:
[D84126481](https://our.internmc.facebook.com/intern/diff/D84126481/)
@diff-train-skip-merge

---------

Co-authored-by: gasoonjia <gasoonjia@icloud.com>
---
 backends/cuda/CMakeLists.txt                  |   2 +-
 backends/cuda/runtime/TARGETS                 |   2 +
 backends/cuda/runtime/guard.cpp               | 151 ++++++++++
 backends/cuda/runtime/guard.h                 | 195 +++++++++++++
 backends/cuda/runtime/tests/TARGETS           |   6 +
 backends/cuda/runtime/tests/targets.bzl       |  27 ++
 .../cuda/runtime/tests/test_cuda_guard.cpp    | 113 ++++++++
 .../runtime/tests/test_cuda_stream_guard.cpp  | 264 ++++++++++++++++++
 8 files changed, 759 insertions(+), 1 deletion(-)
 create mode 100644 backends/cuda/runtime/guard.cpp
 create mode 100644 backends/cuda/runtime/guard.h
 create mode 100644 backends/cuda/runtime/tests/TARGETS
 create mode 100644 backends/cuda/runtime/tests/targets.bzl
 create mode 100644 backends/cuda/runtime/tests/test_cuda_guard.cpp
 create mode 100644 backends/cuda/runtime/tests/test_cuda_stream_guard.cpp

diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index 7a9cdbd0b39..acbb7adc87f 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -36,7 +36,7 @@ find_package_torch()
 
 # CUDA-specific AOTI functionality
 set(_aoti_cuda_sources runtime/cuda_backend.cpp runtime/shims/memory.cpp
-                       runtime/shims/tensor_attribute.cpp
+                       runtime/shims/tensor_attribute.cpp runtime/guard.cpp
 )
 add_library(aoti_cuda STATIC ${_aoti_cuda_sources})
 target_include_directories(
diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS
index 29fba0e706a..c4b778eccc5 100644
--- a/backends/cuda/runtime/TARGETS
+++ b/backends/cuda/runtime/TARGETS
@@ -5,10 +5,12 @@ oncall("executorch")
 runtime.cxx_library(
     name = "runtime_shims",
     srcs = [
+        "guard.cpp",
         "shims/memory.cpp",
         "shims/tensor_attribute.cpp",
     ],
     headers = [
+        "guard.h",
         "shims/memory.h",
         "shims/tensor_attribute.h",
         "utils.h",
diff --git a/backends/cuda/runtime/guard.cpp b/backends/cuda/runtime/guard.cpp
new file mode 100644
index 00000000000..885efc7670d
--- /dev/null
+++ b/backends/cuda/runtime/guard.cpp
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cuda/runtime/guard.h>
+#include <executorch/runtime/platform/log.h>
+
+namespace executorch {
+namespace backends {
+namespace cuda {
+
+namespace {
+// Thread-local stream storage (private to this file)
+thread_local std::unordered_map<DeviceIndex, cudaStream_t> current_streams_;
+} // namespace
+
+Error setCurrentCUDAStream(cudaStream_t stream, DeviceIndex device_index) {
+  if (device_index == -1) {
+    // Get current device if not specified
+    int current_device;
+    ET_CUDA_CHECK_OR_RETURN_ERROR(cudaGetDevice(&current_device));
+    device_index = current_device;
+  }
+
+  current_streams_[device_index] = stream;
+  return Error::Ok;
+}
+
+Result<cudaStream_t> getCurrentCUDAStream(DeviceIndex device_index) {
+  if (device_index == -1) {
+    int current_device;
+    ET_CUDA_CHECK_OR_RETURN_ERROR(cudaGetDevice(&current_device));
+    device_index = current_device;
+  }
+
+  auto it = current_streams_.find(device_index);
+  if (it != current_streams_.end()) {
+    return it->second;
+  }
+
+  cudaStream_t stream;
+  ET_CUDA_CHECK_OR_RETURN_ERROR(cudaStreamCreate(&stream));
+  setCurrentCUDAStream(stream, device_index);
+  return stream;
+}
+
+CUDAGuard::CUDAGuard(CUDAGuard&& other) noexcept
+    : original_device_index_(other.original_device_index_),
+      current_device_index_(other.current_device_index_) {
+  // Mark the moved-from object as "already restored" so its destructor doesn't
+  // try to restore the device
+  other.original_device_index_ = other.current_device_index_;
+}
+
+CUDAGuard::~CUDAGuard() {
+  if (original_device_index_ != current_device_index_) {
+    cudaError_t err = cudaSetDevice(original_device_index_);
+    if (err != cudaSuccess) {
+      ET_LOG(
+          Error,
+          "~CUDAGuard: Failed to restore device to %d: %s",
+          original_device_index_,
+          cudaGetErrorString(err));
+    }
+  }
+}
+
+Error CUDAGuard::set_index(DeviceIndex device_index) {
+  int orig_index = -1;
+  ET_CUDA_CHECK_OR_RETURN_ERROR(cudaGetDevice(&orig_index));
+
+  original_device_index_ = orig_index;
+  current_device_index_ = device_index;
+
+  if (current_device_index_ != original_device_index_) {
+    ET_CUDA_CHECK_OR_RETURN_ERROR(cudaSetDevice(current_device_index_));
+  }
+
+  return Error::Ok;
+}
+
+Result<CUDAGuard> CUDAGuard::create(DeviceIndex device_index) {
+  CUDAGuard guard; // Fixed: Removed () to create a variable, not a function
+  ET_CHECK_OK_OR_RETURN_ERROR(guard.set_index(device_index));
+  return guard;
+}
+
+CUDAStreamGuard::CUDAStreamGuard(CUDAStreamGuard&& other) noexcept
+    : device_guard_(std::move(other.device_guard_)),
+      original_stream_(other.original_stream_),
+      current_stream_(other.current_stream_),
+      device_index_(other.device_index_) {
+  // Mark the moved-from object as "already restored" so its destructor doesn't
+  // try to restore the stream
+  other.original_stream_ = other.current_stream_;
+}
+
+CUDAStreamGuard::~CUDAStreamGuard() {
+  // Restore the original stream unless this object was moved-from.
+  // After a move, original_stream_ == current_stream_, which indicates
+  // the moved-from object should not restore.
+  // Note: nullptr is a valid stream value (represents the default stream),
+  // so we must restore even if original_stream_ is nullptr.
+  if (original_stream_ != current_stream_) {
+    Error err = setCurrentCUDAStream(original_stream_, device_index_);
+    if (err != Error::Ok) {
+      ET_LOG(
+          Error,
+          "~CUDAStreamGuard: Failed to restore stream for device %d",
+          device_index_);
+    }
+  }
+}
+
+Error CUDAStreamGuard::set_stream(
+    cudaStream_t stream,
+    DeviceIndex device_index) {
+  auto result = getCurrentCUDAStream(device_index);
+  if (!result.ok()) {
+    ET_LOG(Error, "Failed to get current stream for device %d", device_index);
+    return result.error();
+  }
+
+  original_stream_ = result.get();
+  current_stream_ = stream;
+  device_index_ = device_index;
+
+  ET_CHECK_OK_OR_RETURN_ERROR(setCurrentCUDAStream(stream, device_index));
+
+  return Error::Ok;
+}
+
+Result<CUDAStreamGuard> CUDAStreamGuard::create(
+    cudaStream_t stream,
+    DeviceIndex device_index) {
+  auto guard_result = CUDAGuard::create(device_index);
+  ET_CHECK_OK_OR_RETURN_ERROR(guard_result.error());
+
+  CUDAStreamGuard stream_guard(std::move(guard_result.get()));
+  ET_CHECK_OK_OR_RETURN_ERROR(stream_guard.set_stream(stream, device_index));
+
+  return stream_guard;
+}
+
+} // namespace cuda
+} // namespace backends
+} // namespace executorch
diff --git a/backends/cuda/runtime/guard.h b/backends/cuda/runtime/guard.h
new file mode 100644
index 00000000000..4e5a18a4c0f
--- /dev/null
+++ b/backends/cuda/runtime/guard.h
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <executorch/backends/cuda/runtime/utils.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/result.h>
+#include <cstdint>
+
+namespace executorch {
+namespace backends {
+namespace cuda {
+
+using executorch::runtime::Error;
+using executorch::runtime::Result;
+
+// Type alias for device index
+using DeviceIndex = int32_t;
+
+/**
+ * Set the current CUDA stream for the specified device.
+ *
+ * @param stream The CUDA stream to set as current
+ * @param device_index The device index (-1 to use current device)
+ * @return Error code indicating success or failure
+ */
+Error setCurrentCUDAStream(cudaStream_t stream, DeviceIndex device_index = -1);
+
+/**
+ * Get the current CUDA stream for the specified device.
+ * If no stream has been set, creates a new stream and sets it as current.
+ *
+ * @param device_index The device index (-1 to use current device)
+ * @return Result containing the current stream on success, or an error code on
+ * failure
+ */
+Result<cudaStream_t> getCurrentCUDAStream(DeviceIndex device_index = -1);
+
+/**
+ * RAII guard that sets the current CUDA device and restores it on destruction.
+ * This ensures that the device is properly restored even if an exception
+ * occurs.
+ *
+ */
+class CUDAGuard {
+ private:
+  /**
+   * Private constructor - use create() factory method instead.
+   */
+  explicit CUDAGuard()
+      : original_device_index_(-1), current_device_index_(-1) {}
+
+ public:
+  /**
+   * Factory method to create a CUDAGuard.
+   *
+   * @param device_index The device index to set as current
+   * @return Result containing the guard on success, or an error code on failure
+   */
+  static Result<CUDAGuard> create(DeviceIndex device_index);
+
+  // Copy is not allowed
+  CUDAGuard(const CUDAGuard&) = delete;
+  CUDAGuard& operator=(const CUDAGuard&) = delete;
+
+  // Move constructor and assignment
+  CUDAGuard(CUDAGuard&& other) noexcept;
+  CUDAGuard& operator=(CUDAGuard&& other) = delete;
+
+  /**
+   * Destructor that restores the original device if necessary.
+   */
+  ~CUDAGuard();
+
+  /**
+   * Sets the CUDA device to the given device index.
+   *
+   * @param device_index The device index to set as current
+   * @return Error code indicating success or failure
+   */
+  Error set_index(DeviceIndex device_index);
+
+  /**
+   * Get the original device index before the guard was created.
+   *
+   * @return The original device index
+   */
+  DeviceIndex original_device() const {
+    return original_device_index_;
+  }
+
+  /**
+   * Get the current device index.
+   *
+   * @return The current device index
+   */
+  DeviceIndex current_device() const {
+    return current_device_index_;
+  }
+
+ private:
+  /// The original device before this guard was created
+  DeviceIndex original_device_index_;
+  /// The current device managed by this guard
+  DeviceIndex current_device_index_;
+};
+
+/**
+ * RAII guard that sets the current CUDA device and stream, restoring both on
+ * destruction. This is useful for temporarily switching to a different device
+ * and stream.
+ *
+ */
+class CUDAStreamGuard {
+ private:
+  // Private constructor that takes a CUDAGuard
+  explicit CUDAStreamGuard(CUDAGuard&& guard)
+      : device_guard_(std::move(guard)),
+        original_stream_(nullptr),
+        current_stream_(nullptr),
+        device_index_(-1) {}
+
+ public:
+  /**
+   * Factory method to create a CUDAStreamGuard.
+   *
+   * @param stream The CUDA stream to set as current
+   * @param device_index The device index for the stream
+   * @return Result containing the guard on success, or an error code on failure
+   */
+  static Result<CUDAStreamGuard> create(
+      cudaStream_t stream,
+      DeviceIndex device_index);
+
+  // Copy is not allowed
+  CUDAStreamGuard(const CUDAStreamGuard&) = delete;
+  CUDAStreamGuard& operator=(const CUDAStreamGuard&) = delete;
+
+  // Move constructor and assignment
+  CUDAStreamGuard(CUDAStreamGuard&& other) noexcept;
+  CUDAStreamGuard& operator=(CUDAStreamGuard&& other) noexcept = delete;
+
+  /**
+   * Destructor that restores the original stream and device.
+   */
+  ~CUDAStreamGuard();
+
+  /**
+   * Sets the CUDA stream to the given stream on the specified device.
+   *
+   * @param stream The CUDA stream to set as current
+   * @param device_index The device index for the stream
+   * @return Error code indicating success or failure
+   */
+  Error set_stream(cudaStream_t stream, DeviceIndex device_index);
+
+  /**
+   * Get the current guarded stream.
+   *
+   * @return The current stream
+   */
+  cudaStream_t stream() const {
+    return current_stream_;
+  }
+
+  /**
+   * Get the device index being guarded.
+   *
+   * @return The device index
+   */
+  DeviceIndex device_index() const {
+    return device_index_;
+  }
+
+ private:
+  /// The device guard that handles device switching
+  CUDAGuard device_guard_;
+  /// The original stream that was current before this guard
+  cudaStream_t original_stream_ = nullptr;
+  /// The current stream being guarded
+  cudaStream_t current_stream_ = nullptr;
+  /// The device index for this stream guard
+  DeviceIndex device_index_;
+};
+
+} // namespace cuda
+} // namespace backends
+} // namespace executorch
diff --git a/backends/cuda/runtime/tests/TARGETS b/backends/cuda/runtime/tests/TARGETS
new file mode 100644
index 00000000000..9ff3e83a8bd
--- /dev/null
+++ b/backends/cuda/runtime/tests/TARGETS
@@ -0,0 +1,6 @@
+load("@fbcode_macros//build_defs:cpp_unittest.bzl", "cpp_unittest")
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/backends/cuda/runtime/tests/targets.bzl b/backends/cuda/runtime/tests/targets.bzl
new file mode 100644
index 00000000000..37e8d876526
--- /dev/null
+++ b/backends/cuda/runtime/tests/targets.bzl
@@ -0,0 +1,27 @@
+load("@fbcode_macros//build_defs:cpp_unittest.bzl", "cpp_unittest")
+
+def cuda_runtime_cpp_unittest(name):
+    cpp_unittest(
+        name = "test_" + name,
+        srcs = [
+            "test_" + name + ".cpp",
+        ],
+        deps = [
+            "//executorch/backends/cuda/runtime:runtime_shims",
+            "//executorch/runtime/core:core",
+            "//executorch/runtime/core/exec_aten:lib",
+            "//executorch/runtime/platform:platform",
+        ],
+        external_deps = [
+            ("cuda", None, "cuda-lazy"),
+        ],
+    )
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+    cuda_runtime_cpp_unittest("cuda_guard")
+    cuda_runtime_cpp_unittest("cuda_stream_guard")
diff --git a/backends/cuda/runtime/tests/test_cuda_guard.cpp b/backends/cuda/runtime/tests/test_cuda_guard.cpp
new file mode 100644
index 00000000000..a364ae98484
--- /dev/null
+++ b/backends/cuda/runtime/tests/test_cuda_guard.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cuda_runtime.h>
+#include <executorch/backends/cuda/runtime/guard.h>
+#include <executorch/runtime/platform/platform.h>
+#include <gtest/gtest.h>
+
+using namespace executorch::backends::cuda;
+using namespace executorch::runtime;
+
+// TODO(gasoonjia): Multiple device tests were not included due to test
+// environment limitations. These tests should be added in the future when
+// multi-GPU test environments are available,
+
+class CUDAGuardTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    et_pal_init();
+
+    int device_count = 0;
+    cudaError_t error = cudaGetDeviceCount(&device_count);
+    if (error != cudaSuccess || device_count == 0) {
+      GTEST_SKIP() << "CUDA not available or no CUDA devices found";
+    }
+    device_count_ = device_count;
+
+    ASSERT_EQ(cudaGetDevice(&original_device_), cudaSuccess);
+  }
+
+  void TearDown() override {
+    if (device_count_ > 0) {
+      ASSERT_EQ(cudaSetDevice(original_device_), cudaSuccess);
+    }
+  }
+
+  int device_count_ = 0;
+  int original_device_ = 0;
+};
+
+TEST_F(CUDAGuardTest, BasicDeviceSwitching) {
+  int current_device;
+  ASSERT_EQ(cudaGetDevice(&current_device), cudaSuccess);
+
+  {
+    auto guard_result = CUDAGuard::create(0);
+    ASSERT_TRUE(guard_result.ok());
+    CUDAGuard guard = std::move(guard_result.get());
+
+    int device_after_guard;
+    ASSERT_EQ(cudaGetDevice(&device_after_guard), cudaSuccess);
+    EXPECT_EQ(device_after_guard, 0);
+    EXPECT_EQ(guard.current_device(), 0);
+    EXPECT_EQ(guard.original_device(), current_device);
+  }
+
+  int device_after_destruction;
+  ASSERT_EQ(cudaGetDevice(&device_after_destruction), cudaSuccess);
+  EXPECT_EQ(device_after_destruction, current_device);
+}
+
+TEST_F(CUDAGuardTest, SameDeviceNoSwitching) {
+  ASSERT_EQ(cudaSetDevice(0), cudaSuccess);
+
+  {
+    auto guard_result = CUDAGuard::create(0);
+    ASSERT_TRUE(guard_result.ok());
+    CUDAGuard guard = std::move(guard_result.get());
+
+    int current_device;
+    ASSERT_EQ(cudaGetDevice(&current_device), cudaSuccess);
+    EXPECT_EQ(current_device, 0);
+    EXPECT_EQ(guard.current_device(), 0);
+    EXPECT_EQ(guard.original_device(), 0);
+  }
+
+  int final_device;
+  ASSERT_EQ(cudaGetDevice(&final_device), cudaSuccess);
+  EXPECT_EQ(final_device, 0);
+}
+
+TEST_F(CUDAGuardTest, InvalidDeviceIndex) {
+  auto guard_result = CUDAGuard::create(999);
+  EXPECT_FALSE(guard_result.ok());
+}
+
+TEST_F(CUDAGuardTest, NegativeDeviceIndex) {
+  auto guard_result = CUDAGuard::create(-2);
+  EXPECT_FALSE(guard_result.ok());
+}
+
+TEST_F(CUDAGuardTest, CopyConstructorDeleted) {
+  static_assert(
+      !std::is_copy_constructible_v<CUDAGuard>,
+      "CUDAGuard should not be copy constructible");
+}
+
+TEST_F(CUDAGuardTest, CopyAssignmentDeleted) {
+  static_assert(
+      !std::is_copy_assignable_v<CUDAGuard>,
+      "CUDAGuard should not be copy assignable");
+}
+
+TEST_F(CUDAGuardTest, MoveAssignmentDeleted) {
+  static_assert(
+      !std::is_move_assignable_v<CUDAGuard>,
+      "CUDAGuard should not be move assignable");
+}
diff --git a/backends/cuda/runtime/tests/test_cuda_stream_guard.cpp b/backends/cuda/runtime/tests/test_cuda_stream_guard.cpp
new file mode 100644
index 00000000000..68a050a69be
--- /dev/null
+++ b/backends/cuda/runtime/tests/test_cuda_stream_guard.cpp
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cuda_runtime.h>
+#include <executorch/backends/cuda/runtime/guard.h>
+#include <executorch/runtime/platform/platform.h>
+#include <gtest/gtest.h>
+
+using namespace executorch::backends::cuda;
+using namespace executorch::runtime;
+
+// TODO(gasoonjia): Multiple device tests were not included due to test
+// environment limitations. These tests should be added in the future when
+// multi-GPU test environments are available,
+
+class CUDAStreamGuardTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    et_pal_init();
+
+    int device_count = 0;
+    cudaError_t error = cudaGetDeviceCount(&device_count);
+    if (error != cudaSuccess || device_count == 0) {
+      GTEST_SKIP() << "CUDA not available or no CUDA devices found";
+    }
+    device_count_ = device_count;
+
+    ASSERT_EQ(cudaGetDevice(&original_device_), cudaSuccess);
+
+    ASSERT_EQ(cudaStreamCreate(&test_stream1_), cudaSuccess);
+    ASSERT_EQ(cudaStreamCreate(&test_stream2_), cudaSuccess);
+  }
+
+  void TearDown() override {
+    if (test_stream1_) {
+      ASSERT_EQ(cudaStreamDestroy(test_stream1_), cudaSuccess);
+    }
+    if (test_stream2_) {
+      ASSERT_EQ(cudaStreamDestroy(test_stream2_), cudaSuccess);
+    }
+
+    if (device_count_ > 0) {
+      ASSERT_EQ(cudaSetDevice(original_device_), cudaSuccess);
+    }
+  }
+
+  int device_count_ = 0;
+  int original_device_ = 0;
+  cudaStream_t test_stream1_ = nullptr;
+  cudaStream_t test_stream2_ = nullptr;
+};
+
+TEST_F(CUDAStreamGuardTest, BasicStreamSwitching) {
+  auto guard_result = CUDAStreamGuard::create(test_stream1_, 0);
+  ASSERT_TRUE(guard_result.ok());
+  CUDAStreamGuard guard = std::move(guard_result.get());
+
+  EXPECT_EQ(guard.stream(), test_stream1_);
+  EXPECT_EQ(guard.device_index(), 0);
+
+  auto current_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(current_stream_result.ok());
+  EXPECT_EQ(current_stream_result.get(), test_stream1_);
+
+  int current_device;
+  ASSERT_EQ(cudaGetDevice(&current_device), cudaSuccess);
+  EXPECT_EQ(current_device, 0);
+}
+
+TEST_F(CUDAStreamGuardTest, StreamSwitchingOnSameDevice) {
+  Error err = setCurrentCUDAStream(test_stream1_, 0);
+  ASSERT_EQ(err, Error::Ok);
+
+  auto current_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(current_stream_result.ok());
+  EXPECT_EQ(current_stream_result.get(), test_stream1_);
+
+  {
+    auto guard_result = CUDAStreamGuard::create(test_stream2_, 0);
+    ASSERT_TRUE(guard_result.ok());
+    CUDAStreamGuard guard = std::move(guard_result.get());
+
+    auto new_stream_result = getCurrentCUDAStream(0);
+    ASSERT_TRUE(new_stream_result.ok());
+    EXPECT_EQ(new_stream_result.get(), test_stream2_);
+    EXPECT_EQ(guard.stream(), test_stream2_);
+  }
+
+  auto restored_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(restored_stream_result.ok());
+  EXPECT_EQ(restored_stream_result.get(), test_stream1_);
+}
+
+TEST_F(CUDAStreamGuardTest, NestedStreamGuards) {
+  cudaStream_t initial_stream;
+  ASSERT_EQ(cudaStreamCreate(&initial_stream), cudaSuccess);
+
+  Error err = setCurrentCUDAStream(initial_stream, 0);
+  ASSERT_EQ(err, Error::Ok);
+
+  {
+    auto guard1_result = CUDAStreamGuard::create(test_stream1_, 0);
+    ASSERT_TRUE(guard1_result.ok());
+    CUDAStreamGuard guard1 = std::move(guard1_result.get());
+
+    auto stream_result = getCurrentCUDAStream(0);
+    ASSERT_TRUE(stream_result.ok());
+    EXPECT_EQ(stream_result.get(), test_stream1_);
+
+    {
+      auto guard2_result = CUDAStreamGuard::create(test_stream2_, 0);
+      ASSERT_TRUE(guard2_result.ok());
+      CUDAStreamGuard guard2 = std::move(guard2_result.get());
+
+      auto stream_result2 = getCurrentCUDAStream(0);
+      ASSERT_TRUE(stream_result2.ok());
+      EXPECT_EQ(stream_result2.get(), test_stream2_);
+    }
+
+    auto stream_result3 = getCurrentCUDAStream(0);
+    ASSERT_TRUE(stream_result3.ok());
+    EXPECT_EQ(stream_result3.get(), test_stream1_);
+  }
+
+  auto final_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(final_stream_result.ok());
+  EXPECT_EQ(final_stream_result.get(), initial_stream);
+
+  ASSERT_EQ(cudaStreamDestroy(initial_stream), cudaSuccess);
+}
+
+TEST_F(CUDAStreamGuardTest, SameStreamNoChange) {
+  Error err = setCurrentCUDAStream(test_stream1_, 0);
+  ASSERT_EQ(err, Error::Ok);
+
+  {
+    auto guard_result = CUDAStreamGuard::create(test_stream1_, 0);
+    ASSERT_TRUE(guard_result.ok());
+    CUDAStreamGuard guard = std::move(guard_result.get());
+
+    auto stream_result = getCurrentCUDAStream(0);
+    ASSERT_TRUE(stream_result.ok());
+    EXPECT_EQ(stream_result.get(), test_stream1_);
+    EXPECT_EQ(guard.stream(), test_stream1_);
+  }
+
+  auto final_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(final_stream_result.ok());
+  EXPECT_EQ(final_stream_result.get(), test_stream1_);
+}
+
+TEST_F(CUDAStreamGuardTest, StreamAccessor) {
+  auto guard_result = CUDAStreamGuard::create(test_stream1_, 0);
+  ASSERT_TRUE(guard_result.ok());
+  CUDAStreamGuard guard = std::move(guard_result.get());
+
+  EXPECT_EQ(guard.stream(), test_stream1_);
+  EXPECT_EQ(guard.device_index(), 0);
+}
+
+TEST_F(CUDAStreamGuardTest, SetStreamMethod) {
+  auto guard_result = CUDAStreamGuard::create(test_stream1_, 0);
+  ASSERT_TRUE(guard_result.ok());
+  CUDAStreamGuard guard = std::move(guard_result.get());
+
+  EXPECT_EQ(guard.stream(), test_stream1_);
+
+  Error err = guard.set_stream(test_stream2_, 0);
+  EXPECT_EQ(err, Error::Ok);
+
+  EXPECT_EQ(guard.stream(), test_stream2_);
+
+  auto current_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(current_stream_result.ok());
+  EXPECT_EQ(current_stream_result.get(), test_stream2_);
+}
+
+TEST_F(CUDAStreamGuardTest, MoveConstructor) {
+  auto guard1_result = CUDAStreamGuard::create(test_stream1_, 0);
+  ASSERT_TRUE(guard1_result.ok());
+  CUDAStreamGuard guard1 = std::move(guard1_result.get());
+
+  EXPECT_EQ(guard1.stream(), test_stream1_);
+  EXPECT_EQ(guard1.device_index(), 0);
+
+  CUDAStreamGuard guard2 = std::move(guard1);
+
+  EXPECT_EQ(guard2.stream(), test_stream1_);
+  EXPECT_EQ(guard2.device_index(), 0);
+
+  auto current_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(current_stream_result.ok());
+  EXPECT_EQ(current_stream_result.get(), test_stream1_);
+}
+
+TEST_F(CUDAStreamGuardTest, MoveConstructorRestoresOnlyOnce) {
+  cudaStream_t initial_stream;
+  ASSERT_EQ(cudaStreamCreate(&initial_stream), cudaSuccess);
+
+  Error err = setCurrentCUDAStream(initial_stream, 0);
+  ASSERT_EQ(err, Error::Ok);
+
+  {
+    auto guard1_result = CUDAStreamGuard::create(test_stream1_, 0);
+    ASSERT_TRUE(guard1_result.ok());
+    CUDAStreamGuard guard1 = std::move(guard1_result.get());
+
+    { CUDAStreamGuard guard2 = std::move(guard1); }
+
+    auto stream_result = getCurrentCUDAStream(0);
+    ASSERT_TRUE(stream_result.ok());
+    EXPECT_EQ(stream_result.get(), initial_stream);
+  }
+
+  auto final_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(final_stream_result.ok());
+  EXPECT_EQ(final_stream_result.get(), initial_stream);
+
+  ASSERT_EQ(cudaStreamDestroy(initial_stream), cudaSuccess);
+}
+
+TEST_F(CUDAStreamGuardTest, InvalidDeviceIndex) {
+  auto guard_result = CUDAStreamGuard::create(test_stream1_, 999);
+  EXPECT_FALSE(guard_result.ok());
+}
+
+TEST_F(CUDAStreamGuardTest, NegativeDeviceIndex) {
+  auto guard_result = CUDAStreamGuard::create(test_stream1_, -2);
+  EXPECT_FALSE(guard_result.ok());
+}
+
+TEST_F(CUDAStreamGuardTest, CopyConstructorDeleted) {
+  static_assert(
+      !std::is_copy_constructible_v<CUDAStreamGuard>,
+      "CUDAStreamGuard should not be copy constructible");
+}
+
+TEST_F(CUDAStreamGuardTest, CopyAssignmentDeleted) {
+  static_assert(
+      !std::is_copy_assignable_v<CUDAStreamGuard>,
+      "CUDAStreamGuard should not be copy assignable");
+}
+
+TEST_F(CUDAStreamGuardTest, MoveAssignmentDeleted) {
+  static_assert(
+      !std::is_move_assignable_v<CUDAStreamGuard>,
+      "CUDAStreamGuard should not be move assignable");
+}
+
+TEST_F(CUDAStreamGuardTest, NullStreamPointer) {
+  auto guard_result = CUDAStreamGuard::create(nullptr, 0);
+  ASSERT_TRUE(guard_result.ok());
+  CUDAStreamGuard guard = std::move(guard_result.get());
+
+  EXPECT_EQ(guard.stream(), nullptr);
+
+  auto current_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(current_stream_result.ok());
+}

From f64c864a3b7afd39e59bc1e7774acac3a36274de Mon Sep 17 00:00:00 2001
From: Andrew Grebenisan <33402477+DrJessop@users.noreply.github.com>
Date: Wed, 8 Oct 2025 14:17:45 -0700
Subject: [PATCH 191/266] Revert D84020397: Group-quantized embedding op
 (#14915)

Summary: Revert D84020397: [Cadence ops] Group-quantized embedding op

Differential Revision: D84186522

Co-authored-by: Chuck Gillen-O'Neel <chuckg@meta.com>
---
 backends/cadence/aot/functions.yaml           |   5 -
 backends/cadence/aot/ops_registrations.py     |  26 +---
 backends/cadence/aot/ref_implementations.py   |  31 -----
 .../aot/tests/test_ref_implementations.py     | 113 ------------------
 4 files changed, 2 insertions(+), 173 deletions(-)

diff --git a/backends/cadence/aot/functions.yaml b/backends/cadence/aot/functions.yaml
index 1d63a41f989..d8024c0245a 100644
--- a/backends/cadence/aot/functions.yaml
+++ b/backends/cadence/aot/functions.yaml
@@ -468,8 +468,3 @@
   kernels:
     - arg_meta: null
       kernel_name: impl::generic::requantize_per_tensor_out
-
-- func: cadence::quantized_embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, Tensor indices, bool pruned_weights, *, Tensor(a!) out) -> Tensor(a!)
-  kernels:
-    - arg_meta: null
-      kernel_name: impl::generic::quantized_embedding_byte_out
diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
index 2b78d81b156..9266cc72970 100644
--- a/backends/cadence/aot/ops_registrations.py
+++ b/backends/cadence/aot/ops_registrations.py
@@ -320,7 +320,7 @@
     "float out_scale, int out_zero_point) -> (Tensor Z)"
 )
 lib.define(
-    "quantized_embedding_byte(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
+    "quantized_embedding_byte(Tensor weight, Tensor weight_scales, Tensor weight_zero_points, "
     "Tensor indices, bool pruned_weights=False) -> (Tensor X)"
 )
 lib.define(
@@ -514,7 +514,7 @@
     "int weight_zero_point, int out_multiplier, int out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)"
 )
 lib.define(
-    "quantized_embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
+    "quantized_embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor weight_zero_points, "
     "Tensor indices, bool pruned_weights=False, *, Tensor(a!) out) -> Tensor(a!)"
 )
 
@@ -2310,28 +2310,6 @@ def transposed_im2row_meta(
     return input.new_empty(output_size, dtype=input.dtype)
 
 
-@register_fake("cadence::quantized_embedding_byte")
-def quantized_embedding_byte_meta(
-    weight: torch.Tensor,
-    weight_scales: torch.Tensor,
-    weight_zero_points: torch.Tensor | None,
-    indices: torch.Tensor,
-    pruned_weights: bool = False,
-) -> torch.Tensor:
-    assert not pruned_weights
-    assert len(weight.shape) == 2
-    assert 1 <= len(weight_scales.shape) <= 2
-    if len(weight_scales.shape) == 2:
-        num_groups = weight_scales.shape[-1]
-        assert weight.shape[1] % num_groups == 0
-
-    if weight_zero_points is not None:
-        assert weight_zero_points.shape == weight_scales.shape
-
-    assert 1 <= len(indices.shape) <= 2
-    return torch.empty(*indices.shape, weight.shape[1], dtype=torch.float32)
-
-
 @register_fake("cadence::where_Scalar")
 def where_Scalar_meta(
     condition: torch.Tensor,
diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py
index 4f612e3bab4..ad1abb3ce4b 100644
--- a/backends/cadence/aot/ref_implementations.py
+++ b/backends/cadence/aot/ref_implementations.py
@@ -1572,34 +1572,3 @@ def transposed_im2row(
     # Optionally, flatten to (N, num_patches, patch_size) if needed
     patches = patches.view(N, C * H_in * W_in, -1).transpose(1, 2).contiguous()
     return patches
-
-
-@impl(m, "quantized_embedding_byte")
-def quantized_embedding_byte(
-    weight: torch.Tensor,
-    weight_scales: torch.Tensor,
-    weight_zero_points: torch.Tensor | None,
-    indices: torch.Tensor,
-    pruned_weights: bool = False,
-) -> torch.Tensor:
-    if pruned_weights:
-        raise NotImplementedError("Pruned weights not supported")
-
-    # Cannot use torch.ops.quantized_decomposed.embedding_byte.dtype because
-    # it doesn't support num_groups == 1
-    num_groups = 1
-    if len(weight_scales.shape) == 2:
-        num_groups = weight_scales.shape[1]
-
-    group_size = weight.shape[1] // num_groups
-    weight = torch.ops.torchao.dequantize_affine.default(
-        input=weight,
-        block_size=(1, group_size),
-        scale=weight_scales,
-        zero_point=weight_zero_points,
-        input_dtype=weight.dtype,
-        quant_min=torch.iinfo(weight.dtype).min,
-        quant_max=torch.iinfo(weight.dtype).max,
-    )
-
-    return weight[indices]
diff --git a/backends/cadence/aot/tests/test_ref_implementations.py b/backends/cadence/aot/tests/test_ref_implementations.py
index 5856c9def66..d8a79454097 100644
--- a/backends/cadence/aot/tests/test_ref_implementations.py
+++ b/backends/cadence/aot/tests/test_ref_implementations.py
@@ -2306,116 +2306,3 @@ def test_transposed_im2row(
             torch.equal(output, expected_output),
             f"transposed_im2row output mismatch in {name}: got {output}, expected {expected_output}",
         )
-
-    @expand(
-        [
-            (
-                "1_group",
-                torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=torch.int8),
-                torch.tensor([1, 1, 1], dtype=torch.float32),
-                torch.tensor([0, 0, 0], dtype=torch.int8),
-                torch.tensor([0, 2, 1], dtype=torch.int64),
-                torch.tensor(
-                    [[0.0, 1.0, 2.0], [6.0, 7.0, 8.0], [3.0, 4.0, 5.0]],
-                    dtype=torch.float32,
-                ),
-            ),
-            (
-                "2_groups",
-                torch.tensor(
-                    [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]], dtype=torch.int8
-                ),
-                torch.tensor([[0.5, 1.0], [1.5, 2.0], [2.5, 3.0]], dtype=torch.float32),
-                torch.tensor([[0, 1], [2, 3], [4, 5]], dtype=torch.int8),
-                torch.tensor([0, 2, 1], dtype=torch.int64),
-                torch.tensor(
-                    [
-                        [0.0, 0.5, 1.0, 2.0],
-                        [10.0, 12.5, 15.0, 18.0],
-                        [3.0, 4.5, 6.0, 8.0],
-                    ],
-                    dtype=torch.float32,
-                ),
-            ),
-            (
-                "1_group_none_zero_point",
-                torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=torch.int8),
-                torch.tensor([1, 1, 1], dtype=torch.float32),
-                None,
-                torch.tensor([0, 2, 1], dtype=torch.int64),
-                torch.tensor(
-                    [[0.0, 1.0, 2.0], [6.0, 7.0, 8.0], [3.0, 4.0, 5.0]],
-                    dtype=torch.float32,
-                ),
-            ),
-            (
-                "1_group_batch2",
-                torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=torch.int8),
-                torch.tensor([1, 1, 1], dtype=torch.float32),
-                torch.tensor([0, 0, 0], dtype=torch.int8),
-                torch.tensor([[0, 2, 1], [1, 0, 2]], dtype=torch.int64),
-                torch.tensor(
-                    [
-                        [[0.0, 1.0, 2.0], [6.0, 7.0, 8.0], [3.0, 4.0, 5.0]],
-                        [[3.0, 4.0, 5.0], [0.0, 1.0, 2.0], [6.0, 7.0, 8.0]],
-                    ],
-                    dtype=torch.float32,
-                ),
-            ),
-            (
-                "2_groups_batch2",
-                torch.tensor(
-                    [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]], dtype=torch.int8
-                ),
-                torch.tensor([[0.5, 1.0], [1.5, 2.0], [2.5, 3.0]], dtype=torch.float32),
-                torch.tensor([[0, 1], [2, 3], [4, 5]], dtype=torch.int8),
-                torch.tensor([[0, 2, 1], [2, 1, 0]], dtype=torch.int64),
-                torch.tensor(
-                    [
-                        [
-                            [0.0, 0.5, 1.0, 2.0],
-                            [10.0, 12.5, 15.0, 18.0],
-                            [3.0, 4.5, 6.0, 8.0],
-                        ],
-                        [
-                            [10.0, 12.5, 15.0, 18.0],
-                            [3.0, 4.5, 6.0, 8.0],
-                            [0.0, 0.5, 1.0, 2.0],
-                        ],
-                    ],
-                    dtype=torch.float32,
-                ),
-            ),
-            (
-                "1_group_none_zero_point_batch2",
-                torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=torch.int8),
-                torch.tensor([1, 1, 1], dtype=torch.float32),
-                None,
-                torch.tensor([[0, 2, 1], [1, 0, 2]], dtype=torch.int64),
-                torch.tensor(
-                    [
-                        [[0.0, 1.0, 2.0], [6.0, 7.0, 8.0], [3.0, 4.0, 5.0]],
-                        [[3.0, 4.0, 5.0], [0.0, 1.0, 2.0], [6.0, 7.0, 8.0]],
-                    ],
-                    dtype=torch.float32,
-                ),
-            ),
-        ]
-    )
-    def test_quantized_embedding_byte(
-        self,
-        name: str,
-        weight: torch.Tensor,
-        weight_scales: torch.Tensor,
-        weight_zero_points: torch.Tensor | None,
-        indices: torch.Tensor,
-        expected_out: torch.Tensor,
-    ) -> None:
-        self.assertTrue(
-            torch.equal(
-                torch.ops.cadence.quantized_embedding_byte(
-                    weight, weight_scales, weight_zero_points, indices
-                ),
-                expected_out,
-            )
-        )

From f32e9fc02521e224d8d65d2471e9cb9de39594cb Mon Sep 17 00:00:00 2001
From: lucylq <lfq@meta.com>
Date: Wed, 8 Oct 2025 15:36:39 -0700
Subject: [PATCH 192/266] Back FreeableBuffer with int64_t

Differential Revision: D83007972

Pull Request resolved: https://github.com/pytorch/executorch/pull/14570
---
 runtime/core/freeable_buffer.h             | 128 ++++++++++++++++---
 runtime/core/test/freeable_buffer_test.cpp | 139 ++++++++++++++++++++-
 2 files changed, 246 insertions(+), 21 deletions(-)

diff --git a/runtime/core/freeable_buffer.h b/runtime/core/freeable_buffer.h
index a90c899103d..c743f32116a 100644
--- a/runtime/core/freeable_buffer.h
+++ b/runtime/core/freeable_buffer.h
@@ -9,6 +9,12 @@
 #pragma once
 
 #include <cstddef>
+#include <cstdint>
+#include <variant>
+
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/platform/assert.h>
 
 namespace executorch {
 namespace runtime {
@@ -20,20 +26,35 @@ class FreeableBuffer final {
  public:
   // Callback signature for the function that does the freeing.
   using FreeFn = void (*)(void* context, void* data, size_t size);
+  using FreeUInt64Fn =
+      void (*)(void* context, uint64_t data_uint64, size_t size);
+
+ private:
+  // Forward declare types.
+  struct PointerData {
+    const void* data_;
+    FreeFn free_fn_;
+  };
 
+  struct UInt64Data {
+    // A pointer value cast to uint64_t.
+    uint64_t data_;
+    FreeUInt64Fn free_fn_;
+  };
+
+ public:
   /**
    * Creates an empty FreeableBuffer with size zero and a null data pointer.
    */
   FreeableBuffer()
-      : free_fn_(nullptr),
+      : data_(PointerData{nullptr, nullptr}),
         free_fn_context_(nullptr),
-        data_(nullptr),
         size_(0) {}
 
   /**
    * Creates a FreeableBuffer with an optional free function.
    *
-   * @param[in] data The data of the segment.
+   * @param[in] data The data of the segment, as a void*.
    * @param[in] size The size of the segment data, in bytes.
    * @param[in] free_fn Optional function to free the data. Guaranteed to be
    *     called exactly once before the FreeableBuffer is destroyed. May be
@@ -47,9 +68,35 @@ class FreeableBuffer final {
       size_t size,
       FreeFn free_fn,
       void* free_fn_context = nullptr)
-      : free_fn_(free_fn),
+      : data_(PointerData{data, free_fn}),
+        free_fn_context_(free_fn_context),
+        size_(size) {}
+
+  /**
+   * Creates a FreeableBuffer with an optional free function.
+   *
+   * NOTE: most users should use the other ctor with FreeFn.
+   * This variant exists for situations where the FreeableBuffer points to
+   * memory on a different core whose pointer value is larger than the local
+   * core's void*.
+   *
+   * @param[in] data Pointer to the data of the segment, cast to a uint64_t
+   * value.
+   * @param[in] size The size of the segment data, in bytes.
+   * @param[in] free_fn Optional function to free the data. Guaranteed to be
+   *     called exactly once before the FreeableBuffer is destroyed. May be
+   *     nullptr. NOTE: This function must be thread-safe. If it modifies common
+   *     state, the function must do its own locking.
+   * @param[in] free_fn_context Opaque pointer to pass as the `context`
+   *     parameter of `free_fn`. May be nullptr.
+   */
+  explicit FreeableBuffer(
+      const uint64_t data_uint64,
+      size_t size,
+      FreeUInt64Fn free_fn,
+      void* free_fn_context = nullptr)
+      : data_(UInt64Data{data_uint64, free_fn}),
         free_fn_context_(free_fn_context),
-        data_(data),
         size_(size) {}
 
   /**
@@ -57,13 +104,15 @@ class FreeableBuffer final {
    * leaving `rhs` pointing to nullptr.
    */
   FreeableBuffer(FreeableBuffer&& rhs) noexcept
-      : free_fn_(rhs.free_fn_),
+      : data_(rhs.data_),
         free_fn_context_(rhs.free_fn_context_),
-        data_(rhs.data_),
         size_(rhs.size_) {
-    rhs.free_fn_ = nullptr;
+    if (std::holds_alternative<PointerData>(rhs.data_)) {
+      rhs.data_ = PointerData{nullptr, nullptr};
+    } else {
+      rhs.data_ = UInt64Data{0, nullptr};
+    }
     rhs.free_fn_context_ = nullptr;
-    rhs.data_ = nullptr;
     rhs.size_ = 0;
   }
 
@@ -75,11 +124,22 @@ class FreeableBuffer final {
    * Frees the data if not already free. Safe to call multiple times.
    */
   void Free() {
-    if (data_ != nullptr) {
-      if (free_fn_ != nullptr) {
-        free_fn_(free_fn_context_, const_cast<void*>(data_), size_);
+    if (std::holds_alternative<PointerData>(data_)) {
+      PointerData& ptr_data = std::get<PointerData>(data_);
+      if (ptr_data.data_ != nullptr && ptr_data.free_fn_ != nullptr) {
+        // Do not need to check for truncation here, as free_fn_ is only set
+        // using the void* ctor.
+        ptr_data.free_fn_(
+            free_fn_context_, const_cast<void*>(ptr_data.data_), size_);
       }
-      data_ = nullptr;
+      ptr_data.data_ = nullptr;
+      size_ = 0;
+    } else {
+      UInt64Data& int64_data = std::get<UInt64Data>(data_);
+      if (int64_data.data_ != 0 && int64_data.free_fn_ != nullptr) {
+        int64_data.free_fn_(free_fn_context_, int64_data.data_, size_);
+      }
+      int64_data.data_ = static_cast<uint64_t>(0);
       size_ = 0;
     }
   }
@@ -95,7 +155,37 @@ class FreeableBuffer final {
    * Pointer to the data. Returns nullptr if the data has been freed.
    */
   const void* data() const {
-    return data_;
+    ET_CHECK_MSG(
+        std::holds_alternative<PointerData>(data_),
+        "FreeableBuffer is backed by an uint64_t, please use the data_uint64_type() API.");
+    return std::get<PointerData>(data_).data_;
+  }
+
+  /**
+   * Pointer to the data. Returns nullptr if the data has been freed.
+   * Safe version of data() API that returns an ERror if the data is
+   * backed by int64_t instead of void*.
+   */
+  Result<const void*> data_safe() const {
+    ET_CHECK_OR_RETURN_ERROR(
+        std::holds_alternative<PointerData>(data_),
+        InvalidType,
+        "FreeableBuffer is backed by an uint64_t, please use the data_uint64_type() API.");
+    return std::get<PointerData>(data_).data_;
+  }
+
+  /**
+   * Data address as a uint64_t. Returns zero if the data has been freed.
+   * Most users should use data(). data_uint64_type() is only helpful in
+   * situations where the FreeableBuffer points to memory on a different core
+   * whose pointer value is larger than the local core's void *.
+   */
+  Result<uint64_t> data_uint64_type() const {
+    ET_CHECK_OR_RETURN_ERROR(
+        std::holds_alternative<UInt64Data>(data_),
+        InvalidType,
+        "FreeableBuffer is backed by a void*, please use the data() API.");
+    return std::get<UInt64Data>(data_).data_;
   }
 
  private:
@@ -104,9 +194,15 @@ class FreeableBuffer final {
   FreeableBuffer& operator=(FreeableBuffer&& rhs) noexcept = delete;
   FreeableBuffer& operator=(const FreeableBuffer& rhs) = delete;
 
-  FreeFn free_fn_;
+  // This stores either a PointerData or a UInt64Data structure. Most users
+  // should use the PointerData variant and the void* ctor. This creates a
+  // FreeableBuffer backed by void*, accessed using the void* getter data().
+  // The UInt64Data variant is only helpful in situations where the
+  // FreeableBuffer points to memory on a different core whose pointer value
+  // is larger than the local core's void*.
+  std::variant<PointerData, UInt64Data> data_;
+
   void* free_fn_context_;
-  const void* data_;
   size_t size_;
 };
 
diff --git a/runtime/core/test/freeable_buffer_test.cpp b/runtime/core/test/freeable_buffer_test.cpp
index e2edff24227..2848a6b049d 100644
--- a/runtime/core/test/freeable_buffer_test.cpp
+++ b/runtime/core/test/freeable_buffer_test.cpp
@@ -6,16 +6,21 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/freeable_buffer.h>
+#include <executorch/runtime/platform/platform.h>
+#include <executorch/test/utils/DeathTest.h>
 
 #include <gtest/gtest.h>
 
 using namespace ::testing;
+
+using executorch::runtime::Error;
 using executorch::runtime::FreeableBuffer;
 
 struct FreeCallArgs {
   size_t calls;
-  void* data;
+  std::variant<const void*, uint64_t> data;
   size_t size;
 };
 
@@ -26,9 +31,18 @@ void RecordFree(void* context, void* data, size_t size) {
   call->size = size;
 }
 
+void RecordInt64Free(void* context, uint64_t data, size_t size) {
+  auto* call = reinterpret_cast<FreeCallArgs*>(context);
+  call->calls++;
+  call->data = data;
+  call->size = size;
+}
+
 TEST(FreeableBufferTest, EmptyTest) {
   FreeableBuffer fb;
   EXPECT_EQ(fb.data(), nullptr);
+  EXPECT_EQ(fb.data_safe().error(), Error::Ok);
+  EXPECT_EQ(fb.data_safe().get(), nullptr);
   EXPECT_EQ(fb.size(), 0);
 }
 
@@ -42,11 +56,33 @@ TEST(FreeableBufferTest, DataAndSizeTest) {
   // It should return the ctor params unmodified.
   EXPECT_EQ(fb.size(), sizeof(i));
   EXPECT_EQ(fb.data(), &i);
+  EXPECT_EQ(fb.data_safe().error(), Error::Ok);
+  EXPECT_EQ(fb.data_safe().get(), &i);
 
   // Freeing should clear them, even though free_fn is nullptr.
   fb.Free();
   EXPECT_EQ(fb.size(), 0);
   EXPECT_EQ(fb.data(), nullptr);
+  EXPECT_EQ(fb.data_safe().error(), Error::Ok);
+  EXPECT_EQ(fb.data_safe().get(), nullptr);
+
+  // Use uint64_t constructor.
+  const uint64_t i64 = 1;
+  FreeableBuffer fb2(
+      /*data_uint64=*/i64,
+      /*size=*/sizeof(i64),
+      /*free_fn=*/nullptr);
+
+  // It should return the ctor params unmodified.
+  EXPECT_EQ(fb2.size(), sizeof(i64));
+  EXPECT_EQ(fb2.data_uint64_type().error(), Error::Ok);
+  EXPECT_EQ(fb2.data_uint64_type().get(), i64);
+
+  // Freeing should clear them, even though free_fn is nullptr.
+  fb2.Free();
+  EXPECT_EQ(fb2.size(), 0);
+  EXPECT_EQ(fb2.data_uint64_type().error(), Error::Ok);
+  EXPECT_EQ(fb2.data_uint64_type().get(), 0);
 }
 
 TEST(FreeableBufferTest, FreeTest) {
@@ -68,7 +104,7 @@ TEST(FreeableBufferTest, FreeTest) {
     // Called once during Free() with the expected data/size.
     fb.Free();
     EXPECT_EQ(call.calls, 1);
-    EXPECT_EQ(call.data, &i);
+    EXPECT_EQ(std::get<const void*>(call.data), &i);
     EXPECT_EQ(call.size, sizeof(i));
 
     // A second call to Free() should not call the function again.
@@ -78,6 +114,31 @@ TEST(FreeableBufferTest, FreeTest) {
 
   // The destructor should not have called the function again.
   EXPECT_EQ(call.calls, 1);
+
+  // Test with uint64_t constructor and free function.
+  FreeCallArgs call2 = {};
+  {
+    uint64_t i64 = 1;
+    FreeableBuffer fb(
+        /*data_uint64=*/i64,
+        /*size=*/sizeof(i64),
+        /*free_fn=*/RecordInt64Free,
+        /*free_fn_context=*/&call2);
+
+    // Not called during construction.
+    EXPECT_EQ(call2.calls, 0);
+
+    // Called once during Free() with the expected data/size.
+    fb.Free();
+    EXPECT_EQ(call2.calls, 1);
+    EXPECT_EQ(std::get<uint64_t>(call2.data), i64);
+    EXPECT_EQ(call2.size, sizeof(i64));
+
+    // A second call to Free() should not call the function again.
+    fb.Free();
+    EXPECT_EQ(call2.calls, 1);
+  }
+  EXPECT_EQ(call2.calls, 1);
 }
 
 TEST(FreeableBufferTest, DestructorTest) {
@@ -99,8 +160,24 @@ TEST(FreeableBufferTest, DestructorTest) {
 
   // The destructor should have freed the data.
   EXPECT_EQ(call.calls, 1);
-  EXPECT_EQ(call.data, &i);
+  EXPECT_EQ(std::get<const void*>(call.data), &i);
   EXPECT_EQ(call.size, sizeof(i));
+
+  // Test with uint64_t constructor and free function.
+  FreeCallArgs call2 = {};
+  uint64_t i64 = 1;
+  {
+    FreeableBuffer fb2(
+        /*data_uint64=*/i64,
+        /*size=*/sizeof(i),
+        /*free_fn=*/RecordInt64Free,
+        /*free_fn_context=*/&call2);
+    EXPECT_EQ(call2.calls, 0);
+  }
+  // The destructor should have freed the data.
+  EXPECT_EQ(call2.calls, 1);
+  EXPECT_EQ(std::get<uint64_t>(call2.data), i64);
+  EXPECT_EQ(call2.size, sizeof(i));
 }
 
 TEST(FreeableBufferTest, MoveTest) {
@@ -127,7 +204,6 @@ TEST(FreeableBufferTest, MoveTest) {
   // The destination FreeableBuffer should have the data.
   EXPECT_EQ(fb_dst.size(), sizeof(i));
   EXPECT_EQ(fb_dst.data(), &i);
-
   // Freeing the source FreeableBuffer should not call the free function.
   fb_src.Free();
   EXPECT_EQ(call.calls, 0);
@@ -135,6 +211,59 @@ TEST(FreeableBufferTest, MoveTest) {
   // Freeing the destination FreeableBuffer should call the free function.
   fb_dst.Free();
   EXPECT_EQ(call.calls, 1);
-  EXPECT_EQ(call.data, &i);
   EXPECT_EQ(call.size, sizeof(i));
+
+  // Test with uint64_t constructor and free function.
+  FreeCallArgs call2 = {};
+  const uint64_t i64 = 1;
+  FreeableBuffer fb_src2(
+      /*data_uint64=*/i64,
+      /*size=*/sizeof(i64),
+      /*free_fn=*/RecordInt64Free,
+      /*free_fn_context=*/&call2);
+  EXPECT_EQ(fb_src2.size(), sizeof(i64));
+  EXPECT_EQ(fb_src2.data_uint64_type().error(), Error::Ok);
+  EXPECT_EQ(fb_src2.data_uint64_type().get(), i64);
+
+  // Move it into a second FreeableBuffer.
+  FreeableBuffer fb_dst2(std::move(fb_src2));
+
+  // The source FreeableBuffer should now be empty.
+  EXPECT_EQ(fb_src2.size(), 0); // NOLINT(bugprone-use-after-move)
+  EXPECT_EQ(
+      fb_src2.data_uint64_type().error(),
+      Error::Ok); // NOLINT(bugprone-use-after-move)
+  EXPECT_EQ(
+      fb_src2.data_uint64_type().get(), 0); // NOLINT(bugprone-use-after-move)
+
+  // The destination FreeableBuffer should have the data.
+  EXPECT_EQ(fb_dst2.size(), sizeof(i64));
+  EXPECT_EQ(fb_dst2.data_uint64_type().error(), Error::Ok);
+  EXPECT_EQ(fb_dst2.data_uint64_type().get(), i64);
+  // Freeing the source FreeableBuffer should not call the free function.
+  fb_src2.Free();
+  EXPECT_EQ(call2.calls, 0);
+
+  // Freeing the destination FreeableBuffer should call the free function.
+  fb_dst2.Free();
+  EXPECT_EQ(call2.calls, 1);
+  EXPECT_EQ(call2.size, sizeof(i64));
+}
+
+TEST(FreeableBufferTest, APIMisuseDeathTest) {
+  executorch::runtime::pal_init();
+  int i;
+  FreeableBuffer fb(
+      /*data=*/&i,
+      /*size=*/sizeof(i),
+      /*free_fn=*/nullptr);
+  EXPECT_EQ(fb.data_uint64_type().error(), Error::InvalidType);
+
+  uint64_t i64 = 1;
+  FreeableBuffer fb2(
+      /*data_uint64=*/i64,
+      /*size=*/sizeof(i64),
+      /*free_fn=*/nullptr);
+  EXPECT_EQ(fb2.data_safe().error(), Error::InvalidType);
+  ET_EXPECT_DEATH(fb2.data(), ".*");
 }

From a26412e7f0af3b11e93baa69d466a227519567b4 Mon Sep 17 00:00:00 2001
From: Gregory Comer <gjcomer@meta.com>
Date: Wed, 8 Oct 2025 18:07:12 -0600
Subject: [PATCH 193/266] =?UTF-8?q?Reapply=20"Add=20EXECUTORCH=5FTHREADPOO?=
 =?UTF-8?q?L=5FSIZE=20options,=20default=20to=20u=E2=80=A6=20(#14307)=20(#?=
 =?UTF-8?q?14842)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 750cba7321e6f21bb85819d27241e40aff4bf461.

Re-applying the better threadpool size defaults from
https://github.com/pytorch/executorch/pull/14090 with the fix from
https://github.com/pytorch/executorch/pull/14838. This gives a 2-4x
speedup for many models and platforms (I measured 4x speedup on M1 with
MobileNet V3 + XNNPACK). On high core count server platforms (doing
evals, for example), this can give a 100x speedup out of box.
---
 extension/threadpool/CMakeLists.txt           | 14 ++++++++-
 extension/threadpool/targets.bzl              |  1 +
 extension/threadpool/test/threadpool_test.cpp |  9 ++++++
 extension/threadpool/threadpool.cpp           | 31 +++++++++++++++++--
 extension/threadpool/threadpool.h             | 16 ++++++++++
 tools/cmake/preset/default.cmake              | 30 ++++++++++++++++++
 6 files changed, 98 insertions(+), 3 deletions(-)

diff --git a/extension/threadpool/CMakeLists.txt b/extension/threadpool/CMakeLists.txt
index 9cd514fa0ad..3b9c7c66ddb 100644
--- a/extension/threadpool/CMakeLists.txt
+++ b/extension/threadpool/CMakeLists.txt
@@ -20,6 +20,16 @@ if(NOT CMAKE_CXX_STANDARD)
   set(CMAKE_CXX_STANDARD 17)
 endif()
 
+# Threadpool size specifiers. Mutual exclusion is checking in default.cmake.
+# Default to using performance cores if
+# EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES isn't set.
+set(_threadpool_size_flag)
+if(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES)
+  set(_threadpool_size_flag "EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES")
+else()
+  set(_threadpool_size_flag "EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES")
+endif()
+
 add_library(
   extension_threadpool threadpool.cpp threadpool_guard.cpp thread_parallel.cpp
                        cpuinfo_utils.cpp
@@ -36,7 +46,9 @@ target_include_directories(
     $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include>
     $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include>
 )
-target_compile_definitions(extension_threadpool PUBLIC ET_USE_THREADPOOL)
+target_compile_definitions(
+  extension_threadpool PUBLIC ET_USE_THREADPOOL ${_threadpool_size_flag}
+)
 target_compile_options(extension_threadpool PUBLIC ${_common_compile_options})
 
 # Install libraries
diff --git a/extension/threadpool/targets.bzl b/extension/threadpool/targets.bzl
index 6ef55c42434..1889cb650ad 100644
--- a/extension/threadpool/targets.bzl
+++ b/extension/threadpool/targets.bzl
@@ -22,6 +22,7 @@ def define_common_targets():
         name = "threadpool_lib",
         srcs = _THREADPOOL_SRCS,
         deps = [
+            ":cpuinfo_utils",
             "//executorch/runtime/core:core",
             "//executorch/runtime/core/portable_type/c10/c10:c10",
         ],
diff --git a/extension/threadpool/test/threadpool_test.cpp b/extension/threadpool/test/threadpool_test.cpp
index e7784d3cc11..052e6c22f5e 100644
--- a/extension/threadpool/test/threadpool_test.cpp
+++ b/extension/threadpool/test/threadpool_test.cpp
@@ -7,6 +7,7 @@
  */
 
 #include <executorch/extension/threadpool/threadpool.h>
+#include <executorch/runtime/platform/runtime.h>
 
 #include <mutex>
 #include <numeric>
@@ -71,6 +72,8 @@ void run_lambda_with_size(
 } // namespace
 
 TEST(ThreadPoolTest, ParallelAdd) {
+  executorch::runtime::runtime_init();
+
   std::vector<int32_t> a, b, c, c_ref;
   size_t vector_size = 100;
   size_t grain_size = 10;
@@ -111,6 +114,8 @@ TEST(ThreadPoolTest, ParallelAdd) {
 
 // Test parallel reduction where we acquire lock within lambda
 TEST(ThreadPoolTest, ParallelReduce) {
+  executorch::runtime::runtime_init();
+
   std::vector<int32_t> a;
   int32_t c = 0, c_ref = 0;
   size_t vector_size = 100;
@@ -144,6 +149,8 @@ TEST(ThreadPoolTest, ParallelReduce) {
 // Copied from
 // caffe2/aten/src/ATen/test/test_thread_pool_guard.cp
 TEST(TestNoThreadPoolGuard, TestThreadPoolGuard) {
+  executorch::runtime::runtime_init();
+
   auto threadpool_ptr = ::executorch::extension::threadpool::get_pthreadpool();
 
   ASSERT_NE(threadpool_ptr, nullptr);
@@ -173,6 +180,8 @@ TEST(TestNoThreadPoolGuard, TestThreadPoolGuard) {
 }
 
 TEST(TestNoThreadPoolGuard, TestRunWithGuard) {
+  executorch::runtime::runtime_init();
+
   const std::vector<int64_t> array = {1, 2, 3};
 
   auto pool = ::executorch::extension::threadpool::get_threadpool();
diff --git a/extension/threadpool/threadpool.cpp b/extension/threadpool/threadpool.cpp
index bebb4745581..f4d88e668d6 100644
--- a/extension/threadpool/threadpool.cpp
+++ b/extension/threadpool/threadpool.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/extension/threadpool/cpuinfo_utils.h>
 #include <executorch/extension/threadpool/threadpool.h>
 
 #include <algorithm>
@@ -13,9 +14,26 @@
 
 #include <executorch/extension/threadpool/threadpool_guard.h>
 #include <executorch/runtime/platform/assert.h>
+#include <executorch/runtime/platform/runtime.h>
 
 #include <cpuinfo.h>
 
+// At most one mode should be set.
+#if (                                                       \
+    defined(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES) && \
+    defined(EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES))
+#error Multiple \
+            threadpool size specifiers are set.At most one of                \
+    EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES,                             \
+    and EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES may be defined.
+#endif
+
+// Default to EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES if no mode is set.
+#if !defined(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES) && \
+    !defined(EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES)
+#define EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES 1
+#endif
+
 namespace executorch::extension::threadpool {
 
 #if !(defined(WIN32))
@@ -97,13 +115,21 @@ void ThreadPool::run(
 // get_threadpool is not thread safe due to leak_corrupted_threadpool
 // Make this part threadsafe: TODO(kimishpatel)
 ThreadPool* get_threadpool() {
+  executorch::runtime::runtime_init();
+
   if (!cpuinfo_initialize()) {
     ET_LOG(Error, "cpuinfo initialization failed");
     return nullptr; // NOLINT(facebook-hte-NullableReturn)
   }
 
   static const int num_threads = ([]() {
-    int result = cpuinfo_get_processors_count();
+#if defined(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES)
+    // Use threads=cores.
+    auto result = cpuinfo_get_processors_count();
+#else
+    // Set threads equal to the number of performance cores.
+    auto result = ::executorch::extension::cpuinfo::get_num_performant_cores();
+#endif
 
     /*
      * For llvm-tsan, holding limit for the number of locks for a single thread
@@ -113,9 +139,10 @@ ThreadPool* get_threadpool() {
      * tricky to detect if we are running under tsan, for now capping the
      * default threadcount to the tsan limit unconditionally.
      */
-    constexpr int tsan_thread_limit = 63;
+    constexpr unsigned int tsan_thread_limit = 63;
     return std::min(result, tsan_thread_limit);
   })();
+
   static auto threadpool = std::make_unique<ThreadPool>(num_threads);
 
 // Inheriting from old threadpool to get around segfault issue
diff --git a/extension/threadpool/threadpool.h b/extension/threadpool/threadpool.h
index 3ad2d1d48d4..16acad6e5fa 100644
--- a/extension/threadpool/threadpool.h
+++ b/extension/threadpool/threadpool.h
@@ -14,6 +14,22 @@
 
 #include <pthreadpool.h>
 
+/*
+ * Threadpool Options:
+ *
+ * Threadpool size has a sizble affect on performance. By default, the
+ * threadpool will be sized according to the number of performance cores. This
+ * behavior can be overriden with the following build-time options. Note that
+ * these options are mutually exclusive.
+ *
+ * - EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES (flag) - Sizes the threadpool
+ * equal to the number of performance cores on the system. This is the default
+ * behavior.
+ * - EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES (flag) - Sizes the threadpool
+ * equal to the number of logical cores on system. This is the historical
+ * behavior.
+ */
+
 namespace executorch::extension::threadpool {
 
 class ThreadPool final {
diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake
index 37c10d25332..04e84622589 100644
--- a/tools/cmake/preset/default.cmake
+++ b/tools/cmake/preset/default.cmake
@@ -183,6 +183,36 @@ define_overridable_option(
   ${_default_executorch_build_cpuinfo}
 )
 
+# Threadpool size options. At most one can be specified. Note that the default
+# is managed in threadpool.cpp to allow the user to specify an alternate mode
+# without needing to explicitly set the default to off.
+define_overridable_option(
+  EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES
+  "Set the number of threads used for CPU parallel computation equal to the number of performant CPU cores."
+  BOOL
+  OFF
+)
+define_overridable_option(
+  EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES
+  "Set the number of threads used for CPU parallel computation equal to the number of logical CPU cores."
+  BOOL
+  OFF
+)
+
+check_required_options_on(
+  IF_ON EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES REQUIRES
+  EXECUTORCH_BUILD_PTHREADPOOL EXECUTORCH_BUILD_CPUINFO
+)
+check_required_options_on(
+  IF_ON EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES REQUIRES
+  EXECUTORCH_BUILD_PTHREADPOOL EXECUTORCH_BUILD_CPUINFO
+)
+
+check_conflicting_options_on(
+  IF_ON EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES CONFLICTS_WITH
+  EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES
+)
+
 # TODO(jathu): move this to platform specific presets when created
 set(_default_executorch_build_executor_runner ON)
 if(APPLE AND "${SDK_NAME}" STREQUAL "iphoneos")

From 09c93d474435536edeb68676d677778441cf7bfc Mon Sep 17 00:00:00 2001
From: Shen Chen Xu <shenchenxu@meta.com>
Date: Wed, 8 Oct 2025 18:17:35 -0700
Subject: [PATCH 194/266] Read max context length from the correct ModelArgs
 field

Differential Revision: D84182698

Pull Request resolved: https://github.com/pytorch/executorch/pull/14912
---
 examples/models/llama/static_attention.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/models/llama/static_attention.py b/examples/models/llama/static_attention.py
index b42371dc090..95bae1b766a 100644
--- a/examples/models/llama/static_attention.py
+++ b/examples/models/llama/static_attention.py
@@ -259,7 +259,7 @@ def __init__(
         }
 
         rope = Rope(config)
-        freqs = rope.get_freqs(None, config.max_seq_len)
+        freqs = rope.get_freqs(None, config.max_context_len)
         self.freqs_cos = freqs[0].to(dtype)
         self.freqs_sin = freqs[1].to(dtype)
 

From 38b51aa7e5b66e0398b44e81251b5887fea87508 Mon Sep 17 00:00:00 2001
From: Manuel Candales <42380156+manuelcandales@users.noreply.github.com>
Date: Wed, 8 Oct 2025 21:39:40 -0400
Subject: [PATCH 195/266] print bfloat16 tensor data (#14889)

Handle bfloat16 when printing tensor data.
---
 extension/evalue_util/print_evalue.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extension/evalue_util/print_evalue.cpp b/extension/evalue_util/print_evalue.cpp
index 32009011012..83d71cffb43 100644
--- a/extension/evalue_util/print_evalue.cpp
+++ b/extension/evalue_util/print_evalue.cpp
@@ -160,7 +160,7 @@ void print_tensor(std::ostream& os, executorch::aten::Tensor tensor) {
     break;
 
   switch (tensor.scalar_type()) {
-    ET_FORALL_REAL_TYPES_AND2(Bool, Half, PRINT_TENSOR_DATA)
+    ET_FORALL_REALHBBF16_TYPES(PRINT_TENSOR_DATA)
     default:
       os << "[<unhandled scalar type " << (int)tensor.scalar_type() << ">]";
   }

From 6520e0633aa8e5dff68495891240c313e531fa42 Mon Sep 17 00:00:00 2001
From: Shen Chen Xu <shenchenxu@meta.com>
Date: Wed, 8 Oct 2025 21:25:28 -0700
Subject: [PATCH 196/266] Make type of logits a template parameter

Differential Revision: D84211619

Pull Request resolved: https://github.com/pytorch/executorch/pull/14921
---
 examples/models/llama/runner/static_attention_io_manager.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/models/llama/runner/static_attention_io_manager.h b/examples/models/llama/runner/static_attention_io_manager.h
index e2d2bc40c60..06fbffbef83 100644
--- a/examples/models/llama/runner/static_attention_io_manager.h
+++ b/examples/models/llama/runner/static_attention_io_manager.h
@@ -586,12 +586,12 @@ class StaticAttentionIOManager {
    * of the prompt and method's input length. Returns the position in the output
    * that corresponds to the end of the prompt during the last inference.
    */
-  template <typename TokenT>
+  template <typename TokenT, typename LogitT>
   size_t prefill(
       executorch::runtime::Span<TokenT> tokens,
       executorch::runtime::Span<TokenT> input_buffer,
       executorch::runtime::Method& method,
-      std::function<void(executorch::runtime::Span<const float>)>
+      std::function<void(executorch::runtime::Span<const LogitT>)>
           logits_callback = nullptr) {
     ET_LOG(Info, "Prefilling at position %zu", input_pos_);
     size_t input_len = input_buffer.size();
@@ -619,7 +619,7 @@ class StaticAttentionIOManager {
           batch_len);
       if (logits_callback) {
         auto logits_tensor = method.get_output(0).toTensor();
-        auto* logits = logits_tensor.const_data_ptr<float>();
+        auto* logits = logits_tensor.const_data_ptr<LogitT>();
         logits_callback(executorch::runtime::Span(
             logits,
             logits + batch_len * logits_tensor.size(logits_tensor.dim() - 1)));

From 698ea79ed34379dbc279e3eb0badf81e05f27aca Mon Sep 17 00:00:00 2001
From: DannyYuyang-quic <quic_yuyazhua@quicinc.com>
Date: Thu, 9 Oct 2025 13:03:53 +0800
Subject: [PATCH 197/266] Qualcomm AI Engine Direct - docs fix (#14881)

### Summary
update README on python dependencies and recommended QNN version

cc @cccclai @winskuo-quic @shewu-quic @haowhsu-quic @cbilgin
---
 docs/source/backends-qualcomm.md              | 89 +------------------
 examples/qualcomm/README.md                   |  7 +-
 examples/qualcomm/oss_scripts/llama/README.md |  1 +
 3 files changed, 9 insertions(+), 88 deletions(-)

diff --git a/docs/source/backends-qualcomm.md b/docs/source/backends-qualcomm.md
index 59634b9b39b..8e1b0ebfcb3 100644
--- a/docs/source/backends-qualcomm.md
+++ b/docs/source/backends-qualcomm.md
@@ -74,10 +74,9 @@ This example is verified with SM8550 and SM8450.
  - A compiler to compile AOT parts, e.g., the GCC compiler comes with Ubuntu LTS.
  - [Android NDK](https://developer.android.com/ndk). This example is verified with NDK 26c.
  - [Qualcomm AI Engine Direct SDK](https://developer.qualcomm.com/software/qualcomm-ai-engine-direct-sdk)
-   - Click the "Get Software" button to download a version of QNN SDK.
-   - However, at the moment of updating this tutorial, the above website doesn't provide QNN SDK newer than 2.22.6.
-   - The below is public links to download various QNN versions. Hope they can be publicly discoverable soon.
-   - [QNN 2.37.0](https://softwarecenter.qualcomm.com/api/download/software/sdks/Qualcomm_AI_Runtime_Community/All/2.37.0.250724/v2.37.0.250724.zip)
+   - Click the "Get Software" button to download the latest version of the QNN SDK.
+   - Although newer versions are available, we have verified and recommend using QNN 2.37.0 for stability.
+   - You can download it directly from the following link: [QNN 2.37.0](https://softwarecenter.qualcomm.com/api/download/software/sdks/Qualcomm_AI_Runtime_Community/All/2.37.0.250724/v2.37.0.250724.zip)
 
 The directory with installed Qualcomm AI Engine Direct SDK looks like:
 ```
@@ -136,86 +135,6 @@ cd $EXECUTORCH_ROOT
 ./backends/qualcomm/scripts/build.sh --release
 ```
 
-### AOT (Ahead-of-time) components:
-
-Python APIs on x64 are required to compile models to Qualcomm AI Engine Direct binary.
-
-```bash
-cd $EXECUTORCH_ROOT
-mkdir build-x86
-cd build-x86
-# Note that the below command might change.
-# Please refer to the above build.sh for latest workable commands.
-cmake .. \
-  -DCMAKE_INSTALL_PREFIX=$PWD \
-  -DEXECUTORCH_BUILD_QNN=ON \
-  -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \
-  -DEXECUTORCH_BUILD_DEVTOOLS=ON \
-  -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-  -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-  -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
-  -DPYTHON_EXECUTABLE=python3
-
-# nproc is used to detect the number of available CPU.
-# If it is not applicable, please feel free to use the number you want.
-cmake --build $PWD --target "PyQnnManagerAdaptor" "PyQnnWrapperAdaptor" -j$(nproc)
-
-# install Python APIs to correct import path
-# The filename might vary depending on your Python and host version.
-cp -f backends/qualcomm/PyQnnManagerAdaptor.cpython-310-x86_64-linux-gnu.so $EXECUTORCH_ROOT/backends/qualcomm/python
-cp -f backends/qualcomm/PyQnnWrapperAdaptor.cpython-310-x86_64-linux-gnu.so $EXECUTORCH_ROOT/backends/qualcomm/python
-
-# Workaround for .fbs files in exir/_serialize
-cp $EXECUTORCH_ROOT/schema/program.fbs $EXECUTORCH_ROOT/exir/_serialize/program.fbs
-cp $EXECUTORCH_ROOT/schema/scalar_type.fbs $EXECUTORCH_ROOT/exir/_serialize/scalar_type.fbs
-```
-
-### Runtime:
-
-An example `qnn_executor_runner` executable would be used to run the compiled `pte` model.
-
-Commands to build `qnn_executor_runner` for Android:
-
-```bash
-cd $EXECUTORCH_ROOT
-mkdir build-android
-cd build-android
-# build executorch & qnn_executorch_backend
-cmake .. \
-    -DCMAKE_INSTALL_PREFIX=$PWD \
-    -DEXECUTORCH_BUILD_QNN=ON \
-    -DQNN_SDK_ROOT=$QNN_SDK_ROOT \
-    -DEXECUTORCH_BUILD_DEVTOOLS=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-    -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
-    -DPYTHON_EXECUTABLE=python3 \
-    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
-    -DANDROID_ABI='arm64-v8a' \
-    -DANDROID_PLATFORM=android-30
-
-# nproc is used to detect the number of available CPU.
-# If it is not applicable, please feel free to use the number you want.
-cmake --build $PWD --target install -j$(nproc)
-
-cmake ../examples/qualcomm \
-    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
-    -DANDROID_ABI='arm64-v8a' \
-    -DANDROID_PLATFORM=android-30 \
-    -DCMAKE_PREFIX_PATH="$PWD/lib/cmake/ExecuTorch;$PWD/third-party/gflags;" \
-    -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
-    -DPYTHON_EXECUTABLE=python3 \
-    -Bexamples/qualcomm
-
-cmake --build examples/qualcomm -j$(nproc)
-
-# qnn_executor_runner can be found under examples/qualcomm
-# The full path is $EXECUTORCH_ROOT/build-android/examples/qualcomm/executor_runner/qnn_executor_runner
-ls examples/qualcomm
-```
-
-**Note:** If you want to build for release, add `-DCMAKE_BUILD_TYPE=Release` to the `cmake` command options.
-
 
 ## Deploying and running on device
 
@@ -365,7 +284,7 @@ The model, inputs, and output location are passed to `qnn_executorch_runner` by
 
 ## Supported model list
 
-Please refer to `$EXECUTORCH_ROOT/examples/qualcomm/scripts/` and `EXECUTORCH_ROOT/examples/qualcomm/oss_scripts/` to the list of supported models.
+Please refer to `$EXECUTORCH_ROOT/examples/qualcomm/scripts/` and `$EXECUTORCH_ROOT/examples/qualcomm/oss_scripts/` to the list of supported models.
 
 ## How to Support a Custom Model in HTP Backend
 
diff --git a/examples/qualcomm/README.md b/examples/qualcomm/README.md
index 355209f43a7..31443f2d356 100644
--- a/examples/qualcomm/README.md
+++ b/examples/qualcomm/README.md
@@ -111,12 +111,13 @@ This section outlines the essential APIs and utilities provided to streamline th
    Creates a clean directory for storing model outputs or intermediate results. If the directory already exists, it will be deleted and recreated to ensure a consistent environment for each run.
 
 ## Additional Dependency
+This example requires the following Python packages:
+- pandas and scikit-learn: used in the mobilebert multi-class text classification example.
+- graphviz (optional): used for visualizing QNN graphs during debugging.
 
-The mobilebert multi-class text classification example requires `pandas` and `sklearn`.
 Please install them by something like
-
 ```bash
-pip install scikit-learn pandas
+pip install scikit-learn pandas graphviz
 ```
 
 ## Limitation
diff --git a/examples/qualcomm/oss_scripts/llama/README.md b/examples/qualcomm/oss_scripts/llama/README.md
index 9bb76142362..be25324d63d 100644
--- a/examples/qualcomm/oss_scripts/llama/README.md
+++ b/examples/qualcomm/oss_scripts/llama/README.md
@@ -38,6 +38,7 @@ We offer the following modes to execute the model:
 ### Step 1: Setup
 1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch.
 2. Follow the [tutorial](https://pytorch.org/executorch/main/backends-qualcomm) to build Qualcomm AI Engine Direct Backend.
+3. Please install the llm eval dependency via [examples/models/llama/install_requirements.sh](https://github.com/pytorch/executorch/blob/main/examples/models/llama/install_requirements.sh)
 
 ### Step 2: Prepare Model
 

From 29b4db8139a1f003ccbe64a89c77dd94eb3742e4 Mon Sep 17 00:00:00 2001
From: Marco Giordano <112122023+mgiordy@users.noreply.github.com>
Date: Wed, 8 Oct 2025 22:04:31 -0700
Subject: [PATCH 198/266] Including mixed quant Conv1D op in Jarvis

Differential Revision: D81652570

Pull Request resolved: https://github.com/pytorch/executorch/pull/14865
---
 backends/cadence/aot/functions_hifi.yaml      |  5 ++
 backends/cadence/aot/ops_registrations.py     | 35 +++++++++++
 backends/cadence/aot/quantizer/fusion_pass.py | 57 +++++++++++++++++
 backends/cadence/aot/quantizer/patterns.py    | 62 +++++++++++++++++++
 backends/cadence/aot/quantizer/quantizer.py   |  4 ++
 5 files changed, 163 insertions(+)

diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml
index 8c65e745c21..c1cef01c1e8 100644
--- a/backends/cadence/aot/functions_hifi.yaml
+++ b/backends/cadence/aot/functions_hifi.yaml
@@ -553,3 +553,8 @@
   kernels:
     - arg_meta: null
       kernel_name: impl::HiFi::quantized_w8a32_linear_out
+
+- func: cadence::quantized_w8a32_conv.out(Tensor input, Tensor weight, float w_scale, Tensor bias, float b_scale, *, Tensor(a!) output) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::quantized_w8a32_conv_out
diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
index 9266cc72970..38a6b08836c 100644
--- a/backends/cadence/aot/ops_registrations.py
+++ b/backends/cadence/aot/ops_registrations.py
@@ -571,6 +571,12 @@
     "quantized_w8a32_linear.out(Tensor input, Tensor weight, float w_scale, Tensor bias, float b_scale, *, Tensor(a!) output) -> Tensor(a!)"
 )
 
+lib.define(
+    "quantized_w8a32_conv(Tensor input, Tensor weight, float w_scale, Tensor bias, float b_scale) -> Tensor"
+)
+lib.define(
+    "quantized_w8a32_conv.out(Tensor input, Tensor weight, float w_scale, Tensor bias, float b_scale, *, Tensor(a!) output) -> Tensor(a!)"
+)
 
 # Custom ops with aten namespace. Need to specify the lib var as FRAGMENT type as aten library is already defined
 aten_lib = Library("aten", "FRAGMENT")
@@ -2589,3 +2595,32 @@ def quantized_w8a32_linear_meta(
     assert src_shape[-1] == weight_shape[-1]
     src_shape[-1] = weight_shape[0]
     return src.new_empty(src_shape, dtype=src.dtype)
+
+
+@register_fake("cadence::quantized_w8a32_conv")
+def quantized_w8a32_conv_meta(
+    src: torch.Tensor,
+    weight: torch.Tensor,
+    w_scale: float,
+    bias: torch.Tensor,
+    b_scale: float,
+) -> torch.Tensor:
+    # src comes in shape [batch, in_channel, in_length]
+    # weight comes in shape [out_ch, in_ch, kernel_dim]
+    # output comes in empty with shape [batch, out_ch, in_length - kernel_dim + 1]
+    assert len(src.shape) == 3
+
+    kernel_size, out_channels, in_channels = weight.shape
+    assert in_channels == src.shape[-1]
+
+    # Compute the output tensor size
+    output_size = get_conv1d_output_size(
+        src.permute(0, 2, 1).shape,
+        out_channels,
+        stride=1,
+        padding=0,
+        dilation=1,
+        kernel_size=kernel_size,
+        channel_last=False,
+    )
+    return src.new_empty(output_size, dtype=src.dtype)
diff --git a/backends/cadence/aot/quantizer/fusion_pass.py b/backends/cadence/aot/quantizer/fusion_pass.py
index cdadedff6cf..c8bfa5cbac7 100644
--- a/backends/cadence/aot/quantizer/fusion_pass.py
+++ b/backends/cadence/aot/quantizer/fusion_pass.py
@@ -24,6 +24,7 @@
     LayerNormPattern,
     LinearPattern,
     MatmulPattern,
+    MixedW8A32ConvPattern,
     MixedW8A32LinearPattern,
     ReluPattern0,
     ReluPattern1,
@@ -478,6 +479,52 @@ def get_args_and_kwargs_softmax(
         out_zero_point_tensor,
     )
     kwargs = {}
+
+    return args, kwargs
+
+
+def get_args_and_kwargs_mixed_w8a32_conv(
+    graph_module: GraphModule,
+    other_inputs: List[fx.Node],
+    weights_inputs: List[fx.Node],
+    dequants_weights: List[fx.Node],
+    bias_inputs: List[fx.Node],
+    dequants_biases: List[fx.Node],
+    op_node: fx.Node,
+) -> Tuple[Tuple[ArgsType, ...], Dict[str, ArgsType]]:
+    # Stride, padding, dilation, groups not supported yet
+    if len(op_node.args) > 3:
+        assert op_node.args[3] == [1]  # Stride
+    if len(op_node.args) > 4:
+        assert op_node.args[4] == [0]  # Padding
+    if len(op_node.args) > 5:
+        assert op_node.args[5] == [1]  # Dilation
+    if len(op_node.args) > 6:
+        assert op_node.args[6] == 1  # Groups
+
+    assert len(dequants_weights) == 1
+    assert len(dequants_biases) == 1
+    W_scale_ = dequants_weights[0].args[1]
+    B_scale_ = dequants_biases[0].args[1]
+
+    transposed_inputs = graph_module.graph.call_function(
+        torch.ops.aten.permute.default,
+        (other_inputs[0], [0, 2, 1]),  # NCL -> NLC
+    )
+    transposed_weights = graph_module.graph.call_function(
+        torch.ops.aten.permute.default,
+        (weights_inputs[0], [2, 0, 1]),  # NCL -> NLC
+    )
+
+    args = (
+        transposed_inputs,
+        transposed_weights,
+        W_scale_,
+        bias_inputs[0],
+        B_scale_,
+    )
+    kwargs = {}
+
     return args, kwargs
 
 
@@ -650,6 +697,16 @@ def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
                             bias_inputs,
                             dequants_biases,
                         )
+                    elif isinstance(pattern, MixedW8A32ConvPattern):
+                        args, kwargs = get_args_and_kwargs_mixed_w8a32_conv(
+                            graph_module,
+                            other_inputs,
+                            weights_inputs,
+                            dequants_weights,
+                            bias_inputs,
+                            dequants_biases,
+                            op_node,
+                        )
 
                     fused = graph_module.graph.call_function(
                         pattern.replacement_op(),
diff --git a/backends/cadence/aot/quantizer/patterns.py b/backends/cadence/aot/quantizer/patterns.py
index 5ceb2ffdda3..65389aaad37 100644
--- a/backends/cadence/aot/quantizer/patterns.py
+++ b/backends/cadence/aot/quantizer/patterns.py
@@ -599,3 +599,65 @@ def get_anchors(
 
     def replacement_op(self) -> OpOverload:
         return torch.ops.cadence.quantized_w8a32_linear.default
+
+
+class MixedW8A32ConvPattern(QuantizationPattern):
+    def partition_types(self) -> List[OpOverload]:
+        return [torch.ops.aten.conv1d.default]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+    ) -> Tuple[PartitionAnchors, fx.Node]:
+        # pyre-ignore[29]
+        conv_layer = fused_partition[0].nodes[-1]
+
+        # Bail if the arguments have different shapes than expected
+        # Stride, padding, dilation and groups are not supported
+        if len(conv_layer.args) != 3 or len(conv_layer.kwargs) > 0:
+            return (
+                PartitionAnchors(
+                    empty=True,
+                ),
+                conv_layer,
+            )
+
+        cnn_weights = conv_layer.args[1]
+        if hasattr(cnn_weights.meta, "tensor_meta"):
+            cnn_weights_shape = cnn_weights.meta["tensor_meta"].shape
+            # Bail if the channels are not multiple of 4 (SIMD)
+            if cnn_weights_shape[0] % 4 != 0:
+                return (
+                    PartitionAnchors(
+                        empty=True,
+                    ),
+                    conv_layer,
+                )
+            if cnn_weights_shape[1] % 4 != 0:
+                return (
+                    PartitionAnchors(
+                        empty=True,
+                    ),
+                    conv_layer,
+                )
+            # Bail if the kernel size is not 3
+            if cnn_weights_shape[2] != 3:
+                return (
+                    PartitionAnchors(
+                        empty=True,
+                    ),
+                    conv_layer,
+                )
+
+        return (
+            PartitionAnchors(
+                inputs=[],
+                weights=[(conv_layer, 1)],
+                biases=[(conv_layer, 2)],
+                output=[],
+                others=[(conv_layer, 0)],
+            ),
+            conv_layer,
+        )
+
+    def replacement_op(self) -> OpOverload:
+        return torch.ops.cadence.quantized_w8a32_conv.default
diff --git a/backends/cadence/aot/quantizer/quantizer.py b/backends/cadence/aot/quantizer/quantizer.py
index 4df69df0779..f824ef874c4 100644
--- a/backends/cadence/aot/quantizer/quantizer.py
+++ b/backends/cadence/aot/quantizer/quantizer.py
@@ -24,6 +24,7 @@
     LayerNormPattern,
     LinearPattern,
     MatmulPattern,
+    MixedW8A32ConvPattern,
     MixedW8A32LinearPattern,
     QuantizationPattern,
     ReluPattern0,
@@ -321,6 +322,9 @@ def __init__(self) -> None:
         quantizers.append(
             CadenceAtenQuantizer(MixedW8A32LinearPattern(), qconfig_A32W8sym)
         )
+        quantizers.append(
+            CadenceAtenQuantizer(MixedW8A32ConvPattern(), qconfig_A32W8sym)
+        )
         super().__init__(quantizers)
 
 
From f7f97f7e8d05b5368661b49ce0aeb4a22b87ab14 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Thu, 9 Oct 2025 01:22:53 -0400
Subject: [PATCH 199/266] introduce shim layers for cudaguard and
 cudastreamguard (#14925)

This PR was created by the merge bot to help merge the original PR into
the main branch.
ghstack PR number: https://github.com/pytorch/executorch/pull/14902 by
@Gasoonjia
^ Please use this as the source of truth for the PR details, comments,
and reviews
ghstack PR base:
https://github.com/pytorch/executorch/tree/gh/gasoonjia/47/base
ghstack PR head:
https://github.com/pytorch/executorch/tree/gh/gasoonjia/47/head
Merge bot PR base: https://github.com/pytorch/executorch/tree/main
Merge bot PR head:
https://github.com/pytorch/executorch/tree/gh/gasoonjia/47/orig
Differential Revision:
[D84126634](https://our.internmc.facebook.com/intern/diff/D84126634/)
@diff-train-skip-merge

---------

Co-authored-by: gasoonjia <gasoonjia@icloud.com>
Co-authored-by: Gasoonjia <gasoonjia@meta.com>
---
 backends/aoti/CMakeLists.txt                  |   9 +-
 backends/aoti/aoti_model_container.h          |   2 +
 backends/aoti/common_shims.cpp                |   9 +-
 backends/aoti/common_shims.h                  |   2 +
 backends/aoti/targets.bzl                     |   2 +-
 backends/cuda/CMakeLists.txt                  |  11 +-
 backends/cuda/runtime/TARGETS                 |  24 ++
 backends/cuda/runtime/cuda_backend.cpp        | 227 ++++++++----------
 backends/cuda/runtime/guard.cpp               |   8 +-
 backends/cuda/runtime/guard.h                 |   8 +-
 backends/cuda/runtime/shims/cuda_guard.cpp    | 105 ++++++++
 backends/cuda/runtime/shims/cuda_guard.h      | 100 ++++++++
 backends/cuda/runtime/shims/memory.cpp        |  23 +-
 backends/cuda/runtime/shims/memory.h          |   8 +-
 .../cuda/runtime/shims/tensor_attribute.cpp   |   8 +-
 .../cuda/runtime/shims/tensor_attribute.h     |   8 +-
 backends/cuda/runtime/shims/tests/targets.bzl |   1 +
 .../tests/test_aoti_torch_cuda_guard.cpp      | 199 +++++++++++++++
 backends/cuda/runtime/utils.h                 |   8 +-
 runtime/platform/log.h                        |  22 ++
 20 files changed, 591 insertions(+), 193 deletions(-)
 create mode 100644 backends/cuda/runtime/shims/cuda_guard.cpp
 create mode 100644 backends/cuda/runtime/shims/cuda_guard.h
 create mode 100644 backends/cuda/runtime/shims/tests/test_aoti_torch_cuda_guard.cpp

diff --git a/backends/aoti/CMakeLists.txt b/backends/aoti/CMakeLists.txt
index 8d49bcf1f96..845144af50f 100644
--- a/backends/aoti/CMakeLists.txt
+++ b/backends/aoti/CMakeLists.txt
@@ -40,13 +40,8 @@ target_compile_options(aoti_common PUBLIC -fexceptions -frtti -fPIC)
 # Ensure symbols are exported properly
 target_link_options(aoti_common PUBLIC -Wl,--export-dynamic)
 
-# Link against PyTorch libraries and standard libraries
-target_link_libraries(
-  aoti_common
-  PUBLIC extension_tensor ${CMAKE_DL_LIBS}
-         # Link PyTorch libraries for AOTI functions
-         ${TORCH_LIBRARIES}
-)
+# Link against ExecuTorch libraries and standard libraries
+target_link_libraries(aoti_common PUBLIC extension_tensor ${CMAKE_DL_LIBS})
 executorch_target_link_options_shared_lib(aoti_common)
 
 install(
diff --git a/backends/aoti/aoti_model_container.h b/backends/aoti/aoti_model_container.h
index 844bd2d5a77..9b185327172 100644
--- a/backends/aoti/aoti_model_container.h
+++ b/backends/aoti/aoti_model_container.h
@@ -77,6 +77,8 @@ struct AOTIDelegateHandle {
   void* so_handle;
   std::string so_path;
   AOTInductorModelContainerHandle container_handle;
+  void* cuda_stream; // cudaStream_t stored as void* to avoid CUDA header
+                     // dependency
 };
 
 } // namespace aoti
diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp
index 2f9b36e3c4f..abc83779443 100644
--- a/backends/aoti/common_shims.cpp
+++ b/backends/aoti/common_shims.cpp
@@ -127,11 +127,18 @@ int32_t aoti_torch_layout_strided() {
 }
 
 // Dtype constants - these return the PyTorch dtype codes
-// Currently only float32 is supported, but using robust enum-based approach
 int32_t aoti_torch_dtype_float32() {
   return 6; // PyTorch's float32 dtype code
 }
 
+int32_t aoti_torch_dtype_bfloat16() {
+  return 15; // PyTorch's bfloat16 dtype code
+}
+
+int32_t aoti_torch_dtype_int64() {
+  return 4; // PyTorch's int64 dtype code
+}
+
 // Cleanup functions
 void cleanup_tensor_metadata() {
   internal::tensor_to_sizes.clear();
diff --git a/backends/aoti/common_shims.h b/backends/aoti/common_shims.h
index ffcbaa11a08..5f54cd1c878 100644
--- a/backends/aoti/common_shims.h
+++ b/backends/aoti/common_shims.h
@@ -58,6 +58,8 @@ AOTITorchError aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim);
 int32_t aoti_torch_device_type_cpu();
 int32_t aoti_torch_layout_strided();
 int32_t aoti_torch_dtype_float32();
+int32_t aoti_torch_dtype_bfloat16();
+int32_t aoti_torch_dtype_int64();
 
 // Autograd mode functions
 int32_t aoti_torch_grad_mode_is_enabled();
diff --git a/backends/aoti/targets.bzl b/backends/aoti/targets.bzl
index 79f082e5a89..8bf44573bb3 100644
--- a/backends/aoti/targets.bzl
+++ b/backends/aoti/targets.bzl
@@ -51,7 +51,7 @@ def define_common_targets():
         link_whole = True,
         supports_python_dlopen = True,
         visibility = ["@EXECUTORCH_CLIENTS"],
-        deps = [
+        exported_deps = [
             ":common_shims",
             ":model_container",
         ],
diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index acbb7adc87f..575f676e4cc 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -35,8 +35,10 @@ include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 find_package_torch()
 
 # CUDA-specific AOTI functionality
-set(_aoti_cuda_sources runtime/cuda_backend.cpp runtime/shims/memory.cpp
-                       runtime/shims/tensor_attribute.cpp runtime/guard.cpp
+set(_aoti_cuda_sources
+    runtime/cuda_backend.cpp runtime/shims/memory.cpp
+    runtime/shims/tensor_attribute.cpp runtime/guard.cpp
+    runtime/shims/cuda_guard.cpp
 )
 add_library(aoti_cuda STATIC ${_aoti_cuda_sources})
 target_include_directories(
@@ -53,10 +55,7 @@ target_link_options(aoti_cuda PUBLIC -Wl,--export-dynamic)
 
 # Link against CUDA::cudart, common AOTI library, and PyTorch CUDA libraries
 target_link_libraries(
-  aoti_cuda
-  PUBLIC aoti_common CUDA::cudart ${CMAKE_DL_LIBS}
-         # Link PyTorch libraries for AOTI CUDA functions
-         ${TORCH_LIBRARIES}
+  aoti_cuda PUBLIC aoti_common CUDA::cudart ${CMAKE_DL_LIBS}
 )
 # If you need other CUDA libraries, link them similarly:
 # target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...)
diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS
index c4b778eccc5..54412269287 100644
--- a/backends/cuda/runtime/TARGETS
+++ b/backends/cuda/runtime/TARGETS
@@ -6,11 +6,13 @@ runtime.cxx_library(
     name = "runtime_shims",
     srcs = [
         "guard.cpp",
+        "shims/cuda_guard.cpp",
         "shims/memory.cpp",
         "shims/tensor_attribute.cpp",
     ],
     headers = [
         "guard.h",
+        "shims/cuda_guard.h",
         "shims/memory.h",
         "shims/tensor_attribute.h",
         "utils.h",
@@ -32,3 +34,25 @@ runtime.cxx_library(
         ("cuda", None, "cuda-lazy"),
     ],
 )
+
+runtime.cxx_library(
+    name = "cuda_backend",
+    srcs = [
+        "cuda_backend.cpp",
+    ],
+    # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
+    link_whole = True,
+    supports_python_dlopen = True,
+    # Constructor needed for backend registration.
+    compiler_flags = ["-Wno-global-constructors"],
+    visibility = ["@EXECUTORCH_CLIENTS"],
+    deps = [
+        ":runtime_shims",
+        "//executorch/backends/aoti:aoti_common",
+        "//executorch/runtime/backend:interface",
+        "//executorch/runtime/core/exec_aten/util:tensor_util",
+    ],
+    external_deps = [
+        ("cuda", None, "cuda-lazy"),
+    ],
+)
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index 08031ce6a26..58ab54e1aac 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <cuda_runtime.h>
 #include <dlfcn.h>
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/error.h>
@@ -16,7 +17,6 @@
 
 #include <filesystem>
 #include <fstream>
-#include <iostream>
 #include <string>
 #include <vector>
 
@@ -24,10 +24,16 @@
 #include <executorch/backends/aoti/aoti_model_container.h>
 #include <executorch/backends/aoti/common_shims.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
+#include <executorch/backends/cuda/runtime/utils.h>
 
-namespace executorch {
-namespace backends {
-namespace cuda {
+namespace executorch::backends::cuda {
+
+#define LOAD_SYMBOL(name, handle)                                \
+  do {                                                           \
+    name = reinterpret_cast<name##Func>(dlsym(handle, #name));   \
+    ET_CHECK_OR_RETURN_ERROR(                                    \
+        name != nullptr, AccessFailed, "Failed to load " #name); \
+  } while (0)
 
 using namespace std;
 using namespace aoti;
@@ -52,45 +58,11 @@ class ET_EXPERIMENTAL CudaBackend final
     : public ::executorch::runtime::BackendInterface {
  private:
   Error register_shared_library_functions(void* so_handle) const {
-    AOTInductorModelContainerCreateWithDevice =
-        reinterpret_cast<AOTInductorModelContainerCreateWithDeviceFunc>(
-            dlsym(so_handle, "AOTInductorModelContainerCreateWithDevice"));
-    if (AOTInductorModelContainerCreateWithDevice == nullptr) {
-      ET_LOG(Error, "Failed to load AOTInductorModelContainerCreateWithDevice");
-      return Error::AccessFailed;
-    }
-
-    AOTInductorModelContainerDelete =
-        reinterpret_cast<AOTInductorModelContainerDeleteFunc>(
-            dlsym(so_handle, "AOTInductorModelContainerDelete"));
-    if (AOTInductorModelContainerDelete == nullptr) {
-      ET_LOG(Error, "Failed to load AOTInductorModelContainerDelete");
-      return Error::AccessFailed;
-    }
-
-    AOTInductorModelContainerGetNumInputs =
-        reinterpret_cast<AOTInductorModelContainerGetNumInputsFunc>(
-            dlsym(so_handle, "AOTInductorModelContainerGetNumInputs"));
-    if (AOTInductorModelContainerGetNumInputs == nullptr) {
-      ET_LOG(Error, "Failed to load AOTInductorModelContainerGetNumInputs");
-      return Error::AccessFailed;
-    }
-
-    AOTInductorModelContainerGetNumOutputs =
-        reinterpret_cast<AOTInductorModelContainerGetNumOutputsFunc>(
-            dlsym(so_handle, "AOTInductorModelContainerGetNumOutputs"));
-    if (AOTInductorModelContainerGetNumOutputs == nullptr) {
-      ET_LOG(Error, "Failed to load AOTInductorModelContainerGetNumOutputs");
-      return Error::AccessFailed;
-    }
-
-    AOTInductorModelContainerRun =
-        reinterpret_cast<AOTInductorModelContainerRunFunc>(
-            dlsym(so_handle, "AOTInductorModelContainerRun"));
-    if (AOTInductorModelContainerRun == nullptr) {
-      ET_LOG(Error, "Failed to load AOTInductorModelContainerRun");
-      return Error::AccessFailed;
-    }
+    LOAD_SYMBOL(AOTInductorModelContainerCreateWithDevice, so_handle);
+    LOAD_SYMBOL(AOTInductorModelContainerDelete, so_handle);
+    LOAD_SYMBOL(AOTInductorModelContainerGetNumInputs, so_handle);
+    LOAD_SYMBOL(AOTInductorModelContainerGetNumOutputs, so_handle);
+    LOAD_SYMBOL(AOTInductorModelContainerRun, so_handle);
 
     return Error::Ok;
   }
@@ -121,14 +93,13 @@ class ET_EXPERIMENTAL CudaBackend final
 
     const NamedDataMap* named_data_map = context.get_named_data_map();
     auto aoti_cuda_buffer = named_data_map->get_data(so_blob_key.c_str());
-    if (!aoti_cuda_buffer.ok()) {
-      ET_LOG(
-          Error,
-          "Failed to get data for key %s: 0x%x",
-          so_blob_key.c_str(),
-          aoti_cuda_buffer.error());
-      return aoti_cuda_buffer.error();
-    }
+    ET_CHECK_OR_RETURN_ERROR(
+        aoti_cuda_buffer.ok(),
+        Internal,
+        "Failed to get data for key %s: 0x%x",
+        so_blob_key.c_str(),
+        static_cast<uint32_t>(aoti_cuda_buffer.error()));
+
     // Generate dynamic temporary file path
     filesystem::path temp_dir = filesystem::temp_directory_path();
     filesystem::path so_path =
@@ -143,45 +114,47 @@ class ET_EXPERIMENTAL CudaBackend final
         "Writing %zu bytes to %s",
         aoti_cuda_buffer->size(),
         so_path.c_str());
+
     outfile.write(
         static_cast<const char*>(aoti_cuda_buffer->data()),
         aoti_cuda_buffer->size());
 
-    if (!outfile) {
-      ET_LOG(Error, "Failed to write to file %s", so_path.c_str());
-      return Error::AccessFailed;
-    }
+    ET_CHECK_OR_RETURN_ERROR(
+        outfile, AccessFailed, "Failed to write to file %s", so_path.c_str());
+
     // Finish writing the file to disk
     outfile.close();
 
     // Load the ELF using dlopen
     void* so_handle = dlopen(so_path.c_str(), RTLD_LAZY | RTLD_LOCAL);
-    if (so_handle == nullptr) {
-      ET_LOG(Error, "Failed to load shared library: %s", dlerror());
-      return Error::AccessFailed;
-    }
+    ET_CHECK_OR_RETURN_ERROR(
+        so_handle != nullptr,
+        AccessFailed,
+        "Failed to load shared library: %s",
+        dlerror());
 
     processed->Free();
 
     // Register all shared library functions
-    Error reg_err = register_shared_library_functions(so_handle);
-    if (reg_err != Error::Ok) {
-      return reg_err;
-    }
+    ET_CHECK_OK_OR_RETURN_ERROR(register_shared_library_functions(so_handle));
 
     AOTInductorModelContainerHandle container_handle = nullptr;
 
-    AOTIRuntimeError err = AOTInductorModelContainerCreateWithDevice(
-        &container_handle, 1, "cuda", nullptr);
-    if (err != Error::Ok) {
-      return err;
-    }
+    ET_CHECK_OK_OR_RETURN_ERROR(AOTInductorModelContainerCreateWithDevice(
+        &container_handle, 1, "cuda", nullptr));
+
     ET_LOG(Info, "container_handle = %p", container_handle);
 
     AOTIDelegateHandle* handle = new AOTIDelegateHandle();
     handle->so_handle = so_handle;
     handle->so_path = so_path.string();
     handle->container_handle = container_handle;
+
+    // Create a CUDA stream for asynchronous execution
+    cudaStream_t cuda_stream;
+    ET_CUDA_CHECK_OR_RETURN_ERROR(cudaStreamCreate(&cuda_stream));
+    handle->cuda_stream = static_cast<void*>(cuda_stream);
+
     return (DelegateHandle*)handle; // Return the handle post-processing
   }
 
@@ -199,15 +172,13 @@ class ET_EXPERIMENTAL CudaBackend final
     AOTInductorModelContainerGetNumOutputs(
         handle->container_handle, &n_outputs);
 
-    if (n_inputs + n_outputs != args.size()) {
-      ET_LOG(
-          Error,
-          "number of user input %zd and output %zd generated from AOT Inductor does not match ET runner's %zd. Exit.",
-          n_inputs,
-          n_outputs,
-          args.size());
-      return Error::InvalidArgument;
-    }
+    ET_CHECK_OR_RETURN_ERROR(
+        n_inputs + n_outputs == args.size(),
+        InvalidArgument,
+        "number of user input %zd and output %zd generated from AOT Inductor does not match ET runner's %zd. Exit.",
+        n_inputs,
+        n_outputs,
+        args.size())
 
     // NOTE: ExecuTorch tensors are always on CPU/host memory
     // We need to create GPU copies for CUDA kernel execution
@@ -237,19 +208,20 @@ class ET_EXPERIMENTAL CudaBackend final
           0, // device_index = 0
           &gpu_input_handle);
 
-      if (create_err != Error::Ok) {
-        ET_LOG(Error, "Failed to create GPU tensor for input %d", i);
-        return Error::Internal;
-      }
+      ET_CHECK_OR_RETURN_ERROR(
+          create_err == Error::Ok,
+          Internal,
+          "Failed to create GPU tensor for input %d",
+          i);
 
       gpu_inputs[i] = gpu_input_handle;
 
       // Copy data from CPU to GPU
-      Error copy_err = aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0);
-      if (copy_err != Error::Ok) {
-        ET_LOG(Error, "Failed to copy input %d from CPU to GPU", i);
-        return Error::Internal;
-      }
+      ET_CHECK_OR_RETURN_ERROR(
+          aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0) == Error::Ok,
+          Internal,
+          "Failed to copy input %d from CPU to GPU",
+          i);
     }
     ET_LOG(Info, "Inputs copied to GPU");
     // Process output tensors: create GPU counterparts for ExecuTorch CPU
@@ -273,10 +245,11 @@ class ET_EXPERIMENTAL CudaBackend final
           0, // device_index = 0
           &gpu_output_handle);
 
-      if (create_err != Error::Ok) {
-        ET_LOG(Error, "Failed to create GPU tensor for output %d", i);
-        return Error::Internal;
-      }
+      ET_CHECK_OR_RETURN_ERROR(
+          create_err == Error::Ok,
+          Internal,
+          "Failed to create GPU tensor for output %d",
+          i);
 
       gpu_outputs[i] = gpu_output_handle;
     }
@@ -288,16 +261,14 @@ class ET_EXPERIMENTAL CudaBackend final
         n_inputs,
         gpu_outputs.data(), // Use GPU output tensors
         n_outputs,
-        nullptr, // Pass the actual CUDA stream!
+        handle->cuda_stream, // Pass the actual CUDA stream
         nullptr); // proxy_executor_handle can remain nullptr
 
-    if (error != Error::Ok) {
-      ET_LOG(
-          Error,
-          "AOTInductorModelContainerRun failed with error code %d",
-          error);
-      return Error::Internal;
-    }
+    ET_CHECK_OR_RETURN_ERROR(
+        error == Error::Ok,
+        Internal,
+        "AOTInductorModelContainerRun failed with error code %d",
+        error);
 
     // Copy GPU output results back to CPU output tensors
     for (int i = 0; i < n_outputs; i++) {
@@ -313,18 +284,6 @@ class ET_EXPERIMENTAL CudaBackend final
           i);
     }
 
-    // Clean up GPU tensors that we created (ExecuTorch tensors are always
-    // CPU, so all GPU tensors are our copies)
-    for (int i = 0; i < n_inputs; i++) {
-      // All GPU input tensors were created by us, delete them
-      aoti_torch_delete_tensor_object(gpu_inputs[i]);
-    }
-
-    for (int i = 0; i < n_outputs; i++) {
-      // All GPU output tensors were created by us, delete them
-      aoti_torch_delete_tensor_object(gpu_outputs[i]);
-    }
-
     return Error::Ok;
   }
 
@@ -334,19 +293,25 @@ class ET_EXPERIMENTAL CudaBackend final
     }
     AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
 
-    // Delete the container BEFORE closing the shared library
-    if (handle->container_handle != nullptr) {
-      AOTIRuntimeError delete_result =
-          AOTInductorModelContainerDelete(handle->container_handle);
-      if (delete_result != Error::Ok) {
-        ET_LOG(
-            Error,
-            "AOTInductorModelContainerDelete failed with error code %d",
-            delete_result);
-      }
-      handle->container_handle = nullptr;
+    // Destroy the CUDA stream if it exists
+    if (handle->cuda_stream != nullptr) {
+      cudaStream_t cuda_stream = static_cast<cudaStream_t>(handle->cuda_stream);
+      cudaError_t stream_err = cudaStreamDestroy(cuda_stream);
+      ET_CHECK_OR_LOG_ERROR(
+          stream_err == cudaSuccess,
+          "Failed to destroy CUDA stream: %s",
+          cudaGetErrorString(stream_err));
+      handle->cuda_stream = nullptr;
     }
 
+    // NOTE: AOTInductorModelContainerDelete does not work correctly with
+    // multiple .so files. Deleting one container frees shared resources,
+    // which causes segmentation faults when attempting to delete other
+    // containers. As a workaround, we skip explicit container deletion
+    // and defer cleanup to the OS.
+    // TODO(gasoonjia): Find a proper solution for safe container deletion.
+    // AOTInductorModelContainerDelete(handle->container_handle);
+
     // Now close the shared library
     if (handle->so_handle != nullptr) {
       dlclose(handle->so_handle);
@@ -356,27 +321,25 @@ class ET_EXPERIMENTAL CudaBackend final
     if (!handle->so_path.empty()) {
       std::error_code remove_error;
       std::filesystem::remove(handle->so_path, remove_error);
-      if (remove_error) {
-        ET_LOG(
-            Error,
-            "Failed to remove temporary shared library %s: %s",
-            handle->so_path.c_str(),
-            remove_error.message().c_str());
-      }
+      ET_CHECK_OR_LOG_ERROR(
+          !remove_error,
+          "Failed to remove temporary shared library %s: %s",
+          handle->so_path.c_str(),
+          remove_error.message().c_str());
     }
 
     delete handle;
+    clear_all_tensors();
   }
 };
 
-} // namespace cuda
+} // namespace executorch::backends::cuda
 
+namespace executorch::backends {
 namespace {
 auto cls = cuda::CudaBackend();
 executorch::runtime::Backend backend{"CudaBackend", &cls};
 static executorch::runtime::Error success_with_compiler =
     register_backend(backend);
 } // namespace
-
-} // namespace backends
-} // namespace executorch
+} // namespace executorch::backends
diff --git a/backends/cuda/runtime/guard.cpp b/backends/cuda/runtime/guard.cpp
index 885efc7670d..674cc6387b3 100644
--- a/backends/cuda/runtime/guard.cpp
+++ b/backends/cuda/runtime/guard.cpp
@@ -9,9 +9,7 @@
 #include <executorch/backends/cuda/runtime/guard.h>
 #include <executorch/runtime/platform/log.h>
 
-namespace executorch {
-namespace backends {
-namespace cuda {
+namespace executorch::backends::cuda {
 
 namespace {
 // Thread-local stream storage (private to this file)
@@ -146,6 +144,4 @@ Result<CUDAStreamGuard> CUDAStreamGuard::create(
   return stream_guard;
 }
 
-} // namespace cuda
-} // namespace backends
-} // namespace executorch
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/guard.h b/backends/cuda/runtime/guard.h
index 4e5a18a4c0f..3f187000f90 100644
--- a/backends/cuda/runtime/guard.h
+++ b/backends/cuda/runtime/guard.h
@@ -14,9 +14,7 @@
 #include <executorch/runtime/core/result.h>
 #include <cstdint>
 
-namespace executorch {
-namespace backends {
-namespace cuda {
+namespace executorch::backends::cuda {
 
 using executorch::runtime::Error;
 using executorch::runtime::Result;
@@ -190,6 +188,4 @@ class CUDAStreamGuard {
   DeviceIndex device_index_;
 };
 
-} // namespace cuda
-} // namespace backends
-} // namespace executorch
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/cuda_guard.cpp b/backends/cuda/runtime/shims/cuda_guard.cpp
new file mode 100644
index 00000000000..bb07acc7ffa
--- /dev/null
+++ b/backends/cuda/runtime/shims/cuda_guard.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cuda/runtime/shims/cuda_guard.h>
+
+namespace executorch::backends::cuda {
+
+extern "C" {
+
+AOTITorchError aoti_torch_create_cuda_guard(
+    int32_t device_index,
+    CUDAGuardHandle* ret_guard) {
+  ET_CHECK_OR_RETURN_ERROR(
+      ret_guard != nullptr,
+      InvalidArgument,
+      "aoti_torch_create_cuda_guard failed: ret_guard is null");
+
+  auto result = CUDAGuard::create(device_index);
+  if (!result.ok()) {
+    return result.error();
+  }
+  *ret_guard = new CUDAGuard(std::move(result.get()));
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle guard) {
+  ET_CHECK_OR_RETURN_ERROR(
+      guard != nullptr,
+      InvalidArgument,
+      "aoti_torch_delete_cuda_guard failed: guard is null");
+
+  delete guard;
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_cuda_guard_set_index(
+    CUDAGuardHandle guard,
+    int32_t device_index) {
+  ET_CHECK_OR_RETURN_ERROR(
+      guard != nullptr,
+      InvalidArgument,
+      "aoti_torch_cuda_guard_set_index failed: guard is null");
+
+  ET_CHECK_OK_OR_RETURN_ERROR(guard->set_index(device_index));
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_create_cuda_stream_guard(
+    void* stream,
+    int32_t device_index,
+    CUDAStreamGuardHandle* ret_guard) {
+  ET_CHECK_OR_RETURN_ERROR(
+      ret_guard != nullptr,
+      InvalidArgument,
+      "aoti_torch_create_cuda_stream_guard failed: ret_guard is null");
+
+  ET_CHECK_OR_RETURN_ERROR(
+      stream != nullptr,
+      InvalidArgument,
+      "aoti_torch_create_cuda_stream_guard failed: stream is null");
+
+  auto result =
+      CUDAStreamGuard::create(static_cast<cudaStream_t>(stream), device_index);
+  if (!result.ok()) {
+    return result.error();
+  }
+  *ret_guard = new CUDAStreamGuard(std::move(result.get()));
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_delete_cuda_stream_guard(
+    CUDAStreamGuardHandle guard) {
+  ET_CHECK_OR_RETURN_ERROR(
+      guard != nullptr,
+      InvalidArgument,
+      "aoti_torch_delete_cuda_stream_guard failed: guard is null");
+
+  delete guard;
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_get_current_cuda_stream(
+    int32_t device_index,
+    void** ret_stream) {
+  ET_CHECK_OR_RETURN_ERROR(
+      ret_stream != nullptr,
+      InvalidArgument,
+      "aoti_torch_get_current_cuda_stream failed: ret_stream is null");
+
+  auto result = getCurrentCUDAStream(device_index);
+  if (!result.ok()) {
+    return result.error();
+  }
+  *ret_stream = static_cast<void*>(result.get());
+  return Error::Ok;
+}
+
+} // extern "C"
+
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/cuda_guard.h b/backends/cuda/runtime/shims/cuda_guard.h
new file mode 100644
index 00000000000..f930f3df643
--- /dev/null
+++ b/backends/cuda/runtime/shims/cuda_guard.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <executorch/backends/aoti/common_shims.h>
+#include <executorch/backends/cuda/runtime/guard.h>
+#include <cstdint>
+
+namespace executorch::backends::cuda {
+
+using executorch::backends::aoti::AOTITorchError;
+
+extern "C" {
+
+// Handle types for CUDA guards
+using CUDAGuardHandle = CUDAGuard*;
+using CUDAStreamGuardHandle = CUDAStreamGuard*;
+
+/**
+ * Creates a CUDA device guard that sets the current device and restores it
+ * upon destruction.
+ *
+ * @param device_index The device index to set as current
+ * @param ret_guard Output parameter for the created guard handle (must not be
+ * null)
+ * @return AOTITorchError error code (Error::Ok on success, or an error code on
+ * failure)
+ */
+AOTITorchError aoti_torch_create_cuda_guard(
+    int32_t device_index,
+    CUDAGuardHandle* ret_guard);
+
+/**
+ * Deletes a CUDA device guard and frees its associated resources.
+ *
+ * @param guard Handle to the guard to be deleted
+ * @return AOTITorchError error code (Error::Ok on success, or an error code on
+ * failure)
+ */
+AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle guard);
+
+/**
+ * Sets the CUDA device to a new index for an existing guard.
+ *
+ * @param guard Handle to the guard
+ * @param device_index The device index to set as current
+ * @return AOTITorchError error code (Error::Ok on success, or an error code on
+ * failure)
+ */
+AOTITorchError aoti_torch_cuda_guard_set_index(
+    CUDAGuardHandle guard,
+    int32_t device_index);
+
+/**
+ * Creates a CUDA stream guard that sets the current device and stream,
+ * restoring both upon destruction.
+ *
+ * @param stream The CUDA stream to set as current
+ * @param device_index The device index for the stream
+ * @param ret_guard Output parameter for the created guard handle (must not be
+ * null)
+ * @return AOTITorchError error code (Error::Ok on success, or an error code on
+ * failure)
+ */
+AOTITorchError aoti_torch_create_cuda_stream_guard(
+    void* stream,
+    int32_t device_index,
+    CUDAStreamGuardHandle* ret_guard);
+
+/**
+ * Deletes a CUDA stream guard and frees its associated resources.
+ *
+ * @param guard Handle to the stream guard to be deleted
+ * @return AOTITorchError error code (Error::Ok on success, or an error code on
+ * failure)
+ */
+AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard);
+
+/**
+ * Gets the current CUDA stream for a specified device.
+ *
+ * @param device_index The device index (-1 to use current device)
+ * @param ret_stream Output parameter for the current stream (must not be null)
+ * @return AOTITorchError error code (Error::Ok on success, or an error code on
+ * failure)
+ */
+AOTITorchError aoti_torch_get_current_cuda_stream(
+    int32_t device_index,
+    void** ret_stream);
+
+} // extern "C"
+
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/memory.cpp b/backends/cuda/runtime/shims/memory.cpp
index cbaca68576e..b8e3dc8e21b 100644
--- a/backends/cuda/runtime/shims/memory.cpp
+++ b/backends/cuda/runtime/shims/memory.cpp
@@ -19,9 +19,7 @@
 #include <unordered_set>
 #include <vector>
 
-namespace executorch {
-namespace backends {
-namespace cuda {
+namespace executorch::backends::cuda {
 
 using executorch::aten::SizesType;
 using executorch::aten::StridesType;
@@ -271,14 +269,21 @@ void clear_all_tensors() {
   // Use aoti_torch_delete_tensor_object to properly delete each tensor
   // Note: We need to collect tensor pointers first since deletion modifies the
   // set
-  auto old_tensors =
-      std::move(tensors); // tensors is now empty and no need to copy
-  for (const auto& tensor_shared : old_tensors) {
-    aoti_torch_delete_tensor_object(tensor_shared.get());
+  std::vector<Tensor*> tensor_ptrs;
+  tensor_ptrs.reserve(tensors.size());
+  for (const auto& tensor_shared : tensors) {
+    tensor_ptrs.push_back(tensor_shared.get());
+  }
+
+  // Now delete each tensor - this will modify the global tensors set
+  for (Tensor* tensor_ptr : tensor_ptrs) {
+    aoti_torch_delete_tensor_object(tensor_ptr);
   }
 
   // tensors set should now be empty, but ensure it's cleared
   tensors.clear();
+
+  ET_LOG(Info, "Cleared all tensors");
 }
 
 AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor) {
@@ -652,6 +657,4 @@ AOTITorchError aoti_torch__reinterpret_tensor(
 
 } // extern "C"
 
-} // namespace cuda
-} // namespace backends
-} // namespace executorch
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/memory.h b/backends/cuda/runtime/shims/memory.h
index bcec6621285..7a8d4c3609b 100644
--- a/backends/cuda/runtime/shims/memory.h
+++ b/backends/cuda/runtime/shims/memory.h
@@ -12,9 +12,7 @@
 #include <executorch/backends/aoti/common_shims.h>
 #include <cstdint>
 
-namespace executorch {
-namespace backends {
-namespace cuda {
+namespace executorch::backends::cuda {
 
 using executorch::backends::aoti::AOTITorchError;
 using executorch::backends::aoti::Tensor;
@@ -145,6 +143,4 @@ aoti_torch_copy_(Tensor* self, Tensor* src, int32_t non_blocking);
 void clear_all_tensors();
 } // extern "C"
 
-} // namespace cuda
-} // namespace backends
-} // namespace executorch
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/tensor_attribute.cpp b/backends/cuda/runtime/shims/tensor_attribute.cpp
index 5b640b7a9e8..1a14c79f9f2 100644
--- a/backends/cuda/runtime/shims/tensor_attribute.cpp
+++ b/backends/cuda/runtime/shims/tensor_attribute.cpp
@@ -8,9 +8,7 @@
 
 #include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
 
-namespace executorch {
-namespace backends {
-namespace cuda {
+namespace executorch::backends::cuda {
 
 extern "C" {
 
@@ -31,6 +29,4 @@ int32_t aoti_torch_device_type_cuda() {
 
 } // extern "C"
 
-} // namespace cuda
-} // namespace backends
-} // namespace executorch
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/tensor_attribute.h b/backends/cuda/runtime/shims/tensor_attribute.h
index e99958b4f0c..15a4e397d24 100644
--- a/backends/cuda/runtime/shims/tensor_attribute.h
+++ b/backends/cuda/runtime/shims/tensor_attribute.h
@@ -12,9 +12,7 @@
 #include <executorch/runtime/core/error.h>
 #include <cstdint>
 
-namespace executorch {
-namespace backends {
-namespace cuda {
+namespace executorch::backends::cuda {
 
 // Common using declarations for ExecutorTorch types
 using executorch::runtime::Error;
@@ -35,6 +33,4 @@ int32_t aoti_torch_device_type_cuda();
 
 } // extern "C"
 
-} // namespace cuda
-} // namespace backends
-} // namespace executorch
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/tests/targets.bzl b/backends/cuda/runtime/shims/tests/targets.bzl
index fcb95a0beb7..70f27b86bec 100644
--- a/backends/cuda/runtime/shims/tests/targets.bzl
+++ b/backends/cuda/runtime/shims/tests/targets.bzl
@@ -32,3 +32,4 @@ def define_common_targets():
     cuda_shim_cpp_unittest("aoti_torch_create_tensor_from_blob_v2")
     cuda_shim_cpp_unittest("aoti_torch__reinterpret_tensor")
     cuda_shim_cpp_unittest("aoti_torch_copy_")
+    cuda_shim_cpp_unittest("aoti_torch_cuda_guard")
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_cuda_guard.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_cuda_guard.cpp
new file mode 100644
index 00000000000..7527965cdb8
--- /dev/null
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_cuda_guard.cpp
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cuda_runtime.h>
+#include <executorch/backends/aoti/common_shims.h>
+#include <executorch/backends/cuda/runtime/shims/cuda_guard.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/platform/platform.h>
+#include <gtest/gtest.h>
+
+using namespace executorch::backends::aoti;
+using namespace executorch::backends::cuda;
+using namespace executorch::runtime;
+
+// TODO(gasoonjia): Multiple device tests were not included due to test
+// environment limitations. Will be added in the future.
+class AOTITorchCUDAGuardTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    et_pal_init();
+
+    int device_count = 0;
+    cudaError_t err = cudaGetDeviceCount(&device_count);
+    if (err != cudaSuccess || device_count == 0) {
+      GTEST_SKIP() << "CUDA not available, skipping CUDA tests";
+    }
+
+    ASSERT_EQ(cudaGetDevice(&original_device_), cudaSuccess);
+  }
+
+  void TearDown() override {
+    if (cudaGetDeviceCount(&original_device_) == cudaSuccess) {
+      ASSERT_EQ(cudaGetDevice(&original_device_), cudaSuccess);
+    }
+  }
+
+  int original_device_ = 0;
+};
+
+TEST_F(AOTITorchCUDAGuardTest, CreateAndDeleteCUDAGuard) {
+  CUDAGuardHandle guard = nullptr;
+  AOTITorchError error = aoti_torch_create_cuda_guard(0, &guard);
+
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(guard, nullptr);
+
+  int current_device = -1;
+  ASSERT_EQ(cudaGetDevice(&current_device), cudaSuccess);
+  EXPECT_EQ(current_device, 0);
+
+  error = aoti_torch_delete_cuda_guard(guard);
+  EXPECT_EQ(error, Error::Ok);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, CreateCUDAGuardNullReturnPointer) {
+  AOTITorchError error = aoti_torch_create_cuda_guard(0, nullptr);
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, DeleteCUDAGuardNullHandle) {
+  AOTITorchError error = aoti_torch_delete_cuda_guard(nullptr);
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, CUDAGuardSetIndexNullHandle) {
+  AOTITorchError error = aoti_torch_cuda_guard_set_index(nullptr, 0);
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, CUDAGuardSetIndexInvalidDevice) {
+  CUDAGuardHandle guard = nullptr;
+  AOTITorchError error = aoti_torch_create_cuda_guard(0, &guard);
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(guard, nullptr);
+
+  error = aoti_torch_cuda_guard_set_index(guard, 999);
+  EXPECT_NE(error, Error::Ok);
+
+  error = aoti_torch_delete_cuda_guard(guard);
+  EXPECT_EQ(error, Error::Ok);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, CreateAndDeleteCUDAStreamGuard) {
+  cudaStream_t stream;
+  ASSERT_EQ(cudaStreamCreate(&stream), cudaSuccess);
+
+  CUDAStreamGuardHandle guard = nullptr;
+  AOTITorchError error = aoti_torch_create_cuda_stream_guard(stream, 0, &guard);
+
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(guard, nullptr);
+
+  error = aoti_torch_delete_cuda_stream_guard(guard);
+  EXPECT_EQ(error, Error::Ok);
+
+  ASSERT_EQ(cudaStreamDestroy(stream), cudaSuccess);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, CreateCUDAStreamGuardNullReturnPointer) {
+  cudaStream_t stream;
+  ASSERT_EQ(cudaStreamCreate(&stream), cudaSuccess);
+
+  AOTITorchError error =
+      aoti_torch_create_cuda_stream_guard(stream, 0, nullptr);
+  EXPECT_EQ(error, Error::InvalidArgument);
+
+  ASSERT_EQ(cudaStreamDestroy(stream), cudaSuccess);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, CreateCUDAStreamGuardNullStream) {
+  CUDAStreamGuardHandle guard = nullptr;
+  AOTITorchError error =
+      aoti_torch_create_cuda_stream_guard(nullptr, 0, &guard);
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, DeleteCUDAStreamGuardNullHandle) {
+  AOTITorchError error = aoti_torch_delete_cuda_stream_guard(nullptr);
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, GetCurrentCUDAStream) {
+  void* ret_stream = nullptr;
+  AOTITorchError error = aoti_torch_get_current_cuda_stream(0, &ret_stream);
+
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(ret_stream, nullptr);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, GetCurrentCUDAStreamNullReturnPointer) {
+  AOTITorchError error = aoti_torch_get_current_cuda_stream(0, nullptr);
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, StreamGuardWithSameDevice) {
+  ASSERT_EQ(cudaSetDevice(0), cudaSuccess);
+
+  cudaStream_t stream1, stream2;
+  ASSERT_EQ(cudaStreamCreate(&stream1), cudaSuccess);
+  ASSERT_EQ(cudaStreamCreate(&stream2), cudaSuccess);
+
+  CUDAStreamGuardHandle guard1 = nullptr;
+  AOTITorchError error =
+      aoti_torch_create_cuda_stream_guard(stream1, 0, &guard1);
+  EXPECT_EQ(error, Error::Ok);
+
+  void* ret_stream = nullptr;
+  error = aoti_torch_get_current_cuda_stream(0, &ret_stream);
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_EQ(static_cast<cudaStream_t>(ret_stream), stream1);
+
+  CUDAStreamGuardHandle guard2 = nullptr;
+  error = aoti_torch_create_cuda_stream_guard(stream2, 0, &guard2);
+  EXPECT_EQ(error, Error::Ok);
+
+  ret_stream = nullptr;
+  error = aoti_torch_get_current_cuda_stream(0, &ret_stream);
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_EQ(static_cast<cudaStream_t>(ret_stream), stream2);
+
+  error = aoti_torch_delete_cuda_stream_guard(guard2);
+  EXPECT_EQ(error, Error::Ok);
+
+  ret_stream = nullptr;
+  error = aoti_torch_get_current_cuda_stream(0, &ret_stream);
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_EQ(static_cast<cudaStream_t>(ret_stream), stream1);
+
+  error = aoti_torch_delete_cuda_stream_guard(guard1);
+  EXPECT_EQ(error, Error::Ok);
+
+  ASSERT_EQ(cudaStreamDestroy(stream1), cudaSuccess);
+  ASSERT_EQ(cudaStreamDestroy(stream2), cudaSuccess);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, GetCurrentStreamAfterSetStream) {
+  cudaStream_t new_stream;
+  ASSERT_EQ(cudaStreamCreate(&new_stream), cudaSuccess);
+
+  CUDAStreamGuardHandle guard = nullptr;
+  AOTITorchError error =
+      aoti_torch_create_cuda_stream_guard(new_stream, 0, &guard);
+  EXPECT_EQ(error, Error::Ok);
+
+  void* ret_stream = nullptr;
+  error = aoti_torch_get_current_cuda_stream(0, &ret_stream);
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_EQ(static_cast<cudaStream_t>(ret_stream), new_stream);
+
+  error = aoti_torch_delete_cuda_stream_guard(guard);
+  EXPECT_EQ(error, Error::Ok);
+
+  ASSERT_EQ(cudaStreamDestroy(new_stream), cudaSuccess);
+}
diff --git a/backends/cuda/runtime/utils.h b/backends/cuda/runtime/utils.h
index 02c3abfc83f..2d805724090 100644
--- a/backends/cuda/runtime/utils.h
+++ b/backends/cuda/runtime/utils.h
@@ -34,9 +34,7 @@
 #define ET_CUDA_KERNEL_LAUNCH_CHECK_OR_RETURN_ERROR() \
   ET_CUDA_CHECK_OR_RETURN_ERROR(cudaGetLastError())
 
-namespace executorch {
-namespace backends {
-namespace cuda {
+namespace executorch::backends::cuda {
 
 // Enum for supported data types in et-cuda backend
 enum class SupportedDTypes : int32_t {
@@ -125,6 +123,4 @@ inline AOTITorchError validate_dtype(int32_t dtype) {
 }
 } // extern "C"
 
-} // namespace cuda
-} // namespace backends
-} // namespace executorch
+} // namespace executorch::backends::cuda
diff --git a/runtime/platform/log.h b/runtime/platform/log.h
index 72ea8528442..7293fa2428d 100644
--- a/runtime/platform/log.h
+++ b/runtime/platform/log.h
@@ -181,6 +181,20 @@ using ::executorch::runtime::LogLevel;
           ##__VA_ARGS__);                                            \
     }                                                                \
   } while (0)
+
+/**
+ * Check a condition and log an error message if the condition is false.
+ *
+ * @param[in] _condition The condition to check.
+ * @param[in] _format Log message format string.
+ */
+#define ET_CHECK_OR_LOG_ERROR(_condition, _format, ...) \
+  do {                                                  \
+    if (!(_condition)) {                                \
+      ET_LOG(Error, _format, ##__VA_ARGS__);            \
+    }                                                   \
+  } while (0)
+
 #else // ET_LOG_ENABLED
 
 /**
@@ -191,4 +205,12 @@ using ::executorch::runtime::LogLevel;
  */
 #define ET_LOG(_level, _format, ...) ((void)0)
 
+/**
+ * Check a condition and log an error message if the condition is false.
+ *
+ * @param[in] _condition The condition to check.
+ * @param[in] _format Log message format string.
+ */
+#define ET_CHECK_OR_LOG_ERROR(_condition, _format, ...) ((void)0)
+
 #endif // ET_LOG_ENABLED

From 2eb8994012cceebc186ece6f27149550d659d5a0 Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin <anthony@shoumikh.in>
Date: Wed, 8 Oct 2025 23:08:36 -0700
Subject: [PATCH 200/266] Add Voxtral test. (#14918)

Summary:
X-link: https://github.com/meta-pytorch/tokenizers/pull/136

.

Differential Revision: D84081392
---
 .../Exported/ExecuTorchLLMMultimodalRunner.h  |  2 +-
 .../__tests__/MultimodalRunnerTest.swift      | 47 ++++++++++++++++++-
 2 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h
index 250241b9c9d..b2e36e0a1f2 100644
--- a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h
+++ b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h
@@ -145,7 +145,7 @@ __attribute__((objc_subclassing_restricted))
  @return A retained ExecuTorchLLMMultimodalInput instance of type Audio.
 */
 + (instancetype)inputWithAudio:(ExecuTorchLLMAudio *)audio
-    NS_SWIFT_NAME(init(audio:))
+    NS_SWIFT_NAME(init(_:))
     NS_RETURNS_RETAINED;
 
 @property(nonatomic, readonly) ExecuTorchLLMMultimodalInputType type;
diff --git a/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift b/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift
index 7281740c3af..3617245b8f8 100644
--- a/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift
+++ b/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+import ExecuTorch
 import ExecuTorchLLM
 import XCTest
 
@@ -98,10 +99,10 @@ extension UIImage {
 
 class MultimodalRunnerTest: XCTestCase {
   let systemPrompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
-  let userPrompt = "What's on the picture?"
 
   func testGemma() {
     let chatTemplate = "<start_of_turn>user\n%@<end_of_turn>\n<start_of_turn>model"
+    let userPrompt = "What's on the picture?"
     let sideSize: CGFloat = 896
     let sequenceLength = 768
     let bundle = Bundle(for: type(of: self))
@@ -156,6 +157,7 @@ class MultimodalRunnerTest: XCTestCase {
 
   func testLLaVA() {
     let chatTemplate = "USER: %@ ASSISTANT: "
+    let userPrompt = "What's on the picture?"
     let sideSize: CGFloat = 336
     let sequenceLength = 768
     let bundle = Bundle(for: type(of: self))
@@ -201,4 +203,47 @@ class MultimodalRunnerTest: XCTestCase {
     }
     XCTAssertTrue(text.lowercased().contains("waterfall"))
   }
+
+  func testVoxtral() throws {
+    let chatTemplate = "%@[/INST]"
+    let userPrompt = "What is the audio about?"
+    let bundle = Bundle(for: type(of: self))
+    guard let modelPath = bundle.path(forResource: "voxtral", ofType: "pte"),
+          let tokenizerPath = bundle.path(forResource: "voxtral_tokenizer_tekken", ofType: "json"),
+          let audioPath = bundle.path(forResource: "voxtral_input_features", ofType: "bin") else {
+      XCTFail("Couldn't find model or tokenizer files")
+      return
+    }
+    let runner = MultimodalRunner(modelPath: modelPath, tokenizerPath: tokenizerPath)
+    var audioData = try Data(contentsOf: URL(fileURLWithPath: audioPath), options: .mappedIfSafe)
+    let floatSize = MemoryLayout<Float>.size
+    guard audioData.count % floatSize == 0 else {
+      XCTFail("Invalid audio data")
+      return
+    }
+    let bins = 128
+    let frames = 3000
+    let batchSize = audioData.count / floatSize / (bins * frames)
+    var text = ""
+
+    do {
+      try runner.generate([
+        MultimodalInput("<s>[INST][BEGIN_AUDIO]"),
+        MultimodalInput(Audio(
+          float: audioData,
+          batchSize: batchSize,
+          bins: bins,
+          frames: frames
+        )),
+        MultimodalInput(String(format: chatTemplate, userPrompt)),
+      ], Config {
+        $0.maximumNewTokens = 256
+      }) { token in
+        text += token
+      }
+    } catch {
+      XCTFail("Failed to generate text with error \(error)")
+    }
+    XCTAssertTrue(text.lowercased().contains("tattoo"))
+  }
 }

From 8fbc42c0c3a42a40293b4f76fddf9ecd12e712b2 Mon Sep 17 00:00:00 2001
From: Ryan OShea <86965113+ArmRyan@users.noreply.github.com>
Date: Thu, 9 Oct 2025 09:11:58 +0200
Subject: [PATCH 201/266] Arm backend: Unsqueeze rank 0 tensor at vgf runtime
 (#14856)

Rank 0 tensors are not supported in SPV_ARM_tensor.
 We need to symbolically unsqueeze scalar IOs at runtime.

 * Remove xfails related to MLETORCH-1410

Signed-off-by: Ryan O'Shea <ryan.oshea3@arm.com>
---
 backends/arm/runtime/VGFSetup.cpp           | 31 +++++++++++++++------
 backends/arm/test/ops/test_addmm.py         |  2 --
 backends/arm/test/ops/test_amax.py          |  4 ---
 backends/arm/test/ops/test_amin.py          |  4 ---
 backends/arm/test/ops/test_any.py           |  3 --
 backends/arm/test/ops/test_mean_dim.py      |  5 ----
 backends/arm/test/ops/test_scalar_tensor.py |  3 --
 backends/arm/test/ops/test_select.py        |  5 ----
 backends/arm/test/ops/test_var.py           |  7 -----
 9 files changed, 22 insertions(+), 42 deletions(-)

diff --git a/backends/arm/runtime/VGFSetup.cpp b/backends/arm/runtime/VGFSetup.cpp
index abb4c50d8be..fa8c7ead220 100644
--- a/backends/arm/runtime/VGFSetup.cpp
+++ b/backends/arm/runtime/VGFSetup.cpp
@@ -24,6 +24,13 @@ namespace vgf {
 /* static function to map format to byte count */
 static uint32_t get_format_size(VkFormat format);
 
+// SPV_ARM_tensor does not support rank-0 representations according to the spec.
+// Use an unsqueezed dimension when the resource table contains an empty
+// shape. Tensors are output as rank 0 when copied back from the vgf backend.
+namespace {
+constexpr int64_t kScalarSentinelDimension = 1;
+}
+
 // Debug function to inspect memory properties
 static string memory_flags_to_string(VkMemoryPropertyFlags flags) {
   if (flags == 0)
@@ -264,7 +271,11 @@ static void debug_print_resources(
             the_shape.size(),
             the_stride.size());
         for (int j = 0; j < the_shape.size(); j++) {
-          ET_LOG(Info, "      %d: dim %ld", j, the_shape[j]);
+          ET_LOG(
+              Info,
+              "      %d: dim %lld",
+              j,
+              static_cast<long long>(the_shape[j]));
         }
         // Allocate a tensor with bound memory
         break;
@@ -387,6 +398,7 @@ bool VgfRepr::process_vgf(const char* vgf_data, ArrayRef<CompileSpec> specs) {
     // Get tensor shape and strides
     auto shape = resource_decoder->getTensorShape(i);
     auto stride = resource_decoder->getTensorStride(i);
+    const auto shape_size = shape.size();
 
     switch (resource_decoder->getCategory(i)) {
       case vgflib::ResourceCategory::INPUT:
@@ -409,9 +421,9 @@ bool VgfRepr::process_vgf(const char* vgf_data, ArrayRef<CompileSpec> specs) {
         result = allocate_tensor(
             vk_physical,
             vk_device,
-            vgflib::ToVkFormat(resource_decoder->getVkFormat(i)),
-            static_cast<uint32_t>(shape.size()),
-            shape.begin(),
+            resource_format,
+            shape_size == 0 ? 1 : static_cast<uint32_t>(shape_size),
+            shape_size == 0 ? &kScalarSentinelDimension : shape.begin(),
             static_cast<uint32_t>(stride.size()),
             stride.begin(),
             &tensor_description,
@@ -422,8 +434,7 @@ bool VgfRepr::process_vgf(const char* vgf_data, ArrayRef<CompileSpec> specs) {
           ET_LOG(Error, "Failed to allocate tensor for VGF resource %d", i);
           return false;
         }
-        size_t e_size = get_format_size(
-            vgflib::ToVkFormat(resource_decoder->getVkFormat(i)));
+        size_t e_size = get_format_size(resource_format);
         if (0 == e_size) {
           ET_LOG(Error, "failed to get element size of VkFormat");
           return false;
@@ -449,9 +460,11 @@ bool VgfRepr::process_vgf(const char* vgf_data, ArrayRef<CompileSpec> specs) {
             .sType = VK_STRUCTURE_TYPE_TENSOR_DESCRIPTION_ARM,
             .pNext = nullptr,
             .tiling = VK_TENSOR_TILING_LINEAR_ARM,
-            .format = vgflib::ToVkFormat(resource_decoder->getVkFormat(i)),
-            .dimensionCount = static_cast<uint32_t>(shape.size()),
-            .pDimensions = shape.begin(),
+            .format = resource_format,
+            .dimensionCount =
+                shape_size == 0 ? 1 : static_cast<uint32_t>(shape_size),
+            .pDimensions =
+                shape_size == 0 ? &kScalarSentinelDimension : shape.begin(),
             // Note: stride_data of 0's causes size==0, null means stride==size
             .pStrides = (0 == stride.size() ? nullptr : stride.begin()),
             .usage = VK_TENSOR_USAGE_DATA_GRAPH_BIT_ARM,
diff --git a/backends/arm/test/ops/test_addmm.py b/backends/arm/test/ops/test_addmm.py
index 1170f65dd58..685b69b3541 100644
--- a/backends/arm/test/ops/test_addmm.py
+++ b/backends/arm/test/ops/test_addmm.py
@@ -167,7 +167,6 @@ def test_addmm_u85_INT(test_data: Tuple):
 
 @common.parametrize("test_data", test_data_suite)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_addmm_vgf_FP(test_data: input_t1):
     pipeline = VgfPipeline[input_t1](
         Addmm(),
@@ -181,7 +180,6 @@ def test_addmm_vgf_FP(test_data: input_t1):
 
 @common.parametrize("test_data", test_data_suite)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_addmm_vgf_INT(test_data: input_t1):
     pipeline = VgfPipeline[input_t1](
         Addmm(),
diff --git a/backends/arm/test/ops/test_amax.py b/backends/arm/test/ops/test_amax.py
index 99529e07ca2..e69e9163325 100644
--- a/backends/arm/test/ops/test_amax.py
+++ b/backends/arm/test/ops/test_amax.py
@@ -139,7 +139,6 @@ def test_max_dim_tosa_FP_not_delegated():
 
 @common.parametrize("test_data", Amax.test_data)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_amax_vgf_FP(test_data: Amax.input_t):
     data, dim, keep_dims = test_data()
     module = Amax(dim, keep_dims)
@@ -154,7 +153,6 @@ def test_amax_vgf_FP(test_data: Amax.input_t):
 
 @common.parametrize("test_data", Amax.test_data)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_amax_vgf_INT(test_data: Amax.input_t):
     data, dim, keep_dims = test_data()
     module = Amax(dim, keep_dims)
@@ -169,7 +167,6 @@ def test_amax_vgf_INT(test_data: Amax.input_t):
 
 @common.parametrize("test_data", Max.test_data)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_max_dim_vgf_FP_to_amax(test_data: Max.input_t):
     data, dim = test_data()
     pipeline = VgfPipeline[Max.input_t](
@@ -183,7 +180,6 @@ def test_max_dim_vgf_FP_to_amax(test_data: Max.input_t):
 
 @common.parametrize("test_data", Max.test_data)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_max_dim_vgf_INT_to_amax(test_data: Max.input_t):
     data, dim = test_data()
     pipeline = VgfPipeline[Max.input_t](
diff --git a/backends/arm/test/ops/test_amin.py b/backends/arm/test/ops/test_amin.py
index d213cabf5a1..09d9018c73e 100644
--- a/backends/arm/test/ops/test_amin.py
+++ b/backends/arm/test/ops/test_amin.py
@@ -155,7 +155,6 @@ def test_min_dim_tosa_FP_not_delegated():
 
 @common.parametrize("test_data", Amin.test_data)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_amin_vgf_FP(test_data: Amin.input_t):
     data, dim, keep_dims = test_data()
     pipeline = VgfPipeline[Amin.input_t](
@@ -166,7 +165,6 @@ def test_amin_vgf_FP(test_data: Amin.input_t):
 
 @common.parametrize("test_data", Amin.test_data)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_amin_vgf_INT(test_data: Amin.input_t):
     data, dim, keep_dims = test_data()
     pipeline = VgfPipeline[Amin.input_t](
@@ -180,7 +178,6 @@ def test_amin_vgf_INT(test_data: Amin.input_t):
 
 @common.parametrize("test_data", Min.test_data)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_min_dim_vgf_FP_to_amin(test_data: Min.input_t):
     data, dim = test_data()
     pipeline = VgfPipeline[Min.input_t](
@@ -194,7 +191,6 @@ def test_min_dim_vgf_FP_to_amin(test_data: Min.input_t):
 
 @common.parametrize("test_data", Min.test_data)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_min_dim_vgf_INT_to_amin(test_data: Min.input_t):
     data, dim = test_data()
     pipeline = VgfPipeline[Min.input_t](
diff --git a/backends/arm/test/ops/test_any.py b/backends/arm/test/ops/test_any.py
index 1676018f0ce..3eccff0a64e 100644
--- a/backends/arm/test/ops/test_any.py
+++ b/backends/arm/test/ops/test_any.py
@@ -6,7 +6,6 @@
 
 from typing import List, Tuple
 
-import pytest
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.test_pipeline import (
@@ -189,7 +188,6 @@ def test_any_u85_INT(test_data: input_t1):
 
 @common.parametrize("test_data", test_data)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_any_vgf_FP(test_data: input_t1):
     op, data_fn = test_data()
     pipeline = VgfPipeline[input_t1](
@@ -204,7 +202,6 @@ def test_any_vgf_FP(test_data: input_t1):
 
 @common.parametrize("test_data", test_data)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_any_vgf_INT(test_data: input_t1):
     op, data_fn = test_data()
     pipeline = VgfPipeline[input_t1](
diff --git a/backends/arm/test/ops/test_mean_dim.py b/backends/arm/test/ops/test_mean_dim.py
index 656f35fb17f..970340c352b 100644
--- a/backends/arm/test/ops/test_mean_dim.py
+++ b/backends/arm/test/ops/test_mean_dim.py
@@ -4,7 +4,6 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
-import pytest
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.test_pipeline import (
@@ -84,7 +83,6 @@ def test_adaptive_avg_pool2d_u85_INT(test_data):
 
 @common.parametrize("test_data", AdaptiveAveragePool2d.test_data_suite)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_adaptive_avg_pool2d_vgf_FP(test_data):
     pipeline = VgfPipeline[input_t](
         AdaptiveAveragePool2d(),
@@ -98,7 +96,6 @@ def test_adaptive_avg_pool2d_vgf_FP(test_data):
 
 @common.parametrize("test_data", AdaptiveAveragePool2d.test_data_suite)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_adaptive_avg_pool2d_vgf_INT(test_data):
     pipeline = VgfPipeline[input_t](
         AdaptiveAveragePool2d(),
@@ -331,7 +328,6 @@ def test_mean_dim_u85_INT(test_data):
 
 @common.parametrize("test_data", MeanDim.test_data_suite)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_mean_dim_vgf_FP(test_data):
     test_data_val, dim, keep_dim = test_data()
     pipeline = VgfPipeline[input_t](
@@ -346,7 +342,6 @@ def test_mean_dim_vgf_FP(test_data):
 
 @common.parametrize("test_data", MeanDim.test_data_suite)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_mean_dim_vgf_INT(test_data):
     test_data_val, dim, keep_dim = test_data()
     pipeline = VgfPipeline[input_t](
diff --git a/backends/arm/test/ops/test_scalar_tensor.py b/backends/arm/test/ops/test_scalar_tensor.py
index ecc2fece223..d5e5b365da1 100644
--- a/backends/arm/test/ops/test_scalar_tensor.py
+++ b/backends/arm/test/ops/test_scalar_tensor.py
@@ -2,7 +2,6 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
-import pytest
 import torch
 from executorch.backends.arm.test import common
 
@@ -102,7 +101,6 @@ def test_scalar_tensor_u85_INT(test_data):
 
 @common.parametrize("test_data", float_test_data_suite)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_scalar_tensor_vgf_FP(test_data):
     scalar, dtype, data = test_data()
     pipeline = VgfPipeline(
@@ -116,7 +114,6 @@ def test_scalar_tensor_vgf_FP(test_data):
 
 @common.parametrize("test_data", int_test_data_suite)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_scalar_tensor_vgf_INT(test_data):
     scalar, dtype, data = test_data()
     pipeline = VgfPipeline(
diff --git a/backends/arm/test/ops/test_select.py b/backends/arm/test/ops/test_select.py
index b47295f967b..23046c34fe4 100644
--- a/backends/arm/test/ops/test_select.py
+++ b/backends/arm/test/ops/test_select.py
@@ -7,7 +7,6 @@
 
 from typing import Tuple
 
-import pytest
 import torch
 
 from executorch.backends.arm.test import common
@@ -170,7 +169,6 @@ def test_select_int_u85_INT(test_data: Tuple):
 
 @common.parametrize("test_data", test_data_suite)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_select_int_vgf_FP_copy(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         SelectCopy(), test_data(), aten_op_copy, [], tosa_version="TOSA-1.0+FP"
@@ -180,7 +178,6 @@ def test_select_int_vgf_FP_copy(test_data: Tuple):
 
 @common.parametrize("test_data", test_data_suite)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_select_int_vgf_FP(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         SelectInt(), test_data(), aten_op_int, [], tosa_version="TOSA-1.0+FP"
@@ -190,7 +187,6 @@ def test_select_int_vgf_FP(test_data: Tuple):
 
 @common.parametrize("test_data", test_data_suite)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_select_int_vgf_INT_copy(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         SelectCopy(),
@@ -204,7 +200,6 @@ def test_select_int_vgf_INT_copy(test_data: Tuple):
 
 @common.parametrize("test_data", test_data_suite)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_select_int_vgf_INT(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         SelectInt(),
diff --git a/backends/arm/test/ops/test_var.py b/backends/arm/test/ops/test_var.py
index f08e4498cc5..9f1c437fc65 100644
--- a/backends/arm/test/ops/test_var.py
+++ b/backends/arm/test/ops/test_var.py
@@ -6,7 +6,6 @@
 
 from typing import Tuple
 
-import pytest
 import torch
 
 from executorch.backends.arm.test import common
@@ -214,7 +213,6 @@ def test_var_dim_u85_INT_no_dim(test_data: Tuple):
 
 @common.parametrize("test_data", Var.test_parameters)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_var_dim_vgf_FP_no_dim(test_data: Tuple):
     data, keepdim, correction = test_data()
     pipeline = VgfPipeline[input_t1](
@@ -225,7 +223,6 @@ def test_var_dim_vgf_FP_no_dim(test_data: Tuple):
 
 @common.parametrize("test_data", Var.test_parameters)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_var_dim_vgf_INT_no_dim(test_data: Tuple):
     data, keepdim, correction = test_data()
     pipeline = VgfPipeline[input_t1](
@@ -296,7 +293,6 @@ def test_var_dim_u85_INT(test_data: Tuple):
 
 @common.parametrize("test_data", VarDim.test_parameters)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_var_dim_vgf_FP(test_data: Tuple):
     data, dim, keepdim, unbiased = test_data()
     pipeline = VgfPipeline[input_t1](
@@ -307,7 +303,6 @@ def test_var_dim_vgf_FP(test_data: Tuple):
 
 @common.parametrize("test_data", VarDim.test_parameters)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_var_dim_vgf_INT(test_data: Tuple):
     data, dim, keepdim, unbiased = test_data()
     pipeline = VgfPipeline[input_t1](
@@ -377,7 +372,6 @@ def test_var_dim_u85_INT_correction(test_data: Tuple):
 
 @common.parametrize("test_data", VarCorrection.test_parameters)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_var_dim_vgf_FP_correction(test_data: Tuple):
     data, dim, keepdim, corr = test_data()
     pipeline = VgfPipeline[input_t1](
@@ -388,7 +382,6 @@ def test_var_dim_vgf_FP_correction(test_data: Tuple):
 
 @common.parametrize("test_data", VarCorrection.test_parameters)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_var_dim_vgf_INT_correction(test_data: Tuple):
     data, dim, keepdim, corr = test_data()
     pipeline = VgfPipeline[input_t1](

From 418c584ed1b6f9d605a3346567f83877e8c07b58 Mon Sep 17 00:00:00 2001
From: Erik Lundell <erik.lundell@arm.com>
Date: Thu, 9 Oct 2025 10:28:19 +0200
Subject: [PATCH 202/266] Use quantizable LSTM in test when flow has
 quantize=True (#14893)

It makes more sense to use the quantizable version of the LSTM. For
example, right now the xnnpack int8 tests pass, even though all tensors
are float, since the quantizer is not triggered.

Signed-off-by: Erik Lundell <erik.lundell@arm.com>
---
 backends/test/suite/operators/test_lstm.py | 87 ++++++++++++++++------
 1 file changed, 63 insertions(+), 24 deletions(-)

diff --git a/backends/test/suite/operators/test_lstm.py b/backends/test/suite/operators/test_lstm.py
index 91dd73c9052..11632e1e055 100644
--- a/backends/test/suite/operators/test_lstm.py
+++ b/backends/test/suite/operators/test_lstm.py
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -15,6 +16,11 @@
     operator_test,
     OperatorTest,
 )
+from torch.nn.quantizable.modules.rnn import LSTM as QuantizableLSTM
+
+
+def _get_lstm_cls(use_quantizable_lstm: bool):
+    return QuantizableLSTM if use_quantizable_lstm else torch.nn.LSTM
 
 
 class Model(torch.nn.Module):
@@ -27,9 +33,11 @@ def __init__(
         batch_first=True,
         dropout=0.0,
         bidirectional=False,
+        use_quantizable_lstm: bool = False,
     ):
         super().__init__()
-        self.lstm = torch.nn.LSTM(
+        lstm_cls = _get_lstm_cls(use_quantizable_lstm)
+        self.lstm = lstm_cls(
             input_size=input_size,
             hidden_size=hidden_size,
             num_layers=num_layers,
@@ -47,106 +55,133 @@ def forward(self, x):
 class LSTM(OperatorTest):
     @dtype_test
     def test_lstm_dtype(self, flow: TestFlow, dtype) -> None:
+        use_quantizable_lstm = flow.quantize
         self._test_op(
-            Model(num_layers=2).to(dtype),
+            Model(num_layers=2, use_quantizable_lstm=use_quantizable_lstm).to(dtype),
             ((torch.rand(1, 10, 64) * 10).to(dtype),),  # (batch=1, seq_len, input_size)
             flow,
         )
 
     @dtype_test
     def test_lstm_no_bias_dtype(self, flow: TestFlow, dtype) -> None:
+        use_quantizable_lstm = flow.quantize
         self._test_op(
-            Model(num_layers=2, bias=False).to(dtype),
+            Model(
+                num_layers=2, bias=False, use_quantizable_lstm=use_quantizable_lstm
+            ).to(dtype),
             ((torch.rand(1, 10, 64) * 10).to(dtype),),
             flow,
         )
 
     def test_lstm_feature_sizes(self, flow: TestFlow) -> None:
+        use_quantizable_lstm = flow.quantize
         self._test_op(
-            Model(input_size=32, hidden_size=16),
+            Model(
+                input_size=32,
+                hidden_size=16,
+                use_quantizable_lstm=use_quantizable_lstm,
+            ),
             (torch.randn(1, 8, 32),),  # (batch=1, seq_len, input_size)
             flow,
         )
         self._test_op(
-            Model(input_size=128, hidden_size=64),
+            Model(
+                input_size=128,
+                hidden_size=64,
+                use_quantizable_lstm=use_quantizable_lstm,
+            ),
             (torch.randn(1, 12, 128),),
             flow,
         )
         self._test_op(
-            Model(input_size=256, hidden_size=128),
+            Model(
+                input_size=256,
+                hidden_size=128,
+                use_quantizable_lstm=use_quantizable_lstm,
+            ),
             (torch.randn(1, 6, 256),),
             flow,
         )
         self._test_op(
-            Model(input_size=16, hidden_size=32),
+            Model(
+                input_size=16,
+                hidden_size=32,
+                use_quantizable_lstm=use_quantizable_lstm,
+            ),
             (torch.randn(1, 5, 16),),
             flow,
         )
 
     def test_lstm_batch_sizes(self, flow: TestFlow) -> None:
+        use_quantizable_lstm = flow.quantize
         self._test_op(
-            Model(),
+            Model(use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(8, 10, 64),),
             flow,
         )
         self._test_op(
-            Model(),
+            Model(use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(32, 10, 64),),
             flow,
         )
         self._test_op(
-            Model(),
+            Model(use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(100, 10, 64),),
             flow,
         )
 
     def test_lstm_seq_lengths(self, flow: TestFlow) -> None:
+        use_quantizable_lstm = flow.quantize
         self._test_op(
-            Model(),
+            Model(use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(1, 5, 64),),
             flow,
         )
         self._test_op(
-            Model(),
+            Model(use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(1, 20, 64),),
             flow,
         )
         self._test_op(
-            Model(),
+            Model(use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(1, 50, 64),),
             flow,
         )
 
     def test_lstm_batch_first_false(self, flow: TestFlow) -> None:
+        use_quantizable_lstm = flow.quantize
         self._test_op(
-            Model(batch_first=False),
+            Model(batch_first=False, use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(10, 1, 64),),  # (seq_len, batch=1, input_size)
             flow,
         )
 
     def test_lstm_num_layers(self, flow: TestFlow) -> None:
+        use_quantizable_lstm = flow.quantize
         self._test_op(
-            Model(num_layers=2),
+            Model(num_layers=2, use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(1, 10, 64),),
             flow,
         )
         self._test_op(
-            Model(num_layers=3),
+            Model(num_layers=3, use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(1, 10, 64),),
             flow,
         )
 
     def test_lstm_bidirectional(self, flow: TestFlow) -> None:
+        use_quantizable_lstm = flow.quantize
         self._test_op(
-            Model(bidirectional=True),
+            Model(bidirectional=True, use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(1, 10, 64),),
             flow,
         )
 
     def test_lstm_with_dropout(self, flow: TestFlow) -> None:
         # Note: Dropout is only effective with num_layers > 1
+        use_quantizable_lstm = flow.quantize
         self._test_op(
-            Model(num_layers=2, dropout=0.2),
+            Model(num_layers=2, dropout=0.2, use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(1, 10, 64),),
             flow,
         )
@@ -154,9 +189,10 @@ def test_lstm_with_dropout(self, flow: TestFlow) -> None:
     def test_lstm_with_initial_states(self, flow: TestFlow) -> None:
         # Create a model that accepts initial states
         class ModelWithStates(torch.nn.Module):
-            def __init__(self):
+            def __init__(self, use_quantizable_lstm: bool = False):
                 super().__init__()
-                self.lstm = torch.nn.LSTM(
+                lstm_cls = _get_lstm_cls(use_quantizable_lstm)
+                self.lstm = lstm_cls(
                     input_size=64,
                     hidden_size=32,
                     num_layers=2,
@@ -169,9 +205,10 @@ def forward(self, x, h0, c0):
         batch_size = 1
         num_layers = 2
         hidden_size = 32
+        use_quantizable_lstm = flow.quantize
 
         self._test_op(
-            ModelWithStates(),
+            ModelWithStates(use_quantizable_lstm=use_quantizable_lstm),
             (
                 torch.randn(batch_size, 10, 64),  # input
                 torch.randn(num_layers, batch_size, hidden_size),  # h0
@@ -183,9 +220,10 @@ def forward(self, x, h0, c0):
     def test_lstm_return_hidden_states(self, flow: TestFlow) -> None:
         # Create a model that returns both output and hidden states
         class ModelWithHiddenStates(torch.nn.Module):
-            def __init__(self):
+            def __init__(self, use_quantizable_lstm: bool = False):
                 super().__init__()
-                self.lstm = torch.nn.LSTM(
+                lstm_cls = _get_lstm_cls(use_quantizable_lstm)
+                self.lstm = lstm_cls(
                     input_size=64,
                     hidden_size=32,
                     num_layers=2,
@@ -200,9 +238,10 @@ def forward(self, x):
         batch_size = 1
         seq_len = 10
         input_size = 64
+        use_quantizable_lstm = flow.quantize
 
         self._test_op(
-            ModelWithHiddenStates(),
+            ModelWithHiddenStates(use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(batch_size, seq_len, input_size),),
             flow,
         )

From dda270555801e7869563d203a45e4fde44a5e967 Mon Sep 17 00:00:00 2001
From: Erik Lundell <erik.lundell@arm.com>
Date: Thu, 9 Oct 2025 13:20:00 +0200
Subject: [PATCH 203/266] Arm backend: Decompose sub/add with alpha!=1 (#14932)

This was previously not supported, causing crashes in quantization, and
incorrect output in floating point.

Signed-off-by: Erik Lundell <erik.lundell@arm.com>
---
 backends/arm/_passes/__init__.py              |  1 +
 backends/arm/_passes/arm_pass_manager.py      |  3 +
 .../_passes/decompose_add_sub_alpha_pass.py   | 94 +++++++++++++++++++
 backends/arm/test/ops/test_add.py             |  2 +-
 backends/arm/test/ops/test_sub.py             | 26 +++++
 5 files changed, 125 insertions(+), 1 deletion(-)
 create mode 100644 backends/arm/_passes/decompose_add_sub_alpha_pass.py

diff --git a/backends/arm/_passes/__init__.py b/backends/arm/_passes/__init__.py
index 1374ed8a3d3..b1337c38a58 100644
--- a/backends/arm/_passes/__init__.py
+++ b/backends/arm/_passes/__init__.py
@@ -27,6 +27,7 @@
 from .convert_to_clamp import ConvertToClampPass  # noqa
 from .decompose_acosh_pass import DecomposeAcoshPass  # noqa
 from .decompose_adaptive_avg_pool2d_pass import DecomposeAdaptiveAvgPool2dPass  # noqa
+from .decompose_add_sub_alpha_pass import DecomposeAddSubAlphaPass  # noqa
 from .decompose_addmm_pass import DecomposeAddmmPass  # noqa
 from .decompose_asin_and_acos_pass import DecomposeAsinAndAcosPass  # noqa
 from .decompose_asinh_pass import DecomposeAsinhPass  # noqa
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
index ef6d6e6810a..325f667f0ac 100644
--- a/backends/arm/_passes/arm_pass_manager.py
+++ b/backends/arm/_passes/arm_pass_manager.py
@@ -36,6 +36,7 @@
     DecomposeAcoshPass,
     DecomposeAdaptiveAvgPool2dPass,
     DecomposeAddmmPass,
+    DecomposeAddSubAlphaPass,
     DecomposeAsinAndAcosPass,
     DecomposeAsinhPass,
     DecomposeAtanhPass,
@@ -262,6 +263,7 @@ def _tosa_FP_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         )
         self.add_pass(DecomposeNotEqualPass())
         self.add_pass(DecomposeDivPass())
+        self.add_pass(DecomposeAddSubAlphaPass())
         self.add_pass(DecomposeSoftmaxPass())
         self.add_pass(DecomposeGeluPass())
         self.add_pass(ConvertFullLikeToFullPass())
@@ -334,6 +336,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(DecomposeSignPass())
         self.add_pass(DecomposeAddmmPass())
         self.add_pass(DecomposeDivTensorModePass())
+        self.add_pass(DecomposeAddSubAlphaPass())
         self.add_pass(ReplaceScalarWithTensorArgPassTOSABI())
         self.add_pass(ScalarsToAttributePass())
         self.add_pass(DecomposeGroupNormPass())
diff --git a/backends/arm/_passes/decompose_add_sub_alpha_pass.py b/backends/arm/_passes/decompose_add_sub_alpha_pass.py
new file mode 100644
index 00000000000..c0ed1bae09b
--- /dev/null
+++ b/backends/arm/_passes/decompose_add_sub_alpha_pass.py
@@ -0,0 +1,94 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import annotations
+
+import numbers
+from typing import Set, Type
+
+import torch
+from executorch.backends.arm._passes import ArmPass
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+
+
+_ADD_OPS = (
+    exir_ops.edge.aten.add.Tensor,
+    torch.ops.aten.add.Tensor,
+)
+
+_SUB_OPS = (
+    exir_ops.edge.aten.sub.Tensor,
+    torch.ops.aten.sub.Tensor,
+)
+
+
+def _get_ops(op):
+    if op in _ADD_OPS:
+        if op is exir_ops.edge.aten.add.Tensor:
+            return (
+                exir_ops.edge.aten.mul.Tensor,
+                exir_ops.edge.aten.full.default,
+                exir_ops.edge.aten.add.Tensor,
+            )
+        return (
+            torch.ops.aten.mul.Tensor,
+            torch.ops.aten.full.default,
+            torch.ops.aten.add.Tensor,
+        )
+    if op in _SUB_OPS:
+        if op is exir_ops.edge.aten.sub.Tensor:
+            return (
+                exir_ops.edge.aten.mul.Tensor,
+                exir_ops.edge.aten.full.default,
+                exir_ops.edge.aten.sub.Tensor,
+            )
+        return (
+            torch.ops.aten.mul.Tensor,
+            torch.ops.aten.full.default,
+            torch.ops.aten.sub.Tensor,
+        )
+    raise RuntimeError(f"Unsupported operator {op}")
+
+
+def _should_decompose(alpha) -> bool:
+    if isinstance(alpha, numbers.Number):
+        return alpha != 1
+    return False
+
+
+class DecomposeAddSubAlphaPass(ArmPass):
+    """Rewrite add/sub with alpha into a mul followed by add/sub."""
+
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
+    def call_operator(self, op, args, kwargs, meta, updated: bool | None = False):
+        if op not in _ADD_OPS + _SUB_OPS:
+            return super().call_operator(op, args, kwargs, meta, updated)
+
+        alpha = kwargs.get("alpha", 1)
+        if not _should_decompose(alpha):
+            return super().call_operator(op, args, kwargs, meta, updated)
+
+        mul_op, full_op, binary_op = _get_ops(op)
+        lhs, rhs = args
+
+        alpha_full = super().call_operator(
+            full_op, ((1,), float(alpha)), {}, meta, updated=True
+        )
+        scaled_rhs = super().call_operator(
+            mul_op,
+            (rhs, alpha_full),
+            {},
+            meta,
+            updated=True,
+        )
+        return super().call_operator(
+            binary_op,
+            (lhs, scaled_rhs),
+            {},
+            meta,
+            updated=True,
+        )
diff --git a/backends/arm/test/ops/test_add.py b/backends/arm/test/ops/test_add.py
index 9b3f98763c6..bcab40116d8 100644
--- a/backends/arm/test/ops/test_add.py
+++ b/backends/arm/test/ops/test_add.py
@@ -78,7 +78,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
 
 class Add3(torch.nn.Module):
     def forward(self, x: torch.Tensor, y: torch.Tensor):
-        return x + y
+        return torch.add(x, y, alpha=1.5)
 
     test_data: list[input_t2] = {
         "3d_randn_diff_rank": lambda: (torch.randn(1, 4, 5), torch.randn(4, 1)),
diff --git a/backends/arm/test/ops/test_sub.py b/backends/arm/test/ops/test_sub.py
index 9c02243f30f..68b6ad5fb93 100644
--- a/backends/arm/test/ops/test_sub.py
+++ b/backends/arm/test/ops/test_sub.py
@@ -79,6 +79,11 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
         return x - y
 
 
+class SubAlpha(torch.nn.Module):
+    def forward(self, x: torch.Tensor, y: torch.Tensor):
+        return torch.sub(x, y, alpha=5)
+
+
 class SubTan(torch.nn.Module):
 
     def forward(self, x: torch.Tensor, y: torch.Tensor):
@@ -115,6 +120,18 @@ def test_sub_tensor_tosa_FP_2(test_data: Tuple[torch.Tensor, torch.Tensor]):
     pipeline.run()
 
 
+@common.parametrize("test_data", sub_tan_test_data)
+def test_sub_tensor_tosa_FP_alpha(test_data: Tuple[torch.Tensor, torch.Tensor]):
+    """Test Two-Operand Subtraction with alpha (TOSA FP)"""
+    pipeline = TosaPipelineFP[input_t2](
+        SubAlpha(),
+        test_data(),
+        aten_op,
+        exir_op,
+    )
+    pipeline.run()
+
+
 @common.parametrize("test_data", sub_test_data)
 def test_sub_tensor_tosa_INT(test_data):
     """Test Subtraction (TOSA INT)"""
@@ -138,6 +155,15 @@ def test_sub_tensor_tosa_INT_3(test_data: Tuple[torch.Tensor, torch.Tensor]):
     pipeline.run()
 
 
+@common.parametrize("test_data", sub_tan_test_data)
+def test_sub_tensor_tosa_INT_alpha(test_data: Tuple[torch.Tensor, torch.Tensor]):
+    """Test Two-Operand Subtraction with alpha (TOSA INT)"""
+    pipeline = TosaPipelineINT[input_t2](
+        SubAlpha(), test_data(), aten_op, exir_op, qtol=0
+    )
+    pipeline.run()
+
+
 @common.parametrize("test_data", sub_test_data)
 @common.XfailIfNoCorstone300
 def test_sub_tensor_u55_INT(test_data):

From 29b98c314fb83bc08563638c6ab62bab3d300389 Mon Sep 17 00:00:00 2001
From: Erik Lundell <erik.lundell@arm.com>
Date: Thu, 9 Oct 2025 13:21:11 +0200
Subject: [PATCH 204/266] Arm backend: add new cmake line to vgf tutorial
 (#14935)

Missed updating docs in #14885

Signed-off-by: Erik Lundell <erik.lundell@arm.com>
---
 docs/source/tutorial-arm-vgf.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/tutorial-arm-vgf.md b/docs/source/tutorial-arm-vgf.md
index a29c2ada6e9..dff7111d080 100644
--- a/docs/source/tutorial-arm-vgf.md
+++ b/docs/source/tutorial-arm-vgf.md
@@ -171,6 +171,7 @@ cmake \
   -DCMAKE_BUILD_TYPE=Debug \
   -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
   -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
   -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
   -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
   -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \

From 75f968d053c9d3bdb3eaea962a70b6ce7056f572 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Thu, 9 Oct 2025 08:11:45 -0700
Subject: [PATCH 205/266] Make determinism of channels_last more conservative

Differential Revision: D83998877

Pull Request resolved: https://github.com/pytorch/executorch/pull/14862
---
 .../channels_last_tagged_reshape_pass.py      | 47 ++++++++++++++-----
 1 file changed, 34 insertions(+), 13 deletions(-)

diff --git a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
index 85e9889ca36..c1bc3a54f7c 100644
--- a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
+++ b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
@@ -110,7 +110,9 @@ def is_nhwc_node(node: torch.fx.Node) -> bool:
             if len(quantize_node.all_input_nodes) > 0:
                 actual_node = quantize_node.args[0]
                 if actual_node.op == "placeholder":
-                    return not actual_node.meta["val"][0].is_contiguous()
+                    return ChannelsLastTaggedReshapePass._is_nhwc_tensor(
+                        actual_node.meta["val"][0]
+                    )
                 else:
                     return actual_node.meta.get(
                         ChannelsLastTaggedReshapePass.XNN_NHWC_NODE, False
@@ -125,7 +127,9 @@ def is_nchw_node(node: torch.fx.Node) -> bool:
             if len(quantize_node.all_input_nodes) > 0:
                 actual_node = quantize_node.args[0]
                 if actual_node.op == "placeholder":
-                    return actual_node.meta["val"][0].is_contiguous()
+                    return not ChannelsLastTaggedReshapePass._is_nhwc_tensor(
+                        actual_node.meta["val"][0]
+                    )
                 else:
                     return not actual_node.meta.get(
                         ChannelsLastTaggedReshapePass.XNN_NHWC_NODE, False
@@ -133,6 +137,26 @@ def is_nchw_node(node: torch.fx.Node) -> bool:
 
         return not ChannelsLastTaggedReshapePass.is_nhwc_node(node)
 
+    @staticmethod
+    def _is_nhwc_tensor(tensor: torch.Tensor) -> bool:
+        nhwc = tensor.is_contiguous(memory_format=torch.channels_last)
+        nchw = tensor.is_contiguous()
+        # if both are true false
+        # if  both nchw and nhwc are true
+        #     then we want to see this is nchw hence return false
+        # if either of nchw or nhwc is false, then just rely on hwc
+        # if both are false, mayb channels_last_3d, then return nhwc
+        #    however this should not happen here
+        # return (not (nchw and nhwc)) and nhwc
+        # Readable version
+        if nchw and nhwc:
+            return False
+        else:
+            return nhwc
+
+    def _is_nhwc(self, tensor: torch.Tensor) -> bool:
+        return ChannelsLastTaggedReshapePass._is_nhwc_tensor(tensor)
+
     def requires_nhwc_input(self, node: torch.fx.Node) -> bool:
         return node.target in self.memory_sensitive_ops_nhwc
 
@@ -315,11 +339,8 @@ def input_dim_order(
         self, input_node: torch.fx.Node, input_order: InputDimOrder
     ) -> bool:
         if input_node.op == "placeholder":
-            return (
-                input_node.meta["val"].is_contiguous()
-                if input_order == InputDimOrder.NCHW
-                else not input_node.meta["val"].is_contiguous()
-            )
+            is_nhwc = self._is_nhwc(input_node.meta["val"])
+            return not is_nhwc if input_order == InputDimOrder.NCHW else is_nhwc
         else:
             return (
                 ChannelsLastTaggedReshapePass.is_nchw_node(input_node)
@@ -348,7 +369,7 @@ def input_to_nhwc(
             self.mark_as_nhwc_node(input_node)
 
         if input_node.op == "placeholder":
-            if not input_node.meta["val"][0].is_contiguous():
+            if self._is_nhwc(input_node.meta["val"][0]):
                 return
         elif ChannelsLastTaggedReshapePass.is_nhwc_node(input_node):
             return
@@ -420,7 +441,7 @@ def input_to_nchw(
             self.mark_as_nchw_node(input_node)
 
         if input_node.op == "placeholder":
-            if input_node.meta["val"].is_contiguous():
+            if not self._is_nhwc(input_node.meta["val"]):
                 return
         elif ChannelsLastTaggedReshapePass.is_nchw_node(input_node):
             return
@@ -462,17 +483,17 @@ def call(self, graph_module: torch.fx.GraphModule):  # noqa: C901
                     and isinstance(node.meta["val"], torch.Tensor)
                     and len(node.meta["val"].shape) == 4
                 ):
-                    if node.meta["val"].is_contiguous():
-                        self.mark_as_nchw_node(node)
-                    else:
+                    if self._is_nhwc(node.meta["val"]):
                         self.mark_as_nhwc_node(node)
+                    else:
+                        self.mark_as_nchw_node(node)
                 continue
 
             # Need special case for output node because it can have multiple output dim orders as we can output a tuple multiple nodes
             if node.op == "output":
                 out_tuple = node.args[0]
                 for out_node in out_tuple:
-                    if out_node.meta["val"].is_contiguous():
+                    if not self._is_nhwc(out_node.meta["val"]):
                         self.input_to_nchw(graph_module, out_node, node)
                     else:
                         self.input_to_nhwc(graph_module, out_node, node)

From a50943113a5cfd15e4664190e92058091c6c9468 Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin <anthony@shoumikh.in>
Date: Thu, 9 Oct 2025 09:35:04 -0700
Subject: [PATCH 206/266] Update extension/llm/tokenizers to
 d710a0cf10cfa8cb7ffda33c4e61af63119bc95f (#14930)

---
 extension/llm/tokenizers | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extension/llm/tokenizers b/extension/llm/tokenizers
index ee0ad9b6e84..d710a0cf10c 160000
--- a/extension/llm/tokenizers
+++ b/extension/llm/tokenizers
@@ -1 +1 @@
-Subproject commit ee0ad9b6e84622589911e2855a111b4278db114b
+Subproject commit d710a0cf10cfa8cb7ffda33c4e61af63119bc95f

From bdc526bb661cee68044dae38ae0326849be6d74c Mon Sep 17 00:00:00 2001
From: DannyYuyang-quic <quic_yuyazhua@quicinc.com>
Date: Fri, 10 Oct 2025 00:42:19 +0800
Subject: [PATCH 207/266] Qualcomm AI Engine Direct - change the llama tutorial
 to static llama version (#14887)

### Summary
change the llama tutorial to static llama version


cc @cccclai @winskuo-quic @shewu-quic @haowhsu-quic @cbilgin
---
 docs/source/backends-qualcomm.md              |   2 +-
 ...lama3-qualcomm-ai-engine-direct-backend.md | 163 +++++++-----------
 2 files changed, 60 insertions(+), 105 deletions(-)

diff --git a/docs/source/backends-qualcomm.md b/docs/source/backends-qualcomm.md
index 8e1b0ebfcb3..7346075ead8 100644
--- a/docs/source/backends-qualcomm.md
+++ b/docs/source/backends-qualcomm.md
@@ -397,4 +397,4 @@ print(f"Model successfully exported to {model_name}")
 ## FAQ
 
 If you encounter any issues while reproducing the tutorial, please file a github
-issue on ExecuTorch repo and tag use `#qcom_aisw` tag
+[issue](https://github.com/pytorch/executorch/issues) on ExecuTorch repo and tag use `#qcom_aisw` tag
diff --git a/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md
index 642dc04da58..ae1b4f15c99 100644
--- a/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md
+++ b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md
@@ -1,6 +1,7 @@
-# Run Llama 3 8B on Android (with Qualcomm AI Engine Direct Backend)
+# Run Llama 3 3B Instruct on Android (with Qualcomm AI Engine Direct Backend)
 
-This tutorial demonstrates how to export Llama 3 8B Instruct for Qualcomm AI Engine Direct Backend and running the model on a Qualcomm device.
+This tutorial demonstrates how to export and run the Llama 3 3B Instruct model on a Qualcomm device using the Qualcomm AI Engine Direct Backend via ExecuTorch.
+We use a static Llama [implementation](https://github.com/pytorch/executorch/blob/main/examples/qualcomm/oss_scripts/llama/model/static_llama.py) to optimize performance and memory usage during on-device inference.
 
 ## Prerequisites
 
@@ -13,10 +14,8 @@ This tutorial demonstrates how to export Llama 3 8B Instruct for Qualcomm AI Eng
 
 ## Instructions
 
-### Step 1: Prepare the checkpoint of the model and optimized matrix from [Spin Quant](https://github.com/facebookresearch/SpinQuant)
-
-1. For Llama 3 tokenizer and checkpoint, please refer to https://github.com/meta-llama/llama-models/blob/main/README.md for further instructions on how to download `tokenizer.model`, `consolidated.00.pth` and `params.json`.
-2. To get the optimized matrix, please refer to [SpinQuant on GitHub](https://github.com/facebookresearch/SpinQuant). You can download the optimized rotation matrices in the Quantized Models section. Please choose **LLaMA-3-8B/8B_W4A16KV16_lr_1.5_seed_0**.
+### Step 1: Prepare the checkpoint and tokenizer of the model.
+1. For Llama 3 tokenizer and checkpoint, please refer to [instructions](https://www.llama.com/models/llama-3) for further instructions on how to download `tokenizer.model`, `consolidated.00.pth` and `params.json`.
 
 ### Step 2: Export to ExecuTorch with Qualcomm AI Engine Direct Backend
 Deploying large language models like Llama 3 on-device presents the following challenges:
@@ -25,123 +24,79 @@ Deploying large language models like Llama 3 on-device presents the following ch
 2. High model loading and inference time.
 3. Difficulty in quantization.
 
-To address these challenges, we have implemented the following solutions:
-1. Using `quantization.pt2e_quantize = "qnn_16a4w'` to quantize activations and weights, thereby reducing the on-disk model size and alleviating memory pressure during inference.
-2. Using `backed.qnn.num_sharding = 8` to shard the model into sub-parts.
-3. Performing graph transformations to convert or decompose operations into more accelerator-friendly operations.
-4. Using `backend.qnn.optimized_rotation_path = "<path_to_optimized_matrix>"` to apply R1 and R2 of [Spin Quant](https://github.com/facebookresearch/SpinQuant) to improve accuracy.
-5. Using `quantization.calibration_data = "<|start_header_id|>system<|end_header_id|..."` to ensure that during quantization, the calibration includes special tokens in the prompt template. For more details on the prompt template, refer to [the model card](https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/).
+To address these, we apply the following optimizations:
+
+1. Quantization: Use `QuantDtype.use_16a4w_block` for post-training quantization to reduce model size and memory usage.
+
+2. Mixed Precision Quantization: compresses KV cache tensors to 8-bit and applies `QuantDtype.use_16a8w` to the LM head.
+
+3. Model Sharding: Set `num_sharding` = 4 to shard the model into sub-parts. This helps reduce memory pressure and improve performance during on-device inference. The number of shards might be different depending on the model size.
+
+4. Graph Transformations: Convert operations into accelerator-friendly formats for better runtime performance.
+
+You can find the full optimization configuration in this [file](https://github.com/pytorch/executorch/blob/main/examples/qualcomm/oss_scripts/llama/__init__.py), as shown below:
+
+``` python
+@register_llm_model("llama3_2-3b_instruct")
+@dataclass(init=False, frozen=True)
+class Llama3_2_3B_Instruct(LLMModelConfig):
+    repo_id = None
+    params_path = None
+    convert_weights = None
+    transform_weight = True
+    # The Llama3_2 enabled should be instruct, however, Llama's tokenizer does not provide utility to apply chat template.
+    instruct_model = False
+
+    num_sharding = 4
+    # quant config
+    ptq = QuantDtype.use_16a4w_block
+    group_size = 32  # Group size used in block quantization for weight quantization. Will only be used when ptq = 16a4w_block
+    masked_softmax = False
+  
+    # SeqMSE Quantization: optimizes the parameter encodings of each layer of a model individually to minimize the difference between the layer’s original and quantized outputs. (Implementation details: ./backends/qualcomm/_passes/seq_mse.py) In this configuration, we set `seq_mse_candidates` = 0, which means SeqMSE quantization is not applied.
+    seq_mse_candidates = 0
+    r1 = False
+    r2 = False
+    r3 = False
+    custom_annotation = (
+        annotate_kv_8bit,
+        annotate_output_16a8w,
+    )
+```
+
 
 To export with the Qualcomm AI Engine Direct Backend, ensure the following:
 
-1. The host machine has more than 100GB of memory (RAM + swap space).
+1. The host machine has more than 64GB of memory (RAM + swap space).
 2. The entire process takes a few hours.
 
 ```bash
-# path/to/config.yaml
-base:
-  model_class: llama3
-  checkpoint: path/to/consolidated.00.pth
-  params: path/to/params.json
-  tokenizer_path: path/to/tokenizer.model
-  metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
-model:
-  use_kv_cache: True
-  enable_dynamic_shape: False
-quantization:
-  pt2e_quantize: qnn_16a4w
-  # Please note that calibration_data must include the prompt template for special tokens.
-  calibration_data: "<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
-backend:
-  qnn:
-    enabled: True
-    num_sharding: 8
-
-
-# export_llm
-python -m extension.llm.export.export_llm \
-  --config path/to/config.yaml
+# export llama
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --decoder_model llama3_2-3b_instruct --model_mode kv --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?" --tasks wikitext --limit 1 --compile_only
 ```
+Note: end-to-end [instructions](https://github.com/pytorch/executorch/blob/main/examples/qualcomm/oss_scripts/llama/README.md)
 
 ### Step 3: Invoke the Runtime on an Android smartphone with Qualcomm SoCs
-1. Build executorch with Qualcomm AI Engine Direct Backend for android
-    ```bash
-    cmake \
-        -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake" \
-        -DANDROID_ABI=arm64-v8a \
-        -DCMAKE_INSTALL_PREFIX=cmake-android-out \
-        -DCMAKE_BUILD_TYPE=Release \
-        -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-        -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-        -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-        -DEXECUTORCH_BUILD_QNN=ON \
-        -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \
-        -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-        -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-        -Bcmake-android-out .
-
-    cmake --build cmake-android-out -j16 --target install --config Release
-    ```
-2. Build llama runner for android
-```bash
-    cmake \
-        -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK_ROOT}"/build/cmake/android.toolchain.cmake  \
-        -DANDROID_ABI=arm64-v8a \
-        -DCMAKE_INSTALL_PREFIX=cmake-android-out \
-        -DCMAKE_BUILD_TYPE=Release -DPYTHON_EXECUTABLE=python \
-        -DEXECUTORCH_BUILD_QNN=ON \
-        -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-        -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-        -Bcmake-android-out/examples/models/llama examples/models/llama
-
-    cmake --build cmake-android-out/examples/models/llama -j16 --config Release
-```
-3. Run on Android via adb shell
-*Pre-requisite*: Make sure you enable USB debugging via developer options on your phone
-
 **3.1 Connect your android phone**
 
-**3.2 We need to push required QNN libraries to the device.**
-```bash
-# make sure you have write-permission on below path.
-DEVICE_DIR=/data/local/tmp/llama
-adb shell mkdir -p ${DEVICE_DIR}
-adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtp.so ${DEVICE_DIR}
-adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnSystem.so ${DEVICE_DIR}
-adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV69Stub.so ${DEVICE_DIR}
-adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV73Stub.so ${DEVICE_DIR}
-adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV75Stub.so ${DEVICE_DIR}
-adb push ${QNN_SDK_ROOT}/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so ${DEVICE_DIR}
-adb push ${QNN_SDK_ROOT}/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so ${DEVICE_DIR}
-adb push ${QNN_SDK_ROOT}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${DEVICE_DIR}
-```
+**3.2 Make sure the following artifact is present before running the model.**
+-- artifact/
+   └── llama_qnn.pte
 
-**3.3 Upload model, tokenizer and llama runner binary to phone**
+**3.3 Run model**
 ```bash
-adb push <model.pte> ${DEVICE_DIR}
-adb push <tokenizer.model> ${DEVICE_DIR}
-adb push cmake-android-out/lib/libqnn_executorch_backend.so ${DEVICE_DIR}
-adb push cmake-out-android/examples/models/llama/llama_main ${DEVICE_DIR}
-```
-
-**3.4 Run model**
-```bash
-adb shell "cd ${DEVICE_DIR} && ./llama_main --model_path <model.pte> --tokenizer_path <tokenizer.model> --prompt \"<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n\" --seq_len 128"
-```
-You should see the message:
-```
-<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello! I'd be delighted to chat with you about Facebook. Facebook is a social media platform that was created in 2004 by Mark Zuckerberg and his colleagues while he was a student at Harvard University. It was initially called "Facemaker" but later changed to Facebook, which is a combination of the words "face" and "book". The platform was initially intended for people to share their thoughts and share information with their friends, but it quickly grew to become one of the
+# Run llama
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --decoder_model llama3_2-3b_instruct --model_mode kv --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?" --tasks wikitext --limit 1 --pre_gen_pte ${PATH_TO_ARTIFACT}
 ```
 
 ## What is coming?
-
 - Performance improvements
 - Reduce the memory pressure during inference to support 12GB Qualcomm devices
-- Support more LLMs (Qwen, Phi-4-mini, etc.)
+- Broader LLM Support via [Optimum ExecuTorch](https://github.com/huggingface/optimum-executorch?tab=readme-ov-file#llms-large-language-models)
+
+  - Already supported models (e.g.): Llama2, Llama3, Gemma, Qwen, Phi-4, SmolLM. For usage examples, please refer to [README](https://github.com/pytorch/executorch/blob/main/examples/qualcomm/oss_scripts/llama/README.md)
 
 ## FAQ
 
 If you encounter any issues while reproducing the tutorial, please file a github
-issue on ExecuTorch repo and tag use `#qcom_aisw` tag
+[issue](https://github.com/pytorch/executorch/issues) on ExecuTorch repo and tag use `#qcom_aisw` tag
\ No newline at end of file

From d4129b7c04aa629b5414436ed3e4dea59330ff04 Mon Sep 17 00:00:00 2001
From: Michiel Olieslagers
 <44864547+Michiel-Olieslagers@users.noreply.github.com>
Date: Thu, 9 Oct 2025 19:17:22 +0100
Subject: [PATCH 208/266] Arm backend: Updated how generic evaluator is handled
 (#14940)

Currently using the generic evaluator leads to a dataset of "None" being
used. This patch reverts the dataset to being defaulted to example
inputs for the generic evaluator.

Change-Id: I7bdb3161d6339eadc50a1d3a73d592119fa87587

cc @freddan80 @per @zingo @oscarandersson8218 @digantdesai
---
 examples/arm/aot_arm_compiler.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
index f3de38c20da..cf924971327 100644
--- a/examples/arm/aot_arm_compiler.py
+++ b/examples/arm/aot_arm_compiler.py
@@ -285,7 +285,9 @@ def get_calibration_data(
 ):
     # Firstly, if the model is being evaluated, take the evaluators calibration function if it has one
     if evaluator_name is not None:
-        return evaluator_calibration_data(evaluator_name, evaluator_config)
+        evaluator_data = evaluator_calibration_data(evaluator_name, evaluator_config)
+        if evaluator_data is not None:
+            return evaluator_data
 
     # If the model is in the calibration_data dictionary, get the data from there
     # This is used for the simple model examples provided

From 71c80319a723bc9822f227c5e7ee481237937023 Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin <anthony@shoumikh.in>
Date: Thu, 9 Oct 2025 12:36:24 -0700
Subject: [PATCH 209/266] Fix iOS demo app package resolution on CI (#14952)

Configure Git to use HTTP/1.1 to avoid SPM clone issues.
https://github.com/pytorch/executorch/issues/14824
---
 .ci/scripts/test_ios_ci.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.ci/scripts/test_ios_ci.sh b/.ci/scripts/test_ios_ci.sh
index a89c2cc5809..46c3f71f021 100755
--- a/.ci/scripts/test_ios_ci.sh
+++ b/.ci/scripts/test_ios_ci.sh
@@ -36,6 +36,7 @@ say() {
 
 say "Cloning the Demo App"
 
+git config --global http.postBuffer 524288000
 git clone --depth 1 https://github.com/meta-pytorch/executorch-examples.git
 
 say "Installing CoreML Backend Requirements"

From 64b0fd9b4ba0f40f26b9b9775a756dffdc17d139 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Thu, 9 Oct 2025 12:51:13 -0700
Subject: [PATCH 210/266] Make determinism of channels_last more conservative

Differential Revision: D83998877

Pull Request resolved: https://github.com/pytorch/executorch/pull/14862

From bf977e0ec39207fb16a31b1bea9875dc8af74a5e Mon Sep 17 00:00:00 2001
From: Andrew Grebenisan <33402477+DrJessop@users.noreply.github.com>
Date: Thu, 9 Oct 2025 12:51:27 -0700
Subject: [PATCH 211/266] Group-quantized embedding op

Differential Revision: D84189660

Pull Request resolved: https://github.com/pytorch/executorch/pull/14916
---
 backends/cadence/aot/ops_registrations.py     |  26 +++-
 backends/cadence/aot/ref_implementations.py   |  31 +++++
 .../aot/tests/test_ref_implementations.py     | 113 ++++++++++++++++++
 3 files changed, 168 insertions(+), 2 deletions(-)

diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
index 38a6b08836c..a0527618bcf 100644
--- a/backends/cadence/aot/ops_registrations.py
+++ b/backends/cadence/aot/ops_registrations.py
@@ -320,7 +320,7 @@
     "float out_scale, int out_zero_point) -> (Tensor Z)"
 )
 lib.define(
-    "quantized_embedding_byte(Tensor weight, Tensor weight_scales, Tensor weight_zero_points, "
+    "quantized_embedding_byte(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
     "Tensor indices, bool pruned_weights=False) -> (Tensor X)"
 )
 lib.define(
@@ -514,7 +514,7 @@
     "int weight_zero_point, int out_multiplier, int out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)"
 )
 lib.define(
-    "quantized_embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor weight_zero_points, "
+    "quantized_embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
     "Tensor indices, bool pruned_weights=False, *, Tensor(a!) out) -> Tensor(a!)"
 )
 
@@ -2316,6 +2316,28 @@ def transposed_im2row_meta(
     return input.new_empty(output_size, dtype=input.dtype)
 
 
+@register_fake("cadence::quantized_embedding_byte")
+def quantized_embedding_byte_meta(
+    weight: torch.Tensor,
+    weight_scales: torch.Tensor,
+    weight_zero_points: torch.Tensor | None,
+    indices: torch.Tensor,
+    pruned_weights: bool = False,
+) -> torch.Tensor:
+    assert not pruned_weights
+    assert len(weight.shape) == 2
+    assert 1 <= len(weight_scales.shape) <= 2
+    if len(weight_scales.shape) == 2:
+        num_groups = weight_scales.shape[-1]
+        assert weight.shape[1] % num_groups == 0
+
+    if weight_zero_points is not None:
+        assert weight_zero_points.shape == weight_scales.shape
+
+    assert 1 <= len(indices.shape) <= 2
+    return torch.empty(*indices.shape, weight.shape[1], dtype=torch.float32)
+
+
 @register_fake("cadence::where_Scalar")
 def where_Scalar_meta(
     condition: torch.Tensor,
diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py
index ad1abb3ce4b..4f612e3bab4 100644
--- a/backends/cadence/aot/ref_implementations.py
+++ b/backends/cadence/aot/ref_implementations.py
@@ -1572,3 +1572,34 @@ def transposed_im2row(
     # Optionally, flatten to (N, num_patches, patch_size) if needed
     patches = patches.view(N, C * H_in * W_in, -1).transpose(1, 2).contiguous()
     return patches
+
+
+@impl(m, "quantized_embedding_byte")
+def quantized_embedding_byte(
+    weight: torch.Tensor,
+    weight_scales: torch.Tensor,
+    weight_zero_points: torch.Tensor | None,
+    indices: torch.Tensor,
+    pruned_weights: bool = False,
+) -> torch.Tensor:
+    if pruned_weights:
+        raise NotImplementedError("Pruned weights not supported")
+
+    # Cannot use torch.ops.quantized_decomposed.embedding_byte.dtype because
+    # it doesn't support num_groups == 1
+    num_groups = 1
+    if len(weight_scales.shape) == 2:
+        num_groups = weight_scales.shape[1]
+
+    group_size = weight.shape[1] // num_groups
+    weight = torch.ops.torchao.dequantize_affine.default(
+        input=weight,
+        block_size=(1, group_size),
+        scale=weight_scales,
+        zero_point=weight_zero_points,
+        input_dtype=weight.dtype,
+        quant_min=torch.iinfo(weight.dtype).min,
+        quant_max=torch.iinfo(weight.dtype).max,
+    )
+
+    return weight[indices]
diff --git a/backends/cadence/aot/tests/test_ref_implementations.py b/backends/cadence/aot/tests/test_ref_implementations.py
index d8a79454097..5856c9def66 100644
--- a/backends/cadence/aot/tests/test_ref_implementations.py
+++ b/backends/cadence/aot/tests/test_ref_implementations.py
@@ -2306,3 +2306,116 @@ def test_transposed_im2row(
             torch.equal(output, expected_output),
             f"transposed_im2row output mismatch in {name}: got {output}, expected {expected_output}",
         )
+
+    @expand(
+        [
+            (
+                "1_group",
+                torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=torch.int8),
+                torch.tensor([1, 1, 1], dtype=torch.float32),
+                torch.tensor([0, 0, 0], dtype=torch.int8),
+                torch.tensor([0, 2, 1], dtype=torch.int64),
+                torch.tensor(
+                    [[0.0, 1.0, 2.0], [6.0, 7.0, 8.0], [3.0, 4.0, 5.0]],
+                    dtype=torch.float32,
+                ),
+            ),
+            (
+                "2_groups",
+                torch.tensor(
+                    [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]], dtype=torch.int8
+                ),
+                torch.tensor([[0.5, 1.0], [1.5, 2.0], [2.5, 3.0]], dtype=torch.float32),
+                torch.tensor([[0, 1], [2, 3], [4, 5]], dtype=torch.int8),
+                torch.tensor([0, 2, 1], dtype=torch.int64),
+                torch.tensor(
+                    [
+                        [0.0, 0.5, 1.0, 2.0],
+                        [10.0, 12.5, 15.0, 18.0],
+                        [3.0, 4.5, 6.0, 8.0],
+                    ],
+                    dtype=torch.float32,
+                ),
+            ),
+            (
+                "1_group_none_zero_point",
+                torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=torch.int8),
+                torch.tensor([1, 1, 1], dtype=torch.float32),
+                None,
+                torch.tensor([0, 2, 1], dtype=torch.int64),
+                torch.tensor(
+                    [[0.0, 1.0, 2.0], [6.0, 7.0, 8.0], [3.0, 4.0, 5.0]],
+                    dtype=torch.float32,
+                ),
+            ),
+            (
+                "1_group_batch2",
+                torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=torch.int8),
+                torch.tensor([1, 1, 1], dtype=torch.float32),
+                torch.tensor([0, 0, 0], dtype=torch.int8),
+                torch.tensor([[0, 2, 1], [1, 0, 2]], dtype=torch.int64),
+                torch.tensor(
+                    [
+                        [[0.0, 1.0, 2.0], [6.0, 7.0, 8.0], [3.0, 4.0, 5.0]],
+                        [[3.0, 4.0, 5.0], [0.0, 1.0, 2.0], [6.0, 7.0, 8.0]],
+                    ],
+                    dtype=torch.float32,
+                ),
+            ),
+            (
+                "2_groups_batch2",
+                torch.tensor(
+                    [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]], dtype=torch.int8
+                ),
+                torch.tensor([[0.5, 1.0], [1.5, 2.0], [2.5, 3.0]], dtype=torch.float32),
+                torch.tensor([[0, 1], [2, 3], [4, 5]], dtype=torch.int8),
+                torch.tensor([[0, 2, 1], [2, 1, 0]], dtype=torch.int64),
+                torch.tensor(
+                    [
+                        [
+                            [0.0, 0.5, 1.0, 2.0],
+                            [10.0, 12.5, 15.0, 18.0],
+                            [3.0, 4.5, 6.0, 8.0],
+                        ],
+                        [
+                            [10.0, 12.5, 15.0, 18.0],
+                            [3.0, 4.5, 6.0, 8.0],
+                            [0.0, 0.5, 1.0, 2.0],
+                        ],
+                    ],
+                    dtype=torch.float32,
+                ),
+            ),
+            (
+                "1_group_none_zero_point_batch2",
+                torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=torch.int8),
+                torch.tensor([1, 1, 1], dtype=torch.float32),
+                None,
+                torch.tensor([[0, 2, 1], [1, 0, 2]], dtype=torch.int64),
+                torch.tensor(
+                    [
+                        [[0.0, 1.0, 2.0], [6.0, 7.0, 8.0], [3.0, 4.0, 5.0]],
+                        [[3.0, 4.0, 5.0], [0.0, 1.0, 2.0], [6.0, 7.0, 8.0]],
+                    ],
+                    dtype=torch.float32,
+                ),
+            ),
+        ]
+    )
+    def test_quantized_embedding_byte(
+        self,
+        name: str,
+        weight: torch.Tensor,
+        weight_scales: torch.Tensor,
+        weight_zero_points: torch.Tensor | None,
+        indices: torch.Tensor,
+        expected_out: torch.Tensor,
+    ) -> None:
+        self.assertTrue(
+            torch.equal(
+                torch.ops.cadence.quantized_embedding_byte(
+                    weight, weight_scales, weight_zero_points, indices
+                ),
+                expected_out,
+            )
+        )

From 84d060ac02fbdac17ac44c5eadcbd9c60b9e58fc Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Thu, 9 Oct 2025 15:07:58 -0500
Subject: [PATCH 212/266] XNNPACK: Assert on unsupported pass through tensor
 args

Differential Revision: D83872407

Pull Request resolved: https://github.com/pytorch/executorch/pull/14937
---
 backends/xnnpack/xnnpack_preprocess.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/backends/xnnpack/xnnpack_preprocess.py b/backends/xnnpack/xnnpack_preprocess.py
index 05fb53a837d..cdceb8a90a1 100644
--- a/backends/xnnpack/xnnpack_preprocess.py
+++ b/backends/xnnpack/xnnpack_preprocess.py
@@ -71,6 +71,11 @@ def generate_node_to_external_map(
         if node.op == "output":
             for output_nodes in node.args:
                 for output_node in output_nodes:
+                    if output_node in node_to_external_map:
+                        raise RuntimeError(
+                            f"Output node '{output_node}' is already in the inputs. "
+                            "This is likely due to pass through arguments, which are not supported in XNNPACK Delegate."
+                        )
                     node_to_external_map[output_node] = ExternalMeta(
                         external_id=len(node_to_external_map),
                         io_type=XNN_VALUE_FLAG_EXTERNAL_OUTPUT,

From a5d7e5c2d9f619f3d1d11745e9fb4852fa74ca2c Mon Sep 17 00:00:00 2001
From: Alex Dean <a.dean1@samsung.com>
Date: Thu, 9 Oct 2025 13:18:30 -0700
Subject: [PATCH 213/266] [ET-VK] Add Fusing for Conv/Binary Ops, Clamp/Binary
 Ops, and Clamp/Clamp (#14415)

With the motivation of improving performance, this change adds the
functionality for fusing the following ops:
- conv2d PW s1p0 and binary ops (add, sub, mul, div)
- clamp and binary ops (add, sub, mul, div)
- clamp and clamp

cc @SS-JIA @manuelcandales @digantdesai @cbilgin
---
 .../transforms/fuse_clamp_with_binary_op.py   | 123 +++
 backends/transforms/fuse_clamps.py            | 105 +++
 backends/transforms/fuse_conv_with_clamp.py   |  10 +-
 backends/transforms/targets.bzl               |  32 +
 backends/vulkan/custom_ops_lib.py             | 757 ++++++++++++++++++
 backends/vulkan/op_registry.py                |   8 +
 .../runtime/graph/ops/glsl/binary_op.glsl     |  59 +-
 .../runtime/graph/ops/glsl/unary_op.glsl      |   1 +
 .../runtime/graph/ops/impl/BinaryOp.cpp       | 102 ++-
 backends/vulkan/targets.bzl                   |   2 +
 backends/vulkan/vulkan_preprocess.py          |  10 +-
 11 files changed, 1190 insertions(+), 19 deletions(-)
 create mode 100644 backends/transforms/fuse_clamp_with_binary_op.py
 create mode 100644 backends/transforms/fuse_clamps.py

diff --git a/backends/transforms/fuse_clamp_with_binary_op.py b/backends/transforms/fuse_clamp_with_binary_op.py
new file mode 100644
index 00000000000..4155b2b7458
--- /dev/null
+++ b/backends/transforms/fuse_clamp_with_binary_op.py
@@ -0,0 +1,123 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import sys
+
+import executorch.backends.vulkan.custom_ops_lib  # noqa
+
+import torch
+
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+class FuseClampBinaryOpPass(ExportPass):
+
+    FUSEABLE_CLAMP_OPS = [
+        exir_ops.edge.aten.relu.default,
+        exir_ops.edge.aten.hardtanh.default,
+        exir_ops.edge.aten.clamp.default,
+    ]
+    FUSEABLE_BINARY_OPS = [
+        exir_ops.edge.aten.add.Tensor,
+        exir_ops.edge.aten.sub.Tensor,
+        exir_ops.edge.aten.mul.Tensor,
+        exir_ops.edge.aten.div.Tensor,
+    ]
+
+    def exists_before(self, graph_module, node_a, node_b):
+        seen_a = False
+        for n in graph_module.graph.nodes:
+            if n is node_a:
+                seen_a = True
+            if n is node_b:
+                return seen_a
+        return False
+
+    def get_output_min_max_from_activation(self, activation_node):
+        if activation_node.target == exir_ops.edge.aten.relu.default:
+            output_min = 0.0
+            output_max = sys.float_info.max
+        elif activation_node.target == exir_ops.edge.aten.hardtanh.default:
+            output_min = -1.0
+            output_max = 1.0
+            if len(activation_node.args) > 1:
+                output_min = activation_node.args[1]
+                output_max = activation_node.args[2]
+        elif activation_node.target == exir_ops.edge.aten.clamp.default:
+            output_min = None
+            output_max = None
+            if len(activation_node.args) >= 2:
+                output_min = activation_node.args[1]
+            if len(activation_node.args) >= 3:
+                output_max = activation_node.args[2]
+
+        return output_min, output_max
+
+    def fuse_binary_op_with_clamp(self, graph_module: torch.fx.GraphModule):
+        fuseAdded = False
+        for clamp_node in graph_module.graph.nodes:
+            if clamp_node.op == "call_function":
+                if clamp_node.target in self.FUSEABLE_CLAMP_OPS:
+                    preceding_op = clamp_node.args[0]
+
+                    if (
+                        preceding_op.op == "call_function"
+                        and preceding_op.target in self.FUSEABLE_BINARY_OPS
+                    ):
+                        # Delete activation
+                        output_min_max = self.get_output_min_max_from_activation(
+                            clamp_node
+                        )
+                        new_args = list(preceding_op.args)
+                        new_args.append(output_min_max[0])
+                        new_args.append(output_min_max[1])
+                        new_args = tuple(new_args)
+                        clamp_node.replace_all_uses_with(preceding_op)
+                        graph_module.graph.erase_node(clamp_node)
+
+                        new_op = None
+                        match preceding_op.target:
+                            case exir_ops.edge.aten.add.Tensor:
+                                new_op = (
+                                    exir_ops.edge.et_vk.binary_add_with_clamp.default
+                                )
+                            case exir_ops.edge.aten.sub.Tensor:
+                                new_op = (
+                                    exir_ops.edge.et_vk.binary_sub_with_clamp.default
+                                )
+                            case exir_ops.edge.aten.mul.Tensor:
+                                new_op = (
+                                    exir_ops.edge.et_vk.binary_mul_with_clamp.default
+                                )
+                            case exir_ops.edge.aten.div.Tensor:
+                                new_op = (
+                                    exir_ops.edge.et_vk.binary_div_with_clamp.default
+                                )
+
+                        # Create and insert node of custom op `binary_<op>_with_clamp`
+                        with graph_module.graph.inserting_before(preceding_op):
+                            binary_op_clamp_node = graph_module.graph.create_node(
+                                "call_function",
+                                new_op,
+                                new_args,
+                            )
+
+                            preceding_op.replace_all_uses_with(binary_op_clamp_node)
+                            graph_module.graph.erase_node(preceding_op)
+
+                            fuseAdded = True
+
+        graph_module.recompile()
+        graph_module = super().call(graph_module).graph_module
+        return [fuseAdded, graph_module]
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        fuseAdded = True
+        while fuseAdded:
+            fuseAdded, graph_module = self.fuse_binary_op_with_clamp(graph_module)
+
+        return PassResult(graph_module, True)
diff --git a/backends/transforms/fuse_clamps.py b/backends/transforms/fuse_clamps.py
new file mode 100644
index 00000000000..6e5be508d54
--- /dev/null
+++ b/backends/transforms/fuse_clamps.py
@@ -0,0 +1,105 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import sys
+
+import executorch.backends.vulkan.custom_ops_lib  # noqa
+
+import torch
+
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+class FuseClampsPass(ExportPass):
+
+    FUSEABLE_CLAMPS = [
+        exir_ops.edge.aten.relu.default,
+        exir_ops.edge.aten.hardtanh.default,
+        exir_ops.edge.aten.clamp.default,
+    ]
+
+    def get_output_min_max_from_activation(self, activation_node):
+        if activation_node.target == exir_ops.edge.aten.relu.default:
+            output_min = 0.0
+            output_max = sys.float_info.max
+        elif activation_node.target == exir_ops.edge.aten.hardtanh.default:
+            output_min = -1.0
+            output_max = 1.0
+            if len(activation_node.args) > 1:
+                output_min = activation_node.args[1]
+                output_max = activation_node.args[2]
+        elif activation_node.target == exir_ops.edge.aten.clamp.default:
+            output_min = None
+            output_max = None
+            if len(activation_node.args) >= 2:
+                output_min = activation_node.args[1]
+            if len(activation_node.args) >= 3:
+                output_max = activation_node.args[2]
+
+        return output_min, output_max
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        fuseAdded = True
+        while fuseAdded:
+            fuseAdded = False
+            for clamp_2_node in graph_module.graph.nodes:
+                if clamp_2_node.op == "call_function":
+                    if clamp_2_node.target in self.FUSEABLE_CLAMPS:
+                        preceding_op = clamp_2_node.args[0]
+                        if (
+                            preceding_op.op == "call_function"
+                            and preceding_op.target in self.FUSEABLE_CLAMPS
+                        ):
+                            # Ensure the shapes match
+                            if (
+                                "val" not in clamp_2_node.args[0].meta
+                                or "val" not in preceding_op.args[0].meta
+                            ):
+                                continue
+                            if len(clamp_2_node.args[0].meta["val"].shape) != len(
+                                preceding_op.args[0].meta["val"].shape
+                            ):
+                                continue
+
+                            min_max1 = self.get_output_min_max_from_activation(
+                                preceding_op
+                            )
+                            min_max2 = self.get_output_min_max_from_activation(
+                                clamp_2_node
+                            )
+
+                            min_max = [None, None]
+
+                            if min_max1[0] is None and min_max2[0] is not None:
+                                min_max[0] = min_max2[0]
+                            elif min_max1[0] is not None and min_max2[0] is None:
+                                min_max[0] = min_max1[0]
+                            else:
+                                min_max[0] = min(min_max1[0], min_max2[0])
+
+                            if min_max1[1] is None and min_max2[1] is not None:
+                                min_max[1] = min_max2[1]
+                            elif min_max1[1] is not None and min_max2[1] is None:
+                                min_max[1] = min_max1[1]
+                            else:
+                                min_max[1] = max(min_max1[1], min_max2[1])
+
+                            new_args = list(preceding_op.args)
+
+                            # Insert the new min/max at indices 1 and 2
+                            new_args.insert(1, min_max[0])
+                            new_args.insert(2, min_max[1])
+                            new_args = new_args[0:3]
+                            preceding_op.args = tuple(new_args)
+                            clamp_2_node.replace_all_uses_with(preceding_op)
+                            graph_module.graph.erase_node(clamp_2_node)
+                            fuseAdded = True
+
+            graph_module.recompile()
+            graph_module = super().call(graph_module).graph_module
+
+        return PassResult(graph_module, True)
diff --git a/backends/transforms/fuse_conv_with_clamp.py b/backends/transforms/fuse_conv_with_clamp.py
index 3f45296b26c..52fc1f4a413 100644
--- a/backends/transforms/fuse_conv_with_clamp.py
+++ b/backends/transforms/fuse_conv_with_clamp.py
@@ -14,7 +14,7 @@
 from executorch.exir.pass_base import ExportPass, PassResult
 
 
-class FuseClampPass(ExportPass):
+class FuseConvClampPass(ExportPass):
     """
     Some activations like ReLU and hardtanh can be fused with certain operators (e.g. convolution) preceding it.
     """
@@ -25,6 +25,7 @@ class FuseClampPass(ExportPass):
     FUSEABLE_ACTIVATIONS = [
         exir_ops.edge.aten.relu.default,
         exir_ops.edge.aten.hardtanh.default,
+        exir_ops.edge.aten.clamp.default,
     ]
 
     def get_output_min_max_from_activation(self, activation_node):
@@ -37,6 +38,13 @@ def get_output_min_max_from_activation(self, activation_node):
             if len(activation_node.args) > 1:
                 output_min = activation_node.args[1]
                 output_max = activation_node.args[2]
+        elif activation_node.target == exir_ops.edge.aten.clamp.default:
+            output_min = None
+            output_max = None
+            if len(activation_node.args) >= 2:
+                output_min = activation_node.args[1]
+            if len(activation_node.args) >= 3:
+                output_max = activation_node.args[2]
 
         return output_min, output_max
 
diff --git a/backends/transforms/targets.bzl b/backends/transforms/targets.bzl
index ca09d34c2fe..f354f2234bd 100644
--- a/backends/transforms/targets.bzl
+++ b/backends/transforms/targets.bzl
@@ -77,6 +77,38 @@ def define_common_targets():
         ],
     )
 
+    runtime.python_library(
+        name = "fuse_clamps",
+        srcs = ["fuse_clamps.py"],
+        visibility = [
+            "//executorch/backends/...",
+        ],
+        deps = [
+            ":utils",
+            "//caffe2:torch",
+            "//executorch/backends/vulkan:custom_ops_lib",
+            "//executorch/exir:pass_base",
+            "//executorch/exir:sym_util",
+            "//executorch/exir/dialects:lib",
+        ],
+    )
+
+    runtime.python_library(
+        name = "fuse_clamp_with_binary_op",
+        srcs = ["fuse_clamp_with_binary_op.py"],
+        visibility = [
+            "//executorch/backends/...",
+        ],
+        deps = [
+            ":utils",
+            "//caffe2:torch",
+            "//executorch/backends/vulkan:custom_ops_lib",
+            "//executorch/exir:pass_base",
+            "//executorch/exir:sym_util",
+            "//executorch/exir/dialects:lib",
+        ],
+    )
+
     runtime.python_library(
         name = "view_copy_to_squeeze_unsqueeze",
         srcs = ["view_copy_to_squeeze_unsqueeze.py"],
diff --git a/backends/vulkan/custom_ops_lib.py b/backends/vulkan/custom_ops_lib.py
index 6e5aa926d37..56d882fa075 100644
--- a/backends/vulkan/custom_ops_lib.py
+++ b/backends/vulkan/custom_ops_lib.py
@@ -109,6 +109,763 @@ def conv_with_clamp_out_impl(
 )
 lib.impl(name, conv_with_clamp_out_impl, "CompositeExplicitAutograd")
 
+##########################
+## conv_with_binary_add ##
+##########################
+
+
+def conv_with_binary_add_impl(
+    input,
+    weight,
+    bias=None,
+    stride=1,
+    padding=0,
+    dilation=1,
+    transposed=False,
+    output_padding=0,
+    groups=1,
+    other=None,
+):
+    return torch.add(
+        torch.convolution(
+            input,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            transposed,
+            output_padding,
+            groups,
+        ),
+        other,
+    )
+
+
+name = "conv_with_binary_add"
+lib.define(
+    f"{name}(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, Tensor other) -> Tensor"
+)
+lib.impl(name, conv_with_binary_add_impl, "CompositeExplicitAutograd")
+conv_with_binary_add_op = getattr(getattr(torch.ops, namespace), name)
+
+#############################
+## conv_with_binary_add.out ##
+#############################
+
+
+def conv_with_binary_add_out_impl(
+    input,
+    weight,
+    bias=None,
+    stride=1,
+    padding=0,
+    dilation=1,
+    transposed=False,
+    output_padding=0,
+    groups=1,
+    other=None,
+    out=None,
+):
+    out = conv_with_binary_add_impl(
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        transposed,
+        output_padding,
+        groups,
+        other,
+    )
+    return out
+
+
+name = "conv_with_binary_add.out"
+lib.define(
+    f"{name}(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, Tensor other, *, Tensor(a!) out) -> Tensor(a!)"
+)
+lib.impl(name, conv_with_binary_add_out_impl, "CompositeExplicitAutograd")
+
+##########################
+## conv_with_binary_sub ##
+##########################
+
+
+def conv_with_binary_sub_impl(
+    input,
+    weight,
+    bias=None,
+    stride=1,
+    padding=0,
+    dilation=1,
+    transposed=False,
+    output_padding=0,
+    groups=1,
+    other=None,
+):
+    return torch.sub(
+        torch.convolution(
+            input,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            transposed,
+            output_padding,
+            groups,
+        ),
+        other,
+    )
+
+
+name = "conv_with_binary_sub"
+lib.define(
+    f"{name}(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, Tensor other) -> Tensor"
+)
+lib.impl(name, conv_with_binary_sub_impl, "CompositeExplicitAutograd")
+conv_with_binary_sub_op = getattr(getattr(torch.ops, namespace), name)
+
+##############################
+## conv_with_binary_sub.out ##
+##############################
+
+
+def conv_with_binary_sub_out_impl(
+    input,
+    weight,
+    bias=None,
+    stride=1,
+    padding=0,
+    dilation=1,
+    transposed=False,
+    output_padding=0,
+    groups=1,
+    other=None,
+    out=None,
+):
+    out = conv_with_binary_sub_impl(
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        transposed,
+        output_padding,
+        groups,
+        other,
+    )
+    return out
+
+
+name = "conv_with_binary_sub.out"
+lib.define(
+    f"{name}(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, Tensor other, *, Tensor(a!) out) -> Tensor(a!)"
+)
+lib.impl(name, conv_with_binary_sub_out_impl, "CompositeExplicitAutograd")
+
+##########################
+## conv_with_binary_mul ##
+##########################
+
+
+def conv_with_binary_mul_impl(
+    input,
+    weight,
+    bias=None,
+    stride=1,
+    padding=0,
+    dilation=1,
+    transposed=False,
+    output_padding=0,
+    groups=1,
+    other=None,
+):
+    return torch.mul(
+        torch.convolution(
+            input,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            transposed,
+            output_padding,
+            groups,
+        ),
+        other,
+    )
+
+
+name = "conv_with_binary_mul"
+lib.define(
+    f"{name}(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, Tensor other) -> Tensor"
+)
+lib.impl(name, conv_with_binary_mul_impl, "CompositeExplicitAutograd")
+conv_with_binary_mul_op = getattr(getattr(torch.ops, namespace), name)
+
+##############################
+## conv_with_binary_mul.out ##
+##############################
+
+
+def conv_with_binary_mul_out_impl(
+    input,
+    weight,
+    bias=None,
+    stride=1,
+    padding=0,
+    dilation=1,
+    transposed=False,
+    output_padding=0,
+    groups=1,
+    other=None,
+    out=None,
+):
+    out = conv_with_binary_mul_impl(
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        transposed,
+        output_padding,
+        groups,
+        other,
+    )
+    return out
+
+
+name = "conv_with_binary_mul.out"
+lib.define(
+    f"{name}(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, Tensor other, *, Tensor(a!) out) -> Tensor(a!)"
+)
+lib.impl(name, conv_with_binary_mul_out_impl, "CompositeExplicitAutograd")
+
+##########################
+## conv_with_binary_div ##
+##########################
+
+
+def conv_with_binary_div_impl(
+    input,
+    weight,
+    bias=None,
+    stride=1,
+    padding=0,
+    dilation=1,
+    transposed=False,
+    output_padding=0,
+    groups=1,
+    other=None,
+):
+    return torch.div(
+        torch.convolution(
+            input,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            transposed,
+            output_padding,
+            groups,
+        ),
+        other,
+    )
+
+
+name = "conv_with_binary_div"
+lib.define(
+    f"{name}(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, Tensor other) -> Tensor"
+)
+lib.impl(name, conv_with_binary_div_impl, "CompositeExplicitAutograd")
+conv_with_binary_div_op = getattr(getattr(torch.ops, namespace), name)
+
+##############################
+## conv_with_binary_div.out ##
+##############################
+
+
+def conv_with_binary_div_out_impl(
+    input,
+    weight,
+    bias=None,
+    stride=1,
+    padding=0,
+    dilation=1,
+    transposed=False,
+    output_padding=0,
+    groups=1,
+    other=None,
+    out=None,
+):
+    out = conv_with_binary_div_impl(
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        transposed,
+        output_padding,
+        groups,
+        other,
+    )
+    return out
+
+
+name = "conv_with_binary_div.out"
+lib.define(
+    f"{name}(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, Tensor other, *, Tensor(a!) out) -> Tensor(a!)"
+)
+lib.impl(name, conv_with_binary_div_out_impl, "CompositeExplicitAutograd")
+
+###########################
+## clamp_with_binary_add ##
+###########################
+
+
+def clamp_with_binary_add_impl(
+    input,
+    output_min=-float("inf"),
+    output_max=float("inf"),
+    other=None,
+):
+    return torch.add(
+        torch.clamp(
+            input,
+            output_min,
+            output_max,
+        ),
+        other,
+    )
+
+
+name = "clamp_with_binary_add"
+lib.define(
+    f"{name}(Tensor input, Scalar? output_min, Scalar? output_max, Tensor? other) -> Tensor"
+)
+lib.impl(name, clamp_with_binary_add_impl, "CompositeExplicitAutograd")
+clamp_with_binary_add_op = getattr(getattr(torch.ops, namespace), name)
+
+###############################
+## clamp_with_binary_add.out ##
+###############################
+
+
+def clamp_with_binary_add_out_impl(
+    input,
+    output_min=-float("inf"),
+    output_max=float("inf"),
+    other=None,
+    out=None,
+):
+    out = clamp_with_binary_add_impl(
+        input,
+        output_min,
+        output_max,
+        other,
+    )
+    return out
+
+
+name = "clamp_with_binary_add.out"
+lib.define(
+    f"{name}(Tensor input, Scalar? output_min, Scalar? output_max, Tensor? other, *, Tensor(a!) out) -> Tensor(a!)"
+)
+lib.impl(name, clamp_with_binary_add_out_impl, "CompositeExplicitAutograd")
+
+###########################
+## clamp_with_binary_sub ##
+###########################
+
+
+def clamp_with_binary_sub_impl(
+    input,
+    output_min=-float("inf"),
+    output_max=float("inf"),
+    other=None,
+):
+    return torch.sub(
+        torch.clamp(
+            input,
+            output_min,
+            output_max,
+        ),
+        other,
+    )
+
+
+name = "clamp_with_binary_sub"
+lib.define(
+    f"{name}(Tensor input, Scalar? output_min, Scalar? output_max, Tensor? other) -> Tensor"
+)
+lib.impl(name, clamp_with_binary_sub_impl, "CompositeExplicitAutograd")
+clamp_with_binary_sub_op = getattr(getattr(torch.ops, namespace), name)
+
+###############################
+## clamp_with_binary_sub.out ##
+###############################
+
+
+def clamp_with_binary_sub_out_impl(
+    input,
+    output_min=-float("inf"),
+    output_max=float("inf"),
+    other=None,
+    out=None,
+):
+    out = clamp_with_binary_sub_impl(
+        input,
+        output_min,
+        output_max,
+        other,
+    )
+    return out
+
+
+name = "clamp_with_binary_sub.out"
+lib.define(
+    f"{name}(Tensor input, Scalar? output_min, Scalar? output_max, Tensor? other, *, Tensor(a!) out) -> Tensor(a!)"
+)
+lib.impl(name, clamp_with_binary_sub_out_impl, "CompositeExplicitAutograd")
+
+###########################
+## clamp_with_binary_mul ##
+###########################
+
+
+def clamp_with_binary_mul_impl(
+    input,
+    output_min=-float("inf"),
+    output_max=float("inf"),
+    other=None,
+):
+    return torch.mul(
+        torch.clamp(
+            input,
+            output_min,
+            output_max,
+        ),
+        other,
+    )
+
+
+name = "clamp_with_binary_mul"
+lib.define(
+    f"{name}(Tensor input, Scalar? output_min, Scalar? output_max, Tensor? other) -> Tensor"
+)
+lib.impl(name, clamp_with_binary_mul_impl, "CompositeExplicitAutograd")
+clamp_with_binary_mul_op = getattr(getattr(torch.ops, namespace), name)
+
+###############################
+## clamp_with_binary_mul.out ##
+###############################
+
+
+def clamp_with_binary_mul_out_impl(
+    input,
+    output_min=-float("inf"),
+    output_max=float("inf"),
+    other=None,
+    out=None,
+):
+    out = clamp_with_binary_mul_impl(
+        input,
+        output_min,
+        output_max,
+        other,
+    )
+    return out
+
+
+name = "clamp_with_binary_mul.out"
+lib.define(
+    f"{name}(Tensor input, Scalar? output_min, Scalar? output_max, Tensor? other, *, Tensor(a!) out) -> Tensor(a!)"
+)
+lib.impl(name, clamp_with_binary_mul_out_impl, "CompositeExplicitAutograd")
+
+###########################
+## clamp_with_binary_div ##
+###########################
+
+
+def clamp_with_binary_div_impl(
+    input,
+    output_min=-float("inf"),
+    output_max=float("inf"),
+    other=None,
+):
+    return torch.div(
+        torch.clamp(
+            input,
+            output_min,
+            output_max,
+        ),
+        other,
+    )
+
+
+name = "clamp_with_binary_div"
+lib.define(
+    f"{name}(Tensor input, Scalar? output_min, Scalar? output_max, Tensor? other) -> Tensor"
+)
+lib.impl(name, clamp_with_binary_div_impl, "CompositeExplicitAutograd")
+clamp_with_binary_div_op = getattr(getattr(torch.ops, namespace), name)
+
+###############################
+## clamp_with_binary_div.out ##
+###############################
+
+
+def clamp_with_binary_div_out_impl(
+    input,
+    output_min=-float("inf"),
+    output_max=float("inf"),
+    other=None,
+    out=None,
+):
+    out = clamp_with_binary_div_impl(
+        input,
+        output_min,
+        output_max,
+        other,
+    )
+    return out
+
+
+name = "clamp_with_binary_div.out"
+lib.define(
+    f"{name}(Tensor input, Scalar? output_min, Scalar? output_max, Tensor? other, *, Tensor(a!) out) -> Tensor(a!)"
+)
+lib.impl(name, clamp_with_binary_div_out_impl, "CompositeExplicitAutograd")
+
+###########################
+## binary_add_with_clamp ##
+###########################
+
+
+def binary_add_with_clamp_impl(
+    input,
+    other=None,
+    output_min=-float("inf"),
+    output_max=float("inf"),
+):
+    return torch.clamp(
+        torch.add(
+            input,
+            other,
+        ),
+        output_min,
+        output_max,
+    )
+
+
+name = "binary_add_with_clamp"
+lib.define(
+    f"{name}(Tensor input, Tensor? other, Scalar? output_min, Scalar? output_max) -> Tensor"
+)
+lib.impl(name, binary_add_with_clamp_impl, "CompositeExplicitAutograd")
+binary_add_with_clamp_op = getattr(getattr(torch.ops, namespace), name)
+
+###############################
+## binary_add_with_clamp.out ##
+###############################
+
+
+def binary_add_with_clamp_out_impl(
+    input,
+    other=None,
+    output_min=-float("inf"),
+    output_max=float("inf"),
+    out=None,
+):
+    out = binary_add_with_clamp_impl(
+        input,
+        output_min,
+        output_max,
+        other,
+    )
+    return out
+
+
+name = "binary_add_with_clamp.out"
+lib.define(
+    f"{name}(Tensor input, Tensor? other, Scalar? output_min, Scalar? output_max, *, Tensor(a!) out) -> Tensor(a!)"
+)
+lib.impl(name, binary_add_with_clamp_impl, "CompositeExplicitAutograd")
+
+###########################
+## binary_sub_with_clamp ##
+###########################
+
+
+def binary_sub_with_clamp_impl(
+    input,
+    other=None,
+    output_min=-float("inf"),
+    output_max=float("inf"),
+):
+    return torch.clamp(
+        torch.sub(
+            input,
+            other,
+        ),
+        output_min,
+        output_max,
+    )
+
+
+name = "binary_sub_with_clamp"
+lib.define(
+    f"{name}(Tensor input, Tensor? other, Scalar? output_min, Scalar? output_max) -> Tensor"
+)
+lib.impl(name, binary_sub_with_clamp_impl, "CompositeExplicitAutograd")
+binary_sub_with_clamp_op = getattr(getattr(torch.ops, namespace), name)
+
+###############################
+## binary_sub_with_clamp.out ##
+###############################
+
+
+def binary_sub_with_clamp_out_impl(
+    input,
+    other=None,
+    output_min=-float("inf"),
+    output_max=float("inf"),
+    out=None,
+):
+    out = binary_sub_with_clamp_impl(
+        input,
+        output_min,
+        output_max,
+        other,
+    )
+    return out
+
+
+name = "binary_sub_with_clamp.out"
+lib.define(
+    f"{name}(Tensor input, Tensor? other, Scalar? output_min, Scalar? output_max, *, Tensor(a!) out) -> Tensor(a!)"
+)
+lib.impl(name, binary_sub_with_clamp_impl, "CompositeExplicitAutograd")
+
+###########################
+## binary_mul_with_clamp ##
+###########################
+
+
+def binary_mul_with_clamp_impl(
+    input,
+    other=None,
+    output_min=-float("inf"),
+    output_max=float("inf"),
+):
+    return torch.clamp(
+        torch.mul(
+            input,
+            other,
+        ),
+        output_min,
+        output_max,
+    )
+
+
+name = "binary_mul_with_clamp"
+lib.define(
+    f"{name}(Tensor input, Tensor? other, Scalar? output_min, Scalar? output_max) -> Tensor"
+)
+lib.impl(name, binary_mul_with_clamp_impl, "CompositeExplicitAutograd")
+binary_mul_with_clamp_op = getattr(getattr(torch.ops, namespace), name)
+
+###############################
+## binary_mul_with_clamp.out ##
+###############################
+
+
+def binary_mul_with_clamp_out_impl(
+    input,
+    other=None,
+    output_min=-float("inf"),
+    output_max=float("inf"),
+    out=None,
+):
+    out = binary_mul_with_clamp_impl(
+        input,
+        output_min,
+        output_max,
+        other,
+    )
+    return out
+
+
+name = "binary_mul_with_clamp.out"
+lib.define(
+    f"{name}(Tensor input, Tensor? other, Scalar? output_min, Scalar? output_max, *, Tensor(a!) out) -> Tensor(a!)"
+)
+lib.impl(name, binary_mul_with_clamp_impl, "CompositeExplicitAutograd")
+
+###########################
+## binary_div_with_clamp ##
+###########################
+
+
+def binary_div_with_clamp_impl(
+    input,
+    other=None,
+    output_min=-float("inf"),
+    output_max=float("inf"),
+):
+    return torch.clamp(
+        torch.div(
+            input,
+            other,
+        ),
+        output_min,
+        output_max,
+    )
+
+
+name = "binary_div_with_clamp"
+lib.define(
+    f"{name}(Tensor input, Tensor? other, Scalar? output_min, Scalar? output_max) -> Tensor"
+)
+lib.impl(name, binary_div_with_clamp_impl, "CompositeExplicitAutograd")
+binary_div_with_clamp_op = getattr(getattr(torch.ops, namespace), name)
+
+###############################
+## binary_div_with_clamp.out ##
+###############################
+
+
+def binary_div_with_clamp_out_impl(
+    input,
+    other=None,
+    output_min=-float("inf"),
+    output_max=float("inf"),
+    out=None,
+):
+    out = binary_div_with_clamp_impl(
+        input,
+        output_min,
+        output_max,
+        other,
+    )
+    return out
+
+
+name = "binary_div_with_clamp.out"
+lib.define(
+    f"{name}(Tensor input, Tensor? other, Scalar? output_min, Scalar? output_max, *, Tensor(a!) out) -> Tensor(a!)"
+)
+lib.impl(name, binary_div_with_clamp_impl, "CompositeExplicitAutograd")
+
+
 #################
 ## grid_priors ##
 #################
diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
index 63b57a0e79c..85d14b30e88 100644
--- a/backends/vulkan/op_registry.py
+++ b/backends/vulkan/op_registry.py
@@ -219,6 +219,10 @@ def register_torchao_choose_qparams_affine():
         exir_ops.edge.aten.le.Tensor,
         exir_ops.edge.aten.gt.Tensor,
         exir_ops.edge.aten.ge.Tensor,
+        exir_ops.edge.et_vk.binary_add_with_clamp.default,
+        exir_ops.edge.et_vk.binary_sub_with_clamp.default,
+        exir_ops.edge.et_vk.binary_mul_with_clamp.default,
+        exir_ops.edge.et_vk.binary_div_with_clamp.default,
     ]
 )
 def register_binary_op():
@@ -246,6 +250,10 @@ def register_binary_op():
         exir_ops.edge.aten.tanh.default,
         exir_ops.edge.aten.round.default,
         exir_ops.edge.aten.leaky_relu.default,
+        exir_ops.edge.et_vk.clamp_with_binary_add.default,
+        exir_ops.edge.et_vk.clamp_with_binary_sub.default,
+        exir_ops.edge.et_vk.clamp_with_binary_mul.default,
+        exir_ops.edge.et_vk.clamp_with_binary_div.default,
     ]
 )
 def register_unary_op():
diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
index 6f2a93667ea..ed420fcc72f 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
@@ -69,6 +69,9 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 ${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
 ${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
 ${layout_declare_spec_const(C, "int", "other_layout", "DEFAULT_LAYOUT")}
+${layout_declare_spec_const(C, "int", "clamp_type", "0")}
+${layout_declare_spec_const(C, "float", "min_val", "0")}
+${layout_declare_spec_const(C, "float", "max_val", "0")}
 
 $if STORAGE == "buffer":
   const lowp ivec4 out_dim_order = unhash_dim_order(out_layout);
@@ -90,7 +93,20 @@ void main() {
 
   // Simple case; no broadcasting
   if (are_equal(inp, other)) {
-    t_out[out_bufi] = T(op(t_in[out_bufi], t_other[out_bufi], T(alpha)));
+    T in_val = T(t_in[out_bufi]);
+    T other_val = T(t_other[out_bufi]);
+    if (clamp_type == 1) {
+      in_val = T(clamp(in_val, T(min_val), T(max_val)));
+    }
+    else if (clamp_type == 2) {
+      other_val = T(clamp(other_val, T(min_val), T(max_val)));
+    }
+    T out_val = T(op(in_val, other_val, T(alpha)));
+    if (clamp_type == 3) {
+      out_val = T(clamp(out_val, T(min_val), T(max_val)));
+    }
+    t_out[out_bufi] = out_val;
+
     return;
   }
 
@@ -106,7 +122,19 @@ void main() {
   uint inp_bufi = tensor_idx_to_linear_idx(inp, inp_tidx);
   uint other_bufi = tensor_idx_to_linear_idx(other, other_tidx);
 
-  t_out[out_bufi] = T(op(t_in[inp_bufi], t_other[other_bufi], T(alpha)));
+  T in_val = T(t_in[inp_bufi]);
+  T other_val = T(t_other[other_bufi]);
+  if (clamp_type == 1) {
+    in_val = T(clamp(in_val, T(min_val), T(max_val)));
+  }
+  else if (clamp_type == 2) {
+    other_val = T(clamp(other_val, T(min_val), T(max_val)));
+  }
+  T out_val = T(op(in_val, other_val, T(alpha)));
+  if (clamp_type == 3) {
+    out_val = T(clamp(out_val, T(min_val), T(max_val)));
+  }
+  t_out[out_bufi] = out_val;
 }
 
 #else // USING_TEXTURE
@@ -126,6 +154,10 @@ void main() {
     // read axis mapped texel
     tidx_to_pos(in_idx, in_sizes, in_axis_map, packed_dim)));
 
+  if (clamp_type == 1) {
+    in_texel = clamp(in_texel, VEC4_T(min_val), VEC4_T(max_val));
+  }
+
   // broadcast on logical sizes
   ivec4 other_idx = broadcast_indices(tidx, other_sizes);
   VEC4_T other_texel = VEC4_T(load_texel(
@@ -133,6 +165,10 @@ void main() {
     // read axis mapped texel
     tidx_to_pos(other_idx, other_sizes, other_axis_map, packed_dim)));
 
+  if (clamp_type == 2) {
+    in_texel = clamp(other_texel, VEC4_T(min_val), VEC4_T(max_val));
+  }
+
   // Check boolean broadcast flags; we use ivec2 instead of bvec2 for alignment.
   if (broadcast_params.x > 0) {
     in_texel = in_texel.xxxx;
@@ -141,11 +177,20 @@ void main() {
     other_texel = other_texel.xxxx;
   }
 
-  write_texel_lpos(
-    t_out,
-    lpos,
-    VEC4_OUT_T(op(in_texel, other_texel, alpha)),
-    out_axis_map);
+  if (clamp_type != 3) {
+    write_texel_lpos(
+      t_out,
+      lpos,
+      VEC4_OUT_T(op(in_texel, other_texel, alpha)),
+      out_axis_map);
+  }
+  else {
+    write_texel_lpos(
+      t_out,
+      lpos,
+      VEC4_OUT_T(clamp(VEC4_OUT_T(op(in_texel, other_texel, alpha)), min_val, max_val)),
+      out_axis_map);
+  }
 }
 
 #endif
diff --git a/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl
index bb7ce482a7a..5bc01fa7f57 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl
@@ -61,6 +61,7 @@ void main() {
   }
 
   VEC4_T in_texel = texelFetch(t_in, pos, 0);
+
   imageStore(t_out, pos, VEC4_T(op(in_texel, minimum, maximum)));
 }
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
index 025b483eab7..9575ca0dcdd 100644
--- a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
@@ -54,13 +54,39 @@ void resize_binary_op_node(
   graph->virtual_resize(out, new_out_sizes);
 }
 
+int remove_clamp_from_name(std::string& op) {
+  if (op.find("clamp_0_with_") != std::string::npos) {
+    op.erase(op.find("clamp_0_with_"), 13);
+
+    // Clamp input 0
+    return 1;
+  }
+  if (op.find("clamp_1_with_") != std::string::npos) {
+    op.erase(op.find("clamp_1_with_"), 13);
+
+    // Clamp input 1
+    return 2;
+  }
+  if (op.find("_with_clamp") != std::string::npos) {
+    op.erase(op.find("_with_clamp"), 11);
+
+    // Clamp output
+    return 3;
+  }
+
+  // No clamp
+  return 0;
+}
+
 void add_binary_op_texture_node(
     ComputeGraph& graph,
     const ValueRef in1,
     const ValueRef in2,
     const ValueRef alpha,
     const ValueRef out,
-    const std::string& op_name) {
+    const std::string& op_name,
+    const float min,
+    const float max) {
   ValueRef arg1 = prepack_standard_like(graph, in1, out, true);
   ValueRef arg2 = prepack_standard_like(graph, in2, out, true);
 
@@ -80,7 +106,10 @@ void add_binary_op_texture_node(
 
   std::string kernel_name("binary_");
   kernel_name.reserve(kShaderNameReserve);
-  kernel_name += op_name;
+
+  std::string op = op_name;
+  int clamp_type = remove_clamp_from_name(op);
+  kernel_name += op;
   add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
   add_dtype_suffix(kernel_name, graph.dtype_of(in1));
 
@@ -101,7 +130,10 @@ void add_binary_op_texture_node(
       // Specialization Constants
       {graph.hashed_layout_of(out),
        graph.hashed_layout_of(arg1),
-       graph.hashed_layout_of(arg2)},
+       graph.hashed_layout_of(arg2),
+       clamp_type,
+       min,
+       max},
       // Resize Args
       {},
       // Resizing Logic
@@ -114,7 +146,9 @@ void add_binary_op_buffer_node(
     const ValueRef in2,
     const ValueRef alpha,
     const ValueRef out,
-    const std::string& op_name) {
+    const std::string& op_name,
+    const float min,
+    const float max) {
   // check_binary_op_args(*t_in1, *t_in2, *t_out);
 
   float alpha_val = 1.0f;
@@ -126,7 +160,9 @@ void add_binary_op_buffer_node(
 
   std::string kernel_name("binary_");
   kernel_name.reserve(kShaderNameReserve);
-  kernel_name += op_name;
+  std::string op = op_name;
+  int clamp_type = remove_clamp_from_name(op);
+  kernel_name += op;
   add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
 
   add_dtype_suffix(kernel_name, graph.dtype_of(in1));
@@ -149,7 +185,9 @@ void add_binary_op_buffer_node(
       // Specialization Constants
       {graph.hashed_layout_of(out),
        graph.hashed_layout_of(in1),
-       graph.hashed_layout_of(in2)},
+       graph.hashed_layout_of(in2),
+       min,
+       max},
       // Resize Args
       {},
       // Resizing Logic
@@ -162,11 +200,13 @@ void add_binary_op_node(
     const ValueRef in2,
     const ValueRef alpha,
     const ValueRef out,
-    const std::string& op_name) {
+    const std::string& op_name,
+    const float min = std::numeric_limits<float>::infinity(),
+    const float max = -std::numeric_limits<float>::infinity()) {
   if (graph.is_buffer_storage(out)) {
-    add_binary_op_buffer_node(graph, in1, in2, alpha, out, op_name);
+    add_binary_op_buffer_node(graph, in1, in2, alpha, out, op_name, min, max);
   } else {
-    add_binary_op_texture_node(graph, in1, in2, alpha, out, op_name);
+    add_binary_op_texture_node(graph, in1, in2, alpha, out, op_name, min, max);
   }
 }
 
@@ -182,6 +222,40 @@ void add_binary_op_node(
         graph, args[0], args[1], kDummyValueRef, args[2], #op_name);     \
   }
 
+float get_val_or_inf_(ComputeGraph& graph, const ValueRef& val, bool max) {
+  if (!graph.val_is_none(val)) {
+    return graph.extract_scalar<float>(val);
+  }
+  return max ? std::numeric_limits<float>::infinity()
+             : -std::numeric_limits<float>::infinity();
+}
+
+#define DEFINE_BINARY_OP_WITH_ALPHA_FN_CLAMPED(op_name)                  \
+  void op_name(ComputeGraph& graph, const std::vector<ValueRef>& args) { \
+    return add_binary_op_node(                                           \
+        graph,                                                           \
+        args[0],                                                         \
+        args[1],                                                         \
+        args[2],                                                         \
+        args[5],                                                         \
+        #op_name,                                                        \
+        get_val_or_inf_(graph, args[3], false),                          \
+        get_val_or_inf_(graph, args[4], true));                          \
+  }
+
+#define DEFINE_BINARY_OP_FN_CLAMPED(op_name)                             \
+  void op_name(ComputeGraph& graph, const std::vector<ValueRef>& args) { \
+    return add_binary_op_node(                                           \
+        graph,                                                           \
+        args[0],                                                         \
+        args[1],                                                         \
+        kDummyValueRef,                                                  \
+        args[4],                                                         \
+        #op_name,                                                        \
+        get_val_or_inf_(graph, args[2], false),                          \
+        get_val_or_inf_(graph, args[3], true));                          \
+  }
+
 DEFINE_BINARY_OP_WITH_ALPHA_FN(add);
 DEFINE_BINARY_OP_WITH_ALPHA_FN(sub);
 
@@ -199,6 +273,11 @@ DEFINE_BINARY_OP_FN(le);
 DEFINE_BINARY_OP_FN(gt);
 DEFINE_BINARY_OP_FN(ge);
 
+DEFINE_BINARY_OP_FN_CLAMPED(add_with_clamp);
+DEFINE_BINARY_OP_FN_CLAMPED(sub_with_clamp);
+DEFINE_BINARY_OP_FN_CLAMPED(mul_with_clamp);
+DEFINE_BINARY_OP_FN_CLAMPED(div_with_clamp);
+
 REGISTER_OPERATORS {
   VK_REGISTER_OP(aten.add.Tensor, add);
   VK_REGISTER_OP(aten.sub.Tensor, sub);
@@ -212,6 +291,11 @@ REGISTER_OPERATORS {
   VK_REGISTER_OP(aten.le.Tensor, le);
   VK_REGISTER_OP(aten.gt.Tensor, gt);
   VK_REGISTER_OP(aten.ge.Tensor, ge);
+
+  VK_REGISTER_OP(et_vk.binary_add_with_clamp.default, add_with_clamp);
+  VK_REGISTER_OP(et_vk.binary_sub_with_clamp.default, sub_with_clamp);
+  VK_REGISTER_OP(et_vk.binary_mul_with_clamp.default, mul_with_clamp);
+  VK_REGISTER_OP(et_vk.binary_div_with_clamp.default, div_with_clamp);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/targets.bzl b/backends/vulkan/targets.bzl
index a9ba62b6f9f..170afe4dc44 100644
--- a/backends/vulkan/targets.bzl
+++ b/backends/vulkan/targets.bzl
@@ -381,6 +381,8 @@ def define_common_targets(is_fbcode = False):
             deps = [
                 "//executorch/backends/transforms:addmm_mm_to_linear",
                 "//executorch/backends/transforms:fuse_batch_norm_with_conv",
+                "//executorch/backends/transforms:fuse_clamp_with_binary_op",
+                "//executorch/backends/transforms:fuse_clamps",
                 "//executorch/backends/transforms:fuse_conv_with_clamp",
                 "//executorch/backends/transforms:fuse_view_copy",
                 "//executorch/backends/transforms:remove_clone_ops",
diff --git a/backends/vulkan/vulkan_preprocess.py b/backends/vulkan/vulkan_preprocess.py
index 876f7fa8900..d23f0a29126 100644
--- a/backends/vulkan/vulkan_preprocess.py
+++ b/backends/vulkan/vulkan_preprocess.py
@@ -13,7 +13,11 @@
 import executorch.backends.vulkan.utils as utils
 
 from executorch.backends.transforms.addmm_mm_to_linear import AddmmToLinearTransform
-from executorch.backends.transforms.fuse_conv_with_clamp import FuseClampPass
+from executorch.backends.transforms.fuse_clamp_with_binary_op import (
+    FuseClampBinaryOpPass,
+)
+from executorch.backends.transforms.fuse_clamps import FuseClampsPass
+from executorch.backends.transforms.fuse_conv_with_clamp import FuseConvClampPass
 from executorch.backends.transforms.fuse_view_copy import FuseViewCopyTransform
 from executorch.backends.transforms.view_copy_to_squeeze_unsqueeze import (
     ViewCopyToSqueezeUnsqueezePass,
@@ -169,7 +173,9 @@ def preprocess(  # noqa: C901
             [
                 FuseBatchNormPass(program),
                 FusePatternsPass(),
-                FuseClampPass(),
+                FuseClampsPass(),
+                FuseConvClampPass(),
+                FuseClampBinaryOpPass(),
                 AddmmToLinearTransform(),
                 RemoveRedundantOpsTransform(),
                 FuseQuantizedOpsTransform(),

From b6884df35a8d2f8f023ee16ca837e585f753c83a Mon Sep 17 00:00:00 2001
From: lucylq <lfq@meta.com>
Date: Thu, 9 Oct 2025 13:18:51 -0700
Subject: [PATCH 214/266] Bump cortex-m size test (#14950)

regressed 70 bytes after
https://github.com/pytorch/executorch/pull/14570
---
 .github/workflows/trunk.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 2d25f469ae7..8add54af49c 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -346,7 +346,7 @@ jobs:
         elif [[ ${{ matrix.os}} == "zephyr-preset" ]]; then
           setup_script_args="--target-toolchain zephyr"
           toolchain_prefix=arm-zephyr-eabi-
-          threshold="135168" # 132 KiB
+          threshold="135240" # 132 KiB
           toolchain_cmake=examples/zephyr/x86_64-linux-arm-zephyr-eabi-gcc.cmake
         else
           echo "Fail unsupport OS selection ${{ matrix.os }}"

From f443ebbde650a0c8d4c89039951a59319fa0140e Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin <anthony@shoumikh.in>
Date: Thu, 9 Oct 2025 13:22:28 -0700
Subject: [PATCH 215/266] Add overload to create atensor view from TensorPtr.

Differential Revision: D84259596

Pull Request resolved: https://github.com/pytorch/executorch/pull/14943
---
 extension/tensor/tensor_ptr.h             | 26 ++++++++--
 extension/tensor/test/tensor_ptr_test.cpp | 62 ++++++++++++++++++++---
 2 files changed, 78 insertions(+), 10 deletions(-)

diff --git a/extension/tensor/tensor_ptr.h b/extension/tensor/tensor_ptr.h
index 4753ec296da..27e2e3451ce 100644
--- a/extension/tensor/tensor_ptr.h
+++ b/extension/tensor/tensor_ptr.h
@@ -338,19 +338,37 @@ inline TensorPtr make_tensor_ptr(const executorch::aten::Tensor& tensor) {
 #ifndef USE_ATEN_LIB
       std::vector<executorch::aten::DimOrderType>(
           tensor.dim_order().begin(), tensor.dim_order().end()),
-      std::vector<executorch::aten::StridesType>(
-          tensor.strides().begin(), tensor.strides().end()),
-      tensor.scalar_type(),
-      tensor.shape_dynamism()
 #else // USE_ATEN_LIB
       {},
+#endif // USE_ATEN_LIB
       std::vector<executorch::aten::StridesType>(
           tensor.strides().begin(), tensor.strides().end()),
       tensor.scalar_type()
+#ifndef USE_ATEN_LIB
+          ,
+      tensor.shape_dynamism()
 #endif // USE_ATEN_LIB
   );
 }
 
+/**
+ * Creates a TensorPtr to manage a new Tensor with the same properties
+ * as the Tensor referenced by the given TensorPtr, sharing the same data
+ * without owning it.
+ *
+ * This is a convenience overload equivalent to make_tensor_ptr(*tensor_ptr).
+ * It does not extend the lifetime of the underlying buffer; if the original
+ * owner releases the storage, all views aliasing it become dangling.
+ *
+ * @param tensor_ptr The TensorPtr whose underlying Tensor is used to initialize
+ *                   the returned view.
+ * @return A new TensorPtr managing a Tensor with the same properties as the
+ *         original.
+ */
+inline TensorPtr make_tensor_ptr(const TensorPtr& tensor_ptr) {
+  return make_tensor_ptr(*tensor_ptr);
+}
+
 /**
  * Creates a TensorPtr that manages a new Tensor with the same properties
  * as the given Tensor, but with a copy of the data owned by the returned
diff --git a/extension/tensor/test/tensor_ptr_test.cpp b/extension/tensor/test/tensor_ptr_test.cpp
index 6c98db52d41..04356875867 100644
--- a/extension/tensor/test/tensor_ptr_test.cpp
+++ b/extension/tensor/test/tensor_ptr_test.cpp
@@ -347,7 +347,7 @@ TEST_F(TensorPtrTest, TensorSharingImplResizingAffectsBothVector) {
 TEST_F(TensorPtrTest, MakeTensorPtrFromExistingTensorInt32) {
   std::vector<int32_t> data = {1, 2, 3, 4};
   auto tensor = make_tensor_ptr({2, 2}, data);
-  auto new_tensor = make_tensor_ptr(*tensor);
+  auto new_tensor = make_tensor_ptr(tensor);
 
   EXPECT_EQ(new_tensor->dim(), tensor->dim());
   EXPECT_EQ(new_tensor->size(0), tensor->size(0));
@@ -360,7 +360,7 @@ TEST_F(TensorPtrTest, MakeTensorPtrFromExistingTensorInt32) {
 TEST_F(TensorPtrTest, CloneTensorPtrFromExistingTensorInt32) {
   std::vector<int32_t> data = {1, 2, 3, 4};
   auto tensor = make_tensor_ptr({2, 2}, std::move(data));
-  auto cloned_tensor = clone_tensor_ptr(*tensor);
+  auto cloned_tensor = clone_tensor_ptr(tensor);
 
   EXPECT_EQ(cloned_tensor->dim(), tensor->dim());
   EXPECT_EQ(cloned_tensor->size(0), tensor->size(0));
@@ -373,6 +373,56 @@ TEST_F(TensorPtrTest, CloneTensorPtrFromExistingTensorInt32) {
   EXPECT_EQ(cloned_tensor->scalar_type(), executorch::aten::ScalarType::Int);
 }
 
+TEST_F(TensorPtrTest, MakeTensorPtrFromTensorPtrInt32) {
+  std::vector<int32_t> data = {1, 2, 3, 4};
+  auto tensor = make_tensor_ptr({2, 2}, data);
+  auto new_tensor = make_tensor_ptr(tensor);
+
+  EXPECT_EQ(new_tensor->dim(), tensor->dim());
+  EXPECT_EQ(new_tensor->size(0), tensor->size(0));
+  EXPECT_EQ(new_tensor->size(1), tensor->size(1));
+  EXPECT_EQ(
+      new_tensor->const_data_ptr<int32_t>(), tensor->const_data_ptr<int32_t>());
+  EXPECT_EQ(new_tensor->scalar_type(), executorch::aten::ScalarType::Int);
+}
+
+TEST_F(TensorPtrTest, MakeTensorPtrFromTensorPtrDouble) {
+  std::vector<double> data = {1.0, 2.0, 3.0, 4.0};
+  auto tensor = make_tensor_ptr({2, 2}, data);
+  auto new_tensor = make_tensor_ptr(tensor);
+
+  EXPECT_EQ(new_tensor->dim(), tensor->dim());
+  EXPECT_EQ(new_tensor->size(0), tensor->size(0));
+  EXPECT_EQ(new_tensor->size(1), tensor->size(1));
+  EXPECT_EQ(
+      new_tensor->const_data_ptr<double>(), tensor->const_data_ptr<double>());
+  EXPECT_EQ(new_tensor->scalar_type(), executorch::aten::ScalarType::Double);
+}
+
+TEST_F(TensorPtrTest, MakeTensorPtrFromTensorPtrInt64) {
+  std::vector<int64_t> data = {100, 200, 300, 400};
+  auto tensor = make_tensor_ptr({2, 2}, data);
+  auto new_tensor = make_tensor_ptr(tensor);
+
+  EXPECT_EQ(new_tensor->dim(), tensor->dim());
+  EXPECT_EQ(new_tensor->size(0), tensor->size(0));
+  EXPECT_EQ(new_tensor->size(1), tensor->size(1));
+  EXPECT_EQ(
+      new_tensor->const_data_ptr<int64_t>(), tensor->const_data_ptr<int64_t>());
+  EXPECT_EQ(new_tensor->scalar_type(), executorch::aten::ScalarType::Long);
+}
+
+TEST_F(TensorPtrTest, MakeTensorPtrFromTensorPtrNull) {
+  auto tensor = make_tensor_ptr({2, 2}, nullptr);
+  auto new_tensor = make_tensor_ptr(tensor);
+
+  EXPECT_EQ(new_tensor->dim(), tensor->dim());
+  EXPECT_EQ(new_tensor->size(0), tensor->size(0));
+  EXPECT_EQ(new_tensor->size(1), tensor->size(1));
+  EXPECT_EQ(new_tensor->const_data_ptr(), tensor->const_data_ptr());
+  EXPECT_EQ(new_tensor->const_data_ptr(), nullptr);
+}
+
 TEST_F(TensorPtrTest, CloneTensorPtrFromTensorPtrInt32) {
   std::vector<int32_t> data = {1, 2, 3, 4};
   auto tensor = make_tensor_ptr({2, 2}, std::move(data));
@@ -392,7 +442,7 @@ TEST_F(TensorPtrTest, CloneTensorPtrFromTensorPtrInt32) {
 TEST_F(TensorPtrTest, MakeTensorPtrFromExistingTensorDouble) {
   std::vector<double> data = {1.0, 2.0, 3.0, 4.0};
   auto tensor = make_tensor_ptr({2, 2}, data);
-  auto new_tensor = make_tensor_ptr(*tensor);
+  auto new_tensor = make_tensor_ptr(tensor);
 
   EXPECT_EQ(new_tensor->dim(), tensor->dim());
   EXPECT_EQ(new_tensor->size(0), tensor->size(0));
@@ -405,7 +455,7 @@ TEST_F(TensorPtrTest, MakeTensorPtrFromExistingTensorDouble) {
 TEST_F(TensorPtrTest, CloneTensorPtrFromExistingTensorDouble) {
   std::vector<double> data = {1.0, 2.0, 3.0, 4.0};
   auto tensor = make_tensor_ptr({2, 2}, std::move(data));
-  auto cloned_tensor = clone_tensor_ptr(*tensor);
+  auto cloned_tensor = clone_tensor_ptr(tensor);
 
   EXPECT_EQ(cloned_tensor->dim(), tensor->dim());
   EXPECT_EQ(cloned_tensor->size(0), tensor->size(0));
@@ -437,7 +487,7 @@ TEST_F(TensorPtrTest, CloneTensorPtrFromTensorPtrDouble) {
 TEST_F(TensorPtrTest, MakeTensorPtrFromExistingTensorInt64) {
   std::vector<int64_t> data = {100, 200, 300, 400};
   auto tensor = make_tensor_ptr({2, 2}, data);
-  auto new_tensor = make_tensor_ptr(*tensor);
+  auto new_tensor = make_tensor_ptr(tensor);
 
   EXPECT_EQ(new_tensor->dim(), tensor->dim());
   EXPECT_EQ(new_tensor->size(0), tensor->size(0));
@@ -450,7 +500,7 @@ TEST_F(TensorPtrTest, MakeTensorPtrFromExistingTensorInt64) {
 TEST_F(TensorPtrTest, CloneTensorPtrFromExistingTensorInt64) {
   std::vector<int64_t> data = {100, 200, 300, 400};
   auto tensor = make_tensor_ptr({2, 2}, std::move(data));
-  auto cloned_tensor = clone_tensor_ptr(*tensor);
+  auto cloned_tensor = clone_tensor_ptr(tensor);
 
   EXPECT_EQ(cloned_tensor->dim(), tensor->dim());
   EXPECT_EQ(cloned_tensor->size(0), tensor->size(0));

From fc512faa2f46687a8cb4ef87982ad26c30198685 Mon Sep 17 00:00:00 2001
From: Abhinayk <abhinayk@meta.com>
Date: Thu, 9 Oct 2025 14:59:32 -0700
Subject: [PATCH 216/266] Fix typos in docs ahead of GA (#14964)

---
 docs/README.md                                  | 6 +++---
 docs/source/backends-coreml.md                  | 2 +-
 docs/source/backends-overview.md                | 2 +-
 docs/source/backends-xnnpack.md                 | 2 +-
 docs/source/devtools-overview.md                | 2 +-
 docs/source/getting-started-architecture.md     | 2 +-
 docs/source/getting-started.md                  | 2 +-
 docs/source/intro-how-it-works.md               | 2 +-
 docs/source/quantization-overview.md            | 2 +-
 docs/source/running-a-model-cpp-tutorial.md     | 2 +-
 docs/source/using-executorch-android.md         | 2 +-
 docs/source/using-executorch-troubleshooting.md | 4 ++--
 12 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/docs/README.md b/docs/README.md
index e30decb9362..845267b32f6 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -43,7 +43,7 @@ To build the documentation locally:
    git clone -b viable/strict https://github.com/pytorch/executorch.git && cd executorch
    ```
 
-1. If you don't have it already, start either a Python virtual envitonment:
+1. If you don't have it already, start either a Python virtual environment:
 
    ```bash
    python3 -m venv .venv && source .venv/bin/activate && pip install --upgrade pip
@@ -111,7 +111,7 @@ You can use the variables in both regular text and code blocks.
 ## Including READMEs to the Documentation Build
 
 You might want to include some of the `README.md` files from various directories
-in this repositories in your documentation build. To do that, create an `.md`
+in this repository in your documentation build. To do that, create an `.md`
 file and use the `{include}` directive to insert your `.md` files. Example:
 
 ````
@@ -177,7 +177,7 @@ file:
 ````
 
 In the `index.md` file, I would add `tutorials/selective-build-tutorial` in
-both the `toctree` and the `cusotmcarditem` sections.
+both the `toctree` and the `customcarditem` sections.
 
 # Auto-generated API documentation
 
diff --git a/docs/source/backends-coreml.md b/docs/source/backends-coreml.md
index fe6748617a0..d0d6138d277 100644
--- a/docs/source/backends-coreml.md
+++ b/docs/source/backends-coreml.md
@@ -61,7 +61,7 @@ The Core ML partitioner API allows for configuration of the model delegation to
  - `skip_ops_for_coreml_delegation`: Allows you to skip ops for delegation by Core ML.  By default, all ops that Core ML supports will be delegated.  See [here](https://github.com/pytorch/executorch/blob/14ff52ff89a89c074fc6c14d3f01683677783dcd/backends/apple/coreml/test/test_coreml_partitioner.py#L42) for an example of skipping an op for delegation.
 - `compile_specs`: A list of `CompileSpec`s for the Core ML backend.  These control low-level details of Core ML delegation, such as the compute unit (CPU, GPU, ANE), the iOS deployment target, and the compute precision (FP16, FP32).  These are discussed more below.
 - `take_over_mutable_buffer`: A boolean that indicates whether PyTorch mutable buffers in stateful models should be converted to [Core ML `MLState`](https://developer.apple.com/documentation/coreml/mlstate).  If set to `False`, mutable buffers in the PyTorch graph are converted to graph inputs and outputs to the Core ML lowered module under the hood.  Generally, setting `take_over_mutable_buffer` to true will result in better performance, but using `MLState` requires iOS >= 18.0, macOS >= 15.0, and Xcode >= 16.0.
-- `take_over_constant_data`: A boolean that indicates whether PyTorch constant data like model weights should be consumed by the Core ML delegate.  If set to False, constant data is passed to the Core ML delegate as inputs.  By deafault, take_over_constant_data=True.
+- `take_over_constant_data`: A boolean that indicates whether PyTorch constant data like model weights should be consumed by the Core ML delegate.  If set to False, constant data is passed to the Core ML delegate as inputs.  By default, take_over_constant_data=True.
 - `lower_full_graph`: A boolean that indicates whether the entire graph must be lowered to Core ML.  If set to True and Core ML does not support an op, an error is raised during lowering.  If set to False and Core ML does not support an op, the op is executed on the CPU by ExecuTorch.  Although setting `lower_full_graph`=False can allow a model to lower where it would otherwise fail, it can introduce performance overhead in the model when there are unsupported ops.  You will see warnings about unsupported ops during lowering if there are any.  By default, `lower_full_graph`=False.
 
 
diff --git a/docs/source/backends-overview.md b/docs/source/backends-overview.md
index b15b466d6a6..4a3313964a8 100644
--- a/docs/source/backends-overview.md
+++ b/docs/source/backends-overview.md
@@ -31,7 +31,7 @@ Backends are the bridge between your exported model and the hardware it runs on.
 | [OpenVINO](build-run-openvino)           | Embedded            | CPU/GPU/NPU   | Intel  SoCs                     |
 | [NXP](backends-nxp)                      | Embedded            | NPU           | NXP SoCs                        |
 | [Cadence](backends-cadence)              | Embedded            | DSP           | DSP-optimized workloads         |
-| [Samsung Exynos](backends-samsung-exynos)| Android             | NPU           | Samsung Socs                    |
+| [Samsung Exynos](backends-samsung-exynos)| Android             | NPU           | Samsung SoCs                    |
 
 **Tip:** For best performance, export a `.pte` file for each backend you plan to support.
 
diff --git a/docs/source/backends-xnnpack.md b/docs/source/backends-xnnpack.md
index 75ec17809a4..42e76741ec8 100644
--- a/docs/source/backends-xnnpack.md
+++ b/docs/source/backends-xnnpack.md
@@ -82,7 +82,7 @@ To perform 8-bit quantization with the PT2E flow, perform the following steps pr
 1) Create an instance of the `XnnpackQuantizer` class. Set quantization parameters.
 2) Use `torch.export.export` to prepare for quantization.
 3) Call `prepare_pt2e` to prepare the model for quantization.
-4) For static quantization, run the prepared model with representative samples to calibrate the quantizated tensor activation ranges.
+4) For static quantization, run the prepared model with representative samples to calibrate the quantized tensor activation ranges.
 5) Call `convert_pt2e` to quantize the model.
 6) Export and lower the model using the standard flow.
 
diff --git a/docs/source/devtools-overview.md b/docs/source/devtools-overview.md
index 449dd1485dc..8e13e67f1a1 100644
--- a/docs/source/devtools-overview.md
+++ b/docs/source/devtools-overview.md
@@ -41,6 +41,6 @@ More details are available in the [ETDump documentation](etdump.md) on how to ge
 
 
 ### Inspector APIs
-The Inspector Python APIs are the main user enrty point into the Developer Tools. They join the data sourced from ETDump and ETRecord to give users access to all the performance and debug data sourced from the runtime along with linkage back to eager model source code and module hierarchy in an easy to use API.
+The Inspector Python APIs are the main user entry point into the Developer Tools. They join the data sourced from ETDump and ETRecord to give users access to all the performance and debug data sourced from the runtime along with linkage back to eager model source code and module hierarchy in an easy to use API.
 
 More details are available in the [Inspector API documentation](model-inspector.rst) on how to use the Inspector APIs.
diff --git a/docs/source/getting-started-architecture.md b/docs/source/getting-started-architecture.md
index ef4a12d1a7f..84718d9da08 100644
--- a/docs/source/getting-started-architecture.md
+++ b/docs/source/getting-started-architecture.md
@@ -89,6 +89,6 @@ _Executor_ is the entry point to load the program and execute it. The execution
 
 ## Developer Tools
 
-It should be efficient for users to go from research to production using the flow above. Productivity is essentially important, for users to author, optimize and deploy their models. We provide [ExecuTorch Developer Tools](devtools-overview.md) to improve productivity. The Developer Tools are not in the diagram. Instead it's a tool set that covers the developer workflow in all three phases.
+It should be efficient for users to go from research to production using the flow above. Productivity is especially important, for users to author, optimize and deploy their models. We provide [ExecuTorch Developer Tools](devtools-overview.md) to improve productivity. The Developer Tools are not in the diagram. Instead it's a tool set that covers the developer workflow in all three phases.
 
 During the program preparation and execution, users can use the ExecuTorch Developer Tools to profile, debug, or visualize the program. Since the end-to-end flow is within the PyTorch ecosystem, users can correlate and display performance data along with graph visualization as well as direct references to the program source code and model hierarchy. We consider this to be a critical component for quickly iterating and lowering PyTorch programs to edge devices and environments.
diff --git a/docs/source/getting-started.md b/docs/source/getting-started.md
index d3d9662f5c3..2ee476541ee 100644
--- a/docs/source/getting-started.md
+++ b/docs/source/getting-started.md
@@ -89,7 +89,7 @@ input_tensor: torch.Tensor = torch.randn(1, 3, 224, 224)
 program = runtime.load_program("model.pte")
 method = program.load_method("forward")
 output: List[torch.Tensor] = method.execute([input_tensor])
-print("Run succesfully via executorch")
+print("Run successfully via executorch")
 
 from torchvision.models.mobilenetv2 import MobileNet_V2_Weights
 import torchvision.models as models
diff --git a/docs/source/intro-how-it-works.md b/docs/source/intro-how-it-works.md
index 3e6d384a62f..3ced602fed4 100644
--- a/docs/source/intro-how-it-works.md
+++ b/docs/source/intro-how-it-works.md
@@ -6,7 +6,7 @@ At a high-level, there are three steps for running a PyTorch model with ExecuTor
 
 1. **Export the model.** The first step is to capture the PyTorch program as a graph, which is a new representation of the model that can be expressed in terms of a series of operators such as addition, multiplication, or convolution. This process safely preserves the semantics of the original PyTorch program. This representation is the first step to enable running the model on edge use cases that have low memory and/or low compute.
 1. **Compile the exported model to an ExecuTorch program.** Given an exported model from step 1, convert it to an executable format called an ExecuTorch program that the runtime can use for inference. This step provides entry points for various optimizations such as compressing the model (e.g., quantization) to reduce size and further compiling subgraphs down to on-device specialized hardware accelerators to improve latency. It also provides an entry point for memory planning, i.e. to efficiently plan the location of intermediate tensors to reduce the runtime memory footprint.
-1. **Run the ExecuTorch program on a target device.** Given an input--such as an image represented as an input activation tensor--the ExecuTorch runtime loads the ExecuTorch program, executes the instructions represented by the program, and computes an output. This step is efficient because (1) the runtime is lightweight and (2) an efficient execution plan has already been calculated in steps 1 and 2, making it possible to do performant inference. Furthermore, portability of the core runtime enabled performant execution even on highly-constrained devices.
+1. **Run the ExecuTorch program on a target device.** Given an input--such as an image represented as an input activation tensor--the ExecuTorch runtime loads the ExecuTorch program, executes the instructions represented by the program, and computes an output. This step is efficient because (1) the runtime is lightweight and (2) an efficient execution plan has already been calculated in steps 1 and 2, making it possible to do performant inference. Furthermore, portability of the core runtime enables performant execution even on highly-constrained devices.
 
 This figure illustrates the three-step process of exporting a PyTorch program, compiling it into an ExecuTorch program that targets a specific hardware device, and finally executing the program on the device using the ExecuTorch runtime.
 ![name](_static/img/how-executorch-works-high-level.png)
diff --git a/docs/source/quantization-overview.md b/docs/source/quantization-overview.md
index fdceee80e8e..4ff8d34a4a8 100644
--- a/docs/source/quantization-overview.md
+++ b/docs/source/quantization-overview.md
@@ -14,7 +14,7 @@ Quantization in ExecuTorch is backend-specific. Each backend defines how models
 The PT2E quantization workflow has three main steps:
 
 1. Configure a backend-specific quantizer.
-2. Prepare, calibrate, convert, and evalute the quantized model in PyTorch
+2. Prepare, calibrate, convert, and evaluate the quantized model in PyTorch
 3. Lower the model to the target backend
 
 ## 1. Configure a Backend-Specific Quantizer
diff --git a/docs/source/running-a-model-cpp-tutorial.md b/docs/source/running-a-model-cpp-tutorial.md
index a12ef122bc8..f7bc3773949 100644
--- a/docs/source/running-a-model-cpp-tutorial.md
+++ b/docs/source/running-a-model-cpp-tutorial.md
@@ -96,7 +96,7 @@ MemoryManager memory_manager(&method_allocator, &planned_memory);
 
 ## Loading a Method
 
-In ExecuTorch we load and initialize from the `Program` at a method granularity. Many programs will only have one method 'forward'. `load_method` is where initialization is done, from setting up tensor metadata, to intializing delegates, etc.
+In ExecuTorch we load and initialize from the `Program` at a method granularity. Many programs will only have one method 'forward'. `load_method` is where initialization is done, from setting up tensor metadata, to initializing delegates, etc.
 
 ``` cpp
 Result<Method> method = program->load_method(method_name);
diff --git a/docs/source/using-executorch-android.md b/docs/source/using-executorch-android.md
index 4b388460c87..ce9977218a1 100644
--- a/docs/source/using-executorch-android.md
+++ b/docs/source/using-executorch-android.md
@@ -72,7 +72,7 @@ curl -O https://ossci-android.s3.amazonaws.com/executorch/release/snapshot-20250
 curl -O https://ossci-android.s3.amazonaws.com/executorch/release/snapshot-20250412/executorch.aar.sha256sums
 ```
 
-We aim to make every daily snapshot available and useable. However, for best stability, please use releases, not snapshots.
+We aim to make every daily snapshot available and usable. However, for best stability, please use releases, not snapshots.
 
 ## Using AAR file
 
diff --git a/docs/source/using-executorch-troubleshooting.md b/docs/source/using-executorch-troubleshooting.md
index 56c2e1a0653..1abc5ed999e 100644
--- a/docs/source/using-executorch-troubleshooting.md
+++ b/docs/source/using-executorch-troubleshooting.md
@@ -1,11 +1,11 @@
 # Profiling and Debugging
 
-To faciliate model and runtime integration, ExecuTorch provides tools to profile model resource utilization, numerics, and more. This section describes the available troubleshooting tools and steps to resolve issues when integrating ExecuTorch.
+To facilitate model and runtime integration, ExecuTorch provides tools to profile model resource utilization, numerics, and more. This section describes the available troubleshooting tools and steps to resolve issues when integrating ExecuTorch.
 
 ## General Troubleshooting Steps
 
 - To troubleshoot failure of runtime API calls, such as loading or running a model, ensure that ExecuTorch framework logging is enabled. See [Logging](using-executorch-runtime-integration.md#logging) for more information.
-- As a prelimatinary step to troubleshoot slow run times, ensure that performance testing is being done in a release build, and that the model is delegated. See [Inference is Slow](using-executorch-faqs.md#inference-is-slow--performance-troubleshooting) for more information.
+- As a preliminary step to troubleshoot slow run times, ensure that performance testing is being done in a release build, and that the model is delegated. See [Inference is Slow](using-executorch-faqs.md#inference-is-slow--performance-troubleshooting) for more information.
 - Check [Frequently Asked Questions](using-executorch-faqs.md) for common issues and questions encountered during install, model export, and runtime integration.
 
 ## Developer Tools

From 8d51b0fe6d373fcc3c921e9d11e4503711843d42 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Thu, 9 Oct 2025 18:13:40 -0400
Subject: [PATCH 217/266] [ET-VK] Show stack trace in Exception messages via
 boost if boost is available (#14967)

Title says it all! To enable easier debugging, use boost::stracktrace (if available) to record stack traces when throwing an Exception. This is available only when building for a host machine.

Differential Revision: [D84262869](https://our.internmc.facebook.com/intern/diff/D84262869/)
---
 backends/vulkan/CMakeLists.txt               | 11 +++++++++++
 backends/vulkan/runtime/vk_api/Exception.cpp | 17 +++++++++++++++++
 backends/vulkan/targets.bzl                  | 13 ++++++++++++-
 3 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/backends/vulkan/CMakeLists.txt b/backends/vulkan/CMakeLists.txt
index 4d955a34116..d9acde79ecf 100644
--- a/backends/vulkan/CMakeLists.txt
+++ b/backends/vulkan/CMakeLists.txt
@@ -111,6 +111,9 @@ file(GLOB_RECURSE vulkan_runtime_utils_cpp ${RUNTIME_PATH}/utils/*.cpp)
 
 # vulkan_backend
 
+# Try to find boost to log stack traces when throwing exceptions
+find_package(Boost 1.89 COMPONENTS stacktrace_basic stacktrace_addr2line)
+
 file(GLOB vulkan_backend_cpp ${RUNTIME_PATH}/*.cpp)
 list(APPEND vulkan_backend_cpp ${vulkan_graph_cpp})
 list(APPEND vulkan_backend_cpp ${vulkan_standard_shaders_cpp})
@@ -121,6 +124,14 @@ target_include_directories(
   vulkan_backend PRIVATE ${SCHEMA_INCLUDE_DIR} ${COMMON_INCLUDES}
 )
 target_link_libraries(vulkan_backend PRIVATE vulkan_schema executorch_core)
+# Optionally link boost for stacktraces if boost is available
+if(DEFINED Boost_STACKTRACE_BASIC_LIBRARY)
+  target_link_libraries(
+    vulkan_backend PRIVATE ${Boost_STACKTRACE_LIBRARY}
+                           ${Boost_STACKTRACE_ADDR2LINE_LIBRARY}
+  )
+  list(APPEND VULKAN_CXX_FLAGS "-DETVK_BOOST_STACKTRACE_AVAILABLE")
+endif()
 target_compile_options(vulkan_backend PRIVATE ${VULKAN_CXX_FLAGS})
 # Link this library with --whole-archive due to dynamic backend registration
 executorch_target_link_options_shared_lib(vulkan_backend)
diff --git a/backends/vulkan/runtime/vk_api/Exception.cpp b/backends/vulkan/runtime/vk_api/Exception.cpp
index d3efa81e52a..5bcf047aaf1 100644
--- a/backends/vulkan/runtime/vk_api/Exception.cpp
+++ b/backends/vulkan/runtime/vk_api/Exception.cpp
@@ -10,6 +10,13 @@
 
 #include <sstream>
 
+#ifdef ETVK_BOOST_STACKTRACE_AVAILABLE
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif // _GNU_SOURCE
+#include <boost/stacktrace.hpp>
+#endif // ETVK_BOOST_STACKTRACE_AVAILABLE
+
 namespace vkcompute {
 namespace vkapi {
 
@@ -65,6 +72,11 @@ Error::Error(SourceLocation source_location, std::string msg)
   std::ostringstream oss;
   oss << "Exception raised from " << source_location_ << ": ";
   oss << msg_;
+#ifdef ETVK_BOOST_STACKTRACE_AVAILABLE
+  oss << "\n";
+  oss << "Stack trace:\n";
+  oss << boost::stacktrace::stacktrace();
+#endif // ETVK_BOOST_STACKTRACE_AVAILABLE
   what_ = oss.str();
 }
 
@@ -74,6 +86,11 @@ Error::Error(SourceLocation source_location, const char* cond, std::string msg)
   oss << "Exception raised from " << source_location_ << ": ";
   oss << "(" << cond << ") is false! ";
   oss << msg_;
+#ifdef ETVK_BOOST_STACKTRACE_AVAILABLE
+  oss << "\n";
+  oss << "Stack trace:\n";
+  oss << boost::stacktrace::stacktrace();
+#endif // ETVK_BOOST_STACKTRACE_AVAILABLE
   what_ = oss.str();
 }
 
diff --git a/backends/vulkan/targets.bzl b/backends/vulkan/targets.bzl
index 170afe4dc44..42173e587ac 100644
--- a/backends/vulkan/targets.bzl
+++ b/backends/vulkan/targets.bzl
@@ -19,6 +19,8 @@ def get_vulkan_preprocessor_flags(no_volk, is_fbcode):
     default_flags = []
     android_flags = []
 
+    debug_mode = read_config("etvk", "debug", "0") == "1"
+
     if not no_volk:
         for flags in [default_flags, android_flags]:
             flags.append("-DUSE_VULKAN_WRAPPER")
@@ -32,6 +34,10 @@ def get_vulkan_preprocessor_flags(no_volk, is_fbcode):
         if link_moltenvk:
             mac_flags = []
 
+        if debug_mode:
+            mac_flags.append("-DETVK_BOOST_STACKTRACE_AVAILABLE")
+            default_flags.append("-DETVK_BOOST_STACKTRACE_AVAILABLE")
+
         VK_API_PREPROCESSOR_FLAGS += select({
             "DEFAULT": default_flags,
             "ovr_config//os:android": android_flags,
@@ -59,7 +65,6 @@ def get_vulkan_preprocessor_flags(no_volk, is_fbcode):
         if etvk_default_cache_path != "":
             VK_API_PREPROCESSOR_FLAGS += ["-DETVK_DEFAULT_CACHE_PATH={}".format(etvk_default_cache_path)]
 
-        debug_mode = read_config("etvk", "debug", "0") == "1"
         if debug_mode:
             VK_API_PREPROCESSOR_FLAGS += ["-DVULKAN_DEBUG"]
 
@@ -136,6 +141,8 @@ def vulkan_spv_shader_lib(name, spv_filegroups, is_fbcode = False, no_volk = Fal
     )
 
 def define_common_targets(is_fbcode = False):
+    debug_mode = read_config("etvk", "debug", "0") == "1"
+
     runtime.python_library(
         name = "gen_vulkan_spv_lib",
         srcs = [
@@ -200,6 +207,10 @@ def define_common_targets(is_fbcode = False):
                     "//third-party/khronos:moltenVK_static"
                 ]
 
+            if debug_mode:
+                mac_deps.append("fbsource//third-party/boost:boost")
+                default_deps.append("fbsource//third-party/boost:boost")
+
             VK_API_DEPS += select({
                 "DEFAULT": default_deps,
                 "ovr_config//os:android": android_deps,

From d0827e51bc685312527387eac796cceca49e2e61 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Thu, 9 Oct 2025 18:33:57 -0400
Subject: [PATCH 218/266] Use merged data map in module (#14966)

This PR was created by the merge bot to help merge the original PR into
the main branch.
ghstack PR number: https://github.com/pytorch/executorch/pull/14767 by
@lucylq
^ Please use this as the source of truth for the PR details, comments,
and reviews
ghstack PR base:
https://github.com/pytorch/executorch/tree/gh/lucylq/119/base
ghstack PR head:
https://github.com/pytorch/executorch/tree/gh/lucylq/119/head
Merge bot PR base: https://github.com/pytorch/executorch/tree/main
Merge bot PR head:
https://github.com/pytorch/executorch/tree/gh/lucylq/119/orig
Differential Revision:
[D83799869](https://our.internmc.facebook.com/intern/diff/D83799869/)
@diff-train-skip-merge

Co-authored-by: lucylq <lfq@meta.com>
---
 extension/module/CMakeLists.txt               |  7 ++---
 extension/module/module.cpp                   | 27 +++++++++++--------
 extension/module/targets.bzl                  |  1 +
 extension/module/test/CMakeLists.txt          | 11 ++++++--
 extension/module/test/module_test.cpp         | 21 ++++++++++-----
 extension/module/test/targets.bzl             |  2 ++
 extension/named_data_map/merged_data_map.cpp  |  8 +++---
 extension/named_data_map/merged_data_map.h    |  9 +++++++
 .../test/merged_data_map_test.cpp             |  2 +-
 scripts/build_apple_frameworks.sh             |  1 +
 10 files changed, 63 insertions(+), 26 deletions(-)

diff --git a/extension/module/CMakeLists.txt b/extension/module/CMakeLists.txt
index d887d873ab7..8fb2be9a677 100644
--- a/extension/module/CMakeLists.txt
+++ b/extension/module/CMakeLists.txt
@@ -29,7 +29,7 @@ else()
 endif()
 target_link_libraries(
   extension_module PRIVATE executorch_core extension_data_loader
-                           extension_flat_tensor
+                           extension_flat_tensor extension_named_data_map
 )
 target_include_directories(
   extension_module PUBLIC ${_common_include_directories}
@@ -42,8 +42,9 @@ target_compile_options(
 # after cleaning up CMake targets.
 add_library(extension_module_static STATIC ${_extension_module__srcs})
 target_link_libraries(
-  extension_module_static PRIVATE executorch_core extension_data_loader
-                                  extension_flat_tensor
+  extension_module_static
+  PRIVATE executorch_core extension_data_loader extension_flat_tensor
+          extension_named_data_map
 )
 target_include_directories(
   extension_module_static PUBLIC ${_common_include_directories}
diff --git a/extension/module/module.cpp b/extension/module/module.cpp
index 4b1c30ae6b5..9de77bcbc79 100644
--- a/extension/module/module.cpp
+++ b/extension/module/module.cpp
@@ -12,6 +12,7 @@
 #include <executorch/extension/data_loader/mmap_data_loader.h>
 #include <executorch/extension/flat_tensor/flat_tensor_data_map.h>
 #include <executorch/extension/memory_allocator/malloc_memory_allocator.h>
+#include <executorch/extension/named_data_map/merged_data_map.h>
 #include <executorch/runtime/platform/runtime.h>
 
 /**
@@ -38,6 +39,7 @@ namespace executorch {
 namespace extension {
 namespace ET_MODULE_NAMESPACE {
 
+using ET_MERGED_DATA_MAP_NAMESPACE::MergedDataMap;
 using ET_RUNTIME_NAMESPACE::MethodMeta;
 using ET_RUNTIME_NAMESPACE::Program;
 
@@ -155,10 +157,6 @@ runtime::Error Module::load(const Program::Verification verification) {
       data_loader_ = ET_UNWRAP(make_data_loader(file_path_, load_mode_));
     }
     if (data_files_.size() > 0) {
-      ET_CHECK_OR_RETURN_ERROR(
-          data_files_.size() == 1,
-          NotImplemented,
-          "Multiple named data map paths are not supported yet.");
       for (const auto& data_file : data_files_) {
         data_map_loaders_.push_back(
             ET_UNWRAP(make_data_loader(data_file, load_mode_)));
@@ -166,13 +164,20 @@ runtime::Error Module::load(const Program::Verification verification) {
     }
 
     if (data_map_loaders_.size() > 0) {
-      ET_CHECK_OR_RETURN_ERROR(
-          data_map_loaders_.size() == 1 && merged_data_map_ == nullptr,
-          NotImplemented,
-          "Multiple named data map loaders are not supported yet.");
-      // TODO(lfq): support multiple named data map loaders.
-      merged_data_map_ =
-          ET_UNWRAP_UNIQUE(FlatTensorDataMap::load(data_map_loaders_[0].get()));
+      for (auto i = 0; i < data_map_loaders_.size(); ++i) {
+        named_data_maps_.push_back(ET_UNWRAP_UNIQUE(
+            FlatTensorDataMap::load(data_map_loaders_[i].get())));
+      }
+
+      // Extract raw pointers from unique_ptrs to pass to MergedDataMap::load()
+      std::vector<const NamedDataMap*> raw_data_maps;
+      raw_data_maps.reserve(named_data_maps_.size());
+      for (const auto& data_map : named_data_maps_) {
+        raw_data_maps.push_back(data_map.get());
+      }
+      merged_data_map_ = ET_UNWRAP_UNIQUE(
+          MergedDataMap::load(runtime::Span<const NamedDataMap*>(
+              raw_data_maps.data(), raw_data_maps.size())));
     }
 
     auto program =
diff --git a/extension/module/targets.bzl b/extension/module/targets.bzl
index 3e449da5e14..0db909ce053 100644
--- a/extension/module/targets.bzl
+++ b/extension/module/targets.bzl
@@ -26,6 +26,7 @@ def define_common_targets():
                 "//executorch/extension/data_loader:file_data_loader",
                 "//executorch/extension/data_loader:mmap_data_loader",
                 "//executorch/extension/flat_tensor:flat_tensor_data_map" + aten_suffix,
+                "//executorch/extension/named_data_map:merged_data_map" + aten_suffix,
             ],
             exported_deps = [
                 "//executorch/runtime/executor:program_no_prim_ops" + aten_suffix,
diff --git a/extension/module/test/CMakeLists.txt b/extension/module/test/CMakeLists.txt
index 1c4358dd73e..54ace17557f 100644
--- a/extension/module/test/CMakeLists.txt
+++ b/extension/module/test/CMakeLists.txt
@@ -23,11 +23,14 @@ add_custom_command(
   OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/ModuleAdd.pte"
          "${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.pte"
          "${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.ptd"
+         "${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.pte"
+         "${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.ptd"
   COMMAND ${PYTHON_EXECUTABLE} -m test.models.export_program --modules
           "ModuleAdd" --outdir "${CMAKE_CURRENT_BINARY_DIR}"
   COMMAND
-    ${PYTHON_EXECUTABLE} -m test.models.export_program --modules "ModuleAddMul"
-    --external-constants --outdir "${CMAKE_CURRENT_BINARY_DIR}"
+    ${PYTHON_EXECUTABLE} -m test.models.export_program --modules
+    "ModuleAddMul,ModuleLinear" --external-constants --outdir
+    "${CMAKE_CURRENT_BINARY_DIR}"
   WORKING_DIRECTORY ${EXECUTORCH_ROOT}
 )
 
@@ -36,12 +39,16 @@ add_custom_target(
   DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/ModuleAdd.pte"
           "${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.pte"
           "${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.ptd"
+          "${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.pte"
+          "${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.ptd"
 )
 
 set(test_env
     "ET_MODULE_ADD_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleAdd.pte"
     "ET_MODULE_ADD_MUL_PROGRAM_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.pte"
     "ET_MODULE_ADD_MUL_DATA_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.ptd"
+    "ET_MODULE_LINEAR_PROGRAM_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.pte"
+    "ET_MODULE_LINEAR_DATA_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.ptd"
 )
 
 et_cxx_test(
diff --git a/extension/module/test/module_test.cpp b/extension/module/test/module_test.cpp
index 6f7e8a44558..27332503cad 100644
--- a/extension/module/test/module_test.cpp
+++ b/extension/module/test/module_test.cpp
@@ -26,11 +26,15 @@ class ModuleTest : public ::testing::Test {
     model_path_ = std::getenv("ET_MODULE_ADD_PATH");
     add_mul_path_ = std::getenv("ET_MODULE_ADD_MUL_PROGRAM_PATH");
     add_mul_data_path_ = std::getenv("ET_MODULE_ADD_MUL_DATA_PATH");
+    linear_path_ = std::getenv("ET_MODULE_LINEAR_PROGRAM_PATH");
+    linear_data_path_ = std::getenv("ET_MODULE_LINEAR_DATA_PATH");
   }
 
   static inline std::string model_path_;
   static inline std::string add_mul_path_;
   static inline std::string add_mul_data_path_;
+  static inline std::string linear_path_;
+  static inline std::string linear_data_path_;
 };
 
 TEST_F(ModuleTest, TestLoad) {
@@ -532,16 +536,21 @@ TEST_F(ModuleTest, TestPTD) {
 }
 
 TEST_F(ModuleTest, TestPTD_Multiple) {
-  std::vector<std::string> data_files = {add_mul_data_path_};
-  Module module(add_mul_path_, data_files);
-
-  ASSERT_EQ(module.load_method("forward"), Error::Ok);
+  std::vector<std::string> data_files = {add_mul_data_path_, linear_data_path_};
 
+  // Create module with add mul.
+  Module module_add_mul(add_mul_path_, data_files);
+  ASSERT_EQ(module_add_mul.load_method("forward"), Error::Ok);
   auto tensor = make_tensor_ptr({2, 2}, {2.f, 3.f, 4.f, 2.f});
-  ASSERT_EQ(module.forward(tensor).error(), Error::Ok);
+  ASSERT_EQ(module_add_mul.forward(tensor).error(), Error::Ok);
 
   // Confirm that the data_file is not std::move'd away.
   ASSERT_EQ(std::strcmp(data_files[0].c_str(), add_mul_data_path_.c_str()), 0);
+  ASSERT_EQ(std::strcmp(data_files[1].c_str(), linear_data_path_.c_str()), 0);
 
-  // TODO(lfq): add test when merge capability is supported.
+  // Create module with linear.
+  Module module_linear(linear_path_, data_files);
+  ASSERT_EQ(module_linear.load_method("forward"), Error::Ok);
+  auto tensor2 = make_tensor_ptr({3}, {2.f, 3.f, 4.f});
+  ASSERT_EQ(module_linear.forward(tensor2).error(), Error::Ok);
 }
diff --git a/extension/module/test/targets.bzl b/extension/module/test/targets.bzl
index d1aa73f6789..da7f1cc91bd 100644
--- a/extension/module/test/targets.bzl
+++ b/extension/module/test/targets.bzl
@@ -19,6 +19,8 @@ def define_common_targets(is_fbcode=False):
             "ET_MODULE_ADD_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleAdd.pte])",
             "ET_MODULE_ADD_MUL_PROGRAM_PATH": "$(location fbcode//executorch/test/models:exported_program_and_data[ModuleAddMul.pte])",
             "ET_MODULE_ADD_MUL_DATA_PATH": "$(location fbcode//executorch/test/models:exported_program_and_data[ModuleAddMul.ptd])",
+            "ET_MODULE_LINEAR_PROGRAM_PATH": "$(location fbcode//executorch/test/models:exported_program_and_data[ModuleLinear.pte])",
+            "ET_MODULE_LINEAR_DATA_PATH": "$(location fbcode//executorch/test/models:exported_program_and_data[ModuleLinear.ptd])",
             "ET_MODULE_SHARED_STATE": "$(location fbcode//executorch/test/models:exported_programs[ModuleSharedState.pte])",
         }
 
diff --git a/extension/named_data_map/merged_data_map.cpp b/extension/named_data_map/merged_data_map.cpp
index b42701c7587..2d1bb7d6158 100644
--- a/extension/named_data_map/merged_data_map.cpp
+++ b/extension/named_data_map/merged_data_map.cpp
@@ -21,7 +21,7 @@ using executorch::runtime::Result;
 using executorch::runtime::Span;
 
 namespace executorch::extension {
-
+namespace ET_MERGED_DATA_MAP_NAMESPACE {
 /*static*/ Result<MergedDataMap> MergedDataMap::load(
     Span<const NamedDataMap*> named_data_maps) {
   std::vector<const NamedDataMap*> valid_data_maps;
@@ -38,7 +38,7 @@ namespace executorch::extension {
 
   // Check for duplicate keys.
   std::unordered_map<std::string, uint32_t> key_to_map_index;
-  for (auto i : c10::irange(valid_data_maps.size())) {
+  for (const uint32_t i : c10::irange(valid_data_maps.size())) {
     const auto cur_map = valid_data_maps[i];
     uint32_t num_keys = cur_map->get_num_keys().get();
     for (auto j : c10::irange(num_keys)) {
@@ -47,7 +47,7 @@ namespace executorch::extension {
       ET_CHECK_OR_RETURN_ERROR(
           inserted,
           InvalidArgument,
-          "Duplicate key %s in named data maps at index %u and %lu",
+          "Duplicate key %s in named data maps at index %u and %" PRIu32,
           cur_key,
           it->second,
           i);
@@ -114,4 +114,6 @@ ET_NODISCARD Result<const char*> MergedDataMap::get_key(uint32_t index) const {
   // Shouldn't reach here.
   return Error::Internal;
 }
+
+} // namespace ET_MERGED_DATA_MAP_NAMESPACE
 } // namespace executorch::extension
diff --git a/extension/named_data_map/merged_data_map.h b/extension/named_data_map/merged_data_map.h
index 13415c0b59e..42490ec3d58 100644
--- a/extension/named_data_map/merged_data_map.h
+++ b/extension/named_data_map/merged_data_map.h
@@ -13,7 +13,15 @@
 #include <unordered_map>
 #include <vector>
 
+#ifdef USE_ATEN_LIB
+#define ET_MERGED_DATA_MAP_NAMESPACE merged_data_map::aten
+#else // !USE_ATEN_LIB
+#define ET_MERGED_DATA_MAP_NAMESPACE merged_data_map
+#endif // USE_ATEN_LIB
+
 namespace executorch::extension {
+
+namespace ET_MERGED_DATA_MAP_NAMESPACE {
 /**
  * A NamedDataMap implementation that wraps other NamedDataMaps.
  */
@@ -103,4 +111,5 @@ class MergedDataMap final
   std::unordered_map<std::string, uint32_t> key_to_map_index_;
 };
 
+} // namespace ET_MERGED_DATA_MAP_NAMESPACE
 } // namespace executorch::extension
diff --git a/extension/named_data_map/test/merged_data_map_test.cpp b/extension/named_data_map/test/merged_data_map_test.cpp
index 4086855f439..ccfaaa0ec0e 100644
--- a/extension/named_data_map/test/merged_data_map_test.cpp
+++ b/extension/named_data_map/test/merged_data_map_test.cpp
@@ -23,7 +23,7 @@
 using namespace ::testing;
 using executorch::extension::FileDataLoader;
 using executorch::extension::FlatTensorDataMap;
-using executorch::extension::MergedDataMap;
+using executorch::extension::merged_data_map::MergedDataMap;
 using executorch::runtime::DataLoader;
 using executorch::runtime::Error;
 using executorch::runtime::NamedDataMap;
diff --git a/scripts/build_apple_frameworks.sh b/scripts/build_apple_frameworks.sh
index 8ce2d68bab8..63fa4cf4545 100755
--- a/scripts/build_apple_frameworks.sh
+++ b/scripts/build_apple_frameworks.sh
@@ -31,6 +31,7 @@ libextension_apple.a,\
 libextension_data_loader.a,\
 libextension_flat_tensor.a,\
 libextension_module.a,\
+libextension_named_data_map.a,\
 libextension_tensor.a,\
 :${FRAMEWORK_EXECUTORCH_HEADERS_DIR}:${FRAMEWORK_EXECUTORCH_MODULE_NAME}"
 

From 66c3dea11df6d4fff91874ba8caaaf7d4c9a3d1b Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu0820@users.noreply.github.com>
Date: Thu, 9 Oct 2025 15:48:50 -0700
Subject: [PATCH 219/266] Add a wav loader (#14923)

This pull request adds support for loading and processing `.wav` audio
files in the multimodal runner, alongside existing `.bin` file support.
It introduces a dedicated WAV loader utility, updates the runner to
dispatch audio file processing based on file type, and adds
comprehensive tests for WAV file parsing and normalization. These
changes improve flexibility and robustness when handling audio inputs.

**WAV file support and audio processing:**

* Added a new utility `wav_loader.h` that provides functions to parse
WAV file headers and load normalized PCM audio data from `.wav` files,
supporting 16-bit and 32-bit PCM formats.
* Updated `multimodal.cpp` to support loading audio from both `.bin` and
`.wav` files, including input validation and error handling for
unsupported formats. The runner now uses the processor for both file
types and enforces processor requirements for `.wav` files.
[[1]](diffhunk://#diff-0ac16dbe4eaefa08e21fbda582fe2cd2b482f43aaedfc1bf2f31becf5e7bb843L138-R149)
[[2]](diffhunk://#diff-0ac16dbe4eaefa08e21fbda582fe2cd2b482f43aaedfc1bf2f31becf5e7bb843L166-R191)
[[3]](diffhunk://#diff-0ac16dbe4eaefa08e21fbda582fe2cd2b482f43aaedfc1bf2f31becf5e7bb843R247-L255)
* Added a new command-line flag `data_path` and passed it to the
multimodal runner to facilitate data file handling.
[[1]](diffhunk://#diff-0ac16dbe4eaefa08e21fbda582fe2cd2b482f43aaedfc1bf2f31becf5e7bb843R38)
[[2]](diffhunk://#diff-0ac16dbe4eaefa08e21fbda582fe2cd2b482f43aaedfc1bf2f31becf5e7bb843R294)
[[3]](diffhunk://#diff-0ac16dbe4eaefa08e21fbda582fe2cd2b482f43aaedfc1bf2f31becf5e7bb843L297-R322)

**Testing and build integration:**

* Introduced `test_wav_loader.cpp`, which provides unit tests for WAV
header parsing, sample normalization, error handling, and unsupported
format detection.
* Registered the new utility and tests in build configuration files,
ensuring proper header exports and test coverage.
[[1]](diffhunk://#diff-8a73187dfda9c5479db6911bee649164ff4434d36e8f4eb881cc1f049c4e3271R108)
[[2]](diffhunk://#diff-24b61cfeb7f1fc9a646df385ece0c31ea2ab18b3c7e34fc62117c62538e111ffL22-R22)
[[3]](diffhunk://#diff-c8ef93f128805fc48fe2d7c1dadb9ff5d2f4dc5ee7c00b638fd193d3dfb1f06cR47-R56)
[[4]](diffhunk://#diff-d755455ed59da7a902bb5a5c1e540a1924f63e8f70a9dc78b455f2c569a19db6R17)
---
 examples/models/voxtral/README.md             |  21 +-
 examples/models/voxtral/multimodal.cpp        | 105 +++++----
 extension/llm/runner/targets.bzl              |   1 +
 extension/llm/runner/test/CMakeLists.txt      |   2 +-
 extension/llm/runner/test/targets.bzl         |  10 +
 extension/llm/runner/test/test_wav_loader.cpp | 155 +++++++++++++
 extension/llm/runner/wav_loader.h             | 210 ++++++++++++++++++
 extension/testing_util/targets.bzl            |   1 +
 8 files changed, 460 insertions(+), 45 deletions(-)
 create mode 100644 extension/llm/runner/test/test_wav_loader.cpp
 create mode 100644 extension/llm/runner/wav_loader.h

diff --git a/examples/models/voxtral/README.md b/examples/models/voxtral/README.md
index 8cac4264bba..4e9ddcf34a4 100644
--- a/examples/models/voxtral/README.md
+++ b/examples/models/voxtral/README.md
@@ -41,8 +41,8 @@ To run the model, we will use the Voxtral runner, which utilizes ExecuTorch's Mu
 The Voxtral runner will do the following things:
 
 - Audio Input:
-  - Option A:  Pass the raw audio tensor into exported preprocessor to produce a mel spectrogram tensor.
-  - Option B:  If starting directly with an already processed audio input tensor, format the inputs to the multimodal runner (metadata tokens, audio tokens, text tokens, etc.).
+   - Option A:  Pass raw audio data from a `.wav` file into the exported preprocessor to produce a mel spectrogram tensor.
+   - Option B:  If starting directly with an already processed audio input tensor (preprocessed mel spectrogram), format the inputs to the multimodal runner (metadata tokens, audio tokens, text tokens, etc.).
 - Feed the formatted inputs to the multimodal modal runner.
 
 
@@ -66,13 +66,26 @@ cmake -DCMAKE_INSTALL_PREFIX=cmake-out -DBUILD_TESTING=OFF -DCMAKE_BUILD_TYPE=Re
 
 ## Running the model
 You can download the `tekken.json` tokenizer from [Voxtral's HuggingFace repo](https://huggingface.co/mistralai/Voxtral-Mini-3B-2507).
+
+### Running with raw audio (.wav file)
+For raw audio files (`.wav`), you must provide a preprocessor to convert the audio into mel spectrogram format:
+```
+./cmake-out/examples/models/voxtral/voxtral_runner \
+  --model_path path/to/model.pte \
+  --tokenizer_path path/to/tekken.json \
+  --prompt "What can you tell me about this audio?" \
+  --audio_path path/to/audio_input.wav \
+  --processor_path path/to/voxtral_preprocessor.pte
+```
+
+### Running with preprocessed audio (.bin file)
+If you already have a preprocessed mel spectrogram saved as a `.bin` file, you can skip the preprocessor:
 ```
 ./cmake-out/examples/models/voxtral/voxtral_runner \
   --model_path path/to/model.pte \
   --tokenizer_path path/to/tekken.json \
   --prompt "What can you tell me about this audio?" \
-  --audio_path path/to/audio_input.bin \
-  --processor_path path/to/voxtral_preprocessor.pte # If you're passing raw audio file in audio_path
+  --audio_path path/to/preprocessed_audio.bin
 ```
 
 Example output:
diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp
index 081df27cd67..b3dd5e3ab68 100644
--- a/examples/models/voxtral/multimodal.cpp
+++ b/examples/models/voxtral/multimodal.cpp
@@ -21,6 +21,7 @@
 #include <executorch/extension/llm/runner/llm_runner_helper.h>
 #include <executorch/extension/llm/runner/multimodal_input.h>
 #include <executorch/extension/llm/runner/multimodal_runner.h>
+#include <executorch/extension/llm/runner/wav_loader.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/platform/log.h>
 
@@ -34,6 +35,7 @@ DEFINE_string(
     "multimodal.pte",
     "Model serialized in flatbuffer format.");
 
+DEFINE_string(data_path, "", "Path to data file.");
 DEFINE_string(tokenizer_path, "tekken.json", "Tokenizer stuff.");
 
 DEFINE_string(prompt, "What is happening in this audio?", "Text prompt.");
@@ -113,15 +115,15 @@ MultimodalInput loadPreprocessedAudio(const std::string& audio_path) {
 }
 
 /**
- * @brief Loads a .bin file into a tensor and processes it using a .pte
- * processor
+ * @brief Loads raw audio from a .bin or .wav file and processes it using a
+ * .pte processor
  *
- * This function loads raw audio data from a .bin file (similar to
- * loadPreprocessedAudio), creates a tensor from it, and then passes it through
- * a processor module loaded from a .pte file to generate processed audio
- * features.
+ * This function loads raw audio data from either a .bin file (raw float array)
+ * or a .wav file (WAV format with headers), creates a tensor from it, and then
+ * passes it through a processor module loaded from a .pte file to generate
+ * processed audio features.
  *
- * @param audio_path Path to the .bin audio file
+ * @param audio_path Path to the .bin or .wav audio file
  * @param processor_path Path to the .pte processor file
  * @return MultimodalInput containing the processed audio data
  * @throws std::runtime_error if file loading or processing fails
@@ -135,6 +137,41 @@ MultimodalInput processRawAudioFile(
         "Processor path is required for raw audio processing");
   }
 
+  // Load the audio data from file (.bin or .wav)
+  std::vector<float> audio_data;
+  if (ends_with(audio_path, ".wav")) {
+    audio_data = ::executorch::extension::llm::load_wav_audio_data(audio_path);
+    ET_LOG(
+        Info,
+        "Loaded WAV file: %s, %zu samples",
+        audio_path.c_str(),
+        audio_data.size());
+  } else if (ends_with(audio_path, ".bin")) {
+    std::ifstream f(audio_path, std::ios::binary | std::ios::ate);
+    if (!f.is_open()) {
+      ET_LOG(Error, "Failed to open audio file: %s", audio_path.c_str());
+      throw std::runtime_error("Failed to open audio file");
+    }
+
+    std::size_t n_floats = f.tellg() / sizeof(float);
+    f.seekg(0, std::ios::beg);
+
+    audio_data.resize(n_floats);
+    f.read(
+        reinterpret_cast<char*>(audio_data.data()),
+        audio_data.size() * sizeof(float));
+    f.close();
+
+    ET_LOG(
+        Info, "Loaded .bin file: %s, %zu floats", audio_path.c_str(), n_floats);
+  } else {
+    ET_LOG(
+        Error,
+        "Unsupported audio file format: %s (only .bin and .wav files are supported)",
+        audio_path.c_str());
+    throw std::runtime_error("Unsupported audio file format");
+  }
+
   // Load the audio processor .pte.
   std::unique_ptr<Module> processor_module;
   try {
@@ -153,25 +190,6 @@ MultimodalInput processRawAudioFile(
     throw std::runtime_error("Exception while loading processor module");
   }
 
-  // Load the audio data from file.
-  std::ifstream f(audio_path, std::ios::binary | std::ios::ate);
-  if (!f.is_open()) {
-    ET_LOG(Error, "Failed to open audio file: %s", audio_path.c_str());
-    throw std::runtime_error("Failed to open audio file");
-  }
-
-  std::size_t n_floats = f.tellg() / sizeof(float);
-  f.seekg(0, std::ios::beg);
-
-  std::vector<float> audio_data(n_floats);
-  f.read(
-      reinterpret_cast<char*>(audio_data.data()),
-      audio_data.size() * sizeof(float));
-  f.close();
-
-  ET_LOG(
-      Info, "Loaded .bin file: %s, %zu floats", audio_path.c_str(), n_floats);
-
   // Execute the processor
   std::vector<executorch::aten::SizesType> tensor_shape = {
       static_cast<executorch::aten::SizesType>(audio_data.size())};
@@ -226,33 +244,39 @@ MultimodalInput processRawAudioFile(
  *
  * Dispatches audio file processing based on file extension and processor
  * availability:
+ * - .wav files: Requires processor, processes raw audio through processor
  * - .bin files with processor: Loads raw audio from .bin and processes through
  * processor
  * - .bin files without processor: Loads preprocessed mel spectrogram features
  * directly
  *
- * @param audio_path Path to the audio file (.bin)
- * @param processor_path Path to the processor .pte file (optional)
+ * @param audio_path Path to the audio file (.bin or .wav)
+ * @param processor_path Path to the processor .pte file (optional for .bin,
+ * required for .wav)
  * @return MultimodalInput containing the processed audio data
  * @throws std::runtime_error if file format is unsupported or processing fails
  */
 MultimodalInput processAudioFile(
     const std::string& audio_path,
     const std::string& processor_path = "") {
-  if (ends_with(audio_path, ".bin")) {
-    if (!processor_path.empty()) {
-      // Process raw audio from .bin file through the processor
-      return processRawAudioFile(audio_path, processor_path);
-    } else {
-      // Load preprocessed audio stored as a binary file (existing behavior)
-      return loadPreprocessedAudio(audio_path);
+  if (ends_with(audio_path, ".wav") || ends_with(audio_path, ".bin")) {
+    if (processor_path.empty()) {
+      if (ends_with(audio_path, ".wav")) {
+        ET_CHECK_MSG(
+            false,
+            "Processor path is required for .wav file processing: %s",
+            audio_path.c_str());
+      } else {
+        // Load preprocessed audio stored as a binary file (existing behavior)
+        return loadPreprocessedAudio(audio_path);
+      }
     }
+    return processRawAudioFile(audio_path, processor_path);
   } else {
-    ET_LOG(
-        Error,
-        "Unsupported audio file format: %s (only .bin files are supported)",
+    ET_CHECK_MSG(
+        false,
+        "Unsupported audio file format: %s (only .bin and .wav files are supported)",
         audio_path.c_str());
-    throw std::runtime_error("Unsupported audio file format");
   }
 }
 
@@ -267,6 +291,7 @@ int32_t main(int32_t argc, char** argv) {
   const char* prompt = FLAGS_prompt.c_str();
   const char* audio_path = FLAGS_audio_path.c_str();
   const char* processor_path = FLAGS_processor_path.c_str();
+  const char* data_path = FLAGS_data_path.c_str();
   float temperature = FLAGS_temperature;
   int32_t cpu_threads = FLAGS_cpu_threads;
   bool warmup = FLAGS_warmup;
@@ -294,7 +319,7 @@ int32_t main(int32_t argc, char** argv) {
   // Create multimodal runner
   std::unique_ptr<::executorch::extension::llm::MultimodalRunner> runner =
       ::executorch::extension::llm::create_multimodal_runner(
-          model_path, std::move(tokenizer));
+          model_path, std::move(tokenizer), data_path);
   if (runner == nullptr) {
     ET_LOG(Error, "Failed to create multimodal runner");
     return 1;
diff --git a/extension/llm/runner/targets.bzl b/extension/llm/runner/targets.bzl
index 242860a195a..e001e8fc154 100644
--- a/extension/llm/runner/targets.bzl
+++ b/extension/llm/runner/targets.bzl
@@ -105,6 +105,7 @@ def define_common_targets():
             exported_headers = [
                 "audio.h",
                 "image.h",
+                "wav_loader.h",
                 "multimodal_input.h",
                 "multimodal_runner.h",
                 "multimodal_prefiller.h",
diff --git a/extension/llm/runner/test/CMakeLists.txt b/extension/llm/runner/test/CMakeLists.txt
index 2aa18000831..934a5797da1 100644
--- a/extension/llm/runner/test/CMakeLists.txt
+++ b/extension/llm/runner/test/CMakeLists.txt
@@ -19,7 +19,7 @@ include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
 set(_test_srcs
     test_generation_config.cpp test_text_llm_runner.cpp test_text_prefiller.cpp
-    test_text_decoder_runner.cpp test_multimodal_input.cpp
+    test_text_decoder_runner.cpp test_multimodal_input.cpp test_wav_loader.cpp
 )
 
 # Add LSan stub for Apple platforms
diff --git a/extension/llm/runner/test/targets.bzl b/extension/llm/runner/test/targets.bzl
index 3339b3b8584..0571b39ccdb 100644
--- a/extension/llm/runner/test/targets.bzl
+++ b/extension/llm/runner/test/targets.bzl
@@ -44,3 +44,13 @@ def define_common_targets():
             "//executorch/extension/llm/runner:multimodal_runner_lib",
         ],
     )
+
+    runtime.cxx_test(
+        name = "test_wav_loader",
+        srcs = ["test_wav_loader.cpp"],
+        deps = [
+            "//executorch/extension/testing_util:temp_file",
+            "//executorch/extension/llm/runner:multimodal_runner_lib",
+            "//executorch/runtime/platform:platform",
+        ],
+    )
diff --git a/extension/llm/runner/test/test_wav_loader.cpp b/extension/llm/runner/test/test_wav_loader.cpp
new file mode 100644
index 00000000000..bc3ac0ff324
--- /dev/null
+++ b/extension/llm/runner/test/test_wav_loader.cpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/llm/runner/wav_loader.h>
+#include <executorch/extension/testing_util/temp_file.h>
+#include <executorch/runtime/platform/runtime.h>
+
+#include <cstdint>
+#include <limits>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+using executorch::extension::llm::kOneOverIntMax;
+using executorch::extension::llm::kOneOverShortMax;
+using executorch::extension::llm::load_wav_audio_data;
+using executorch::extension::llm::load_wav_header;
+using executorch::extension::llm::WavHeader;
+using executorch::extension::testing::TempFile;
+
+namespace {
+
+// Test fixture to ensure PAL initialization
+class WavLoaderTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Ensure PAL is initialized before tests run
+    executorch::runtime::runtime_init();
+  }
+};
+
+void append_bytes(std::vector<uint8_t>& out, const char* literal) {
+  out.insert(out.end(), literal, literal + 4);
+}
+
+void append_le16(std::vector<uint8_t>& out, uint16_t value) {
+  out.push_back(static_cast<uint8_t>(value & 0xFF));
+  out.push_back(static_cast<uint8_t>((value >> 8) & 0xFF));
+}
+
+void append_le32(std::vector<uint8_t>& out, uint32_t value) {
+  out.push_back(static_cast<uint8_t>(value & 0xFF));
+  out.push_back(static_cast<uint8_t>((value >> 8) & 0xFF));
+  out.push_back(static_cast<uint8_t>((value >> 16) & 0xFF));
+  out.push_back(static_cast<uint8_t>((value >> 24) & 0xFF));
+}
+
+std::vector<uint8_t> make_pcm_wav_bytes(
+    int bits_per_sample,
+    const std::vector<int32_t>& samples,
+    uint16_t num_channels = 1,
+    uint32_t sample_rate = 16000) {
+  const size_t bytes_per_sample = static_cast<size_t>(bits_per_sample / 8);
+  const uint32_t subchunk2_size =
+      static_cast<uint32_t>(samples.size() * bytes_per_sample);
+  const uint32_t byte_rate = sample_rate * num_channels * bytes_per_sample;
+  const uint16_t block_align = num_channels * bytes_per_sample;
+  const uint32_t chunk_size = 36 + subchunk2_size;
+
+  std::vector<uint8_t> bytes;
+  bytes.reserve(44 + subchunk2_size);
+
+  append_bytes(bytes, "RIFF");
+  append_le32(bytes, chunk_size);
+  append_bytes(bytes, "WAVE");
+  append_bytes(bytes, "fmt ");
+  append_le32(bytes, 16); // PCM
+  append_le16(bytes, 1); // AudioFormat PCM
+  append_le16(bytes, num_channels);
+  append_le32(bytes, sample_rate);
+  append_le32(bytes, byte_rate);
+  append_le16(bytes, block_align);
+  append_le16(bytes, static_cast<uint16_t>(bits_per_sample));
+  append_bytes(bytes, "data");
+  append_le32(bytes, subchunk2_size);
+
+  for (int32_t sample : samples) {
+    const uint32_t encoded =
+        static_cast<uint32_t>(static_cast<int32_t>(sample));
+    for (size_t byte_idx = 0; byte_idx < bytes_per_sample; ++byte_idx) {
+      bytes.push_back(static_cast<uint8_t>((encoded >> (8 * byte_idx)) & 0xFF));
+    }
+  }
+
+  return bytes;
+}
+
+} // namespace
+
+TEST_F(WavLoaderTest, LoadHeaderParsesPcmMetadata) {
+  const std::vector<uint8_t> wav_bytes =
+      make_pcm_wav_bytes(16, {0, 32767, -32768});
+  TempFile file(wav_bytes.data(), wav_bytes.size());
+
+  std::unique_ptr<WavHeader> header = load_wav_header(file.path());
+  ASSERT_NE(header, nullptr);
+
+  EXPECT_EQ(header->AudioFormat, 1);
+  EXPECT_EQ(header->NumOfChan, 1);
+  EXPECT_EQ(header->SamplesPerSec, 16000);
+  EXPECT_EQ(header->bitsPerSample, 16);
+  EXPECT_EQ(header->blockAlign, 2);
+  EXPECT_EQ(header->bytesPerSec, 32000);
+  EXPECT_EQ(header->dataOffset, 44);
+  EXPECT_EQ(header->Subchunk2Size, 6);
+}
+
+TEST_F(WavLoaderTest, LoadAudioData16BitNormalizesSamples) {
+  const std::vector<int32_t> samples = {0, 32767, -32768};
+  const std::vector<uint8_t> wav_bytes = make_pcm_wav_bytes(16, samples);
+  TempFile file(wav_bytes.data(), wav_bytes.size());
+
+  std::vector<float> audio = load_wav_audio_data(file.path());
+  ASSERT_EQ(audio.size(), samples.size());
+
+  EXPECT_NEAR(audio[0], 0.0f, 1e-6f);
+  EXPECT_NEAR(audio[1], 32767.0f * kOneOverShortMax, 1e-6f);
+  EXPECT_NEAR(audio[2], -32768.0f * kOneOverShortMax, 1e-6f);
+}
+
+TEST_F(WavLoaderTest, LoadAudioData32BitNormalizesSamples) {
+  const std::vector<int32_t> samples = {
+      0,
+      std::numeric_limits<int32_t>::max(),
+      std::numeric_limits<int32_t>::min()};
+  const std::vector<uint8_t> wav_bytes = make_pcm_wav_bytes(32, samples);
+  TempFile file(wav_bytes.data(), wav_bytes.size());
+
+  std::vector<float> audio = load_wav_audio_data(file.path());
+  ASSERT_EQ(audio.size(), samples.size());
+
+  EXPECT_NEAR(audio[0], 0.0f, 1e-8f);
+  EXPECT_NEAR(
+      audio[1],
+      static_cast<float>(static_cast<double>(samples[1]) * kOneOverIntMax),
+      1e-6f);
+  EXPECT_NEAR(
+      audio[2],
+      static_cast<float>(static_cast<double>(samples[2]) * kOneOverIntMax),
+      1e-6f);
+}
+
+TEST_F(WavLoaderTest, LoadHeaderReturnsNullWhenMagicMissing) {
+  const std::string bogus_contents = "not a wav file";
+  TempFile file(bogus_contents);
+
+  std::unique_ptr<WavHeader> header = load_wav_header(file.path());
+  EXPECT_EQ(header, nullptr);
+}
diff --git a/extension/llm/runner/wav_loader.h b/extension/llm/runner/wav_loader.h
new file mode 100644
index 00000000000..f49a4d1723e
--- /dev/null
+++ b/extension/llm/runner/wav_loader.h
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// A simple WAV file loader.
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <fstream>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <executorch/runtime/platform/assert.h>
+#include <executorch/runtime/platform/log.h>
+
+namespace executorch::extension::llm {
+
+constexpr float kOneOverIntMax = 1 / static_cast<float>(INT32_MAX);
+constexpr float kOneOverShortMax = 1 / static_cast<float>(INT16_MAX);
+
+struct WavHeader {
+  /* RIFF Chunk Descriptor */
+  uint8_t RIFF[4];
+  uint32_t ChunkSize;
+  uint8_t WAVE[4];
+  /* "fmt" sub-chunk */
+  uint8_t fmt[4];
+  uint32_t Subchunk1Size;
+  uint16_t AudioFormat;
+  uint16_t NumOfChan;
+  uint32_t SamplesPerSec;
+  uint32_t bytesPerSec;
+  uint16_t blockAlign;
+  uint16_t bitsPerSample;
+  /* "data" sub-chunk */
+  uint32_t dataOffset;
+  uint32_t Subchunk2Size;
+};
+
+inline std::unique_ptr<WavHeader> load_wav_header(const std::string& fp) {
+  std::ifstream file(fp, std::ios::binary);
+  if (!file.is_open()) {
+    ET_CHECK_MSG(false, "Failed to open WAV file: %s", fp.c_str());
+  }
+
+  file.seekg(0, std::ios::end);
+  size_t file_size = file.tellg();
+  file.seekg(0, std::ios::beg);
+
+  std::vector<char> buffer(file_size);
+  file.read(buffer.data(), file_size);
+  file.close();
+
+  const char* data = buffer.data();
+  size_t data_size = buffer.size();
+
+  bool has_riff = false;
+  bool has_wave = false;
+
+  if (data_size >= 4 && std::memcmp(data, "RIFF", 4) == 0) {
+    has_riff = true;
+  }
+
+  if (data_size >= 12 && std::memcmp(data + 8, "WAVE", 4) == 0) {
+    has_wave = true;
+  }
+
+  bool is_wav_file = has_riff && has_wave;
+  std::unique_ptr<WavHeader> header;
+
+  if (is_wav_file) {
+    header = std::make_unique<WavHeader>();
+    size_t default_header_size = sizeof(WavHeader);
+
+    size_t data_offset = 0;
+    for (size_t i = 0; i + 4 < data_size; i++) {
+      if (std::memcmp(data + i, "data", 4) == 0) {
+        data_offset = i;
+        break;
+      }
+    }
+
+    if (data_size >= default_header_size) {
+      std::memcpy(
+          reinterpret_cast<char*>(header.get()), data, default_header_size);
+
+      ET_LOG(Info, "WAV header detected, getting raw audio data.");
+      ET_LOG(
+          Info,
+          "RIFF Header: %c%c%c%c",
+          header->RIFF[0],
+          header->RIFF[1],
+          header->RIFF[2],
+          header->RIFF[3]);
+      ET_LOG(Info, "Chunk Size: %d", header->ChunkSize);
+      ET_LOG(
+          Info,
+          "WAVE Header: %c%c%c%c",
+          header->WAVE[0],
+          header->WAVE[1],
+          header->WAVE[2],
+          header->WAVE[3]);
+      ET_LOG(
+          Info,
+          "Format Header: %c%c%c%c",
+          header->fmt[0],
+          header->fmt[1],
+          header->fmt[2],
+          header->fmt[3]);
+      ET_LOG(Info, "Format Chunk Size: %d", header->Subchunk1Size);
+      ET_LOG(Info, "Audio Format: %d", header->AudioFormat);
+      ET_LOG(Info, "Number of Channels: %d", header->NumOfChan);
+      ET_LOG(Info, "Sample Rate: %d", header->SamplesPerSec);
+      ET_LOG(Info, "Byte Rate: %d", header->bytesPerSec);
+      ET_LOG(Info, "Block Align: %d", header->blockAlign);
+      ET_LOG(Info, "Bits per Sample: %d", header->bitsPerSample);
+
+      if (data_offset != 0) {
+        header->Subchunk2Size =
+            *reinterpret_cast<const int32_t*>(data + data_offset + 4);
+        ET_LOG(Info, "Subchunk2Size: %d", header->Subchunk2Size);
+        header->dataOffset = static_cast<uint32_t>(data_offset + 8);
+      } else {
+        ET_LOG(
+            Error,
+            "WAV file structure is invalid, missing Subchunk2ID 'data' field.");
+        throw std::runtime_error("Invalid WAV file structure");
+      }
+    } else {
+      ET_CHECK_MSG(
+          false,
+          "WAV header detected but file is too small to contain a complete header");
+    }
+  }
+
+  return header;
+}
+
+inline std::vector<float> load_wav_audio_data(const std::string& fp) {
+  std::ifstream file(fp, std::ios::binary);
+  if (!file.is_open()) {
+    ET_CHECK_MSG(false, "Failed to open WAV file: %s", fp.c_str());
+  }
+
+  file.seekg(0, std::ios::end);
+  size_t file_size = file.tellg();
+  file.seekg(0, std::ios::beg);
+
+  std::vector<char> buffer(file_size);
+  file.read(buffer.data(), file_size);
+  file.close();
+
+  auto header = load_wav_header(fp);
+
+  if (header.get() == nullptr) {
+    ET_CHECK_MSG(false, "WAV header not detected in file: %s", fp.c_str());
+  }
+
+  const char* data = buffer.data();
+  size_t data_offset = header->dataOffset;
+  size_t data_size = header->Subchunk2Size;
+  int bits_per_sample = header->bitsPerSample;
+
+  std::vector<float> audio_data;
+
+  if (bits_per_sample == 32) {
+    size_t num_samples = data_size / 4;
+    audio_data.resize(num_samples);
+    const int32_t* input_buffer =
+        reinterpret_cast<const int32_t*>(data + data_offset);
+
+    for (size_t i = 0; i < num_samples; ++i) {
+      audio_data[i] = static_cast<float>(
+          static_cast<double>(input_buffer[i]) * kOneOverIntMax);
+    }
+  } else if (bits_per_sample == 16) {
+    size_t num_samples = data_size / 2;
+    audio_data.resize(num_samples);
+    const int16_t* input_buffer =
+        reinterpret_cast<const int16_t*>(data + data_offset);
+
+    for (size_t i = 0; i < num_samples; ++i) {
+      audio_data[i] = static_cast<float>(
+          static_cast<double>(input_buffer[i]) * kOneOverShortMax);
+    }
+  } else {
+    ET_CHECK_MSG(
+        false,
+        "Unsupported bits per sample: %d. Only support 32 and 16.",
+        bits_per_sample);
+  }
+
+  ET_LOG(
+      Info,
+      "Loaded %zu audio samples from WAV file: %s",
+      audio_data.size(),
+      fp.c_str());
+
+  return audio_data;
+}
+
+} // namespace executorch::extension::llm
diff --git a/extension/testing_util/targets.bzl b/extension/testing_util/targets.bzl
index 05b825645e8..a5ad1fb9b8c 100644
--- a/extension/testing_util/targets.bzl
+++ b/extension/testing_util/targets.bzl
@@ -14,6 +14,7 @@ def define_common_targets():
         visibility = [
             "//executorch/devtools/etdump/tests/...",
             "//executorch/extension/data_loader/test/...",
+            "//executorch/extension/llm/runner/test/...",
             "//executorch/extension/testing_util/test/...",
             "//executorch/extension/fb/ptez/decompression_methods/test/...",
             "//executorch/extension/fb/ptez/test/...",

From 9764269e70a0d4570740a60a405a5e3585f5c84e Mon Sep 17 00:00:00 2001
From: Andrew Grebenisan <33402477+DrJessop@users.noreply.github.com>
Date: Thu, 9 Oct 2025 16:08:55 -0700
Subject: [PATCH 220/266] Pass which replaces torch quantized embedding byte
 with cadence variant

Differential Revision: D84109801

Pull Request resolved: https://github.com/pytorch/executorch/pull/14906
---
 backends/cadence/aot/replace_ops.py           | 47 +++++++++++++++++++
 .../aot/tests/test_replace_ops_passes.py      | 46 ++++++++++++++++++
 2 files changed, 93 insertions(+)

diff --git a/backends/cadence/aot/replace_ops.py b/backends/cadence/aot/replace_ops.py
index 7025159e443..3cfc059e75b 100644
--- a/backends/cadence/aot/replace_ops.py
+++ b/backends/cadence/aot/replace_ops.py
@@ -2156,6 +2156,52 @@ def call_operator(self, op, args, kwargs, meta):
         )
 
 
+@register_cadence_pass(CadencePassAttribute(opt_level=0))
+class ReplaceTorchQuantizedEmbeddingWithCadenceQuantizedEmbedding(ExportPass):
+    """
+    Replace torch.ops.quantized_decomposed.embedding_byte.dtype with
+    torch.ops.cadence.quantized_embedding_byte
+    """
+
+    def call_operator(
+        self,
+        op: torch._ops.OpOverload,
+        args: Tuple[Argument, ...],
+        kwargs: Dict[str, Argument],
+        meta: NodeMetadata,
+    ) -> ProxyValue:
+        # Check if the op is the quantized_decomposed.embedding_byte.dtype
+        if (
+            op == exir_ops.edge.quantized_decomposed.embedding_byte.default
+            or op == exir_ops.edge.quantized_decomposed.embedding_byte.dtype
+        ):
+            # Replace with cadence.quantized_embedding_byte
+            if len(args) < 6:
+                raise AssertionError(
+                    f"Expected 6 arguments for embedding_byte, got {len(args)}"
+                )
+            embedding = args[0]
+            scales = args[1]
+            weight_zero_points = args[2]
+            indices = args[5]
+            if op == exir_ops.edge.quantized_decomposed.embedding_byte.dtype:
+                dtype = kwargs.get("dtype", None)
+                if dtype is not None and dtype != torch.float32:
+                    raise AssertionError(
+                        f"Unsupported output dtype for embedding_byte: {dtype}"
+                    )
+
+            new_args = (embedding, scales, weight_zero_points, indices, False)
+            new_kwargs = {}
+            return super().call_operator(
+                exir_ops.edge.cadence.quantized_embedding_byte.default,
+                new_args,
+                new_kwargs,
+                meta,
+            )
+        return super().call_operator(op, args, kwargs, meta)
+
+
 class CommonReplacePasses:
     passes = [
         ReplaceSqueezeAndUnsqueezeWithViewPass,
@@ -2168,6 +2214,7 @@ class CommonReplacePasses:
         ReplacePT2QuantWithCadenceQuantPass,
         ReplacePT2DequantWithCadenceDequantPass,
         ReplacePowWithMulPass,
+        ReplaceTorchQuantizedEmbeddingWithCadenceQuantizedEmbedding,
     ]
 
 
diff --git a/backends/cadence/aot/tests/test_replace_ops_passes.py b/backends/cadence/aot/tests/test_replace_ops_passes.py
index c15755f58c5..e2fbd516757 100644
--- a/backends/cadence/aot/tests/test_replace_ops_passes.py
+++ b/backends/cadence/aot/tests/test_replace_ops_passes.py
@@ -45,6 +45,7 @@
     ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass,
     ReplaceSplitWithSlicePass,
     ReplaceSqueezeAndUnsqueezeWithViewPass,
+    ReplaceTorchQuantizedEmbeddingWithCadenceQuantizedEmbedding,
     ReplaceTransposedConvWithLinearPass,
     ReplaceTrivialConvWithLinear,
     ReplaceWhereWithFullArgsWithWhereScalar,
@@ -2269,3 +2270,48 @@ def test_replace_aten_linalg_svd_with_cadence_linalg_svd(
             count_node(graph_after_passes, exir_ops.edge.cadence.linalg_svd.default),
             1,
         )
+
+    @expand([("dtype",), ("default",)])
+    @torch.no_grad()
+    def test_replace_quantized_embedding(
+        self,
+        name: str,
+    ) -> None:
+        embedding = torch.ones(5, 6, dtype=torch.int8)
+        indices = torch.tensor([0, 2], dtype=torch.int32)
+        scales = torch.ones(5, 2, dtype=torch.float32)
+        zero_points = None
+
+        original_gm = single_op_builder(
+            placeholders=(embedding, scales, indices),
+            op=(
+                exir_ops.edge.quantized_decomposed.embedding_byte.dtype
+                if name == "dtype"
+                else exir_ops.edge.quantized_decomposed.embedding_byte.default
+            ),
+            args=(embedding, scales, zero_points, -128, 127, indices),
+            kwargs={"dtype": torch.float32} if name == "dtype" else {},
+        )
+
+        p = ReplaceTorchQuantizedEmbeddingWithCadenceQuantizedEmbedding()
+        graph_after_passes = cast(PassResult, p(original_gm)).graph_module
+
+        self.assertEqual(
+            count_node(
+                graph_after_passes,
+                (
+                    exir_ops.edge.quantized_decomposed.embedding_byte.dtype
+                    if name == "dtype"
+                    else exir_ops.edge.quantized_decomposed.embedding_byte.default
+                ),
+            ),
+            0,
+        )
+
+        self.assertEqual(
+            count_node(
+                graph_after_passes,
+                exir_ops.edge.cadence.quantized_embedding_byte.default,
+            ),
+            1,
+        )

From d39992f6d971e3548ee3ffe943d9224f63979126 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Thu, 9 Oct 2025 17:36:44 -0700
Subject: [PATCH 221/266] Make HQQ default PTQ quantization in ExecuTorch

Differential Revision: D84020605

Pull Request resolved: https://github.com/pytorch/executorch/pull/14834
---
 examples/models/llama/export_llama_lib.py     | 22 +++++++++----
 .../llama/source_transformation/quantize.py   | 33 ++++++++++++++-----
 third-party/ao                                |  2 +-
 3 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index aa3b157c8da..d0abaf59720 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -1238,12 +1238,15 @@ def _load_llama_model(llm_config: LlmConfig) -> "LLMEdgeManager":
     else:
         raise ValueError(f"{modelname} is not a valid Llama model.")
 
-    model, example_inputs, example_kwarg_inputs, dynamic_shapes = (
-        EagerModelFactory.create_model(
-            module_name,
-            model_class_name,
-            llm_config=llm_config,
-        )
+    (
+        model,
+        example_inputs,
+        example_kwarg_inputs,
+        dynamic_shapes,
+    ) = EagerModelFactory.create_model(
+        module_name,
+        model_class_name,
+        llm_config=llm_config,
     )
     # Convert dtype override string to actual type.
     dtype_override = DType[llm_config.model.dtype_override.value]
@@ -1322,6 +1325,7 @@ def _get_source_transforms(  # noqa
     local_global_attention: Optional[List[int]] = None,
     use_torchao_kernels_linear: bool = False,
     use_torchao_kernels_tied_embedding: bool = False,
+    quantize_with_hqq: bool = True,
 ) -> List[Callable[[torch.nn.Module], torch.nn.Module]]:
     """
     Return a list of functions that transform a graph.
@@ -1391,7 +1395,10 @@ def _get_source_transforms(  # noqa
         """
         transforms.append(
             get_quant_embedding_transform(
-                embedding_quantize, use_shared_embedding, checkpoint_dtype
+                embedding_quantize,
+                use_shared_embedding,
+                checkpoint_dtype,
+                quantize_with_hqq,
             )
         )
 
@@ -1422,6 +1429,7 @@ def _get_source_transforms(  # noqa
                 calibration_tasks=calibration_tasks,
                 calibration_limit=calibration_limit,
                 calibration_seq_length=calibration_seq_length,
+                quantize_with_hqq=quantize_with_hqq,
             )
         )
 
diff --git a/examples/models/llama/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py
index 7cb65833f98..9e49f9e4e15 100644
--- a/examples/models/llama/source_transformation/quantize.py
+++ b/examples/models/llama/source_transformation/quantize.py
@@ -49,6 +49,7 @@ def quantize(  # noqa C901
     blocksize: int = 128,
     tokenizer_path: Optional[Path] = None,
     verbose: bool = False,
+    quantize_with_hqq: bool = True,
 ) -> torch.nn.Module:
     """
     Quantizes a model by converting all weights to int8.
@@ -119,7 +120,6 @@ def quantize(  # noqa C901
         from torchao.quantization.granularity import PerAxis, PerGroup
         from torchao.quantization.quant_api import (
             Int8DynamicActivationIntxWeightConfig,
-            MappingType,
             quantize_,
         )
         from torchao.utils import unwrap_tensor_subclass
@@ -134,9 +134,12 @@ def quantize(  # noqa C901
                     weight_granularity=(
                         PerAxis(0) if group_size == 0 else PerGroup(group_size)
                     ),
-                    weight_mapping_type=MappingType.SYMMETRIC,
                     # pyre-ignore[6]
                     intx_packing_format="opaque_torchao_auto",
+                    # pyre-ignore[6]
+                    intx_choose_qparams_algorithm=(
+                        "hqq_scale_only" if quantize_with_hqq else "affine"
+                    ),
                 ),
             )
             model = unwrap_tensor_subclass(model)
@@ -170,6 +173,10 @@ def filter_fn(m, fqn):
                 # pyre-ignore[16]
                 weight_dtype=torch.int4,
                 weight_granularity=PerGroup(group_size),
+                # pyre-ignore[6]
+                intx_choose_qparams_algorithm=(
+                    "hqq_scale_only" if quantize_with_hqq else "affine"
+                ),
             ),
             filter_fn=filter_fn,
         )
@@ -191,6 +198,10 @@ def filter_fn(m, fqn):
             # pyre-ignore[16]
             weight_dtype=torch.int4,
             granularity=PerGroup(q_group_size),
+            # pyre-ignore[6]
+            intx_choose_qparams_algorithm=(
+                "hqq_scale_only" if quantize_with_hqq else "affine"
+            ),
         )
         quantize_(model, q_config)
         model = unwrap_tensor_subclass(model)
@@ -580,6 +591,7 @@ def __init__(
         group_size: Optional[int] = None,
         packed=False,
         precision: Optional[torch.dtype] = None,
+        quantize_with_hqq: bool = True,
     ):
         if isinstance(packed, str):
             packed = packed == "True"
@@ -592,15 +604,12 @@ def __init__(
         self.precision = precision
         if (bitwidth not in [2, 4]) and packed:
             raise RuntimeError("pack only works with bitsize 2, 4")
+        self.quantize_with_hqq = quantize_with_hqq
 
     @torch.no_grad()
     def create_quantized_state_dict(self, packed=False) -> Dict:
         from torchao.quantization.granularity import PerAxis, PerGroup
-        from torchao.quantization.quant_api import (
-            IntxWeightOnlyConfig,
-            MappingType,
-            quantize_,
-        )
+        from torchao.quantization.quant_api import IntxWeightOnlyConfig, quantize_
 
         cur_state_dict = self.mod.state_dict()
 
@@ -627,7 +636,10 @@ def create_quantized_state_dict(self, packed=False) -> Dict:
                         if (self.group_size is None or self.group_size == 0)
                         else PerGroup(self.group_size)
                     ),
-                    mapping_type=MappingType.SYMMETRIC,
+                    # pyre-ignore[6]
+                    intx_choose_qparams_algorithm=(
+                        "hqq_scale_only" if self.quantize_with_hqq else "affine"
+                    ),
                 )
                 quantize_(tmp_model, config, lambda m, fqn: isinstance(m, nn.Embedding))
                 weight = tmp_model.weight.qdata  # pyre-ignore[16]
@@ -765,6 +777,7 @@ def get_quant_embedding_transform(
     embedding_quantize: str,
     use_shared_embedding: bool = False,
     dtype_override: Optional[DType] = None,
+    quantize_with_hqq: bool = True,
 ):
     if embedding_quantize.startswith("torchao:"):
         from torchao.prototype.quantization.embedding.api import (
@@ -825,6 +838,7 @@ def _torchao_embedding_quantizer(model):
         group_size=group_size,
         packed=(bitwidth in [2, 4]),
         precision=torch_dtype,
+        quantize_with_hqq=quantize_with_hqq,
     ).quantized_model()
 
 
@@ -838,6 +852,7 @@ def get_quant_weight_transform(
     calibration_tasks: Optional[list] = None,
     calibration_limit: Optional[int] = None,
     calibration_seq_length: Optional[int] = None,
+    quantize_with_hqq: bool = True,
 ):
     return partial(
         quantize,
@@ -850,6 +865,7 @@ def get_quant_weight_transform(
         calibration_limit=calibration_limit,
         calibration_seq_length=calibration_seq_length,
         tokenizer_path=(Path(path) if (path := tokenizer_path) is not None else None),
+        quantize_with_hqq=quantize_with_hqq,
     )
 
 
@@ -877,7 +893,6 @@ def _load_torchao_aten_lib(libname):
 def set_8da4w_computation_dtype(
     module: nn.Module, computation_dtype: torch.dtype
 ) -> nn.Module:
-
     from torchao.quantization.linear_quant_modules import Int8DynActInt4WeightLinear
 
     def _set_8da4w_computation_dtype(module: nn.Module, dtype: torch.dtype) -> None:
diff --git a/third-party/ao b/third-party/ao
index b47f1a36550..01849b2b19c 160000
--- a/third-party/ao
+++ b/third-party/ao
@@ -1 +1 @@
-Subproject commit b47f1a3655004b2b4dd3b4f01a5d8eebff1faa3c
+Subproject commit 01849b2b19cb923cb739a1fc02297ba418ddf715

From 7f31fd878e74241f51d75f8a36403dda8f0474ea Mon Sep 17 00:00:00 2001
From: Andrew Grebenisan <33402477+DrJessop@users.noreply.github.com>
Date: Thu, 9 Oct 2025 20:13:19 -0700
Subject: [PATCH 222/266] Removed support for non-per-tensor quantized relu

Differential Revision: D83874866

Pull Request resolved: https://github.com/pytorch/executorch/pull/14788
---
 backends/cadence/aot/ref_implementations.py   | 49 +++-----------
 .../aot/tests/test_ref_implementations.py     | 64 ++++++-------------
 2 files changed, 31 insertions(+), 82 deletions(-)

diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py
index 4f612e3bab4..6a13a4424da 100644
--- a/backends/cadence/aot/ref_implementations.py
+++ b/backends/cadence/aot/ref_implementations.py
@@ -1125,7 +1125,6 @@ def quantized_relu_common(
 
 
 def quantized_relu_variant(
-    per_tensor: bool,
     dtype: torch.dtype | None = None,
 ) -> Callable[[Callable[..., torch.Tensor]], Callable[..., torch.Tensor]]:
     """Create a quantized relu variant with type checking."""
@@ -1133,43 +1132,20 @@ def quantized_relu_variant(
     def decorator(_: Callable[..., torch.Tensor]) -> Callable[..., torch.Tensor]:
         def variant(
             X: torch.Tensor,
-            X_zero_point: torch.Tensor | int,
+            X_zero_point: int,
             out_zero_point: int,
-            out_multiplier: torch.Tensor | int,
-            out_shift: torch.Tensor | int,
+            out_multiplier: int,
+            out_shift: int,
         ) -> torch.Tensor:
-            if per_tensor:
-                if dtype and X.dtype != dtype:
-                    raise ValueError(f"X dtype must be {dtype}. Got {X.dtype}")
-
-                assert isinstance(out_shift, int)
-                assert isinstance(out_multiplier, int)
-                _out_shift = out_shift
-                _out_multiplier = out_multiplier
-            else:
-                assert isinstance(out_multiplier, torch.Tensor)
-                if out_multiplier.numel() > 1:
-                    raise ValueError("Only scalar out_multiplier is supported")
-
-                assert isinstance(out_shift, torch.Tensor)
-                if out_shift.numel() > 1:
-                    raise ValueError("Only scalar out_shift is supported")
-
-                assert isinstance(X_zero_point, torch.Tensor)
-                if X_zero_point.shape != X.shape:
-                    raise ValueError(
-                        f"X_zero_point shape must be {X.shape}. Got {X_zero_point.shape}"
-                    )
-
-                _out_multiplier = int(out_multiplier.item())
-                _out_shift = int(out_shift.item())
+            if dtype and X.dtype != dtype:
+                raise ValueError(f"X dtype must be {dtype}. Got {X.dtype}")
 
             return quantized_relu_common(
                 X,
                 X_zero_point,
                 out_zero_point,
-                _out_multiplier,
-                _out_shift,
+                out_multiplier,
+                out_shift,
             )
 
         return variant
@@ -1177,23 +1153,18 @@ def variant(
     return decorator
 
 
-@impl(m, "quantized_relu")
-@quantized_relu_variant(False)
-def quantized_relu() -> torch.Tensor: ...
-
-
 @impl(m, "quantized_relu.per_tensor")
-@quantized_relu_variant(True)
+@quantized_relu_variant()
 def quantized_relu_per_tensor() -> torch.Tensor: ...
 
 
 @impl(m, "quantized_relu_asym8s_asym8s.per_tensor")
-@quantized_relu_variant(True, torch.int8)
+@quantized_relu_variant(torch.int8)
 def quantized_relu_asym8s_asym8s_per_tensor() -> torch.Tensor: ...
 
 
 @impl(m, "quantized_relu_asym8u_asym8u.per_tensor")
-@quantized_relu_variant(True, torch.uint8)
+@quantized_relu_variant(torch.uint8)
 def quantized_relu_asym8u_asym8u_per_tensor() -> torch.Tensor: ...
 
 
diff --git a/backends/cadence/aot/tests/test_ref_implementations.py b/backends/cadence/aot/tests/test_ref_implementations.py
index 5856c9def66..f679bae9485 100644
--- a/backends/cadence/aot/tests/test_ref_implementations.py
+++ b/backends/cadence/aot/tests/test_ref_implementations.py
@@ -1080,61 +1080,39 @@ def test_quantized_conv_per_tensor(
                 )
                 for dtype in [torch.uint8]
             ],
-            # Test case 4: Non-per-tensor
-            *[
-                (
-                    "non_per_tensor",
-                    torch.tensor([-1, -2, -3, 1, 2, 3], dtype=dtype),  # input
-                    torch.tensor([0, 0, 0, 1, 1, 1]),  # X_zero_point
-                    5,  # out_zero_point
-                    torch.tensor([1073741824]),  # out_multiplier (0.5 * 2^31)
-                    torch.tensor([1]),  # out_shift (multiply by 2^1 = 2)
-                    dtype,  # dtype
-                    torch.tensor([5, 5, 5, 5, 4, 3], dtype=dtype),
-                )
-                for dtype in [torch.int8]
-            ],
         ]
     )
     def test_quantized_relu(
         self,
         name: str,
         X: torch.Tensor,
-        X_zero_point: torch.Tensor | int,
+        X_zero_point: int,
         out_zero_point: int,
-        out_multiplier: torch.Tensor | int,
-        out_shift: torch.Tensor | int,
+        out_multiplier: int,
+        out_shift: int,
         dtype: torch.dtype,
         expected_output: torch.Tensor,
     ) -> None:
 
-        if isinstance(X_zero_point, int):
-            assert isinstance(out_multiplier, int)
-            assert isinstance(out_shift, int)
-
-            match dtype:
-                case torch.int8:
-                    quantized_relu = (
-                        torch.ops.cadence.quantized_relu_asym8s_asym8s.per_tensor
-                    )
-                case torch.uint8:
-                    quantized_relu = (
-                        torch.ops.cadence.quantized_relu_asym8u_asym8u.per_tensor
-                    )
-                case _:
-                    quantized_relu = torch.ops.cadence.quantized_relu_per_tensor
+        match dtype:
+            case torch.int8:
+                quantized_relu = (
+                    torch.ops.cadence.quantized_relu_asym8s_asym8s.per_tensor
+                )
+            case torch.uint8:
+                quantized_relu = (
+                    torch.ops.cadence.quantized_relu_asym8u_asym8u.per_tensor
+                )
+            case _:
+                quantized_relu = torch.ops.cadence.quantized_relu_per_tensor
 
-            output = quantized_relu(
-                X,
-                X_zero_point,
-                out_zero_point,
-                out_multiplier,
-                out_shift,
-            )
-        else:
-            output = torch.ops.cadence.quantized_relu(
-                X, X_zero_point, out_zero_point, out_multiplier, out_shift
-            )
+        output = quantized_relu(
+            X,
+            X_zero_point,
+            out_zero_point,
+            out_multiplier,
+            out_shift,
+        )
 
         # Verify output properties
         self.assertEqual(output.dtype, dtype, f"Output dtype should be {dtype}")

From 8b67236ef7b21a91d2345cabc1a79a0a99452a60 Mon Sep 17 00:00:00 2001
From: Jiseong-oh <jiseong.oh@samsung.com>
Date: Fri, 10 Oct 2025 12:21:52 +0900
Subject: [PATCH 223/266] Enable Exynos backend Quatization (#14464)

### Summary
- Implemented quantized strategies for enn-backend.
- Added support for ENN's quantization strategies.
- Successfully verified multiple quantized models.

### Test plan
python -m executorch.examples.samsung.scripts.${MODEL_NAME} -c e9955 -p
A8W8

cc @SS-JIA @digantdesai @kimishpatel

---------

Signed-off-by: jiseong.oh <jiseong.oh@samsung.com>
Co-authored-by: chen.zhao <chen03.zhao@samsung.com>
Co-authored-by: sangsoo.Ko <sangsoo.ko@samsung.com>
Co-authored-by: chong-chen <chong.chen@samsung.com>
Co-authored-by: xz-linghu <xz.linghu@samsung.com>
---
 .ci/scripts/setup-samsung-linux-deps.sh       |   4 +-
 .github/workflows/pull.yml                    |   6 +
 backends/samsung/_passes/annotate_qparams.py  | 201 ++++
 .../_passes/annotate_scalar_parameters.py     |  65 ++
 backends/samsung/_passes/conv1d_to_conv2d.py  | 129 +--
 backends/samsung/_passes/fold_qdq.py          |  36 +
 backends/samsung/_passes/fuse_conv_act.py     |  77 ++
 backends/samsung/_passes/insert_qdq.py        | 164 ++++
 .../samsung/_passes/remove_useless_ops.py     |  87 ++
 backends/samsung/_passes/utils.py             |  15 +
 backends/samsung/builders/__init__.py         |   6 +
 backends/samsung/builders/node_visitor.py     |  10 +
 backends/samsung/builders/op_add.py           |   7 +-
 backends/samsung/builders/op_avg_pool2d.py    |   4 +-
 backends/samsung/builders/op_bmm.py           |   7 +-
 backends/samsung/builders/op_cat.py           |  12 +-
 backends/samsung/builders/op_clamp.py         |   4 +
 backends/samsung/builders/op_conv2d.py        |   4 +
 backends/samsung/builders/op_dequantize.py    |  19 +
 backends/samsung/builders/op_div.py           |   9 +-
 backends/samsung/builders/op_gelu.py          |  10 +-
 backends/samsung/builders/op_hardsigmoid.py   |  35 +
 backends/samsung/builders/op_hardswish.py     |   6 +-
 backends/samsung/builders/op_hardtanh.py      |   4 +
 backends/samsung/builders/op_layer_norm.py    |   3 +-
 backends/samsung/builders/op_linear.py        |   2 +
 backends/samsung/builders/op_max_pool2d.py    |   1 +
 backends/samsung/builders/op_mean_dim.py      |   7 +-
 backends/samsung/builders/op_mul.py           |  14 +-
 backends/samsung/builders/op_quantize.py      |  60 ++
 backends/samsung/builders/op_relu.py          |   5 +-
 backends/samsung/builders/op_softmax.py       |   2 +-
 backends/samsung/builders/op_squeeze.py       |   3 +-
 backends/samsung/builders/op_to_copy.py       |   7 +-
 backends/samsung/builders/op_unsqueeze.py     |   3 +-
 .../builders/op_upsample_bilinear2d.py        |   1 +
 backends/samsung/builders/utils.py            |   1 -
 backends/samsung/enn_preprocess.py            |  10 +
 backends/samsung/partition/enn_partitioner.py |   1 +
 backends/samsung/quantizer/__init__.py        |  10 +
 backends/samsung/quantizer/annotator.py       | 871 ++++++++++++++++++
 backends/samsung/quantizer/qconfig.py         | 174 ++++
 backends/samsung/quantizer/quantizer.py       |  65 ++
 .../samsung/serialization/enn_graph_schema.py |  60 +-
 backends/samsung/utils/constants.py           |  45 +
 backends/samsung/utils/export_utils.py        |  57 +-
 backends/samsung/utils/utils.py               |  90 +-
 examples/samsung/scripts/deeplab_v3.py        | 168 ++++
 examples/samsung/scripts/edsr.py              | 181 ++++
 examples/samsung/scripts/inception_v3.py      | 169 ++++
 examples/samsung/scripts/inception_v4.py      | 167 ++++
 examples/samsung/scripts/mobilenet_v2.py      | 169 ++++
 examples/samsung/scripts/mobilenet_v3.py      | 169 ++++
 examples/samsung/scripts/resnet18.py          | 169 ++++
 examples/samsung/scripts/resnet50.py          | 169 ++++
 examples/samsung/scripts/vit.py               | 169 ++++
 examples/samsung/scripts/wav2letter.py        | 235 +++++
 57 files changed, 4079 insertions(+), 99 deletions(-)
 create mode 100644 backends/samsung/_passes/annotate_qparams.py
 create mode 100644 backends/samsung/_passes/annotate_scalar_parameters.py
 create mode 100644 backends/samsung/_passes/fold_qdq.py
 create mode 100644 backends/samsung/_passes/fuse_conv_act.py
 create mode 100644 backends/samsung/_passes/insert_qdq.py
 create mode 100644 backends/samsung/_passes/remove_useless_ops.py
 create mode 100644 backends/samsung/_passes/utils.py
 create mode 100644 backends/samsung/builders/op_dequantize.py
 create mode 100644 backends/samsung/builders/op_hardsigmoid.py
 create mode 100644 backends/samsung/builders/op_quantize.py
 create mode 100644 backends/samsung/quantizer/__init__.py
 create mode 100644 backends/samsung/quantizer/annotator.py
 create mode 100644 backends/samsung/quantizer/qconfig.py
 create mode 100644 backends/samsung/quantizer/quantizer.py
 create mode 100644 backends/samsung/utils/constants.py
 create mode 100644 examples/samsung/scripts/deeplab_v3.py
 create mode 100644 examples/samsung/scripts/edsr.py
 create mode 100644 examples/samsung/scripts/inception_v3.py
 create mode 100644 examples/samsung/scripts/inception_v4.py
 create mode 100644 examples/samsung/scripts/mobilenet_v2.py
 create mode 100644 examples/samsung/scripts/mobilenet_v3.py
 create mode 100644 examples/samsung/scripts/resnet18.py
 create mode 100644 examples/samsung/scripts/resnet50.py
 create mode 100644 examples/samsung/scripts/vit.py
 create mode 100644 examples/samsung/scripts/wav2letter.py

diff --git a/.ci/scripts/setup-samsung-linux-deps.sh b/.ci/scripts/setup-samsung-linux-deps.sh
index 434587975ab..c1f2912713b 100644
--- a/.ci/scripts/setup-samsung-linux-deps.sh
+++ b/.ci/scripts/setup-samsung-linux-deps.sh
@@ -13,7 +13,7 @@ download_ai_lite_core() {
   API_BASE="https://soc-developer.semiconductor.samsung.com/api/v1/resource/ai-litecore/download"
   API_KEY=$SAMSUNG_AI_LITECORE_KEY
 
-  VERSION="0.5"
+  VERSION="0.7"
   OS_NAME="Ubuntu 22.04"
   OUT_FILE="/tmp/exynos-ai-litecore-v${VERSION}.tar.gz"
   TARGET_PATH="/tmp/exynos_ai_lite_core"
@@ -62,7 +62,7 @@ install_enn_backend() {
   export PYTHONPATH=${PYTHONPATH:-}:${EXECUTORCH_ROOT}/..
 }
 
-AI_LITE_CORE_VERSION=0.5.0
+AI_LITE_CORE_VERSION=0.7.0
 
 download_ai_lite_core ${AI_LITE_CORE_VERSION}
 install_enn_backend
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 11e005847e6..5b646cba9d1 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -935,6 +935,12 @@ jobs:
           python -m executorch.examples.samsung.aot_compiler --model_name=$model -c E9955
         done
 
+        # Test quant models
+        model_scripts="deeplab_v3 edsr inception_v3 inception_v4 mobilenet_v2 mobilenet_v3 resnet18 resnet50 vit wav2letter"
+        for m_script in $model_scripts; do
+          python -m executorch.examples.samsung.scripts.${m_script} -c e9955 -p A8W8
+        done
+
         # Test ops
         python -m unittest discover -s backends/samsung/test/ops -p "test_*.py"
 
diff --git a/backends/samsung/_passes/annotate_qparams.py b/backends/samsung/_passes/annotate_qparams.py
new file mode 100644
index 00000000000..663d1fdf5fa
--- /dev/null
+++ b/backends/samsung/_passes/annotate_qparams.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import operator
+from typing import Any, Dict, List, Optional
+
+import torch
+from executorch.backends.samsung.utils.constants import QuantConstants
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+from torch._export.utils import get_buffer
+from torch.export import ExportedProgram
+from torch.fx import GraphModule, Node
+
+
+class AnnotateQparamsPass(ExportPass):
+    """This parse is to add quantize properties to node need to be quantized.
+
+    Annotate Quant params:
+        For src_node->Q->DQ->..., we will add the quant params from Q->DQ node
+         to the src_node
+
+    Annotate Requantize:
+        For src_node->Q->DQ->Q->DQ->..., if the multiple Q->DQ contains
+         different quant params, we will mark the src_node as need requantize,
+         and add Q->DQ after removing all the Q->DQs.
+    """
+
+    propagate_nodes = {
+        exir_ops.edge.aten.view_copy.default,
+        exir_ops.edge.aten.permute_copy.default,
+        exir_ops.edge.aten.squeeze_copy.default,
+        exir_ops.edge.aten.squeeze_copy.dim,
+        exir_ops.edge.aten.squeeze_copy.dims,
+        exir_ops.edge.aten.slice_copy.Tensor,
+        exir_ops.edge.aten.unsqueeze_copy.default,
+        exir_ops.edge.aten.concat.default,
+        exir_ops.edge.aten.cat.default,
+        exir_ops.edge.aten.expand_copy.default,
+    }
+
+    def __init__(self, edge_program: ExportedProgram):
+        super().__init__()
+        self.edge_program = edge_program
+
+    def _get_last_dqs(self, node: Node) -> List[Node]:
+        r"""From one Q-DQ node, find the last DQs in the quantization node chain.
+
+
+        need to consider such case:
+                    /--Q-DQ-node1
+            node->Q->DQ--node-node2
+                    \--Q-DQ-node3
+        This is a dfs implemention, so result will keep sorted
+        Args:
+            node (Node): Search DQ from this node.
+
+        Returns:
+            List[Node]: list of DQ node by original sequence
+        """
+
+        def _impl(node: Node, res_list: List[Node]):
+            if (
+                node.target not in QuantConstants.QUANT_OPS_KEY_MAP
+                and node.target not in QuantConstants.DEQUANT_OPS_KEY_MAP
+            ):
+                return
+            for user in node.users.keys():
+                if (
+                    user.target not in QuantConstants.QUANT_OPS_KEY_MAP
+                    and user.target not in QuantConstants.DEQUANT_OPS_KEY_MAP
+                ):
+                    res_list.append(node)
+                else:
+                    _impl(user, res_list)
+
+        res_list: List[Node] = []
+        for user in node.users:
+            _impl(user, res_list)
+        return res_list
+
+    def _propagate_quant_params(self, node: Node):
+        assert (
+            quantize_attrs := node.meta.get("quantize_attrs")
+        ), "Must be annotated node."
+        requantize_map: Dict[Node, Node] = node.meta.get("requantize", {})
+        while node.users:
+            if len(node.users) != 1:
+                break
+            user = list(node.users.keys())[0]
+            if (
+                user.target not in QuantConstants.QUANT_OPS_KEY_MAP
+                and user.target not in QuantConstants.DEQUANT_OPS_KEY_MAP
+            ):
+                break
+            node = user
+        # Case1: ...-q-dq(cur)-propagate_node-node(not d-dq)
+        # Case2: propagate_node(propagateed)-propagate_node-node(not q-dq)
+        for idx, user in enumerate(node.users.keys()):
+            # For the branch who need to be requantized, we propagate the requantize params
+            user_attrs = requantize_map.get(idx, quantize_attrs)
+            if user.target not in self.propagate_nodes:
+                continue
+            if len(user.users) == 1:
+                # Possibily no need for checking len(users)>1
+                user_of_user = list(user.users)[0]
+                # node-q-dq-propagate-q-dq not need for propagatey
+                if (
+                    user_of_user.target in QuantConstants.QUANT_OPS_KEY_MAP
+                    or user_of_user.target in QuantConstants.DEQUANT_OPS_KEY_MAP
+                ):
+                    continue
+            # propagate quant for node-q-dq-propagate_node-node(not qdq)
+            user.meta["quantize_attrs"] = user_attrs
+            self._propagate_quant_params(user)
+
+    def _annotate_requantize(self, node: Node):
+        assert (
+            ori_quant_attrs := node.meta.get("quantize_attrs")
+        ), "No quant parameters found"
+        list_for_requantize = self._get_last_dqs(node)
+        node.meta["requantize"] = node.meta.get("requantize", {})
+
+        # We use index to mark the output to be requantized
+        # Because user obj and name may change when we requantize them.
+
+        def _check_same(requant_obj, ori_obj) -> bool:
+            if type(requant_obj) != type(ori_obj):  # noqa E721
+                # We need actually same type here.
+                return False
+            if not isinstance(requant_obj, torch.Tensor):
+                return requant_obj == ori_obj
+            if requant_obj.shape != ori_obj.shape:
+                return False
+            return bool((requant_obj == ori_obj).all())
+
+        requantize_map: Dict[int, Dict] = node.meta["requantize"]
+        for idx, dq in enumerate(list_for_requantize):
+            q = dq.all_input_nodes[0]
+            if q.target not in QuantConstants.QUANT_OPS_KEY_MAP:
+                continue
+            key_map = QuantConstants.DEQUANT_OPS_KEY_MAP[dq.target]
+            requantize_attrs = self.get_quant_attrs(q, key_map)
+            if not all(
+                _check_same(ori_quant_attrs[key], requantize_attrs[key])
+                for key in key_map.values()
+            ):
+                requantize_map[idx] = requantize_attrs
+
+    def _annotate(self, graph_module: GraphModule):
+        for node in graph_module.graph.nodes:
+            key_map = QuantConstants.QUANT_OPS_KEY_MAP.get(node.target, None)
+            if not key_map:
+                continue
+            source_node = node.args[0]
+            if source_node.target in (
+                *QuantConstants.QUANT_OPS_KEY_MAP,
+                *QuantConstants.DEQUANT_OPS_KEY_MAP,
+            ):
+                # Currently, don't add quant info for d_qd node here.
+                continue
+            elif source_node.target == operator.getitem:
+                source_node = source_node.args[0]
+            quant_attrs = self.get_quant_attrs(node, key_map)
+            source_node.meta["quantize_attrs"] = quant_attrs
+            self._annotate_requantize(source_node)
+            self._propagate_quant_params(source_node)
+
+    def call(self, graph_module: GraphModule):
+        self._annotate(graph_module)
+        graph_module.recompile()
+        return PassResult(graph_module, True)
+
+    def get_quant_attrs(
+        self, quant_node: torch.fx.Node, key_map: Optional[Dict] = None
+    ) -> Dict[str, Any]:
+        quant_attr_keys = [arg.name for arg in quant_node.target._schema.arguments]
+        quant_attrs = dict.fromkeys(quant_attr_keys)
+        for key, attr in zip(quant_attr_keys[1:], quant_node.args[1:]):
+            # For channel-wise quantization, params are stored by buffer nodes.
+            if isinstance(attr, torch.fx.Node):
+                attr = get_buffer(self.edge_program, attr)
+            quant_attrs[key] = attr
+        quant_attrs["target"] = quant_node.target
+        if key_map is None:
+            return quant_attrs
+        miss_attrs = []
+        for aten_attr, snc_attr in key_map.items():
+            if aten_attr not in quant_attrs:
+                miss_attrs.append(aten_attr)
+                continue
+            attr = quant_attrs[aten_attr]
+            quant_attrs.pop(aten_attr)
+            quant_attrs[snc_attr] = attr
+        assert (
+            not miss_attrs
+        ), f"Miss quant attrs {miss_attrs} for node {quant_node.name}"
+        return quant_attrs
diff --git a/backends/samsung/_passes/annotate_scalar_parameters.py b/backends/samsung/_passes/annotate_scalar_parameters.py
new file mode 100644
index 00000000000..643685bdb25
--- /dev/null
+++ b/backends/samsung/_passes/annotate_scalar_parameters.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.backends.samsung.quantizer.quantizer import global_quant_info
+from executorch.backends.samsung.utils.constants import QuantConstants
+from executorch.backends.transforms.utils import get_param_tensor, is_param_node
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+from torch.export import ExportedProgram
+
+
+class AnnotateScalarParametersPass(ExportPass):
+    """
+    Need to add quantization parameters for scalars for some ops
+    Ifm(Quantized)------TargetOP---
+    Scalar(Non-Quant)---/
+    Notice: Such scalars are converted to tensor node by default pass
+    """
+
+    TARGET_OPS = {
+        exir_ops.edge.aten.mul.Tensor,
+        exir_ops.edge.aten.add.Tensor,
+        exir_ops.edge.aten.div.Tensor,
+    }
+
+    def __init__(self, edge_program: ExportedProgram):
+        super().__init__()
+        self.edge_program = edge_program
+
+    def annotate(self, graph_module: torch.fx.GraphModule):
+        for node in graph_module.graph.nodes:
+            if node.target not in self.TARGET_OPS or "quantize_attrs" not in node.meta:
+                continue
+            torch_quant_dtype = global_quant_info.weight_precison.torch_dtype
+            for input_arg in node.all_input_nodes:
+                if input_arg.op not in ("placeholder", "get_attr") or not is_param_node(
+                    self.edge_program, input_arg
+                ):
+                    continue
+                else:
+                    tensor = get_param_tensor(self.edge_program, input_arg)
+                    if not tensor.shape:
+                        qparams = {
+                            QuantConstants.QUANT_KEY.scale: float(tensor),
+                            QuantConstants.QUANT_KEY.quant_dtype: torch_quant_dtype,
+                            QuantConstants.QUANT_KEY.quant_max: torch.iinfo(
+                                torch_quant_dtype
+                            ).max,
+                            QuantConstants.QUANT_KEY.quant_min: torch.iinfo(
+                                torch_quant_dtype
+                            ).min,
+                            QuantConstants.QUANT_KEY.zero_point: 0,
+                        }
+                        input_arg.meta["quantize_attrs"] = qparams
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        graph = graph_module.graph
+        self.annotate(graph_module)
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/samsung/_passes/conv1d_to_conv2d.py b/backends/samsung/_passes/conv1d_to_conv2d.py
index 57f1074b348..1b8782d956b 100644
--- a/backends/samsung/_passes/conv1d_to_conv2d.py
+++ b/backends/samsung/_passes/conv1d_to_conv2d.py
@@ -5,84 +5,93 @@
 # LICENSE file in the root directory of this source tree.
 
 import torch
+from executorch.backends.transforms.utils import get_param_tensor
 from executorch.exir import ExportedProgram
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
-from torch._export.utils import get_param
 
 
 class Conv1dToConv2d(ExportPass):
-
     def __init__(self, edge_program: ExportedProgram):
         super().__init__()
         self.edge_program = edge_program
 
+    def update_kernel(self, weight_node: torch.Tensor):
+        # lifted tensor in tensor constant
+        weight_3d = get_param_tensor(self.edge_program, weight_node)
+        if param_name := self.edge_program.graph_signature.inputs_to_parameters.get(
+            weight_node.name
+        ):
+            new_weight_param = torch.nn.Parameter(
+                data=weight_3d.data.contiguous().unsqueeze(dim=-1), requires_grad=False
+            )
+            self.edge_program.state_dict[param_name] = new_weight_param
+        elif tensor_name := self.edge_program.graph_signature.inputs_to_lifted_tensor_constants.get(
+            weight_node.name
+        ):
+            self.edge_program.constants[tensor_name] = torch.unsqueeze(weight_3d, -1)
+        else:
+            RuntimeError("Weight of 1d conv should be constant tensor or Parameter obj")
+        weight_node.meta["val"] = weight_node.meta["val"].data.unsqueeze(dim=-1)
+
     def call(self, graph_module: torch.fx.GraphModule):
         graph = graph_module.graph
         node_list = list(graph.nodes)
         for node in node_list:
-            if node.op == "call_function":
-                if node.target == exir_ops.edge.aten.convolution.default:
-                    stride = list(node.args[3])
-                    if len(stride) != 1:
-                        continue
+            if node.op != "call_function":
+                continue
+            if node.target != exir_ops.edge.aten.convolution.default:
+                continue
+            stride = list(node.args[3])
+            if len(stride) != 1:
+                continue
 
-                    # convert 3dim weight to 4dim
-                    weight_node = node.args[1]
-                    weight_3dim = get_param(self.edge_program, weight_node)
-                    weight_4dim = torch.nn.Parameter(
-                        data=weight_3dim.data.contiguous().unsqueeze(dim=-1),
-                        requires_grad=False,
-                    )
-                    parameter_name = (
-                        self.edge_program.graph_signature.inputs_to_parameters[
-                            weight_node.name
-                        ]
-                    )
-                    self.edge_program.state_dict[parameter_name] = weight_4dim
-                    weight_node.meta["val"] = weight_node.meta["val"].data.unsqueeze(
-                        dim=-1
-                    )
+            # convert 3dim weight to 4dim
+            weight_node = node.args[1]
+            self.update_kernel(weight_node)
 
-                    # Extend stride, padding, and dilation
-                    node.args = (
-                        node.args[0],
-                        node.args[1],
-                        node.args[2],
-                        node.args[3] + [1],  # stride
-                        node.args[4] + [0],  # padding
-                        node.args[5] + [1],  # dilation
-                        node.args[6],
-                        node.args[7],
-                        node.args[8],
-                    )
+            # Extend stride, padding, and dilation
+            node.args = (
+                node.args[0],
+                node.args[1],
+                node.args[2],
+                node.args[3] + [1],  # stride
+                node.args[4] + [0],  # padding
+                node.args[5] + [1],  # dilation
+                node.args[6],
+                node.args[7],
+                node.args[8],
+            )
+            # unsqueeze -> conv2d -> squeeze
 
-                    # unsqueeze -> conv2d -> squeeze
-                    with graph.inserting_before(node):
-                        input_node = node.args[0]
-                        unsqueeze_before = graph.create_node(
-                            "call_function", exir_ops.edge.aten.unsqueeze_copy.default
-                        )
-                        unsqueeze_before.args = (
-                            input_node,
-                            -1,
-                        )
-                        node.replace_input_with(input_node, unsqueeze_before)
+            with graph.inserting_before(node):
+                input_node = node.args[0]
+                prev_qparams = input_node.meta.get("quantize_attrs")
+                unsqueeze_before = graph.create_node(
+                    "call_function", exir_ops.edge.aten.unsqueeze_copy.default
+                )
+                unsqueeze_before.args = (
+                    input_node,
+                    -1,
+                )
+                node.replace_input_with(input_node, unsqueeze_before)
 
-                    with graph.inserting_after(node):
-                        squeeze_after = graph.create_node(
-                            "call_function", exir_ops.edge.aten.squeeze_copy.dims
-                        )
-                        squeeze_after.args = (
-                            node,
-                            [-1],
-                        )
-                        original_users = [
-                            user for user in node.users if user != squeeze_after
-                        ]
-                        for user in original_users:
-                            user.replace_input_with(node, squeeze_after)
+            with graph.inserting_after(node):
+                squeeze_after = graph.create_node(
+                    "call_function", exir_ops.edge.aten.squeeze_copy.dims
+                )
+                squeeze_after.args = (
+                    node,
+                    [-1],
+                )
+                original_users = [user for user in node.users if user != squeeze_after]
+                for user in original_users:
+                    user.replace_input_with(node, squeeze_after)
+            if quant_attr := node.meta.get("quantize_attrs"):
+                squeeze_after.meta["quantize_attrs"] = quant_attr
+            if prev_qparams is not None:
+                unsqueeze_before.meta["quantize_attrs"] = prev_qparams
 
         graph_module.recompile()
-        graph_module = super().call(graph_module).graph_module
+        _ = super().call(graph_module).graph_module
         return PassResult(graph_module, True)
diff --git a/backends/samsung/_passes/fold_qdq.py b/backends/samsung/_passes/fold_qdq.py
new file mode 100644
index 00000000000..c6f3699ece7
--- /dev/null
+++ b/backends/samsung/_passes/fold_qdq.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.samsung.utils.constants import QuantConstants
+from executorch.exir.pass_base import ExportPass, PassResult
+from executorch.exir.passes import dead_code_elimination_pass
+from torch.fx import GraphModule
+
+
+class FoldQDQPass(ExportPass):
+    def __init__(self):
+        super().__init__()
+
+    def _fold(
+        self,
+        graph_module: GraphModule,
+    ):
+        for node in graph_module.graph.nodes:
+            if node.target not in (
+                *QuantConstants.QUANT_OPS_KEY_MAP.keys(),
+                *QuantConstants.DEQUANT_OPS_KEY_MAP.keys(),
+            ):
+                continue
+            for user in [user for user in node.users.keys()]:  # noqa: C416
+                user.replace_input_with(node, node.args[0])
+            graph_module.graph.erase_node(node)
+
+    def call(self, graph_module: GraphModule):
+        self._fold(graph_module)
+        graph_module.recompile()
+        dead_code_elimination_pass(graph_module)
+        _ = super().call(graph_module).graph_module
+        return PassResult(graph_module, True)
diff --git a/backends/samsung/_passes/fuse_conv_act.py b/backends/samsung/_passes/fuse_conv_act.py
new file mode 100644
index 00000000000..c034c98bb14
--- /dev/null
+++ b/backends/samsung/_passes/fuse_conv_act.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Optional
+
+import torch
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+from executorch.exir.passes import dead_code_elimination_pass
+from torch.fx import GraphModule
+
+
+def map_hardtan_relux(tanhnode: torch.fx.node.Node) -> Optional[str]:
+    assert (
+        tanhnode.target == exir_ops.edge.aten.hardtanh.default
+    ), "Must be a hardtanh node"
+    if not tanhnode.args[1] == 0.0:
+        return None
+    if tanhnode.args[2] == 6.0:
+        return "RELU6"
+    return None
+
+
+class FuseConvActPass(ExportPass):
+    TARGET_ACTS_MAP = {
+        exir_ops.edge.aten.relu.default: (lambda x: "RELU"),
+        exir_ops.edge.aten.relu_.default: (lambda x: "RELU"),
+        exir_ops.edge.aten.relu6.default: (lambda x: "RELU6"),
+        exir_ops.edge.aten.relu6_.default: (lambda x: "RELU6"),
+        exir_ops.edge.aten.hardtanh.default: map_hardtan_relux,
+        exir_ops.edge.aten.hardtanh_.default: map_hardtan_relux,
+    }
+
+    def _fuse(
+        self,
+        graph_module: GraphModule,
+    ):
+        for target_conv, target_act in self.get_target_conv_act(graph_module):
+            assert (
+                act_name := self.TARGET_ACTS_MAP.get(target_act.target)(target_act)
+            ), f"Not supported {target_act.name} now."
+            target_conv.meta["activation"] = act_name
+            if "quantize_attrs" in target_act.meta:
+                target_conv.meta["quantize_attrs"] = target_act.meta["quantize_attrs"]
+
+            # If we merge the real out activation to conv, the conv should be the real out
+            if "real_out" in target_act.meta:
+                target_conv.meta["real_out"] = target_act.meta["real_out"]
+            for user in [user for user in target_act.users.keys()]:  # noqa: C416
+                user.replace_input_with(target_act, target_conv)
+            graph_module.graph.erase_node(target_act)
+
+    def get_target_conv_act(self, graph_module: GraphModule):
+        for node in graph_module.graph.nodes:
+            if node.target != exir_ops.edge.aten.convolution.default:
+                continue
+            if len(node.users) != 1:
+                # Such cases couldn't be conv + act
+                continue
+            act_node = list(node.users.keys())[0]
+            if act_node.target not in self.TARGET_ACTS_MAP:
+                continue
+            if "quantize_attrs" in node.meta:
+                # If the conv's output is quantized
+                # We do not fuse them
+                continue
+            yield node, act_node
+
+    def call(self, graph_module: GraphModule):
+        self._fuse(graph_module)
+        graph_module.recompile()
+        dead_code_elimination_pass(graph_module)
+        _ = super().call(graph_module).graph_module
+        return PassResult(graph_module, True)
diff --git a/backends/samsung/_passes/insert_qdq.py b/backends/samsung/_passes/insert_qdq.py
new file mode 100644
index 00000000000..a59b011ac4b
--- /dev/null
+++ b/backends/samsung/_passes/insert_qdq.py
@@ -0,0 +1,164 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from enum import Enum
+from typing import Any, Dict
+
+import torch
+from executorch.backends.samsung._passes.utils import none_quant_tensor_quant_meta
+from executorch.backends.samsung.utils.constants import QuantConstants
+from executorch.backends.samsung.utils.utils import is_graph_input, is_graph_output
+
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+from torch.export import ExportedProgram
+from torch.fx import GraphModule
+
+
+class QType(Enum):
+    Quant = 0
+    Dequant = 1
+
+
+class InsertQDQPass(ExportPass):
+    QDQ_MAP = {
+        # per tensor
+        exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor,
+        exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor: exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor,
+        # per channel
+        exir_ops.edge.quantized_decomposed.quantize_per_channel.default: exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
+    }
+
+    def __init__(self, edge_program: ExportedProgram):
+        super().__init__()
+        self.edge_program = edge_program
+
+    def _create_qdq_node(
+        self,
+        graph_module: GraphModule,
+        qtype: QType,
+        input_node: torch.fx.Node,
+        quant_attrs: Dict[str, Any],
+    ) -> torch.fx.Node:
+        assert (target := quant_attrs.get("target")), ""
+        new_node_args = [input_node]
+        new_node_meta_val = input_node.meta["val"]
+        new_node_quant_attrs = {}
+        if qtype == QType.Dequant:
+            target = self.QDQ_MAP[target]
+        else:
+            # For input node, we should set the val type as quant type
+            key = QuantConstants.QUANT_KEY.quant_dtype
+            new_node_meta_val = new_node_meta_val.to(quant_attrs[key])
+            new_node_quant_attrs.update(quant_attrs)
+
+        for arg in target._schema.arguments[1:]:
+            name = arg.name
+            if name == "out_dtype":
+                continue
+            if qtype == QType.Quant:
+                key = QuantConstants.QUANT_OPS_KEY_MAP[target].get(name, name)
+            else:
+                key = QuantConstants.DEQUANT_OPS_KEY_MAP[target].get(name, name)
+            arg_value = quant_attrs[key]
+            if isinstance(arg.type, torch.Tensor) and (
+                isinstance(arg_value, int) or isinstance(arg_value, float)
+            ):
+                arg_value = torch.Tensor(arg_value)
+            new_node_args.append(arg_value)
+
+        new_node = graph_module.graph.create_node(
+            "call_function", target, tuple(new_node_args)
+        )
+        if new_node_quant_attrs:
+            new_node.meta["quantize_attrs"] = new_node_quant_attrs
+        else:
+            new_node.meta["quantize_attrs"] = {
+                QuantConstants.QUANT_KEY.quant_dtype: torch.float32,
+                QuantConstants.QUANT_KEY.scale: [1.0],
+                QuantConstants.QUANT_KEY.zero_point: [0],
+            }
+        new_node.meta["val"] = new_node_meta_val
+        return new_node
+
+    def _add_dq_after(self, graph_module: GraphModule, node: torch.fx.Node):
+        if not (quant_attrs := node.meta.get("quantize_attrs")):
+            return
+        with graph_module.graph.inserting_after(node):
+            new_node = self._create_qdq_node(
+                graph_module, QType.Dequant, node, quant_attrs
+            )
+            users = [user for user in node.users.keys() if (user.op == "output")]
+            for user in users:
+                user.replace_input_with(node, new_node)
+
+    def _add_q_after(self, graph_module: GraphModule, node: torch.fx.Node):
+        # In node don't need quant attrs after insert new quantize node.
+        if not (quant_attrs := node.meta.pop("quantize_attrs", None)):
+            return
+        node.meta["quantize_attrs"] = none_quant_tensor_quant_meta()
+        with graph_module.graph.inserting_after(node):
+            users = list(node.users.keys())
+            new_node = self._create_qdq_node(
+                graph_module, QType.Quant, node, quant_attrs
+            )
+            for user in users:
+                if user.target not in QuantConstants.QUANT_OPS_KEY_MAP:
+                    user.replace_input_with(node, new_node)
+
+    def _add_q_before(
+        self,
+        graph_module: GraphModule,
+        node: torch.fx.Node,
+        from_node: torch.fx.Node,
+        quantize_attrs: Dict,
+    ):
+        with graph_module.graph.inserting_before(node):
+            new_quant_node = self._create_qdq_node(
+                graph_module, QType.Quant, from_node, quantize_attrs
+            )
+            node.replace_input_with(from_node, new_quant_node)
+        return new_quant_node
+
+    def _add_dq_before(
+        self,
+        graph_module: GraphModule,
+        node: torch.fx.Node,
+        from_node: torch.fx.Node,
+        quantize_attrs: Dict,
+    ):
+        with graph_module.graph.inserting_before(node):
+            new_dequant_node = self._create_qdq_node(
+                graph_module, QType.Dequant, from_node, quantize_attrs
+            )
+            node.replace_input_with(from_node, new_dequant_node)
+        return new_dequant_node
+
+    def _add_qdq_for_requantize(self, graph_module: GraphModule):
+        for node in graph_module.graph.nodes:
+            requant_map: Dict[int, Dict] = node.meta.get("requantize")
+            if requant_map is None:
+                continue
+            assert (ori_quant_attrs := node.meta.get("quantize_attrs"))
+            usr_list = list(node.users.keys())
+            for user_idx, requant_params in requant_map.items():
+                user = usr_list[user_idx]
+                q_node = self._add_q_before(graph_module, user, node, requant_params)
+                _ = self._add_dq_before(graph_module, q_node, node, ori_quant_attrs)
+
+    def _add_qdq(self, graph_module: GraphModule):
+        for node in list(graph_module.graph.nodes):
+            if is_graph_input(self.edge_program, node):
+                self._add_q_after(graph_module, node)
+            elif is_graph_output(node):
+                self._add_dq_after(graph_module, node)
+
+    def call(self, graph_module: GraphModule):
+        self._add_qdq(graph_module)
+        self._add_qdq_for_requantize(graph_module)
+        graph_module.graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/samsung/_passes/remove_useless_ops.py b/backends/samsung/_passes/remove_useless_ops.py
new file mode 100644
index 00000000000..c88a2d4a5d8
--- /dev/null
+++ b/backends/samsung/_passes/remove_useless_ops.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+from executorch.exir.passes import dead_code_elimination_pass
+from torch.fx import GraphModule
+
+
+class RemoveUselessOpPass(ExportPass):
+    # such ops should be single-in and single-out
+    USELESS_OP_SET = {
+        exir_ops.edge.aten._to_copy.default,
+        exir_ops.edge.aten.clone.default,
+        exir_ops.edge.aten.clone.default,
+        exir_ops.edge.aten.alias.default,
+        exir_ops.edge.aten.lift_fresh_copy.default,
+        exir_ops.edge.dim_order_ops._to_dim_order_copy.default,
+    }
+
+    def __init__(self):
+        super().__init__()
+
+    def gen_pattern_as_strided_copy(self, graph_module: GraphModule):
+        for node in list(graph_module.graph.nodes):  # noqa: C416
+            if node.target != exir_ops.edge.aten.mean.dim:
+                continue
+            if len(node.users) != 1:
+                continue
+            successor = list(node.users.keys())[0]
+            if successor.target != exir_ops.edge.aten.as_strided_copy.default:
+                continue
+            is_pattern = True
+            count = 0
+            for i, stride in enumerate(successor.args[2]):
+                if stride < node.meta["val"].size()[i]:
+                    if stride == 1:
+                        count += 1
+                    else:
+                        is_pattern = False
+                        break
+                if count >= 2:
+                    is_pattern = False
+                    break
+            if is_pattern:
+                yield successor
+
+    def _fold_as_strided_copy(
+        self,
+        graph_module: GraphModule,
+    ):
+        for as_strided_copy_node in self.gen_pattern_as_strided_copy(graph_module):
+            for user in list(as_strided_copy_node.users.keys()):
+                user.replace_input_with(
+                    as_strided_copy_node, as_strided_copy_node.args[0]
+                )
+            graph_module.graph.erase_node(as_strided_copy_node)
+
+    def _remove_useless(
+        self,
+        graph_module: GraphModule,
+    ):
+        for node in graph_module.graph.nodes:
+            if node.target not in self.USELESS_OP_SET:
+                continue
+
+            # Prevent from removing if data type may change.
+            if (
+                node.target == exir_ops.edge.aten._to_copy.default
+                or node.target == exir_ops.edge.dim_order_ops._to_dim_order_copy.default
+            ) and "memory_format" not in node.kwargs:
+                continue
+
+            for user in [user for user in node.users.keys()]:  # noqa: C416
+                user.replace_input_with(node, node.all_input_nodes[0])
+            graph_module.graph.erase_node(node)
+        self._fold_as_strided_copy(graph_module)
+
+    def call(self, graph_module: GraphModule):
+        self._remove_useless(graph_module)
+        graph_module.recompile()
+        dead_code_elimination_pass(graph_module)
+        _ = super().call(graph_module).graph_module
+        return PassResult(graph_module, True)
diff --git a/backends/samsung/_passes/utils.py b/backends/samsung/_passes/utils.py
new file mode 100644
index 00000000000..afa7c72c601
--- /dev/null
+++ b/backends/samsung/_passes/utils.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+
+def none_quant_tensor_quant_meta():
+    return {
+        "quant_dtype": torch.float32,
+        "scales": 1,
+        "zero_points": 0,
+    }
diff --git a/backends/samsung/builders/__init__.py b/backends/samsung/builders/__init__.py
index 02a457fd06e..978da82b370 100644
--- a/backends/samsung/builders/__init__.py
+++ b/backends/samsung/builders/__init__.py
@@ -14,11 +14,13 @@
     op_clamp,
     op_constant_pad_nd,
     op_conv2d,
+    op_dequantize,
     op_div,
     op_embedding,
     op_expand_copy,
     op_gelu,
     op_getitem,
+    op_hardsigmoid,
     op_hardswish,
     op_hardtanh,
     op_layer_norm,
@@ -32,6 +34,7 @@
     op_mul,
     op_permute,
     op_pixel_shuffle,
+    op_quantize,
     op_relu,
     op_reshape,
     op_rsqrt,
@@ -57,6 +60,7 @@
     op_clamp,
     op_conv2d,
     op_constant_pad_nd,
+    op_dequantize,
     op_div,
     op_embedding,
     op_expand_copy,
@@ -64,6 +68,7 @@
     op_getitem,
     op_hardswish,
     op_hardtanh,
+    op_hardsigmoid,
     op_layer_norm,
     op_leaky_relu,
     op_linear,
@@ -75,6 +80,7 @@
     op_mul,
     op_permute,
     op_pixel_shuffle,
+    op_quantize,
     op_relu,
     op_reshape,
     op_rsqrt,
diff --git a/backends/samsung/builders/node_visitor.py b/backends/samsung/builders/node_visitor.py
index a35c0b4715d..0d2707da8f5 100644
--- a/backends/samsung/builders/node_visitor.py
+++ b/backends/samsung/builders/node_visitor.py
@@ -14,6 +14,7 @@
     get_tensor_type,
 )
 from executorch.backends.samsung.serialization.enn_graph_schema import EnnGraph
+from executorch.backends.samsung.utils.constants import QuantConstants
 from executorch.backends.transforms.utils import is_param_node
 from torch.export import ExportedProgram
 
@@ -61,18 +62,26 @@ def define_tensor(
 
         dims = [1] if len(tensor.size()) == 0 else list(tensor.size())
 
+        quant_attrs = node.meta.get("quantize_attrs")
         enn_tensor_id = enn_graph.define_tensor(
             node.name,
             dims,
             data_type,
             tensor_type.name,
             const_data,
+            quant_param=quant_attrs,
         )
         assert enn_tensor_id is not None
         vals_to_ids[node] = enn_tensor_id
 
         return enn_tensor_id
 
+    def _update_params_qdtype(self, node: torch.fx.Node, params: Dict):
+        if qdtype := node.meta.get("quantize_attrs", {}).get(
+            QuantConstants.QUANT_KEY.quant_dtype
+        ):
+            params["quant_dtype"] = EnnGraph._affine_meta_param(qdtype)
+
 
 _node_visitor_dict = {}
 
@@ -92,6 +101,7 @@ def register_node_visitor(visitor):
         raise TypeError(
             f"target of vistor should be str|Tuple[str]|List[str], not{type(visitor.target)}"
         )
+    return visitor
 
 
 def get_node_visitors(*args) -> Dict[str, NodeVisitor]:
diff --git a/backends/samsung/builders/op_add.py b/backends/samsung/builders/op_add.py
index 1b0dddb0d02..a6eb79897dd 100644
--- a/backends/samsung/builders/op_add.py
+++ b/backends/samsung/builders/op_add.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
 from typing import Dict
 
 import torch
@@ -28,9 +29,13 @@ def define_node(
     ) -> None:
         input1 = node.args[0]
         input_id_1 = self.define_tensor(input1, enn_graph, vals_to_ids)
+        params = {}
+        self._update_params_qdtype(node, params)
         input2 = node.args[1]
         input_id_2 = self.define_tensor(input2, enn_graph, vals_to_ids)
 
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
 
-        enn_graph.define_op(node.name, "ELTSUM", [input_id_1, input_id_2], [output_id])
+        enn_graph.define_op(
+            node.name, "ELTSUM", [input_id_1, input_id_2], [output_id], params
+        )
diff --git a/backends/samsung/builders/op_avg_pool2d.py b/backends/samsung/builders/op_avg_pool2d.py
index ad7ccbac3ae..bfca8b89b22 100644
--- a/backends/samsung/builders/op_avg_pool2d.py
+++ b/backends/samsung/builders/op_avg_pool2d.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
 from typing import cast, Dict, List
 
 import torch
@@ -49,6 +50,7 @@ def define_node(
         params["stride_w"] = stride[1]
         params["padding"] = "EXPLICIT"
         params["explicit_padding"] = explicit_padding
+        self._update_params_qdtype(node, params)
 
         if len(node.args) > 4:
             ceil_mode = cast(bool, node.args[4])
@@ -64,7 +66,5 @@ def define_node(
             assert (
                 divisor_override == kernel_size[0] * kernel_size[1]
             ), "Not supported divisor_override which is not equal to pooling region."
-
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
-
         enn_graph.define_op(node.name, "AVGPOOL2D", [input_id], [output_id], params)
diff --git a/backends/samsung/builders/op_bmm.py b/backends/samsung/builders/op_bmm.py
index 6ba8864ebb3..13e0d19cb14 100644
--- a/backends/samsung/builders/op_bmm.py
+++ b/backends/samsung/builders/op_bmm.py
@@ -16,7 +16,7 @@
 
 @register_node_visitor
 class BMMVisitor(NodeVisitor):
-    target = "aten.bmm.default"
+    target = ["aten.bmm.default"]
 
     def __init__(self, *args) -> None:
         super().__init__(*args)
@@ -29,12 +29,15 @@ def define_node(
     ) -> None:
         input1 = node.args[0]
         input_id_1 = self.define_tensor(input1, enn_graph, vals_to_ids)
+
         input2 = node.args[1]
         input_id_2 = self.define_tensor(input2, enn_graph, vals_to_ids)
 
         # output
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
 
+        params = {}
+        self._update_params_qdtype(node, params)
         enn_graph.define_op(
-            node.name, "BATCH_MATMUL", [input_id_1, input_id_2], [output_id]
+            node.name, "BATCH_MATMUL", [input_id_1, input_id_2], [output_id], params
         )
diff --git a/backends/samsung/builders/op_cat.py b/backends/samsung/builders/op_cat.py
index e9c0a32b389..09387f2e361 100644
--- a/backends/samsung/builders/op_cat.py
+++ b/backends/samsung/builders/op_cat.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
 from typing import cast, Dict, List
 
 import torch
@@ -12,6 +13,7 @@
 )
 from executorch.backends.samsung.serialization.enn_graph_schema import EnnGraph
 from executorch.backends.transforms import get_shape
+from executorch.backends.transforms.utils import is_param_node
 
 
 @register_node_visitor
@@ -29,14 +31,20 @@ def define_node(
     ) -> None:
         tensors = cast(List[torch.fx.Node], node.args[0])
         input_tensor_ids = []
-
-        for in_tensor in tensors:
+        constant_idx = None
+        for idx, in_tensor in enumerate(tensors):
+            if is_param_node(self.exported_program, in_tensor):
+                assert constant_idx is None, "Only support at most 1 constant tensor"
+                constant_idx = idx
             input_id = self.define_tensor(in_tensor, enn_graph, vals_to_ids)
             input_tensor_ids.append(input_id)
 
         in_shape = get_shape(node)
         axis = cast(int, node.args[1]) % len(in_shape) if len(node.args) >= 2 else 0
         params = {"axis": axis}
+        if constant_idx is not None:
+            params["constant_index"] = constant_idx
+        self._update_params_qdtype(node, params)
 
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
         enn_graph.define_op(node.name, "CONCAT", input_tensor_ids, [output_id], params)
diff --git a/backends/samsung/builders/op_clamp.py b/backends/samsung/builders/op_clamp.py
index c5670b80fa3..74af83212a5 100644
--- a/backends/samsung/builders/op_clamp.py
+++ b/backends/samsung/builders/op_clamp.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
 from typing import cast, Dict
 
 import torch
@@ -32,12 +33,15 @@ def define_node(
         # The default value of lower bound and upper bound
         output_min = torch.finfo(torch.float32).min
         output_max = torch.finfo(torch.float32).max
+
         if node.args[1] is not None:
             output_min = cast(float, node.args[1])
         if len(node.args) > 2 and node.args[2] is not None:
             output_max = cast(float, node.args[2])
 
         params = {"minimum": output_min, "maximum": output_max}
+        self._update_params_qdtype(node, params)
+
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
 
         enn_graph.define_op(node.name, "CLIP", [input_id], [output_id], params)
diff --git a/backends/samsung/builders/op_conv2d.py b/backends/samsung/builders/op_conv2d.py
index 881a533801f..ab77d8df626 100644
--- a/backends/samsung/builders/op_conv2d.py
+++ b/backends/samsung/builders/op_conv2d.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
 from typing import cast, Dict, List
 
 import torch
@@ -56,6 +57,9 @@ def define_node(
         input_shape = get_shape(input)
         kernel_shape = get_shape(weight_node)
         params = {}
+        self._update_params_qdtype(node, params)
+        if "activation" in node.meta:
+            params["activation"] = node.meta["activation"]
         params["kernel_h"] = kernel_shape[2]
         params["kernel_w"] = kernel_shape[3]
         params["stride_h"] = stride[0]
diff --git a/backends/samsung/builders/op_dequantize.py b/backends/samsung/builders/op_dequantize.py
new file mode 100644
index 00000000000..a1c31af4037
--- /dev/null
+++ b/backends/samsung/builders/op_dequantize.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.samsung.builders.node_visitor import register_node_visitor
+from executorch.backends.samsung.builders.op_quantize import _QuantOpVistorBase
+
+
+# Dequant ops here
+@register_node_visitor
+class DequantizeVistor(_QuantOpVistorBase):
+    target = [
+        "quantized_decomposed.dequantize_per_tensor.default",
+        "quantized_decomposed.dequantize_per_tensor.tensor",
+        "quantized_decomposed.dequantize_per_channel.default",
+        "quantized_decomposed.dequantize_per_channel.tensor",
+    ]
diff --git a/backends/samsung/builders/op_div.py b/backends/samsung/builders/op_div.py
index 89d773ddb0e..8b0e7cdd5af 100644
--- a/backends/samsung/builders/op_div.py
+++ b/backends/samsung/builders/op_div.py
@@ -27,13 +27,16 @@ def define_node(
         enn_graph: EnnGraph,
         vals_to_ids: Dict[torch.Tensor, int],
     ) -> None:
-        # inputs
         input1 = node.args[0]
         input_id_1 = self.define_tensor(input1, enn_graph, vals_to_ids)
+
         input2 = node.args[1]
         input_id_2 = self.define_tensor(input2, enn_graph, vals_to_ids)
-
+        params = {}
+        self._update_params_qdtype(node, params)
         # output
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
 
-        enn_graph.define_op(node.name, "ELTDIV", [input_id_1, input_id_2], [output_id])
+        enn_graph.define_op(
+            node.name, "ELTDIV", [input_id_1, input_id_2], [output_id], params
+        )
diff --git a/backends/samsung/builders/op_gelu.py b/backends/samsung/builders/op_gelu.py
index 059a3b77850..88417f688f9 100644
--- a/backends/samsung/builders/op_gelu.py
+++ b/backends/samsung/builders/op_gelu.py
@@ -27,8 +27,14 @@ def define_node(
         enn_graph: EnnGraph,
         vals_to_ids: Dict[torch.Tensor, int],
     ) -> None:
-        input_id = self.define_tensor(node.args[0], enn_graph, vals_to_ids)
+        # input1
+        input = node.args[0]
+        input_id = self.define_tensor(input, enn_graph, vals_to_ids)
 
+        # output
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
 
-        enn_graph.define_op(node.name, "GELU", [input_id], [output_id])
+        params = {}
+        self._update_params_qdtype(node, params)
+
+        enn_graph.define_op(node.name, "GELU", [input_id], [output_id], params)
diff --git a/backends/samsung/builders/op_hardsigmoid.py b/backends/samsung/builders/op_hardsigmoid.py
new file mode 100644
index 00000000000..3a50d65da41
--- /dev/null
+++ b/backends/samsung/builders/op_hardsigmoid.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict
+
+import torch
+from executorch.backends.samsung.builders.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.samsung.serialization.enn_graph_schema import EnnGraph
+
+
+@register_node_visitor
+class HardSigmoidVisitor(NodeVisitor):
+    target = "aten.hardsigmoid.default"
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        enn_graph: EnnGraph,
+        vals_to_ids: Dict[torch.Tensor, int],
+    ) -> None:
+        input = node.args[0]
+        input_id = self.define_tensor(input, enn_graph, vals_to_ids)
+        output_id = self.define_tensor(node, enn_graph, vals_to_ids)
+        params = {}
+        self._update_params_qdtype(node, params)
+        enn_graph.define_op(node.name, "HardSigmoid", [input_id], [output_id], params)
diff --git a/backends/samsung/builders/op_hardswish.py b/backends/samsung/builders/op_hardswish.py
index 72a99d17b83..8c30125e8a4 100644
--- a/backends/samsung/builders/op_hardswish.py
+++ b/backends/samsung/builders/op_hardswish.py
@@ -29,7 +29,7 @@ def define_node(
     ) -> None:
         input = node.args[0]
         input_id = self.define_tensor(input, enn_graph, vals_to_ids)
-
+        params = {}
+        self._update_params_qdtype(node, params)
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
-
-        enn_graph.define_op(node.name, "HARDSWISH", [input_id], [output_id])
+        enn_graph.define_op(node.name, "HARDSWISH", [input_id], [output_id], params)
diff --git a/backends/samsung/builders/op_hardtanh.py b/backends/samsung/builders/op_hardtanh.py
index 4f667bf5299..7d65e97a566 100644
--- a/backends/samsung/builders/op_hardtanh.py
+++ b/backends/samsung/builders/op_hardtanh.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
 from typing import cast, Dict
 
 import torch
@@ -29,9 +30,12 @@ def define_node(
         input = node.args[0]
         input_id = self.define_tensor(input, enn_graph, vals_to_ids)
 
+        # default value of output_min and output_max
         output_min = cast(float, node.args[1]) if len(node.args) > 1 else -1
         output_max = cast(float, node.args[2]) if len(node.args) > 2 else 1
+
         params = {"minimum": output_min, "maximum": output_max}
+        self._update_params_qdtype(node, params)
 
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
 
diff --git a/backends/samsung/builders/op_layer_norm.py b/backends/samsung/builders/op_layer_norm.py
index e6f853178d8..098bc92dc84 100644
--- a/backends/samsung/builders/op_layer_norm.py
+++ b/backends/samsung/builders/op_layer_norm.py
@@ -46,9 +46,8 @@ def define_node(
 
         epsilon = node.args[4] if len(node.args) > 4 else 1e-5
         params = {"epsilon": epsilon}
-
+        self._update_params_qdtype(node, params)
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
-
         enn_graph.define_op(
             node.name, "LAYERNORM", all_input_tensors, [output_id], params
         )
diff --git a/backends/samsung/builders/op_linear.py b/backends/samsung/builders/op_linear.py
index 2f7aa1e6415..720439de976 100644
--- a/backends/samsung/builders/op_linear.py
+++ b/backends/samsung/builders/op_linear.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
 from typing import Dict
 
 import torch
@@ -43,6 +44,7 @@ def define_node(
 
         weight_shape = get_shape(weight_node)
         params = {"in_channels": weight_shape[1], "out_channels": weight_shape[0]}
+        self._update_params_qdtype(node, params)
 
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
 
diff --git a/backends/samsung/builders/op_max_pool2d.py b/backends/samsung/builders/op_max_pool2d.py
index d386dd30b1a..57b716fcb34 100644
--- a/backends/samsung/builders/op_max_pool2d.py
+++ b/backends/samsung/builders/op_max_pool2d.py
@@ -73,6 +73,7 @@ def define_node(
         params["explicit_padding"] = explicit_padding
         params["dilation_h"] = dilation[0]
         params["dilation_w"] = dilation[1]
+        self._update_params_qdtype(node, params)
 
         if len(node.args) > 5:
             ceil_mode = cast(bool, node.args[5])
diff --git a/backends/samsung/builders/op_mean_dim.py b/backends/samsung/builders/op_mean_dim.py
index 2f07f870ec4..3d0377703a7 100644
--- a/backends/samsung/builders/op_mean_dim.py
+++ b/backends/samsung/builders/op_mean_dim.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
 from typing import cast, Dict, List
 
 import torch
@@ -27,6 +28,7 @@ def define_node(
         enn_graph: EnnGraph,
         vals_to_ids: Dict[torch.Tensor, int],
     ) -> None:
+        # input
         input = node.args[0]
         input_id = self.define_tensor(input, enn_graph, vals_to_ids)
 
@@ -37,8 +39,11 @@ def define_node(
         in_shape = get_shape(input)
         for dim in dims:
             reduce_axes.append(dim % len(in_shape))
-        reduce_axes.sort()
+
+        if len(node.args[1]) > 1:
+            reduce_axes.sort()
 
         keep_dim = node.args[2] if len(node.args) >= 3 else False
         params = {"keep_dims": keep_dim, "axis": reduce_axes}
+        self._update_params_qdtype(node, params)
         enn_graph.define_op(node.name, "REDUCEMEAN", [input_id], [output_id], params)
diff --git a/backends/samsung/builders/op_mul.py b/backends/samsung/builders/op_mul.py
index dce531ff0b0..6dd7c0dd9f0 100644
--- a/backends/samsung/builders/op_mul.py
+++ b/backends/samsung/builders/op_mul.py
@@ -1,5 +1,9 @@
-# Copyright (c) 2024 Samsung Electronics Co. LTD
+# Copyright (c) 2025 Samsung Electronics Co. LTD
 # All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
 from typing import Dict
 
 import torch
@@ -23,11 +27,17 @@ def define_node(
         enn_graph: EnnGraph,
         vals_to_ids: Dict[torch.Tensor, int],
     ) -> None:
+
         input1 = node.args[0]
         input_id_1 = self.define_tensor(input1, enn_graph, vals_to_ids)
+
         input2 = node.args[1]
         input_id_2 = self.define_tensor(input2, enn_graph, vals_to_ids)
+        params = {}
+        self._update_params_qdtype(node, params)
 
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
 
-        enn_graph.define_op(node.name, "ELTMUL", [input_id_1, input_id_2], [output_id])
+        enn_graph.define_op(
+            node.name, "ELTMUL", [input_id_1, input_id_2], [output_id], params
+        )
diff --git a/backends/samsung/builders/op_quantize.py b/backends/samsung/builders/op_quantize.py
new file mode 100644
index 00000000000..dcf30e291f9
--- /dev/null
+++ b/backends/samsung/builders/op_quantize.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict
+
+import torch
+from executorch.backends.samsung.builders.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.samsung.serialization.enn_graph_schema import EnnGraph
+from executorch.backends.samsung.utils.constants import QuantConstants
+
+
+class _QuantOpVistorBase(NodeVisitor):
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        enn_graph: EnnGraph,
+        vals_to_ids: Dict[torch.Tensor, int],
+    ) -> None:
+        # input
+        input = node.args[0]
+        input_id = self.define_tensor(input, enn_graph, vals_to_ids)
+
+        scales = node.args[1]
+        if isinstance(scales, torch.Tensor):
+            scales = scales.tolist()
+        elif not isinstance(scales, list):
+            scales = torch.tensor(scales).reshape([1]).tolist()
+        zero_points = node.args[2]
+        if isinstance(zero_points, torch.Tensor):
+            zero_points = zero_points.tolist()
+        elif not isinstance(zero_points, list):
+            zero_points = torch.tensor(zero_points).reshape([1]).tolist()
+
+        output_id = self.define_tensor(node, enn_graph, vals_to_ids)
+
+        params = {"scales": scales, "zero_points": zero_points}
+
+        if node.target in QuantConstants.QUANT_OPS_KEY_MAP:
+            enn_graph.define_op(node.name, "QUANTIZE", [input_id], [output_id], params)
+        else:
+            enn_graph.define_op(
+                node.name, "DEQUANTIZE", [input_id], [output_id], params
+            )
+
+
+@register_node_visitor
+class QuantizeVistor(_QuantOpVistorBase):
+    target = [
+        "quantized_decomposed.quantize_per_tensor.default",
+        "quantized_decomposed.quantize_per_channel.default",
+    ]
diff --git a/backends/samsung/builders/op_relu.py b/backends/samsung/builders/op_relu.py
index ba90116be1d..a4a2b6bc4f0 100644
--- a/backends/samsung/builders/op_relu.py
+++ b/backends/samsung/builders/op_relu.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
 from typing import Dict
 
 import torch
@@ -30,5 +31,7 @@ def define_node(
         input_id = self.define_tensor(input, enn_graph, vals_to_ids)
 
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
+        params = {}
+        self._update_params_qdtype(node, params)
 
-        enn_graph.define_op(node.name, "RELU", [input_id], [output_id])
+        enn_graph.define_op(node.name, "RELU", [input_id], [output_id], params)
diff --git a/backends/samsung/builders/op_softmax.py b/backends/samsung/builders/op_softmax.py
index 1e2e4a378dc..7f569cea6fc 100644
--- a/backends/samsung/builders/op_softmax.py
+++ b/backends/samsung/builders/op_softmax.py
@@ -35,5 +35,5 @@ def define_node(
 
         axis = cast(int, node.args[1])
         params = {"axis": axis}
-
+        self._update_params_qdtype(node, params)
         enn_graph.define_op(node.name, "SOFTMAX", [input_id], [output_id], params)
diff --git a/backends/samsung/builders/op_squeeze.py b/backends/samsung/builders/op_squeeze.py
index d165a22fcb3..82fa17fbc95 100644
--- a/backends/samsung/builders/op_squeeze.py
+++ b/backends/samsung/builders/op_squeeze.py
@@ -33,4 +33,5 @@ def define_node(
         # output
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
 
-        enn_graph.define_op(node.name, "RESHAPE", [input_id], [output_id])
+        params = {"new_shape": [*node.meta["val"].shape]}
+        enn_graph.define_op(node.name, "RESHAPE", [input_id], [output_id], params)
diff --git a/backends/samsung/builders/op_to_copy.py b/backends/samsung/builders/op_to_copy.py
index 545672ef6a3..c770602bb5f 100644
--- a/backends/samsung/builders/op_to_copy.py
+++ b/backends/samsung/builders/op_to_copy.py
@@ -11,6 +11,8 @@
     NodeVisitor,
     register_node_visitor,
 )
+
+from executorch.backends.samsung.builders.utils import get_map_dtype, get_tensor
 from executorch.backends.samsung.serialization.enn_graph_schema import EnnGraph
 
 
@@ -35,5 +37,8 @@ def define_node(
         input_id = self.define_tensor(input, enn_graph, vals_to_ids)
 
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
+        params = {}
+        out_tensor = get_tensor(self.exported_program, node)
+        params["out_dtype"] = get_map_dtype(out_tensor.dtype)
 
-        enn_graph.define_op(node.name, "CAST", [input_id], [output_id])
+        enn_graph.define_op(node.name, "CAST", [input_id], [output_id], params)
diff --git a/backends/samsung/builders/op_unsqueeze.py b/backends/samsung/builders/op_unsqueeze.py
index 942c3307de7..61fa06e6310 100644
--- a/backends/samsung/builders/op_unsqueeze.py
+++ b/backends/samsung/builders/op_unsqueeze.py
@@ -31,4 +31,5 @@ def define_node(
 
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
 
-        enn_graph.define_op(node.name, "RESHAPE", [input_id], [output_id])
+        params = {"new_shape": [*node.meta["val"].shape]}
+        enn_graph.define_op(node.name, "RESHAPE", [input_id], [output_id], params)
diff --git a/backends/samsung/builders/op_upsample_bilinear2d.py b/backends/samsung/builders/op_upsample_bilinear2d.py
index a934b2789ba..d4b040460e3 100644
--- a/backends/samsung/builders/op_upsample_bilinear2d.py
+++ b/backends/samsung/builders/op_upsample_bilinear2d.py
@@ -46,6 +46,7 @@ def define_node(
             "upsampling_factor": scale_factor,
             "half_pixel_centers": True,
         }
+        self._update_params_qdtype(node, params)
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
         enn_graph.define_op(
             node.name, "RESIZE_BILINEAR", [input_id], [output_id], params
diff --git a/backends/samsung/builders/utils.py b/backends/samsung/builders/utils.py
index 58c84ff6d31..a640071c798 100644
--- a/backends/samsung/builders/utils.py
+++ b/backends/samsung/builders/utils.py
@@ -9,7 +9,6 @@
 import torch
 from executorch.backends.samsung.utils.utils import is_graph_input, is_graph_output
 from executorch.backends.transforms.utils import get_param_tensor, is_param_node
-
 from torch.export import ExportedProgram
 
 DATA_TYPE_STR_MAPPING = {
diff --git a/backends/samsung/enn_preprocess.py b/backends/samsung/enn_preprocess.py
index dde01bc09c7..0847ec0adeb 100644
--- a/backends/samsung/enn_preprocess.py
+++ b/backends/samsung/enn_preprocess.py
@@ -9,10 +9,16 @@
 
 import executorch.backends.samsung.python.PyEnnWrapperAdaptor as PyEnnWrapper
 import torch
+from executorch.backends.samsung._passes.annotate_qparams import AnnotateQparamsPass
+from executorch.backends.samsung._passes.annotate_scalar_parameters import (
+    AnnotateScalarParametersPass,
+)
 from executorch.backends.samsung._passes.conv1d_to_conv2d import Conv1dToConv2d
 from executorch.backends.samsung._passes.customized_constant_prop import (
     ConstantPropPass,
 )
+from executorch.backends.samsung._passes.fold_qdq import FoldQDQPass
+from executorch.backends.samsung._passes.insert_qdq import InsertQDQPass
 from executorch.backends.samsung._passes.replace_scalar_ops import ReplaceOpsWithScalar
 from executorch.backends.samsung.builders.node_visitor import get_node_visitors
 from executorch.backends.samsung.serialization.compile_options import (
@@ -53,12 +59,16 @@ def preprocess(
 
         enn_preprocess_passes = PassManager(
             passes=[
+                AnnotateQparamsPass(edge_program),
+                FoldQDQPass(),
                 ConstantPropPass(edge_program),
                 Conv1dToConv2d(edge_program),
                 FuseBatchNormWithConvPass(edge_program),
                 AddmmToLinearTransform(),
                 ReplaceOpsWithScalar(),
                 RemoveGetItemPass(),
+                InsertQDQPass(edge_program),
+                AnnotateScalarParametersPass(edge_program),
             ]
         )
         pass_result = enn_preprocess_passes(edge_program.graph_module)
diff --git a/backends/samsung/partition/enn_partitioner.py b/backends/samsung/partition/enn_partitioner.py
index 952cb000429..368d069c380 100644
--- a/backends/samsung/partition/enn_partitioner.py
+++ b/backends/samsung/partition/enn_partitioner.py
@@ -129,5 +129,6 @@ def ops_to_not_decompose(
             torch.ops.aten.prelu.default,
             torch.ops.aten.layer_norm.default,
             torch.ops.aten.pixel_shuffle.default,
+            torch.ops.aten.hardsigmoid.default,
         ]
         return (ops_not_to_decompose, None)
diff --git a/backends/samsung/quantizer/__init__.py b/backends/samsung/quantizer/__init__.py
new file mode 100644
index 00000000000..621eec69240
--- /dev/null
+++ b/backends/samsung/quantizer/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .qconfig import Precision
+from .quantizer import EnnQuantizer
+
+__all__ = [EnnQuantizer, Precision]
diff --git a/backends/samsung/quantizer/annotator.py b/backends/samsung/quantizer/annotator.py
new file mode 100644
index 00000000000..31015698006
--- /dev/null
+++ b/backends/samsung/quantizer/annotator.py
@@ -0,0 +1,871 @@
+# Copyright (c) Qualcomm Innovation Center, Inc
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Callable, Dict, List
+
+import torch
+from torch._ops import OpOverload
+from torch._subclasses import FakeTensor
+
+from torch.fx import Graph, Node
+
+from torchao.quantization.pt2e import FixedQParamsObserver
+from torchao.quantization.pt2e.quantizer import (
+    annotate_output_qspec,
+    QuantizationAnnotation,
+    QuantizationSpec,
+    SharedQuantizationSpec,
+)
+
+from .qconfig import QuantizationConfig
+
+OP_ANNOTATOR: Dict[OpOverload, Callable] = {}
+
+ADD_OPS = [
+    torch.ops.aten.add,
+    torch.ops.aten.add.Tensor,
+    torch.ops.aten.add_.Tensor,
+]
+
+
+def register_annotator(ops: List[OpOverload]):
+    def decorator(annotator: Callable):
+        for op in ops:
+            OP_ANNOTATOR[op] = annotator
+
+    return decorator
+
+
+def annotate(graph: Graph, quant_config: QuantizationConfig) -> None:
+    # Pattern annotation
+    _annotate_fused_activation_pattern(graph, quant_config)
+
+    # Per-op annotation
+    for node in graph.nodes:
+        if node.op == "placeholder":
+            annotate_placeholder(node, quant_config)
+        elif node.op == "call_function":
+            annotate_func = OP_ANNOTATOR.get(node.target, None)
+            if annotate_func is not None:
+                annotate_func(node, quant_config)
+
+
+def _is_annotated(nodes: List[Node]):
+    """
+    Given a list of nodes (that represents an operator pattern),
+    return True if any of the node
+    is annotated, otherwise return False
+    """
+    annotated = False
+    for node in nodes:
+        annotated = annotated or (
+            "quantization_annotation" in node.meta
+            and node.meta["quantization_annotation"]._annotated
+        )
+    return annotated
+
+
+def _is_fake_tensor(node: Node):
+    if (
+        isinstance(node, Node)
+        and "val" in node.meta
+        and isinstance(node.meta["val"], FakeTensor)
+    ):
+        return True
+    return False
+
+
+def _is_float_tensor(node: Node):
+    """Check if the node's tensor is a float tensor,
+    so that we can skip quantization for the node
+    since observers only works with float Tensors
+    """
+    if not _is_fake_tensor(node):
+        return False
+    return node.meta["val"].dtype in [torch.float32, torch.float16]
+
+
+def _mark_nodes_as_annotated(nodes: List[Node]):
+    for node in nodes:
+        if "quantization_annotation" not in node.meta:
+            node.meta["quantization_annotation"] = QuantizationAnnotation()
+        node.meta["quantization_annotation"]._annotated = True
+
+
+# for nodes whose targets ars placehold (not call_function)
+def annotate_placeholder(node: Node, quant_config: QuantizationConfig) -> None:
+    if _is_annotated([node]):
+        return
+
+    if _is_float_tensor(node):
+        annotate_output_qspec(node, quant_config.output_activation)
+
+    _mark_nodes_as_annotated([node])
+
+
+# CASE 1: fused_activation case (ex. Conv2D + ReLU)
+def _is_hardtanh_for_relux(relu_node: torch.fx.node.Node):
+    if relu_node.target in [
+        torch.ops.aten.hardtanh.default,
+        torch.ops.aten.hardtanh_.default,
+    ]:
+        # checking if hardtanh is convertable to ReLU6
+        # ReLU1 is not supported now
+        if not relu_node.args[1] == 0.0:
+            return False
+        if relu_node.args[2] == 6.0:  # for ReLU6
+            return True
+    return True
+
+
+def _annotate_fused_activation_pattern(
+    graph: Graph, quant_config: QuantizationConfig
+) -> None:
+    for relu_node in graph.nodes:
+        # Check relu/relu6 node
+        if relu_node.op != "call_function":
+            continue
+        if relu_node.target not in [
+            # The strategy of ReLU and ReLU6 is fold_activation in ENNQuant
+            torch.ops.aten.relu.default,
+            torch.ops.aten.relu_.default,
+            torch.ops.aten.relu6.default,
+            torch.ops.aten.relu6_.default,
+            torch.ops.aten.hardtanh.default,
+            torch.ops.aten.hardtanh_.default,
+        ]:
+            continue
+
+        if not _is_hardtanh_for_relux(relu_node):
+            continue
+
+        producer_node = relu_node.args[0]
+        if not isinstance(producer_node, Node):
+            continue
+        if producer_node.op != "call_function":
+            continue
+        if len(producer_node.users) != 1:
+            continue
+
+        # Handle affine + relu fusion
+        if producer_node.target in [
+            torch.ops.aten.conv1d.default,
+            torch.ops.aten.conv2d.default,
+            torch.ops.aten.linear.default,
+        ]:
+            # input & weight (or bias) setting for Conv node(producer_node)
+            quantization_annotation = producer_node.meta.get(
+                "quantization_annotation", QuantizationAnnotation()
+            )
+            if quantization_annotation.input_qspec_map is None:
+                quantization_annotation.input_qspec_map = {}
+
+            input = producer_node.args[0]
+            quantization_annotation.input_qspec_map[input] = (
+                quant_config.input_activation
+            )
+
+            quantization_annotation.input_qspec_map[producer_node.args[1]] = (
+                quant_config.weight
+            )
+            if len(producer_node.args) > 2 and quant_config.bias is not None:
+                quantization_annotation.input_qspec_map[producer_node.args[2]] = (
+                    quant_config.bias
+                )
+
+            producer_node.meta["quantization_annotation"] = quantization_annotation
+            producer_node.meta["quantization_annotation"]._annotated = True
+            # out setting for activation node (relu_node)
+            quantization_annotation = relu_node.meta.get(
+                "quantization_annotation", QuantizationAnnotation()
+            )
+            quantization_annotation.output_qspec = quant_config.output_activation
+
+            relu_node.meta["quantization_annotation"] = quantization_annotation
+            relu_node.meta["quantization_annotation"]._annotated = True
+            continue
+
+
+# CASE 2-1: two input case without Shared Quant
+@register_annotator(
+    [
+        torch.ops.aten.div,
+        torch.ops.aten.div.Tensor,
+        torch.ops.aten.divide.Tensor,
+        torch.ops.aten.matmul.default,
+        torch.ops.aten.bmm.default,
+        torch.ops.aten.sum.dim_IntList,
+    ]
+)
+def annotate_2in1out(node: Node, quant_config: QuantizationConfig) -> None:
+    input_act0 = node.args[0]
+    input_act1 = node.args[1]
+    # skipping quantization if 1st input is not float.
+    if _is_annotated([node]) or not _is_float_tensor(input_act0):
+        return
+
+    input_act_qspec = quant_config.input_activation
+    output_act_qspec = (
+        quant_config.output_activation if _is_float_tensor(node) else None
+    )
+
+    input_qspec_map = {}
+    if _is_float_tensor(input_act0):
+        input_qspec_map[input_act0] = input_act_qspec
+
+    if _is_float_tensor(input_act1):
+        input_qspec_map[input_act1] = input_act_qspec
+
+    node.meta["quantization_annotation"] = QuantizationAnnotation(
+        input_qspec_map=input_qspec_map,
+        output_qspec=output_act_qspec,
+        _annotated=True,
+    )
+
+
+# getting QuantAnnot though the first input
+def _get_quantization_annotation(node: Node):
+    if node.op == "placeholder":
+        return False
+    elif "quantization_annotation" in node.meta:
+        return node
+    elif node.args == ():
+        return False
+    elif isinstance(node.args[0], Node):
+        return _get_quantization_annotation(node.args[0])
+    elif isinstance(node.args[0], list):
+        # for cat, concatenate and stack
+        if isinstance(node.args[0][0], Node):
+            return _get_quantization_annotation(node.args[0][0])
+        else:
+            return False
+    else:
+        return False
+
+
+# CASE 2-2: two input case with Shared Quant
+# ops.add / ops.add_ are processed by another annotator
+@register_annotator(
+    [
+        torch.ops.aten.sub,
+        torch.ops.aten.mul,
+        torch.ops.aten.sub.Tensor,
+        torch.ops.aten.mul.Tensor,
+        torch.ops.aten.sub_.Tensor,
+        torch.ops.aten.mul_.Tensor,
+        torch.ops.aten.rsub.Scalar,
+        torch.ops.aten.mul.Scalar,
+    ]
+)
+def annotate_2in1out_with_SharedQuant(
+    node: Node, quant_config: QuantizationConfig
+) -> None:
+
+    input_qspec_map = {}
+    input0 = node.args[0]
+    input1 = node.args[1]
+
+    # skipping quantization if 1st input is not float.
+    if _is_annotated([node]) or not _is_float_tensor(input0):
+        return
+    if (
+        isinstance(input0, Node)
+        and isinstance(input1, float)
+        and not _get_quantization_annotation(input0)
+    ):
+        return
+    if (
+        isinstance(input0, float)
+        and isinstance(input1, Node)
+        and not _get_quantization_annotation(input1)
+    ):
+        return
+    if isinstance(input0, Node) and isinstance(input1, Node):
+        shared_qspec = SharedQuantizationSpec((input0, node))
+        input_qspec_map[input0] = quant_config.input_activation
+        input_qspec_map[input1] = shared_qspec
+
+        node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=shared_qspec,
+            _annotated=True,
+        )
+
+    else:
+        input_act_qspec = quant_config.input_activation
+        output_act_qspec = (
+            quant_config.output_activation if _is_float_tensor(node) else None
+        )
+
+        input_qspec_map = {}
+        input_act0 = node.args[0]
+        if _is_float_tensor(input_act0):
+            input_qspec_map[input_act0] = input_act_qspec
+
+        input_act1 = node.args[1]
+        if _is_float_tensor(input_act1):
+            input_qspec_map[input_act1] = input_act_qspec
+
+        node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=output_act_qspec,
+            _annotated=True,
+        )
+
+
+# CASE 2-3: only for add ops
+@register_annotator(ADD_OPS)
+def annotate_add_ops_with_SharedQuant(
+    node: Node, quant_config: QuantizationConfig
+) -> None:
+
+    input_qspec_map = {}
+    input0 = node.args[0]
+    input1 = node.args[1]
+
+    # skipping quantization if 1st input is not float.
+    if _is_annotated([node]) or not _is_float_tensor(input0):
+        return
+
+    if isinstance(input0, Node) and isinstance(input1, Node):
+        NonQuantShare_ops_for_add = [torch.ops.aten.dropout.default] + ADD_OPS
+        if (
+            input0.op == "call_function" and input0.target in NonQuantShare_ops_for_add
+        ) or (
+            input1.op == "call_function" and input1.target in NonQuantShare_ops_for_add
+        ):
+            input_act_qspec = quant_config.input_activation
+            output_act_qspec = (
+                quant_config.output_activation if _is_float_tensor(node) else None
+            )
+
+            input_qspec_map = {}
+            input_act0 = node.args[0]
+            if _is_float_tensor(input_act0):
+                input_qspec_map[input_act0] = input_act_qspec
+
+            input_act1 = node.args[1]
+            if _is_float_tensor(input_act1):
+                input_qspec_map[input_act1] = input_act_qspec
+
+            node.meta["quantization_annotation"] = QuantizationAnnotation(
+                input_qspec_map=input_qspec_map,
+                output_qspec=output_act_qspec,
+                _annotated=True,
+            )
+        else:
+            shared_qspec = SharedQuantizationSpec((input0, node))
+            input_qspec_map[input0] = quant_config.input_activation
+            input_qspec_map[input1] = shared_qspec
+
+            node.meta["quantization_annotation"] = QuantizationAnnotation(
+                input_qspec_map=input_qspec_map,
+                output_qspec=shared_qspec,
+                _annotated=True,
+            )
+    elif (
+        isinstance(input0, Node)
+        and isinstance(input1, float)
+        and not _get_quantization_annotation(input0)
+    ):
+        pass
+    elif (
+        isinstance(input0, float)
+        and isinstance(input1, Node)
+        and not _get_quantization_annotation(input1)
+    ):
+        pass
+    else:
+        input_act_qspec = quant_config.input_activation
+        output_act_qspec = (
+            quant_config.output_activation if _is_float_tensor(node) else None
+        )
+
+        input_qspec_map = {}
+        input_act0 = node.args[0]
+        if _is_float_tensor(input_act0):
+            input_qspec_map[input_act0] = input_act_qspec
+
+        input_act1 = node.args[1]
+        if _is_float_tensor(input_act1):
+            input_qspec_map[input_act1] = input_act_qspec
+
+        node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=output_act_qspec,
+            _annotated=True,
+        )
+
+
+# CASE 3-1: Single input + Single Out case without Shared Quant
+@register_annotator(
+    [
+        torch.ops.aten.ceil.default,
+        torch.ops.aten.clamp.default,
+        torch.ops.aten.relu.default,
+        torch.ops.aten.relu_.default,
+        torch.ops.aten.relu6.default,
+        torch.ops.aten.relu6_.default,
+        torch.ops.aten.cos.default,
+        torch.ops.aten.sin.default,
+        torch.ops.aten.tanh.default,
+        torch.ops.aten.hardswish.default,
+        torch.ops.aten.hardswish_.default,
+        torch.ops.aten.hardsigmoid.default,
+        torch.ops.aten.hardsigmoid_.default,
+        torch.ops.aten.hardtanh.default,
+        torch.ops.aten.hardtanh_.default,
+        torch.ops.aten.mean.default,
+        torch.ops.aten.adaptive_avg_pool2d.default,
+        torch.ops.aten.avg_pool2d.default,
+        torch.ops.aten.leaky_relu.default,
+        torch.ops.aten.leaky_relu_.default,
+        torch.ops.aten.prelu.default,
+        torch.ops.aten.upsample_bilinear2d.vec,
+        torch.ops.aten.upsample_nearest2d.vec,
+        torch.ops.aten.mean.dim,
+        torch.ops.aten.sqrt.default,
+        torch.ops.aten.gelu.default,
+        torch.ops.aten.scaled_dot_product_attention.default,
+        torch.ops.aten.rsqrt.default,
+        torch.ops.aten.pow.Tensor_Scalar,
+        torch.ops.aten.topk.default,
+    ]
+)
+def annotate_1in1out(node: Node, quant_config: QuantizationConfig) -> None:
+    # skipping quantization if input is not float.
+    if _is_annotated([node]) or not _is_float_tensor(node.args[0]):
+        return
+
+    quantization_annotation = node.meta.get(
+        "quantization_annotation", QuantizationAnnotation()
+    )
+    if quantization_annotation.input_qspec_map is None:
+        quantization_annotation.input_qspec_map = {}
+
+    # one inputs + one output case.
+    input_act_qspec = quant_config.input_activation
+    quantization_annotation.input_qspec_map[node.args[0]] = input_act_qspec
+    quantization_annotation.output_qspec = quant_config.output_activation
+
+    node.meta["quantization_annotation"] = quantization_annotation
+    node.meta["quantization_annotation"]._annotated = True
+
+
+# CASE 3-2: Single input + Single Out case with Shared Quant
+@register_annotator(
+    [
+        torch.ops.aten.permute.default,
+        torch.ops.aten.view.default,
+        torch.ops.aten._unsafe_view.default,
+        torch.ops.aten.squeeze.default,
+        torch.ops.aten.squeeze.dim,
+        torch.ops.aten.squeeze_copy.dims,
+        torch.ops.aten.unsqueeze.default,
+        torch.ops.aten.unsqueeze_copy.default,
+        torch.ops.aten.transpose.int,
+        torch.ops.aten.expand.default,
+        torch.ops.aten.max_pool2d.default,
+        torch.ops.aten.max_pool2d_with_indices.default,
+        torch.ops.aten.reshape.default,
+        torch.ops.aten.select.int,
+        torch.ops.aten.flatten.using_ints,
+        torch.ops.aten.pad.default,
+        torch.ops.aten.slice.Tensor,
+        torch.ops.aten.to.dtype,
+    ]
+)
+def annotate_1in1out_with_SharedQuant(
+    node: Node, quant_config: QuantizationConfig
+) -> None:
+    input_qspec_map = {}
+    input = node.args[0]
+    assert isinstance(input, Node)
+    if _is_annotated([node]) or not _is_float_tensor(input):
+        return
+
+    shared_qspec = SharedQuantizationSpec((input, node))
+
+    # get QuantAnnot from the input path
+    shared_quant_node = _get_quantization_annotation(input)
+    if shared_quant_node:
+        input_qspec_map[shared_quant_node] = SharedQuantizationSpec(shared_quant_node)
+        shared_qspec = SharedQuantizationSpec((shared_quant_node, node))
+    else:
+        # if no QuantAnnot in the input path
+        input_qspec_map[input] = quant_config.input_activation
+        shared_qspec = SharedQuantizationSpec((input, node))
+
+    node.meta["quantization_annotation"] = QuantizationAnnotation(
+        input_qspec_map=input_qspec_map,
+        output_qspec=shared_qspec,
+        _annotated=True,
+    )
+
+
+# CASE 3-3: Single input + Single Out case with FP
+@register_annotator(
+    [
+        torch.ops.aten.softmax.int,
+        torch.ops.aten._softmax.default,
+        torch.ops.aten._safe_softmax.default,
+        torch.ops.aten.log_softmax.int,
+    ]
+)
+def annotate_1in1out_with_SharedQuant_for_FP(
+    node: Node, quant_config: QuantizationConfig
+) -> None:
+    input_qspec_map = {}
+    input = node.args[0]
+    assert isinstance(input, Node)
+
+    if _is_annotated([node]) or not _is_float_tensor(input):
+        return
+
+    if input.target in ADD_OPS and _is_annotated([input]):
+        del input.meta["quantization_annotation"]
+
+    # get QuantAnnot from the input path
+    shared_quant_node = _get_quantization_annotation(input)
+    if shared_quant_node:
+        # if QuantAnnot in the input path, input_qspec is shared, but output_qspec is not.
+        input_qspec_map[shared_quant_node] = SharedQuantizationSpec(shared_quant_node)
+
+        node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=quant_config.output_activation,
+            _annotated=True,
+        )
+    else:
+        # if no QuantAnnot in the input path
+        node.meta["quantization_annotation"] = QuantizationAnnotation(
+            output_qspec=quant_config.output_activation,
+            _annotated=True,
+        )
+
+
+# CASE 4: One value input + one index input with Shared Quant
+@register_annotator([torch.ops.aten.index.Tensor])
+def annotate_index(node: Node, quant_config: QuantizationConfig) -> None:
+    input_qspec_map = {}
+    input = node.args[0]
+    assert isinstance(input, Node)
+
+    if _is_annotated([node]) or not _is_float_tensor(input):
+        return
+
+    # get QuantAnnt from the input path
+    shared_quant_node = _get_quantization_annotation(input)
+    if shared_quant_node:
+        shared_qspec = SharedQuantizationSpec((shared_quant_node, node))
+        input_qspec_map[input] = quant_config.input_activation
+
+        # sharing QuantAnnot with the parent
+        node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=shared_qspec,
+            _annotated=True,
+        )
+
+
+# CASE 5 input + index + value & output with Shared Quant
+@register_annotator(
+    [torch.ops.aten.index_put.default, torch.ops.aten.index_put_.default]
+)
+def annotate_index_put(node: Node, quant_config: QuantizationConfig) -> None:
+    input_qspec_map = {}
+    input = node.args[0]  # from KVCache in LLAMA
+    value = node.args[2]  # from linear projection layer
+    assert isinstance(input, Node)
+    assert isinstance(value, Node)
+
+    if _is_annotated([node]) or not _is_float_tensor(input):
+        return
+
+    # get QuantAnnot from input path
+    shared_quant_node = _get_quantization_annotation(input)
+    if shared_quant_node:
+        shared_qspec = SharedQuantizationSpec((shared_quant_node, node))
+        input_qspec_map[input] = shared_qspec
+        input_qspec_map[value] = shared_qspec
+        output_qspec = shared_qspec
+    else:
+        # if no QuantAnnot in input path, asign the default QuantAnnot from quant_config.
+        input_qspec_map[input] = quant_config.input_activation
+        input_qspec_map[value] = SharedQuantizationSpec((input, node))
+        output_qspec = SharedQuantizationSpec((input, node))
+
+    node.meta["quantization_annotation"] = QuantizationAnnotation(
+        input_qspec_map=input_qspec_map,
+        output_qspec=output_qspec,
+        _annotated=True,
+    )
+
+
+# CASE 6 unbind + getitem case
+# (inputQuant--unbinde--no Qunat) --> (no Qunat--getitem--outputQuant)
+@register_annotator([torch.ops.aten.unbind.int])
+def annotate_unbind(node: Node, quant_config: QuantizationConfig) -> None:
+    input_qspec_map = {}
+    input = node.args[0]
+    assert isinstance(input, Node)
+
+    if _is_annotated([node]) or not _is_float_tensor(input):
+        return
+
+    # get QuantAnnot from input path
+    shared_quant_node = _get_quantization_annotation(input)
+    if shared_quant_node:
+        input_qspec_map[input] = quant_config.input_activation
+        shared_qspec = SharedQuantizationSpec((shared_quant_node, node))
+    else:
+        # if no QuantAnnot in input path, asign the default QuantAnnot from quant_config.
+        input_qspec_map[input] = quant_config.input_activation
+        shared_qspec = SharedQuantizationSpec((input, node))
+
+    node.meta["quantization_annotation"] = QuantizationAnnotation(
+        input_qspec_map=input_qspec_map,
+        output_qspec=shared_qspec,
+        _annotated=True,
+    )
+
+    for users_node in node.users:
+        users_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            output_qspec=shared_qspec,
+            _annotated=True,
+        )
+
+
+# CASE 7: stand-alone Conv2d and Conv1d
+@register_annotator(
+    [
+        torch.ops.aten.conv2d.default,
+        torch.ops.aten.conv1d.default,
+        torch.ops.aten.linear.default,
+    ]
+)
+def annotate_conv2d(node: Node, quant_config: QuantizationConfig) -> None:
+    # skipping quantization if weights are not float
+    if _is_annotated([node]) or not _is_float_tensor(node.args[1]):
+        return
+
+    input = node.args[0]
+    # input & weight (or bias) setting for Conv node(producer_node)
+    quantization_annotation = node.meta.get(
+        "quantization_annotation", QuantizationAnnotation()
+    )
+    if quantization_annotation.input_qspec_map is None:
+        quantization_annotation.input_qspec_map = {}
+
+    shared_quant_node = _get_quantization_annotation(input)
+    if shared_quant_node:
+        quantization_annotation.input_qspec_map[input] = SharedQuantizationSpec(
+            shared_quant_node
+        )
+    else:
+        quantization_annotation.input_qspec_map[input] = quant_config.input_activation
+    quantization_annotation.input_qspec_map[node.args[1]] = quant_config.weight
+    if len(node.args) > 2 and quant_config.bias is not None:
+        quantization_annotation.input_qspec_map[node.args[2]] = quant_config.bias
+    quantization_annotation.output_qspec = quant_config.output_activation
+
+    node.meta["quantization_annotation"] = quantization_annotation
+    node.meta["quantization_annotation"]._annotated = True
+
+
+# CASE 8: embedding
+@register_annotator([torch.ops.aten.embedding.default])
+def annotate_embedding(node: Node, quant_config: QuantizationConfig) -> None:
+    input_qspec_map = {}
+    weight = node.args[0]
+    if _is_annotated([node]) or not _is_float_tensor(weight):
+        return
+
+    input_qspec_map[weight] = quant_config.input_activation
+
+    node.meta["quantization_annotation"] = QuantizationAnnotation(
+        input_qspec_map=input_qspec_map,
+        output_qspec=quant_config.output_activation,
+        _annotated=True,
+    )
+
+
+# CASE 9: Concat & Stack
+@register_annotator(
+    [
+        torch.ops.aten.cat.default,
+        torch.ops.aten.concat.default,
+        torch.ops.aten.stack.default,
+    ]
+)
+def annotate_cat(node: Node, quant_config: QuantizationConfig) -> None:
+    inputs = node.args[0]
+    first_input = inputs[0]
+    assert isinstance(inputs, list)
+    assert isinstance(first_input, Node)
+
+    if _is_annotated([node]) or not _is_float_tensor(first_input):
+        return
+
+    input_qspec_map = {}
+    shared_qspec = SharedQuantizationSpec((first_input, node))
+    for input in inputs:
+        if input == first_input:
+            input_qspec_map[input] = quant_config.input_activation
+        else:
+            input_qspec_map[input] = shared_qspec
+
+    node.meta["quantization_annotation"] = QuantizationAnnotation(
+        input_qspec_map=input_qspec_map,
+        output_qspec=shared_qspec,
+        _annotated=True,
+    )
+
+
+# CASE 10: various normalizations
+@register_annotator([torch.ops.aten.rms_norm.default])
+def annotate_rms_norm(node: Node, quant_config: QuantizationConfig) -> None:
+    if _is_annotated([node]):
+        return
+
+    quantization_annotation = node.meta.get(
+        "quantization_annotation", QuantizationAnnotation()
+    )
+    if quantization_annotation.input_qspec_map is None:
+        quantization_annotation.input_qspec_map = {}
+
+    quantization_annotation.input_qspec_map[node.args[0]] = (
+        quant_config.input_activation
+    )  # active
+    quantization_annotation.input_qspec_map[node.args[2]] = (
+        quant_config.input_activation
+    )  # weight
+    quantization_annotation.output_qspec = quant_config.output_activation
+    node.meta["quantization_annotation"] = quantization_annotation
+    node.meta["quantization_annotation"]._annotated = True
+
+
+@register_annotator([torch.ops.aten.group_norm.default])
+def annotate_group_norm(node: Node, quant_config: QuantizationConfig) -> None:
+    if _is_annotated([node]):
+        return
+
+    quantization_annotation = node.meta.get(
+        "quantization_annotation", QuantizationAnnotation()
+    )
+    if quantization_annotation.input_qspec_map is None:
+        quantization_annotation.input_qspec_map = {}
+
+    quantization_annotation.input_qspec_map[node.args[0]] = (
+        quant_config.input_activation
+    )  # active
+    quantization_annotation.input_qspec_map[node.args[2]] = (
+        quant_config.weight
+    )  # weight
+    quantization_annotation.output_qspec = quant_config.output_activation
+
+    node.meta["quantization_annotation"] = quantization_annotation
+    node.meta["quantization_annotation"]._annotated = True
+
+
+@register_annotator([torch.ops.aten.layer_norm.default])
+def annotate_layer_norm(node: Node, quant_config: QuantizationConfig) -> None:
+    if _is_annotated([node]):
+        return
+
+    quantization_annotation = node.meta.get(
+        "quantization_annotation", QuantizationAnnotation()
+    )
+    if quantization_annotation.input_qspec_map is None:
+        quantization_annotation.input_qspec_map = {}
+
+    quantization_annotation.input_qspec_map[node.args[0]] = (
+        quant_config.input_activation
+    )  # active
+    quantization_annotation.input_qspec_map[node.args[2]] = (
+        quant_config.input_activation
+    )  # weight
+    quantization_annotation.output_qspec = quant_config.output_activation
+
+    node.meta["quantization_annotation"] = quantization_annotation
+    node.meta["quantization_annotation"]._annotated = True
+
+
+@register_annotator([torch.ops.aten._native_batch_norm_legit_no_training.default])
+def annotate_batch_norm(node: Node, quant_config: QuantizationConfig) -> None:
+    if _is_annotated([node]):
+        return
+
+    quantization_annotation = node.meta.get(
+        "quantization_annotation", QuantizationAnnotation()
+    )
+    if quantization_annotation.input_qspec_map is None:
+        quantization_annotation.input_qspec_map = {}
+
+    quantization_annotation.input_qspec_map[node.args[0]] = (
+        quant_config.input_activation
+    )  # active
+
+    quantization_annotation.input_qspec_map[node.args[1]] = (
+        quant_config.input_activation
+    )  # weight
+    quantization_annotation.output_qspec = quant_config.output_activation
+
+    node.meta["quantization_annotation"] = quantization_annotation
+    node.meta["quantization_annotation"]._annotated = True
+
+
+# CASE 11: Sigmoid
+@register_annotator([torch.ops.aten.sigmoid, torch.ops.aten.sigmoid.default])
+def annotate_sigmoid(node: Node, quant_config: QuantizationConfig) -> None:
+    if _is_annotated([node]):
+        return
+
+    input_qspec_map = {}
+    input_act = node.args[0]
+    input_qspec_map[input_act] = quant_config.input_activation
+
+    assert isinstance(input_act, Node)
+    out_qconf = quant_config.output_activation
+
+    q_max = (
+        torch.iinfo(out_qconf.dtype).max
+        if out_qconf.quant_max is None
+        else out_qconf.quant_max
+    )
+    q_min = (
+        torch.iinfo(out_qconf.dtype).min
+        if out_qconf.quant_min is None
+        else out_qconf.quant_min
+    )
+
+    scale = 1 / (q_max - q_min + 1)
+
+    bias_obs_ctr = FixedQParamsObserver.with_args(
+        scale=scale,
+        zero_point=0,
+        dtype=quant_config.output_activation.dtype,
+        qscheme=torch.torch.per_tensor_affine,
+        quant_max=q_max,
+        quant_min=q_min,
+    )
+
+    # make sigmoid map to the range between 0~1
+    out_act_quantization_spec = QuantizationSpec(
+        dtype=quant_config.output_activation.dtype,
+        quant_max=q_max,
+        quant_min=q_min,
+        observer_or_fake_quant_ctr=bias_obs_ctr,
+        qscheme=torch.torch.per_tensor_affine,
+    )
+
+    if _is_float_tensor(node):
+        node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=out_act_quantization_spec,
+            _annotated=True,
+        )
diff --git a/backends/samsung/quantizer/qconfig.py b/backends/samsung/quantizer/qconfig.py
new file mode 100644
index 00000000000..f32c8d39796
--- /dev/null
+++ b/backends/samsung/quantizer/qconfig.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+from enum import IntEnum, unique
+from typing import Callable, Optional
+
+import torch
+from torchao.quantization.pt2e import (
+    FakeQuantize,
+    MinMaxObserver,
+    PerChannelMinMaxObserver,
+)
+from torchao.quantization.pt2e.quantizer import QuantizationSpec
+
+
+@unique
+class Precision(IntEnum):
+    A8W8 = 3
+
+
+@dataclass(eq=True, frozen=True)
+class QuantizationConfig:
+    input_activation: Optional[QuantizationSpec]
+    output_activation: Optional[QuantizationSpec]
+    weight: Optional[QuantizationSpec]
+    bias: Optional[QuantizationSpec | Callable]
+
+
+def get_quant_config(
+    precision: Precision,
+    is_per_channel: bool = False,
+    is_qat: bool = False,
+) -> QuantizationConfig:
+
+    precision_mappings = {
+        Precision.A8W8: get_a8w8_enn_quant_config,
+    }
+    if precision not in precision_mappings:
+        raise RuntimeError("Unrecognized precision setting.")
+
+    is_weight_symm = is_per_channel
+
+    qconfig_fn = precision_mappings[precision]
+    return qconfig_fn(is_per_channel, is_qat, wei_symmetric=is_weight_symm)
+
+
+def _get_activation_qspec(
+    dtype,
+    is_symmetric,
+    is_qat,
+    observer_cls=MinMaxObserver,
+    quant_min=None,
+    quant_max=None,
+):
+    eps_value = 2**-12
+    if quant_max is None:
+        quant_max = torch.iinfo(dtype).max
+    if quant_min is None:
+        quant_min = torch.iinfo(dtype).min
+
+    qscheme = torch.per_tensor_symmetric if is_symmetric else torch.per_tensor_affine
+    if is_qat:
+        observer_or_fake_quant = FakeQuantize.with_args(
+            observer=observer_cls, eps=eps_value
+        )
+    else:
+        observer_or_fake_quant = observer_cls.with_args(eps=eps_value)
+
+    return QuantizationSpec(
+        dtype=dtype,
+        quant_min=quant_min,
+        quant_max=quant_max,
+        qscheme=qscheme,
+        observer_or_fake_quant_ctr=observer_or_fake_quant,
+    )
+
+
+def _get_weight_qspec(
+    dtype, is_symmetric, is_per_channel, is_qat, quant_min=None, quant_max=None
+):
+    assert is_symmetric or not is_per_channel, "Not support asymm+perchannel mode"
+
+    eps_value = 2**-12
+
+    if quant_max is None:
+        quant_max = torch.iinfo(dtype).max
+    if quant_min is None:
+        quant_min = torch.iinfo(dtype).min
+
+    if not is_per_channel:
+        qscheme = (
+            torch.per_tensor_symmetric if is_symmetric else torch.per_tensor_affine
+        )
+        observer_cls = MinMaxObserver
+    else:
+        qscheme = (
+            torch.per_channel_symmetric if is_symmetric else torch.per_channel_affine
+        )
+        observer_cls = PerChannelMinMaxObserver
+
+    if is_qat:
+        observer_or_fake_quant = FakeQuantize.with_args(
+            observer=observer_cls, eps=eps_value
+        )
+    else:
+        observer_or_fake_quant = observer_cls.with_args(eps=eps_value)
+
+    return QuantizationSpec(
+        dtype=dtype,
+        quant_min=quant_min,
+        quant_max=quant_max,
+        qscheme=qscheme,
+        ch_axis=0,
+        observer_or_fake_quant_ctr=observer_or_fake_quant,
+    )
+
+
+def get_a8w8_enn_quant_config(
+    is_per_channel=True, is_qat=False, act_symmetric=False, wei_symmetric=False
+) -> QuantizationConfig:
+    act_quantization_spec = _get_activation_qspec(torch.int8, act_symmetric, is_qat)
+    wgt_quantization_spec = _get_weight_qspec(
+        torch.int8, wei_symmetric, is_per_channel, is_qat
+    )
+    bias_quantization_spec = None
+    quantization_config = QuantizationConfig(
+        input_activation=act_quantization_spec,
+        output_activation=act_quantization_spec,
+        weight=wgt_quantization_spec,
+        bias=bias_quantization_spec,
+    )
+    return quantization_config
+
+
+class QuantInfo:
+    def __init__(self, torch_dtype: torch.dtype, string: str):
+        self._torch_dtype = torch_dtype
+        self._string = string
+
+    @property
+    def torch_dtype(self):
+        return self._torch_dtype
+
+    @property
+    def string(self):
+        return self._string
+
+
+class QuantInfoManager:
+    QUANT_INFO_MAP = {
+        Precision.A8W8: (QuantInfo(torch.int8, "INT8"), QuantInfo(torch.int8, "INT8")),
+    }
+    FP_INFO = (
+        QuantInfo(torch.float32, "FLOAT32"),
+        QuantInfo(torch.float32, "FLOAT32"),
+    )
+
+    def __init__(self):
+        self.precision = None
+
+    def set_precision(self, precision: Precision):
+        self.precision = precision
+
+    @property
+    def weight_precison(self) -> Optional[QuantInfo]:
+        return self.QUANT_INFO_MAP.get(self.precision, self.FP_INFO)[0]
+
+    @property
+    def act_precision(self) -> Optional[QuantInfo]:
+        return self.QUANT_INFO_MAP.get(self.precision, self.FP_INFO)[1]
diff --git a/backends/samsung/quantizer/quantizer.py b/backends/samsung/quantizer/quantizer.py
new file mode 100644
index 00000000000..cf46677d000
--- /dev/null
+++ b/backends/samsung/quantizer/quantizer.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Callable, Sequence
+
+import torch
+from torch.fx import GraphModule
+from torchao.quantization.pt2e.quantizer import Quantizer
+
+from .annotator import annotate
+from .qconfig import get_quant_config, Precision, QuantInfoManager
+
+
+global_quant_info = QuantInfoManager()
+
+
+class EnnQuantizer(Quantizer):
+
+    def __init__(self):
+        super().__init__()
+
+        self._precision = Precision.A8W8
+        global_quant_info.set_precision(self._precision)
+        self._is_per_channel = True
+        self._is_qat = False
+        self.custom_quant_annotations: Sequence[Callable] = []
+
+    def setup_precision(self, quant_dtype: Precision) -> None:
+        assert quant_dtype in Precision, f"No support for Precision {quant_dtype}."
+        self._precision = quant_dtype
+        global_quant_info.set_precision(self._precision)
+
+    def setup_quant_params(
+        self, quant_dtype: Precision, is_per_channel=True, is_qat=False
+    ) -> None:
+        assert quant_dtype in Precision, f"No support for Precision {quant_dtype}."
+        self._precision = quant_dtype
+        self._is_per_channel = is_per_channel
+        self._is_qat = is_qat
+
+    def annotate(self, model: GraphModule) -> GraphModule:
+        self._annotate(model)
+        self._annotate_custom_annotation(model)
+        return model
+
+    def _annotate(self, gm: GraphModule) -> None:
+        quant_config = get_quant_config(
+            self._precision, self._is_per_channel, self._is_qat
+        )
+        annotate(gm.graph, quant_config)
+
+    def add_custom_quant_annotations(
+        self, custom_quant_annotations: Sequence[Callable]
+    ) -> None:
+        self.custom_quant_annotations = custom_quant_annotations
+
+    def _annotate_custom_annotation(self, gm: GraphModule) -> None:
+        for annotation_func in self.custom_quant_annotations:
+            annotation_func(gm)
+
+    def validate(self, model: torch.fx.GraphModule) -> None:
+        return
diff --git a/backends/samsung/serialization/enn_graph_schema.py b/backends/samsung/serialization/enn_graph_schema.py
index 7e74182f9d7..5209a8672ee 100644
--- a/backends/samsung/serialization/enn_graph_schema.py
+++ b/backends/samsung/serialization/enn_graph_schema.py
@@ -5,13 +5,16 @@
 # LICENSE file in the root directory of this source tree.
 
 import logging
-from typing import Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
 import executorch.backends.samsung.python.PyGraphWrapperAdaptor as PyGraphWrapper
 
 import numpy as np
 
 import torch
+from executorch.backends.samsung.builders.utils import DATA_TYPE_STR_MAPPING
+from executorch.backends.samsung.utils.constants import QuantConstants
+from executorch.backends.samsung.utils.utils import quantize_tensor
 
 
 class EnnGraph:
@@ -24,6 +27,10 @@ def __init__(self):
         self.inputs = []
         self.outputs = []
 
+    def init(self, name: str, soc_name):
+        self.name = name
+        self.soc_name = soc_name
+
     def define_op(
         self,
         name,
@@ -46,22 +53,54 @@ def define_op(
                     py_param_wrapper.SetScalarValue(params[key])
                 else:
                     logging.error("Unsupported param type.")
+                # Set
                 op.AddOpParam(py_param_wrapper)
 
         self.graph.DefineOpNode(op)
 
-    def define_tensor(
+    def define_tensor(  # noqa: C901
         self,
         name: str,
         shape: List,
         data_type: str,
         tensor_type: str,
         data: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        quant_param: Optional[Dict[str, Any]] = None,
     ) -> int:
         layout = "NCHW" if len(shape) == 4 else "UNDEFINED"
 
+        if quant_param is not None:
+            data_type = DATA_TYPE_STR_MAPPING[
+                quant_param[QuantConstants.QUANT_KEY.quant_dtype]
+            ]
+
         tensor = PyGraphWrapper.PyEnnTensorWrapper(name, shape, data_type, layout)
 
+        if quant_param is not None:
+            need_quantize = True
+
+            scales = self._affine_meta_param(
+                quant_param[QuantConstants.QUANT_KEY.scale]
+            )
+            zero_points = self._affine_meta_param(
+                quant_param[QuantConstants.QUANT_KEY.zero_point]
+            )
+            q_dtype = self._affine_meta_param(
+                quant_param[QuantConstants.QUANT_KEY.quant_dtype]
+            )
+            tensor.AddQuantizeParam(q_dtype, scales, zero_points)
+
+            if need_quantize and data is not None:
+                if isinstance(data, np.ndarray):
+                    data = torch.tensor(data)
+                data = quantize_tensor(
+                    data,
+                    scales,
+                    zero_points,
+                    quant_param[QuantConstants.QUANT_KEY.quant_dtype],
+                    axis=quant_param.get("axis"),
+                )
+
         if data is not None:
             if isinstance(data, torch.Tensor):
                 data = data.detach().numpy()
@@ -83,3 +122,20 @@ def finish(self):
 
     def serialize(self):
         return self.graph.Serialize()
+
+    @staticmethod
+    def _affine_meta_param(param: Any) -> str:
+        type_str_affine_table = {
+            torch.int8: "AINT8",
+        }
+        if isinstance(param, str):
+            return param
+        if isinstance(param, (float, int)):
+            return [param]
+        if hasattr(param, "tolist"):
+            return param.tolist()
+        if isinstance(param, torch.dtype):
+            # Convenient for debugging
+            param = type_str_affine_table.get(param, "")
+
+        return param
diff --git a/backends/samsung/utils/constants.py b/backends/samsung/utils/constants.py
new file mode 100644
index 00000000000..7c3997b9fe2
--- /dev/null
+++ b/backends/samsung/utils/constants.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.exir.dialects._ops import ops as exir_ops
+
+
+class QuantConstants:
+    # TODO: check keys
+    class QUANT_KEY:
+        scale = "scales"
+        zero_point = "zero_points"
+        quant_min = "quant_min"
+        quant_max = "quant_max"
+        quant_dtype = "quant_dtype"
+
+    PERCHANNEL_KEY_MAP = {
+        "scales": QUANT_KEY.scale,
+        "zero_points": QUANT_KEY.zero_point,
+        "quant_min": QUANT_KEY.quant_min,
+        "quant_max": QUANT_KEY.quant_max,
+        "dtype": QUANT_KEY.quant_dtype,
+    }
+    # SNC ir always use key 'scales' and 'zero_points'
+    PERTENSOR_KEY_MAP = {
+        "scale": QUANT_KEY.scale,
+        "zero_point": QUANT_KEY.zero_point,
+        "quant_min": QUANT_KEY.quant_min,
+        "quant_max": QUANT_KEY.quant_max,
+        "dtype": QUANT_KEY.quant_dtype,
+    }
+
+    QUANT_OPS_KEY_MAP = {
+        exir_ops.edge.quantized_decomposed.quantize_per_channel.default: PERCHANNEL_KEY_MAP,
+        exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: PERTENSOR_KEY_MAP,
+        exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor: PERTENSOR_KEY_MAP,
+    }
+
+    DEQUANT_OPS_KEY_MAP = {
+        exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: PERTENSOR_KEY_MAP,
+        exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor: PERTENSOR_KEY_MAP,
+        exir_ops.edge.quantized_decomposed.dequantize_per_channel.default: PERCHANNEL_KEY_MAP,
+    }
diff --git a/backends/samsung/utils/export_utils.py b/backends/samsung/utils/export_utils.py
index aaf407ef0b3..39992f2ea2a 100644
--- a/backends/samsung/utils/export_utils.py
+++ b/backends/samsung/utils/export_utils.py
@@ -4,20 +4,30 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Optional, Tuple
+import logging
+from typing import List, Optional, Tuple
 
 import executorch.exir as exir
 import torch
+from executorch.backends.samsung._passes.fuse_conv_act import FuseConvActPass
+from executorch.backends.samsung._passes.remove_useless_ops import RemoveUselessOpPass
 from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.quantizer.quantizer import EnnQuantizer, Precision
+from executorch.backends.transforms.decompose_sdpa import (
+    DecomposeScaledDotProductAttention,
+)
 from executorch.backends.transforms.remove_clone_ops import RemoveCloneOpsTransform
 from executorch.exir import EdgeCompileConfig
 from executorch.exir.backend.backend_details import CompileSpec
-
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_manager import PassType
 from executorch.exir.program._program import to_edge_transform_and_lower
+from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
 
 
 def get_edge_compile_config():
+    # Maybe most ops in non-decomposition list should be added here
+    # TODO: to confirm whether all op in none-decomposed table should be added here
     return EdgeCompileConfig(
         _skip_dim_order=True,
         _core_aten_ops_exception_list=[
@@ -29,24 +39,55 @@ def get_edge_compile_config():
             exir_ops.edge.aten._safe_softmax.default,
             exir_ops.edge.aten.layer_norm.default,
             exir_ops.edge.aten.matmul.default,
+            exir_ops.edge.aten.hardsigmoid.default,
         ],
     )
 
 
+def get_enn_pass_list() -> List[PassType]:
+    return [
+        RemoveUselessOpPass(),
+        RemoveCloneOpsTransform(),
+        FuseConvActPass(),
+    ]
+
+
+def quantize_module(
+    module: torch.nn.Module,
+    inputs,
+    calibration_dataset,
+    precision: Precision,
+    is_per_channel: bool = True,
+    is_qat: bool = False,
+) -> torch.nn.Module:
+    quantizer = EnnQuantizer()
+    quantizer.setup_quant_params(precision, is_per_channel, is_qat)
+    logging.info("Export nn module for quantization...")
+    exported_module = torch.export.export_for_training(module, inputs).module()
+    DecomposeScaledDotProductAttention()(exported_module)
+    logging.info("Quantizing the module...")
+    annotated_module = prepare_pt2e(exported_module, quantizer)
+    for data in calibration_dataset:
+        annotated_module(*data)
+    quantized_module = convert_pt2e(annotated_module, fold_quantize=False)
+    logging.info("Quantizing finished.")
+    return quantized_module
+
+
 def to_edge_transform_and_lower_to_enn(
     module: torch.nn.Module,
     inputs: Tuple[torch.Tensor],
+    custom_pass_config: List[PassType] = None,
     compile_specs: Optional[CompileSpec] = None,
 ) -> exir.ExecutorchProgramManager:
-    assert (
-        compile_specs is not None
-    ), "Please provide compile specifications for enn backend"
+    assert compile_specs is not None, "For now, we must deliver complile specs"
     prog = torch.export.export(module, inputs)
-
-    ahead_pass_list = [RemoveCloneOpsTransform()]
+    pass_list = get_enn_pass_list()
+    if custom_pass_config:
+        pass_list.extend(custom_pass_config)
     return to_edge_transform_and_lower(
         prog,
-        ahead_pass_list,
+        pass_list,
         {"forward": [EnnPartitioner(compile_specs)]},
         compile_config=get_edge_compile_config(),
     )
diff --git a/backends/samsung/utils/utils.py b/backends/samsung/utils/utils.py
index 5da9808f38f..bbbec518b2a 100644
--- a/backends/samsung/utils/utils.py
+++ b/backends/samsung/utils/utils.py
@@ -4,12 +4,13 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import List
+from typing import List, Optional, Tuple
 
 import torch
 
 from executorch.backends.transforms.utils import is_param_node
 from executorch.exir.backend.backend_details import CompileSpec
+from executorch.exir.dialects._ops import ops as exir_ops
 
 from torch.export.exported_program import ExportedProgram
 
@@ -35,3 +36,90 @@ def is_graph_output(node: torch.fx.Node) -> bool:
         ):
             return True
     return False
+
+
+def _quantize_per_tensor(
+    in_tensor: torch.Tensor,
+    scales: List[float],
+    zeropoints: List[int],
+    dtype: torch.dtype,
+    qrange: Optional[Tuple[int, int]],
+):
+    assert (
+        len(scales) == 1
+    ), "For per-tensor quantization, there should be only one scale/zeropoint"
+    return exir_ops.edge.quantized_decomposed.quantize_per_tensor.default(
+        in_tensor,
+        torch.Tensor(scales),
+        torch.Tensor(zeropoints),
+        qrange[0],
+        qrange[1],
+        dtype,
+    )
+
+
+def _quantize_per_channel(
+    in_tensor: torch.Tensor,
+    scales: List[float],
+    zeropoints: List[int],
+    dtype: torch.dtype,
+    qrange: Optional[Tuple[int, int]],
+    axis: Optional[int],  # Only for per-channel
+):
+    assert (
+        len(scales) == in_tensor.shape[axis]
+    ), "Shape not match for quant params and input tensor"
+    return exir_ops.edge.quantized_decomposed.quantize_per_channel.default(
+        in_tensor,
+        torch.Tensor(scales),
+        torch.Tensor(zeropoints),
+        axis,
+        qrange[0],
+        qrange[1],
+        dtype,
+    )
+
+
+def quantize_tensor(
+    in_tensor: torch.Tensor,
+    scales: List[float],
+    zeropoints: List[int],
+    dtype: torch.dtype,
+    qrange: Optional[Tuple[int, int]] = None,
+    axis: Optional[int] = None,  # Only for per-channel
+) -> torch.Tensor:
+    """
+    To quantize constant tensor by executorch OPs. If `axis` not set, we quantize the tensor by per tensor.
+    If `axis` was set, we do per-channel quantize.
+
+    :param in_tensor: The tensor to be quantized
+    :param scales: List of scales. For per-tensor quantization, it should contain only one element
+    :param zeropoints: List of zeropoints. For per-tensor quantization, it should contain only one element
+    :param dtype: The output dtype
+    :param qrange: The quantization range (qmin, qmax).
+        If not set, we will get the maximum range of the dtype by `torch.iinfo`
+    :param axis: We do per-channel quantize by which axis.
+        Only when this parameter set, we do per-channel quantization
+    :type in_tensor: torch.Tensor
+    :type scalse: List[float]
+    :type zeropoints: List[int]
+    :type dtype: torch.dtype
+    :type qrange: Optional[Tuple[int,int]]
+    :type axis: Optional[int]
+    :return: The quantized tensor
+    """
+    assert len(scales) == len(
+        zeropoints
+    ), "scales should have same shape with zeropoints"
+    if not qrange:
+        qrange = (torch.iinfo(dtype).min, torch.iinfo(dtype).max)
+
+    if axis is not None:
+        return _quantize_per_channel(in_tensor, scales, zeropoints, dtype, qrange, axis)
+    return _quantize_per_tensor(
+        in_tensor,
+        scales,
+        zeropoints,
+        dtype,
+        qrange,
+    )
diff --git a/examples/samsung/scripts/deeplab_v3.py b/examples/samsung/scripts/deeplab_v3.py
new file mode 100644
index 00000000000..b1e8fef65fe
--- /dev/null
+++ b/examples/samsung/scripts/deeplab_v3.py
@@ -0,0 +1,168 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+from typing import Optional
+
+import torch
+import torchvision.transforms.v2 as vision_transform_v2
+
+from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.quantizer import Precision
+from executorch.backends.samsung.serialization.compile_options import (
+    gen_samsung_backend_compile_spec,
+)
+from executorch.backends.samsung.utils.export_utils import (
+    quantize_module,
+    to_edge_transform_and_lower_to_enn,
+)
+from executorch.examples.models.deeplab_v3 import DeepLabV3ResNet50Model
+from executorch.examples.samsung.utils import save_tensors
+from executorch.exir import ExecutorchBackendConfig
+from executorch.extension.export_util.utils import save_pte_program
+from torchvision.datasets import VOCSegmentation
+
+
+def get_dataset(
+    data_dir: str,
+    calinum=100,
+    input_transform_compose: Optional[vision_transform_v2.Compose] = None,
+    target_transform_compose: Optional[vision_transform_v2.Compose] = None,
+):
+    if not input_transform_compose:
+        input_transform_compose = vision_transform_v2.Compose(
+            [
+                vision_transform_v2.Resize([224, 224]),
+                vision_transform_v2.ToImage(),
+                vision_transform_v2.ToDtype(torch.float32, scale=True),
+                vision_transform_v2.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+                vision_transform_v2.Lambda(lambda x: x.unsqueeze(0)),  # Add batch dim
+            ]
+        )
+    if not target_transform_compose:
+        target_transform_compose = vision_transform_v2.Compose(
+            [
+                vision_transform_v2.Resize([224, 224]),
+                vision_transform_v2.ToImage(),
+                vision_transform_v2.ToDtype(torch.long, scale=False),
+                vision_transform_v2.Lambda(lambda x: x.unsqueeze(0)),  # Add batch dim
+            ]
+        )
+    voc_dataset = VOCSegmentation(
+        data_dir,
+        "2012",
+        "val",
+        transform=input_transform_compose,
+        target_transform=target_transform_compose,
+    )
+    example_input = [
+        (voc_dataset[i][0],) for i in range(min(calinum, len(voc_dataset)))
+    ]
+    return example_input
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-c",
+        "--chipset",
+        default="E9955",
+        help="Samsung chipset, i.e. E9945, E9955, etc",
+        type=str,
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        default=None,
+        help=("path to the validation folder of VOC dataset. "),
+        type=str,
+    )
+
+    parser.add_argument(
+        "-p",
+        "--precision",
+        default=None,
+        help=("Quantizaiton precision. If not set, the model will not be quantized."),
+        choices=[None, "A8W8"],
+        type=str,
+    )
+
+    parser.add_argument(
+        "-cn",
+        "--calibration_number",
+        default=100,
+        help=(
+            "Assign the number of data you want "
+            "to use for calibrating the quant params."
+        ),
+        type=int,
+    )
+
+    parser.add_argument(
+        "--dump",
+        default=False,
+        const=True,
+        nargs="?",
+        help=("Whether to dump all outputs. If not set, we only dump pte."),
+        type=bool,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. ",
+        default="./deeplab_v3",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    # build pte
+    pte_filename = "deeplab_v3"
+    instance = DeepLabV3ResNet50Model()
+    model = DeepLabV3ResNet50Model().get_eager_model().eval()
+    assert args.calibration_number
+    if args.dataset:
+        inputs = get_dataset(
+            data_dir=f"{args.dataset}",
+            calinum=args.calibration_number,
+        )
+    else:
+        inputs = [instance.get_example_inputs() for _ in range(args.calibration_number)]
+
+    test_in = inputs[0]
+    float_out = model(*test_in)
+
+    compile_specs = [gen_samsung_backend_compile_spec(args.chipset)]
+
+    if args.precision:
+        model = quantize_module(
+            model, inputs[0], inputs, getattr(Precision, args.precision)
+        )
+        quant_out = model(*test_in)
+
+    edge_prog = to_edge_transform_and_lower_to_enn(
+        model, inputs[0], compile_specs=compile_specs
+    )
+
+    edge = edge_prog.to_backend(EnnPartitioner(compile_specs))
+    exec_prog = edge.to_executorch(
+        config=ExecutorchBackendConfig(extract_delegate_segments=True)
+    )
+    save_pte_program(exec_prog, pte_filename, os.path.join(f"{args.artifact}"))
+
+    if args.dump:
+        save_tensors(test_in, "float_in", args.artifact)
+        save_tensors(float_out, "float_out", args.artifact)
+        if args.precision:
+            save_tensors(quant_out, "quant_out", args.artifact)
diff --git a/examples/samsung/scripts/edsr.py b/examples/samsung/scripts/edsr.py
new file mode 100644
index 00000000000..f300a9c8547
--- /dev/null
+++ b/examples/samsung/scripts/edsr.py
@@ -0,0 +1,181 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+from typing import List, Optional, Tuple
+
+from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.quantizer import Precision
+from executorch.backends.samsung.serialization.compile_options import (
+    gen_samsung_backend_compile_spec,
+)
+from executorch.backends.samsung.utils.export_utils import (
+    quantize_module,
+    to_edge_transform_and_lower_to_enn,
+)
+from executorch.examples.models.edsr import EdsrModel
+from executorch.examples.samsung.utils import save_tensors
+from executorch.exir import ExecutorchBackendConfig
+from executorch.extension.export_util.utils import save_pte_program
+
+from torchsr import transforms
+
+
+def get_dataset(
+    root_dir: str,
+    calinum=100,
+    transform_compose: Optional[transforms.Compose] = None,
+) -> Tuple:
+    """
+    Generate test data from B100 dataset for quantization model
+
+    :param root_dir: Dir of dataset. The real dataset should be in root_dir/SRBenchmarks/benchmark/
+    :param dataset_name: data_set name
+    :param testnum: Number of test data. Default 500
+    :param transform_compose: Transforms to be applied to data.
+        Default:
+        transform_compose = transforms.Compose(
+            [transforms.ToTensor()] # Convert Pillows Image to tensor
+        )
+    :type root_dir: str
+    :type calinum: int
+    :type testnum: int
+    :type transform_compose: transforms.Compose | None
+    :return: (example_input, cali_data, test_data)
+    """
+
+    class SrResize:
+        def __init__(self, expected_size: List[List[int]]):
+            self.expected_size = expected_size
+
+        def __call__(self, x):
+            return (
+                x[0].resize(self.expected_size[0]),
+                x[1].resize(self.expected_size[1]),
+            )
+
+    class SrUnsqueeze:
+        def __call__(self, x):
+            return (
+                x[0].unsqueeze(0),
+                x[1].unsqueeze(0),
+            )
+
+    if not transform_compose:
+        transform_compose = transforms.Compose(
+            [
+                SrResize([[448, 448], [224, 224]]),
+                transforms.ToTensor(),  # Convert Pillows Image to tensor
+                SrUnsqueeze(),
+            ]
+        )
+    from torchsr.datasets import B100
+
+    dataset = B100(root=root_dir, transform=transform_compose, scale=2)
+    example_data = [(dataset[i][1],) for i in range(min(calinum, len(dataset)))]
+    return example_data
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-c",
+        "--chipset",
+        default="E9955",
+        help="Samsung chipset, i.e. E9945, E9955, etc",
+        type=str,
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        default=None,
+        help=("path to the validation folder of B100"),
+        type=str,
+    )
+
+    parser.add_argument(
+        "-p",
+        "--precision",
+        default=None,
+        help=("Quantizaiton precision. If not set, the model will not be quantized."),
+        choices=[None, "A8W8"],
+        type=str,
+    )
+
+    parser.add_argument(
+        "-cn",
+        "--calibration_number",
+        default=100,
+        help=(
+            "Assign the number of data you want "
+            "to use for calibrating the quant params."
+        ),
+        type=int,
+    )
+
+    parser.add_argument(
+        "--dump",
+        default=False,
+        const=True,
+        nargs="?",
+        help=("Whether to dump all outputs. If not set, we only dump pte."),
+        type=bool,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. ",
+        default="./edsr",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    # build pte
+    pte_filename = "edsr"
+    instance = EdsrModel()
+    model = EdsrModel().get_eager_model().eval()
+    assert args.calibration_number
+    if args.dataset:
+        inputs = get_dataset(
+            root_dir=f"{args.dataset}",
+            calinum=args.calibration_number,
+        )
+    else:
+        inputs = [instance.get_example_inputs() for _ in range(args.calibration_number)]
+
+    test_in = inputs[0]
+    float_out = model(*test_in)
+
+    compile_specs = [gen_samsung_backend_compile_spec(args.chipset)]
+
+    if args.precision:
+        model = quantize_module(
+            model, inputs[0], inputs, getattr(Precision, args.precision)
+        )
+        quant_out = model(*test_in)
+
+    edge_prog = to_edge_transform_and_lower_to_enn(
+        model, inputs[0], compile_specs=compile_specs
+    )
+
+    edge = edge_prog.to_backend(EnnPartitioner(compile_specs))
+    exec_prog = edge.to_executorch(
+        config=ExecutorchBackendConfig(extract_delegate_segments=True)
+    )
+    save_pte_program(exec_prog, pte_filename, os.path.join(f"{args.artifact}"))
+
+    if args.dump:
+        save_tensors(test_in, "float_in", args.artifact)
+        save_tensors(float_out, "float_out", args.artifact)
+        if args.precision:
+            save_tensors(quant_out, "quant_out", args.artifact)
diff --git a/examples/samsung/scripts/inception_v3.py b/examples/samsung/scripts/inception_v3.py
new file mode 100644
index 00000000000..77540285eab
--- /dev/null
+++ b/examples/samsung/scripts/inception_v3.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+
+from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.quantizer import Precision
+from executorch.backends.samsung.serialization.compile_options import (
+    gen_samsung_backend_compile_spec,
+)
+from executorch.backends.samsung.utils.export_utils import (
+    quantize_module,
+    to_edge_transform_and_lower_to_enn,
+)
+from executorch.examples.models.inception_v3 import InceptionV3Model
+from executorch.examples.samsung.utils import save_tensors
+from executorch.exir import ExecutorchBackendConfig
+from executorch.extension.export_util.utils import save_pte_program
+
+
+def get_dataset(dataset_path, data_size):
+    from torchvision import datasets, transforms
+
+    image_shape = (256, 256)
+    crop_size = 224
+    shuffle = True
+
+    def get_data_loader():
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(image_shape),
+                transforms.CenterCrop(crop_size),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+        return torch.utils.data.DataLoader(
+            imagenet_data,
+            shuffle=shuffle,
+        )
+
+    # prepare input data
+    inputs, targets, input_list = [], [], ""
+    data_loader = get_data_loader()
+    for index, data in enumerate(data_loader):
+        if index >= data_size:
+            break
+        feature, target = data
+        inputs.append((feature,))
+        targets.append(target)
+        input_list += f"input_{index}_0.bin\n"
+
+    return inputs, targets, input_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-c",
+        "--chipset",
+        default="E9955",
+        help="Samsung chipset, i.e. E9945, E9955, etc",
+        type=str,
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        default=None,
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+    )
+
+    parser.add_argument(
+        "-p",
+        "--precision",
+        default=None,
+        help=("Quantizaiton precision. If not set, the model will not be quantized."),
+        choices=[None, "A8W8"],
+        type=str,
+    )
+
+    parser.add_argument(
+        "-cn",
+        "--calibration_number",
+        default=100,
+        help=(
+            "Assign the number of data you want "
+            "to use for calibrating the quant params."
+        ),
+        type=int,
+    )
+
+    parser.add_argument(
+        "--dump",
+        default=False,
+        const=True,
+        nargs="?",
+        help=("Whether to dump all outputs. If not set, we only dump pte."),
+        type=bool,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. ",
+        default="./inception_v3",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    # build pte
+    pte_filename = "inception_v3"
+    instance = InceptionV3Model()
+    model = InceptionV3Model().get_eager_model().eval()
+    assert args.calibration_number
+    if args.dataset:
+        inputs, targets, input_list = get_dataset(
+            dataset_path=f"{args.dataset}",
+            data_size=args.calibration_number,
+        )
+    else:
+        inputs = [instance.get_example_inputs() for _ in range(args.calibration_number)]
+        target = None
+        input_list = None
+
+    test_in = inputs[0]
+    float_out = model(*test_in)
+
+    compile_specs = [gen_samsung_backend_compile_spec(args.chipset)]
+
+    if args.precision:
+        model = quantize_module(
+            model, inputs[0], inputs, getattr(Precision, args.precision)
+        )
+        quant_out = model(*test_in)
+
+    edge_prog = to_edge_transform_and_lower_to_enn(
+        model, inputs[0], compile_specs=compile_specs
+    )
+
+    edge = edge_prog.to_backend(EnnPartitioner(compile_specs))
+    exec_prog = edge.to_executorch(
+        config=ExecutorchBackendConfig(extract_delegate_segments=True)
+    )
+    save_pte_program(exec_prog, pte_filename, os.path.join(f"{args.artifact}"))
+
+    if args.dump:
+        save_tensors(test_in, "float_in", args.artifact)
+        save_tensors(float_out, "float_out", args.artifact)
+        if args.precision:
+            save_tensors(quant_out, "quant_out", args.artifact)
diff --git a/examples/samsung/scripts/inception_v4.py b/examples/samsung/scripts/inception_v4.py
new file mode 100644
index 00000000000..3140682998c
--- /dev/null
+++ b/examples/samsung/scripts/inception_v4.py
@@ -0,0 +1,167 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+
+from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.quantizer import Precision
+from executorch.backends.samsung.serialization.compile_options import (
+    gen_samsung_backend_compile_spec,
+)
+from executorch.backends.samsung.utils.export_utils import (
+    quantize_module,
+    to_edge_transform_and_lower_to_enn,
+)
+from executorch.examples.models.inception_v4 import InceptionV4Model
+from executorch.examples.samsung.utils import save_tensors
+from executorch.exir import ExecutorchBackendConfig
+from executorch.extension.export_util.utils import save_pte_program
+
+
+def get_dataset(dataset_path, data_size):
+    from torchvision import datasets, transforms
+
+    image_shape = (299, 299)
+    shuffle = True
+
+    def get_data_loader():
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(image_shape),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+        return torch.utils.data.DataLoader(
+            imagenet_data,
+            shuffle=shuffle,
+        )
+
+    # prepare input data
+    inputs, targets, input_list = [], [], ""
+    data_loader = get_data_loader()
+    for index, data in enumerate(data_loader):
+        if index >= data_size:
+            break
+        feature, target = data
+        inputs.append((feature,))
+        targets.append(target)
+        input_list += f"input_{index}_0.bin\n"
+
+    return inputs, targets, input_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-c",
+        "--chipset",
+        default="E9955",
+        help="Samsung chipset, i.e. E9945, E9955, etc",
+        type=str,
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        default=None,
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+    )
+
+    parser.add_argument(
+        "-p",
+        "--precision",
+        default=None,
+        help=("Quantizaiton precision. If not set, the model will not be quantized."),
+        choices=[None, "A8W8"],
+        type=str,
+    )
+
+    parser.add_argument(
+        "-cn",
+        "--calibration_number",
+        default=100,
+        help=(
+            "Assign the number of data you want "
+            "to use for calibrating the quant params."
+        ),
+        type=int,
+    )
+
+    parser.add_argument(
+        "--dump",
+        default=False,
+        const=True,
+        nargs="?",
+        help=("Whether to dump all outputs. If not set, we only dump pte."),
+        type=bool,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. ",
+        default="./inception_v4",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    # build pte
+    pte_filename = "inception_v4"
+    instance = InceptionV4Model()
+    model = InceptionV4Model().get_eager_model().eval()
+    assert args.calibration_number
+    if args.dataset:
+        inputs, targets, input_list = get_dataset(
+            dataset_path=f"{args.dataset}",
+            data_size=args.calibration_number,
+        )
+    else:
+        inputs = [instance.get_example_inputs() for _ in range(args.calibration_number)]
+        target = None
+        input_list = None
+
+    test_in = inputs[0]
+    float_out = model(*test_in)
+
+    compile_specs = [gen_samsung_backend_compile_spec(args.chipset)]
+
+    if args.precision:
+        model = quantize_module(
+            model, inputs[0], inputs, getattr(Precision, args.precision)
+        )
+        quant_out = model(*test_in)
+
+    edge_prog = to_edge_transform_and_lower_to_enn(
+        model, inputs[0], compile_specs=compile_specs
+    )
+
+    edge = edge_prog.to_backend(EnnPartitioner(compile_specs))
+    exec_prog = edge.to_executorch(
+        config=ExecutorchBackendConfig(extract_delegate_segments=True)
+    )
+    save_pte_program(exec_prog, pte_filename, os.path.join(f"{args.artifact}"))
+
+    if args.dump:
+        save_tensors(test_in, "float_in", args.artifact)
+        save_tensors(float_out, "float_out", args.artifact)
+        if args.precision:
+            save_tensors(quant_out, "quant_out", args.artifact)
diff --git a/examples/samsung/scripts/mobilenet_v2.py b/examples/samsung/scripts/mobilenet_v2.py
new file mode 100644
index 00000000000..7c69de38e2c
--- /dev/null
+++ b/examples/samsung/scripts/mobilenet_v2.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+
+from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.quantizer import Precision
+from executorch.backends.samsung.serialization.compile_options import (
+    gen_samsung_backend_compile_spec,
+)
+from executorch.backends.samsung.utils.export_utils import (
+    quantize_module,
+    to_edge_transform_and_lower_to_enn,
+)
+from executorch.examples.models.mobilenet_v2 import MV2Model
+from executorch.examples.samsung.utils import save_tensors
+from executorch.exir import ExecutorchBackendConfig
+from executorch.extension.export_util.utils import save_pte_program
+
+
+def get_dataset(dataset_path, data_size):
+    from torchvision import datasets, transforms
+
+    image_shape = (256, 256)
+    crop_size = 224
+    shuffle = True
+
+    def get_data_loader():
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(image_shape),
+                transforms.CenterCrop(crop_size),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+        return torch.utils.data.DataLoader(
+            imagenet_data,
+            shuffle=shuffle,
+        )
+
+    # prepare input data
+    inputs, targets, input_list = [], [], ""
+    data_loader = get_data_loader()
+    for index, data in enumerate(data_loader):
+        if index >= data_size:
+            break
+        feature, target = data
+        inputs.append((feature,))
+        targets.append(target)
+        input_list += f"input_{index}_0.bin\n"
+
+    return inputs, targets, input_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-c",
+        "--chipset",
+        default="E9955",
+        help="Samsung chipset, i.e. E9945, E9955, etc",
+        type=str,
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        default=None,
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+    )
+
+    parser.add_argument(
+        "-p",
+        "--precision",
+        default=None,
+        help=("Quantizaiton precision. If not set, the model will not be quantized."),
+        choices=[None, "A8W8"],
+        type=str,
+    )
+
+    parser.add_argument(
+        "-cn",
+        "--calibration_number",
+        default=100,
+        help=(
+            "Assign the number of data you want "
+            "to use for calibrating the quant params."
+        ),
+        type=int,
+    )
+
+    parser.add_argument(
+        "--dump",
+        default=False,
+        const=True,
+        nargs="?",
+        help=("Whether to dump all outputs. If not set, we only dump pte."),
+        type=bool,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. ",
+        default="./mobilenetV2",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    # build pte
+    pte_filename = "mobilenetV2_enn"
+    instance = MV2Model(False)
+    model = MV2Model().get_eager_model().eval()
+    assert args.calibration_number
+    if args.dataset:
+        inputs, targets, input_list = get_dataset(
+            dataset_path=f"{args.dataset}",
+            data_size=args.calibration_number,
+        )
+    else:
+        inputs = [instance.get_example_inputs() for _ in range(args.calibration_number)]
+        target = None
+        input_list = None
+
+    test_in = inputs[0]
+    float_out = model(*test_in)
+
+    compile_specs = [gen_samsung_backend_compile_spec(args.chipset)]
+
+    if args.precision:
+        model = quantize_module(
+            model, inputs[0], inputs, getattr(Precision, args.precision)
+        )
+        quant_out = model(*test_in)
+
+    edge_prog = to_edge_transform_and_lower_to_enn(
+        model, inputs[0], compile_specs=compile_specs
+    )
+
+    edge = edge_prog.to_backend(EnnPartitioner(compile_specs))
+    exec_prog = edge.to_executorch(
+        config=ExecutorchBackendConfig(extract_delegate_segments=True)
+    )
+    save_pte_program(exec_prog, pte_filename, os.path.join(f"{args.artifact}"))
+
+    if args.dump:
+        save_tensors(test_in, "float_in", args.artifact)
+        save_tensors(float_out, "float_out", args.artifact)
+        if args.precision:
+            save_tensors(quant_out, "quant_out", args.artifact)
diff --git a/examples/samsung/scripts/mobilenet_v3.py b/examples/samsung/scripts/mobilenet_v3.py
new file mode 100644
index 00000000000..3cc8eadf633
--- /dev/null
+++ b/examples/samsung/scripts/mobilenet_v3.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+
+from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.quantizer import Precision
+from executorch.backends.samsung.serialization.compile_options import (
+    gen_samsung_backend_compile_spec,
+)
+from executorch.backends.samsung.utils.export_utils import (
+    quantize_module,
+    to_edge_transform_and_lower_to_enn,
+)
+from executorch.examples.models.mobilenet_v3 import MV3Model
+from executorch.examples.samsung.utils import save_tensors
+from executorch.exir import ExecutorchBackendConfig
+from executorch.extension.export_util.utils import save_pte_program
+
+
+def get_dataset(dataset_path, data_size):
+    from torchvision import datasets, transforms
+
+    image_shape = (256, 256)
+    crop_size = 224
+    shuffle = True
+
+    def get_data_loader():
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(image_shape),
+                transforms.CenterCrop(crop_size),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+        return torch.utils.data.DataLoader(
+            imagenet_data,
+            shuffle=shuffle,
+        )
+
+    # prepare input data
+    inputs, targets, input_list = [], [], ""
+    data_loader = get_data_loader()
+    for index, data in enumerate(data_loader):
+        if index >= data_size:
+            break
+        feature, target = data
+        inputs.append((feature,))
+        targets.append(target)
+        input_list += f"input_{index}_0.bin\n"
+
+    return inputs, targets, input_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-c",
+        "--chipset",
+        default="E9955",
+        help="Samsung chipset, i.e. E9945, E9955, etc",
+        type=str,
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        default=None,
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+    )
+
+    parser.add_argument(
+        "-p",
+        "--precision",
+        default=None,
+        help=("Quantizaiton precision. If not set, the model will not be quantized."),
+        choices=[None, "A8W8"],
+        type=str,
+    )
+
+    parser.add_argument(
+        "-cn",
+        "--calibration_number",
+        default=100,
+        help=(
+            "Assign the number of data you want "
+            "to use for calibrating the quant params."
+        ),
+        type=int,
+    )
+
+    parser.add_argument(
+        "--dump",
+        default=False,
+        const=True,
+        nargs="?",
+        help=("Whether to dump all outputs. If not set, we only dump pte."),
+        type=bool,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. ",
+        default="./mobilenet_v3",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    # build pte
+    pte_filename = "mobilenet_v3"
+    instance = MV3Model()
+    model = MV3Model().get_eager_model().eval()
+    assert args.calibration_number
+    if args.dataset:
+        inputs, targets, input_list = get_dataset(
+            dataset_path=f"{args.dataset}",
+            data_size=args.calibration_number,
+        )
+    else:
+        inputs = [instance.get_example_inputs() for _ in range(args.calibration_number)]
+        target = None
+        input_list = None
+
+    test_in = inputs[0]
+    float_out = model(*test_in)
+
+    compile_specs = [gen_samsung_backend_compile_spec(args.chipset)]
+
+    if args.precision:
+        model = quantize_module(
+            model, inputs[0], inputs, getattr(Precision, args.precision)
+        )
+        quant_out = model(*test_in)
+
+    edge_prog = to_edge_transform_and_lower_to_enn(
+        model, inputs[0], compile_specs=compile_specs
+    )
+
+    edge = edge_prog.to_backend(EnnPartitioner(compile_specs))
+    exec_prog = edge.to_executorch(
+        config=ExecutorchBackendConfig(extract_delegate_segments=True)
+    )
+    save_pte_program(exec_prog, pte_filename, os.path.join(f"{args.artifact}"))
+
+    if args.dump:
+        save_tensors(test_in, "float_in", args.artifact)
+        save_tensors(float_out, "float_out", args.artifact)
+        if args.precision:
+            save_tensors(quant_out, "quant_out", args.artifact)
diff --git a/examples/samsung/scripts/resnet18.py b/examples/samsung/scripts/resnet18.py
new file mode 100644
index 00000000000..2f3233214ce
--- /dev/null
+++ b/examples/samsung/scripts/resnet18.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+
+from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.quantizer import Precision
+from executorch.backends.samsung.serialization.compile_options import (
+    gen_samsung_backend_compile_spec,
+)
+from executorch.backends.samsung.utils.export_utils import (
+    quantize_module,
+    to_edge_transform_and_lower_to_enn,
+)
+from executorch.examples.models.resnet import ResNet18Model
+from executorch.examples.samsung.utils import save_tensors
+from executorch.exir import ExecutorchBackendConfig
+from executorch.extension.export_util.utils import save_pte_program
+
+
+def get_dataset(dataset_path, data_size):
+    from torchvision import datasets, transforms
+
+    image_shape = (256, 256)
+    crop_size = 224
+    shuffle = True
+
+    def get_data_loader():
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(image_shape),
+                transforms.CenterCrop(crop_size),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+        return torch.utils.data.DataLoader(
+            imagenet_data,
+            shuffle=shuffle,
+        )
+
+    # prepare input data
+    inputs, targets, input_list = [], [], ""
+    data_loader = get_data_loader()
+    for index, data in enumerate(data_loader):
+        if index >= data_size:
+            break
+        feature, target = data
+        inputs.append((feature,))
+        targets.append(target)
+        input_list += f"input_{index}_0.bin\n"
+
+    return inputs, targets, input_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-c",
+        "--chipset",
+        default="E9955",
+        help="Samsung chipset, i.e. E9945, E9955, etc",
+        type=str,
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        default=None,
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+    )
+
+    parser.add_argument(
+        "-p",
+        "--precision",
+        default=None,
+        help=("Quantizaiton precision. If not set, the model will not be quantized."),
+        choices=[None, "A8W8"],
+        type=str,
+    )
+
+    parser.add_argument(
+        "-cn",
+        "--calibration_number",
+        default=100,
+        help=(
+            "Assign the number of data you want "
+            "to use for calibrating the quant params."
+        ),
+        type=int,
+    )
+
+    parser.add_argument(
+        "--dump",
+        default=False,
+        const=True,
+        nargs="?",
+        help=("Whether to dump all outputs. If not set, we only dump pte."),
+        type=bool,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. ",
+        default="./resnet18",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    # build pte
+    pte_filename = "resnet18"
+    instance = ResNet18Model()
+    model = ResNet18Model().get_eager_model().eval()
+    assert args.calibration_number
+    if args.dataset:
+        inputs, targets, input_list = get_dataset(
+            dataset_path=f"{args.dataset}",
+            data_size=args.calibration_number,
+        )
+    else:
+        inputs = [instance.get_example_inputs() for _ in range(args.calibration_number)]
+        target = None
+        input_list = None
+
+    test_in = inputs[0]
+    float_out = model(*test_in)
+
+    compile_specs = [gen_samsung_backend_compile_spec(args.chipset)]
+
+    if args.precision:
+        model = quantize_module(
+            model, inputs[0], inputs, getattr(Precision, args.precision)
+        )
+        quant_out = model(*test_in)
+
+    edge_prog = to_edge_transform_and_lower_to_enn(
+        model, inputs[0], compile_specs=compile_specs
+    )
+
+    edge = edge_prog.to_backend(EnnPartitioner(compile_specs))
+    exec_prog = edge.to_executorch(
+        config=ExecutorchBackendConfig(extract_delegate_segments=True)
+    )
+    save_pte_program(exec_prog, pte_filename, os.path.join(f"{args.artifact}"))
+
+    if args.dump:
+        save_tensors(test_in, "float_in", args.artifact)
+        save_tensors(float_out, "float_out", args.artifact)
+        if args.precision:
+            save_tensors(quant_out, "quant_out", args.artifact)
diff --git a/examples/samsung/scripts/resnet50.py b/examples/samsung/scripts/resnet50.py
new file mode 100644
index 00000000000..1d6c348b641
--- /dev/null
+++ b/examples/samsung/scripts/resnet50.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+
+from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.quantizer import Precision
+from executorch.backends.samsung.serialization.compile_options import (
+    gen_samsung_backend_compile_spec,
+)
+from executorch.backends.samsung.utils.export_utils import (
+    quantize_module,
+    to_edge_transform_and_lower_to_enn,
+)
+from executorch.examples.models.resnet import ResNet50Model
+from executorch.examples.samsung.utils import save_tensors
+from executorch.exir import ExecutorchBackendConfig
+from executorch.extension.export_util.utils import save_pte_program
+
+
+def get_dataset(dataset_path, data_size):
+    from torchvision import datasets, transforms
+
+    image_shape = (256, 256)
+    crop_size = 224
+    shuffle = True
+
+    def get_data_loader():
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(image_shape),
+                transforms.CenterCrop(crop_size),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+        return torch.utils.data.DataLoader(
+            imagenet_data,
+            shuffle=shuffle,
+        )
+
+    # prepare input data
+    inputs, targets, input_list = [], [], ""
+    data_loader = get_data_loader()
+    for index, data in enumerate(data_loader):
+        if index >= data_size:
+            break
+        feature, target = data
+        inputs.append((feature,))
+        targets.append(target)
+        input_list += f"input_{index}_0.bin\n"
+
+    return inputs, targets, input_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-c",
+        "--chipset",
+        default="E9955",
+        help="Samsung chipset, i.e. E9945, E9955, etc",
+        type=str,
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        default=None,
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+    )
+
+    parser.add_argument(
+        "-p",
+        "--precision",
+        default=None,
+        help=("Quantizaiton precision. If not set, the model will not be quantized."),
+        choices=[None, "A8W8"],
+        type=str,
+    )
+
+    parser.add_argument(
+        "-cn",
+        "--calibration_number",
+        default=100,
+        help=(
+            "Assign the number of data you want "
+            "to use for calibrating the quant params."
+        ),
+        type=int,
+    )
+
+    parser.add_argument(
+        "--dump",
+        default=False,
+        const=True,
+        nargs="?",
+        help=("Whether to dump all outputs. If not set, we only dump pte."),
+        type=bool,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. ",
+        default="./resnet50",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    # build pte
+    pte_filename = "resnet50"
+    instance = ResNet50Model()
+    model = ResNet50Model().get_eager_model().eval()
+    assert args.calibration_number
+    if args.dataset:
+        inputs, targets, input_list = get_dataset(
+            dataset_path=f"{args.dataset}",
+            data_size=args.calibration_number,
+        )
+    else:
+        inputs = [instance.get_example_inputs() for _ in range(args.calibration_number)]
+        target = None
+        input_list = None
+
+    test_in = inputs[0]
+    float_out = model(*test_in)
+
+    compile_specs = [gen_samsung_backend_compile_spec(args.chipset)]
+
+    if args.precision:
+        model = quantize_module(
+            model, inputs[0], inputs, getattr(Precision, args.precision)
+        )
+        quant_out = model(*test_in)
+
+    edge_prog = to_edge_transform_and_lower_to_enn(
+        model, inputs[0], compile_specs=compile_specs
+    )
+
+    edge = edge_prog.to_backend(EnnPartitioner(compile_specs))
+    exec_prog = edge.to_executorch(
+        config=ExecutorchBackendConfig(extract_delegate_segments=True)
+    )
+    save_pte_program(exec_prog, pte_filename, os.path.join(f"{args.artifact}"))
+
+    if args.dump:
+        save_tensors(test_in, "float_in", args.artifact)
+        save_tensors(float_out, "float_out", args.artifact)
+        if args.precision:
+            save_tensors(quant_out, "quant_out", args.artifact)
diff --git a/examples/samsung/scripts/vit.py b/examples/samsung/scripts/vit.py
new file mode 100644
index 00000000000..19c22c473cd
--- /dev/null
+++ b/examples/samsung/scripts/vit.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+
+from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.quantizer import Precision
+from executorch.backends.samsung.serialization.compile_options import (
+    gen_samsung_backend_compile_spec,
+)
+from executorch.backends.samsung.utils.export_utils import (
+    quantize_module,
+    to_edge_transform_and_lower_to_enn,
+)
+from executorch.examples.models.torchvision_vit import TorchVisionViTModel
+from executorch.examples.samsung.utils import save_tensors
+from executorch.exir import ExecutorchBackendConfig
+from executorch.extension.export_util.utils import save_pte_program
+
+
+def get_dataset(dataset_path, data_size):
+    from torchvision import datasets, transforms
+
+    image_shape = (256, 256)
+    crop_size = 224
+    shuffle = True
+
+    def get_data_loader():
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(image_shape),
+                transforms.CenterCrop(crop_size),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+        return torch.utils.data.DataLoader(
+            imagenet_data,
+            shuffle=shuffle,
+        )
+
+    # prepare input data
+    inputs, targets, input_list = [], [], ""
+    data_loader = get_data_loader()
+    for index, data in enumerate(data_loader):
+        if index >= data_size:
+            break
+        feature, target = data
+        inputs.append((feature,))
+        targets.append(target)
+        input_list += f"input_{index}_0.bin\n"
+
+    return inputs, targets, input_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-c",
+        "--chipset",
+        default="E9955",
+        help="Samsung chipset, i.e. E9945, E9955, etc",
+        type=str,
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        default=None,
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+    )
+
+    parser.add_argument(
+        "-p",
+        "--precision",
+        default=None,
+        help=("Quantizaiton precision. If not set, the model will not be quantized."),
+        choices=[None, "A8W8"],
+        type=str,
+    )
+
+    parser.add_argument(
+        "-cn",
+        "--calibration_number",
+        default=100,
+        help=(
+            "Assign the number of data you want "
+            "to use for calibrating the quant params."
+        ),
+        type=int,
+    )
+
+    parser.add_argument(
+        "--dump",
+        default=False,
+        const=True,
+        nargs="?",
+        help=("Whether to dump all outputs. If not set, we only dump pte."),
+        type=bool,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. ",
+        default="./vision_transformer",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    # build pte
+    pte_filename = "vision_transformer"
+    instance = TorchVisionViTModel()
+    model = TorchVisionViTModel().get_eager_model().eval()
+    assert args.calibration_number
+    if args.dataset:
+        inputs, targets, input_list = get_dataset(
+            dataset_path=f"{args.dataset}",
+            data_size=args.calibration_number,
+        )
+    else:
+        inputs = [instance.get_example_inputs() for _ in range(args.calibration_number)]
+        target = None
+        input_list = None
+
+    test_in = inputs[0]
+    float_out = model(*test_in)
+
+    compile_specs = [gen_samsung_backend_compile_spec(args.chipset)]
+
+    if args.precision:
+        model = quantize_module(
+            model, inputs[0], inputs, getattr(Precision, args.precision)
+        )
+        quant_out = model(*test_in)
+
+    edge_prog = to_edge_transform_and_lower_to_enn(
+        model, inputs[0], compile_specs=compile_specs
+    )
+
+    edge = edge_prog.to_backend(EnnPartitioner(compile_specs))
+    exec_prog = edge.to_executorch(
+        config=ExecutorchBackendConfig(extract_delegate_segments=True)
+    )
+    save_pte_program(exec_prog, pte_filename, os.path.join(f"{args.artifact}"))
+
+    if args.dump:
+        save_tensors(test_in, "float_in", args.artifact)
+        save_tensors(float_out, "float_out", args.artifact)
+        if args.precision:
+            save_tensors(quant_out, "quant_out", args.artifact)
diff --git a/examples/samsung/scripts/wav2letter.py b/examples/samsung/scripts/wav2letter.py
new file mode 100644
index 00000000000..33069105d99
--- /dev/null
+++ b/examples/samsung/scripts/wav2letter.py
@@ -0,0 +1,235 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+from typing import List
+
+import torch
+
+from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.quantizer import Precision
+from executorch.backends.samsung.serialization.compile_options import (
+    gen_samsung_backend_compile_spec,
+)
+from executorch.backends.samsung.utils.export_utils import (
+    quantize_module,
+    to_edge_transform_and_lower_to_enn,
+)
+from executorch.examples.models.wav2letter import Wav2LetterModel
+from executorch.examples.samsung.utils import save_tensors
+from executorch.exir import ExecutorchBackendConfig
+from executorch.extension.export_util.utils import save_pte_program
+
+
+class DataManager:
+    class Encoder:
+        def __init__(self, vocab, blank_label="*"):
+            self.vocab = vocab
+            self.char_to_id = {c: i for i, c in enumerate(vocab)}
+            self.blank_label = blank_label
+
+        def encode(self, text):
+            return [self.char_to_id[c] for c in text.lower()]
+
+    @classmethod
+    def _get_voice_dataset(
+        cls, data_size: int, data_dir: str, labels: List[str], fixed_token_num: int
+    ):
+        from torch.utils.data import DataLoader
+        from torchaudio.datasets import LIBRISPEECH
+
+        def collate_fun(batch, encode_fn, mode="train"):
+            waves = []
+            text_ids = []
+            input_lengths = []
+            output_lengths = []
+
+            if mode == "train":
+                shifts = torch.randn(len(batch)) > 0.0
+
+            for i, (wave, _, text, *_) in enumerate(batch):
+                if mode == "train" and shifts[i]:
+                    wave = wave[:, 160:]
+                waves.append(wave[0])
+                ids = torch.LongTensor(encode_fn(text))
+                text_ids.append(ids)
+                input_lengths.append(wave.size(1) // 320)
+                output_lengths.append(len(ids))
+
+            waves = torch.nn.utils.rnn.pad_sequence(waves, batch_first=True).unsqueeze(
+                1
+            )
+            labels = torch.nn.utils.rnn.pad_sequence(text_ids, batch_first=True)
+
+            return waves, labels, input_lengths, output_lengths
+
+        encoder = cls.Encoder(labels)
+
+        testset_url = "test-clean"
+        dataset = LIBRISPEECH(data_dir, url=testset_url)
+        data_loader = DataLoader(
+            dataset=dataset,
+            batch_size=1,
+            shuffle=True,
+            collate_fn=lambda x: collate_fun(x, encoder.encode, "valid"),
+        )
+        # prepare input data
+        inputs, targets = [], []
+        in_lens, tar_lens = [], []
+
+        def _loader():
+            for waves, labels, inputs_len, targets_len in data_loader:
+                if inputs_len[0] >= fixed_token_num:
+                    continue
+                zero_padding = torch.zeros(
+                    [1, 1, fixed_token_num * 320 - waves.shape[2]]
+                )
+                waves = torch.concat((waves, zero_padding), axis=2)
+                yield waves, labels, [fixed_token_num + 1], targets_len
+
+        for i, (waves, labels, inputs_len, targets_len) in enumerate(
+            _loader()
+        ):  # waves, labels, input_lens, output_lens
+            inputs.append(waves)
+            targets.append(labels)
+            in_lens.append(inputs_len)
+            tar_lens.append(targets_len)
+            if i >= data_size:
+                break
+
+        return inputs, targets, in_lens, tar_lens
+
+    @classmethod
+    def get_dataset(
+        cls,
+        data_dir: str,
+        calinum=100,
+        fixed_out_token=300,
+        labels=None,
+    ):
+        if labels is None:
+            labels = [" ", *"abcdefghijklmnopqrstuvwxyz", "'", "*"]
+        dataset = cls._get_voice_dataset(calinum, data_dir, labels, fixed_out_token)
+        example_input = [(dataset[0][i],) for i in range(min(calinum, len(dataset[0])))]
+        return example_input
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-c",
+        "--chipset",
+        default="E9955",
+        help="Samsung chipset, i.e. E9945, E9955, etc",
+        type=str,
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        default=None,
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+    )
+
+    parser.add_argument(
+        "-p",
+        "--precision",
+        default=None,
+        help=("Quantizaiton precision. If not set, the model will not be quantized."),
+        choices=[None, "A8W8"],
+        type=str,
+    )
+
+    parser.add_argument(
+        "-cn",
+        "--calibration_number",
+        default=100,
+        help=(
+            "Assign the number of data you want "
+            "to use for calibrating the quant params."
+        ),
+        type=int,
+    )
+
+    parser.add_argument(
+        "--dump",
+        default=False,
+        const=True,
+        nargs="?",
+        help=("Whether to dump all outputs. If not set, we only dump pte."),
+        type=bool,
+    )
+
+    parser.add_argument(
+        "-w",
+        "--weight",
+        default=None,
+        help="Absolute path of retrained w2l weight (With .pt format), the vocab size should 29",
+        type=str,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. ",
+        default="./wav2letter",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    # build pte
+    pte_filename = "wav2letter"
+    instance = Wav2LetterModel()
+    instance.vocab_size = 29
+    model = instance.get_eager_model().eval()
+    if args.weight:
+        weight = torch.load(args.weight, weights_only=True)
+        model.load_state_dict(weight)
+    assert args.calibration_number
+    if args.dataset:
+        inputs = DataManager.get_dataset(
+            data_dir=f"{args.dataset}",
+            calinum=args.calibration_number,
+        )
+    else:
+        inputs = [instance.get_example_inputs() for _ in range(args.calibration_number)]
+
+    test_in = inputs[0]
+    float_out = model(*test_in)
+
+    compile_specs = [gen_samsung_backend_compile_spec(args.chipset)]
+
+    if args.precision:
+        model = quantize_module(
+            model, inputs[0], inputs, getattr(Precision, args.precision)
+        )
+        quant_out = model(*test_in)
+
+    edge_prog = to_edge_transform_and_lower_to_enn(
+        model, inputs[0], compile_specs=compile_specs
+    )
+
+    edge = edge_prog.to_backend(EnnPartitioner(compile_specs))
+    exec_prog = edge.to_executorch(
+        config=ExecutorchBackendConfig(extract_delegate_segments=True)
+    )
+    save_pte_program(exec_prog, pte_filename, os.path.join(f"{args.artifact}"))
+
+    if args.dump:
+        save_tensors(test_in, "float_in", args.artifact)
+        save_tensors(float_out, "float_out", args.artifact)
+        if args.precision:
+            save_tensors(quant_out, "quant_out", args.artifact)

From cf13b9acdbb37cba76b782dade62f57f007c2f09 Mon Sep 17 00:00:00 2001
From: cccclai <chenlai@meta.com>
Date: Thu, 9 Oct 2025 21:55:47 -0700
Subject: [PATCH 224/266] [docs] Fix typo in instructions (#14968)

The path to the v79 is wrong
---
 docs/source/backends-qualcomm.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/backends-qualcomm.md b/docs/source/backends-qualcomm.md
index 7346075ead8..31ff72cd555 100644
--- a/docs/source/backends-qualcomm.md
+++ b/docs/source/backends-qualcomm.md
@@ -238,7 +238,7 @@ adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV79Stub.so ${DEVICE_DIR}
 adb push ${QNN_SDK_ROOT}/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so ${DEVICE_DIR}
 adb push ${QNN_SDK_ROOT}/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so ${DEVICE_DIR}
 adb push ${QNN_SDK_ROOT}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${DEVICE_DIR}
-adb push ${QNN_SDK_ROOT}/lib/hexagon-v75/unsigned/libQnnHtpV79Skel.so ${DEVICE_DIR}
+adb push ${QNN_SDK_ROOT}/lib/hexagon-v79/unsigned/libQnnHtpV79Skel.so ${DEVICE_DIR}
 ```
 
 ***Step 2***.  We also need to indicate dynamic linkers on Android and Hexagon

From 4bdd3df3ebeb92601cbcd1e7bced254ee75cca86 Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin <anthony@shoumikh.in>
Date: Thu, 9 Oct 2025 23:02:16 -0700
Subject: [PATCH 225/266] Allow custom sizes, dim order and strides for tensor
 view. (#14944)

Summary: The `make_tensor_ptr(TensrPtr)` overload creates a view on an existing `Tensor`. Here we provide a way for users to customize the shape, etc. so that they can easily do squeeze/unsqueeze and other convenient operations.

Differential Revision: D84259597
---
 extension/tensor/tensor_ptr.h             |  86 ++++++---
 extension/tensor/test/tensor_ptr_test.cpp | 204 +++++++++++++++++++++-
 2 files changed, 258 insertions(+), 32 deletions(-)

diff --git a/extension/tensor/tensor_ptr.h b/extension/tensor/tensor_ptr.h
index 27e2e3451ce..f0f586ffb56 100644
--- a/extension/tensor/tensor_ptr.h
+++ b/extension/tensor/tensor_ptr.h
@@ -323,26 +323,54 @@ inline TensorPtr make_tensor_ptr(
 }
 
 /**
- * Creates a TensorPtr to manage a new Tensor with the same properties
- * as the given Tensor, sharing the same data without owning it.
+ * Creates a TensorPtr to manage a new Tensor that aliases the given Tensor's
+ * storage, with optional metadata overrides. Shape dynamism is inherited from
+ * the source tensor.
  *
- * @param tensor The Tensor whose properties are used to create a new TensorPtr.
- * @return A new TensorPtr managing a Tensor with the same properties as the
- * original.
+ * If an override is provided (non-empty), it is passed as-is. If an override is
+ * empty, the corresponding metadata is reused from the source tensor when it
+ * fits; otherwise it is left empty for the core factory to derive a valid
+ * configuration. If `dim_order` is empty but `strides` is provided, `dim_order`
+ * is left empty so the core may infer it from the provided strides.
+ *
+ * @param tensor The source tensor to alias.
+ * @param sizes Optional sizes override.
+ * @param dim_order Optional dimension order override.
+ * @param strides Optional strides override.
+ * @return A TensorPtr aliasing the same storage with requested metadata.
  */
-inline TensorPtr make_tensor_ptr(const executorch::aten::Tensor& tensor) {
-  return make_tensor_ptr(
-      std::vector<executorch::aten::SizesType>(
-          tensor.sizes().begin(), tensor.sizes().end()),
-      tensor.mutable_data_ptr(),
+inline TensorPtr make_tensor_ptr(
+    const executorch::aten::Tensor& tensor,
+    std::vector<executorch::aten::SizesType> sizes = {},
+    std::vector<executorch::aten::DimOrderType> dim_order = {},
+    std::vector<executorch::aten::StridesType> strides = {}) {
+  if (sizes.empty()) {
+    sizes.assign(tensor.sizes().begin(), tensor.sizes().end());
+  }
+  const auto same_rank = sizes.size() == static_cast<size_t>(tensor.dim());
+  const auto same_shape = same_rank &&
+      std::equal(sizes.begin(), sizes.end(), tensor.sizes().begin());
+  const auto element_count =
+      executorch::aten::compute_numel(sizes.data(), sizes.size());
+  const auto parent_element_count = tensor.numel();
+  ET_CHECK_MSG(
+      element_count <= parent_element_count,
+      "Requested view has %zd elements, but source tensor only has %zd.",
+      static_cast<ssize_t>(element_count),
+      static_cast<ssize_t>(parent_element_count));
 #ifndef USE_ATEN_LIB
-      std::vector<executorch::aten::DimOrderType>(
-          tensor.dim_order().begin(), tensor.dim_order().end()),
-#else // USE_ATEN_LIB
-      {},
+  if (dim_order.empty() && strides.empty() && same_rank) {
+    dim_order.assign(tensor.dim_order().begin(), tensor.dim_order().end());
+  }
 #endif // USE_ATEN_LIB
-      std::vector<executorch::aten::StridesType>(
-          tensor.strides().begin(), tensor.strides().end()),
+  if (strides.empty() && dim_order.empty() && same_shape) {
+    strides.assign(tensor.strides().begin(), tensor.strides().end());
+  }
+  return make_tensor_ptr(
+      std::move(sizes),
+      tensor.mutable_data_ptr(),
+      std::move(dim_order),
+      std::move(strides),
       tensor.scalar_type()
 #ifndef USE_ATEN_LIB
           ,
@@ -352,21 +380,21 @@ inline TensorPtr make_tensor_ptr(const executorch::aten::Tensor& tensor) {
 }
 
 /**
- * Creates a TensorPtr to manage a new Tensor with the same properties
- * as the Tensor referenced by the given TensorPtr, sharing the same data
- * without owning it.
+ * Convenience overload identical to make_tensor_ptr(*tensor_ptr, ...).
  *
- * This is a convenience overload equivalent to make_tensor_ptr(*tensor_ptr).
- * It does not extend the lifetime of the underlying buffer; if the original
- * owner releases the storage, all views aliasing it become dangling.
- *
- * @param tensor_ptr The TensorPtr whose underlying Tensor is used to initialize
- *                   the returned view.
- * @return A new TensorPtr managing a Tensor with the same properties as the
- *         original.
+ * @param tensor_ptr The source tensor pointer to alias.
+ * @param sizes Optional sizes override.
+ * @param dim_order Optional dimension order override.
+ * @param strides Optional strides override.
+ * @return A TensorPtr aliasing the same storage with requested metadata.
  */
-inline TensorPtr make_tensor_ptr(const TensorPtr& tensor_ptr) {
-  return make_tensor_ptr(*tensor_ptr);
+inline TensorPtr make_tensor_ptr(
+    const TensorPtr& tensor_ptr,
+    std::vector<executorch::aten::SizesType> sizes = {},
+    std::vector<executorch::aten::DimOrderType> dim_order = {},
+    std::vector<executorch::aten::StridesType> strides = {}) {
+  return make_tensor_ptr(
+      *tensor_ptr, std::move(sizes), std::move(dim_order), std::move(strides));
 }
 
 /**
diff --git a/extension/tensor/test/tensor_ptr_test.cpp b/extension/tensor/test/tensor_ptr_test.cpp
index 04356875867..9156a0c4b10 100644
--- a/extension/tensor/test/tensor_ptr_test.cpp
+++ b/extension/tensor/test/tensor_ptr_test.cpp
@@ -357,6 +357,204 @@ TEST_F(TensorPtrTest, MakeTensorPtrFromExistingTensorInt32) {
   EXPECT_EQ(new_tensor->scalar_type(), executorch::aten::ScalarType::Int);
 }
 
+TEST_F(TensorPtrTest, MakeViewOverrideSizesRankIncrease) {
+  std::vector<float> data = {1, 2, 3, 4, 5, 6};
+  auto tensor = make_tensor_ptr({2, 3}, std::move(data));
+  auto view = make_tensor_ptr(tensor, {1, 2, 3});
+
+  EXPECT_EQ(view->dim(), 3);
+  EXPECT_EQ(view->size(0), 1);
+  EXPECT_EQ(view->size(1), 2);
+  EXPECT_EQ(view->size(2), 3);
+  EXPECT_EQ(view->const_data_ptr<float>(), tensor->const_data_ptr<float>());
+  EXPECT_EQ(view->strides()[0], 6);
+  EXPECT_EQ(view->strides()[1], 3);
+  EXPECT_EQ(view->strides()[2], 1);
+}
+
+TEST_F(TensorPtrTest, MakeViewOverrideSizesSameRankRecomputesStrides) {
+  float data[12] = {0};
+  auto tensor = make_tensor_ptr({3, 4}, data);
+  auto view = make_tensor_ptr(tensor, {4, 3});
+
+  EXPECT_EQ(view->dim(), 2);
+  EXPECT_EQ(view->size(0), 4);
+  EXPECT_EQ(view->size(1), 3);
+  EXPECT_EQ(view->strides()[0], 3);
+  EXPECT_EQ(view->strides()[1], 1);
+}
+
+TEST_F(TensorPtrTest, MakeViewOverrideDimOrderOnly) {
+  float data[6] = {0};
+  auto tensor = make_tensor_ptr({2, 3}, data);
+  auto view = make_tensor_ptr(tensor, {}, {1, 0}, {});
+
+  EXPECT_EQ(view->dim(), 2);
+  EXPECT_EQ(view->size(0), 2);
+  EXPECT_EQ(view->size(1), 3);
+  EXPECT_EQ(view->strides()[0], 1);
+  EXPECT_EQ(view->strides()[1], 2);
+}
+
+TEST_F(TensorPtrTest, MakeViewOverrideStridesOnlyInfersDimOrder) {
+  float data[12] = {0};
+  auto tensor = make_tensor_ptr({3, 4}, data);
+  auto view = make_tensor_ptr(tensor, {}, {}, {1, 3});
+
+  EXPECT_EQ(view->dim(), 2);
+  EXPECT_EQ(view->size(0), 3);
+  EXPECT_EQ(view->size(1), 4);
+  EXPECT_EQ(view->strides()[0], 1);
+  EXPECT_EQ(view->strides()[1], 3);
+}
+
+TEST_F(TensorPtrTest, MakeViewReuseMetadataWhenShapeSame) {
+  float data[12] = {0};
+  auto tensor = make_tensor_ptr({3, 4}, data, {1, 0}, {1, 3});
+  auto view = make_tensor_ptr(tensor, {3, 4});
+
+  EXPECT_EQ(view->dim(), 2);
+  EXPECT_EQ(view->size(0), 3);
+  EXPECT_EQ(view->size(1), 4);
+  EXPECT_EQ(view->strides()[0], 1);
+  EXPECT_EQ(view->strides()[1], 3);
+}
+
+TEST_F(TensorPtrTest, MakeViewShapeChangeWithExplicitOldStridesExpectDeath) {
+  float data[12] = {0};
+  auto tensor = make_tensor_ptr({3, 4}, data);
+  std::vector<executorch::aten::StridesType> old_strides(
+      tensor->strides().begin(), tensor->strides().end());
+
+  ET_EXPECT_DEATH(
+      { auto _ = make_tensor_ptr(tensor, {2, 6}, {}, old_strides); }, "");
+}
+
+TEST_F(TensorPtrTest, MakeViewInvalidDimOrderExpectDeath) {
+  float data[12] = {0};
+  auto tensor = make_tensor_ptr({3, 4}, data);
+
+  ET_EXPECT_DEATH(
+      { auto _ = make_tensor_ptr(tensor, {3, 4}, {2, 1}, {1, 4}); }, "");
+}
+
+TEST_F(TensorPtrTest, MakeViewFromTensorPtrConvenienceOverload) {
+  float data[12] = {0};
+  auto tensor = make_tensor_ptr({3, 4}, data);
+  auto view = make_tensor_ptr(tensor, {}, {1, 0}, {});
+
+  EXPECT_EQ(view->dim(), 2);
+  EXPECT_EQ(view->size(0), 3);
+  EXPECT_EQ(view->size(1), 4);
+  EXPECT_EQ(view->strides()[0], 1);
+  EXPECT_EQ(view->strides()[1], 3);
+}
+
+TEST_F(TensorPtrTest, MakeViewRankDecreaseFlatten) {
+  float data[6] = {1, 2, 3, 4, 5, 6};
+  auto tensor = make_tensor_ptr(
+      {2, 3},
+      data,
+      {},
+      {},
+      executorch::aten::ScalarType::Float,
+      executorch::aten::TensorShapeDynamism::DYNAMIC_UNBOUND);
+  auto view = make_tensor_ptr(tensor, {6});
+  EXPECT_EQ(view->dim(), 1);
+  EXPECT_EQ(view->size(0), 6);
+  EXPECT_EQ(view->strides()[0], 1);
+  EXPECT_NE(tensor->unsafeGetTensorImpl(), view->unsafeGetTensorImpl());
+  EXPECT_EQ(resize_tensor_ptr(view, {3, 2}), Error::NotSupported);
+  EXPECT_EQ(view->dim(), 1);
+  EXPECT_EQ(view->size(0), 6);
+}
+
+TEST_F(TensorPtrTest, MakeViewFromScalarAliasAnd1D) {
+  float scalar_value = 7.f;
+  auto tensor = make_tensor_ptr({}, &scalar_value);
+  auto alias = make_tensor_ptr(tensor);
+  EXPECT_EQ(alias->dim(), 0);
+  EXPECT_EQ(alias->numel(), 1);
+  auto reshaped = make_tensor_ptr(tensor, {1});
+  EXPECT_EQ(reshaped->dim(), 1);
+  EXPECT_EQ(reshaped->size(0), 1);
+  EXPECT_EQ(reshaped->strides()[0], 1);
+  ET_EXPECT_DEATH({ auto unused = make_tensor_ptr(tensor, {}, {0}, {}); }, "");
+  ET_EXPECT_DEATH({ auto unused = make_tensor_ptr(tensor, {}, {}, {1}); }, "");
+}
+
+TEST_F(TensorPtrTest, MakeViewExplicitDimOrderAndStridesShapeChange) {
+  float data[6] = {0};
+  auto tensor = make_tensor_ptr({2, 3}, data);
+  auto view = make_tensor_ptr(tensor, {3, 2}, {1, 0}, {1, 3});
+  EXPECT_EQ(view->dim(), 2);
+  EXPECT_EQ(view->size(0), 3);
+  EXPECT_EQ(view->size(1), 2);
+  EXPECT_EQ(view->strides()[0], 1);
+  EXPECT_EQ(view->strides()[1], 3);
+}
+
+TEST_F(TensorPtrTest, TensorUint8dataInt16Type) {
+  std::vector<int16_t> int16_values = {-1, 2, -3, 4};
+  auto byte_pointer = reinterpret_cast<const uint8_t*>(int16_values.data());
+  std::vector<uint8_t> byte_data(
+      byte_pointer, byte_pointer + int16_values.size() * sizeof(int16_t));
+  auto tensor = make_tensor_ptr(
+      {4}, std::move(byte_data), executorch::aten::ScalarType::Short);
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 4);
+  auto int16_data = tensor->const_data_ptr<int16_t>();
+  EXPECT_EQ(int16_data[0], -1);
+  EXPECT_EQ(int16_data[1], 2);
+  EXPECT_EQ(int16_data[2], -3);
+  EXPECT_EQ(int16_data[3], 4);
+}
+
+TEST_F(TensorPtrTest, MakeView3DDimOrderOnly) {
+  float data[24] = {0};
+  auto tensor = make_tensor_ptr({2, 3, 4}, data);
+  auto view = make_tensor_ptr(tensor, {}, {2, 0, 1}, {});
+  EXPECT_EQ(view->dim(), 3);
+  EXPECT_EQ(view->size(0), 2);
+  EXPECT_EQ(view->size(1), 3);
+  EXPECT_EQ(view->size(2), 4);
+  EXPECT_EQ(view->strides()[0], 3);
+  EXPECT_EQ(view->strides()[1], 1);
+  EXPECT_EQ(view->strides()[2], 6);
+}
+
+#ifndef USE_ATEN_LIB
+TEST_F(TensorPtrTest, MakeViewDynamismPropagationResizeAlias) {
+  float data[12] = {0};
+  auto tensor = make_tensor_ptr(
+      {3, 4},
+      data,
+      {},
+      {},
+      executorch::aten::ScalarType::Float,
+      executorch::aten::TensorShapeDynamism::DYNAMIC_UNBOUND);
+  auto alias = make_tensor_ptr(tensor);
+  EXPECT_EQ(resize_tensor_ptr(alias, {2, 6}), Error::Ok);
+  EXPECT_EQ(alias->size(0), 2);
+  EXPECT_EQ(alias->size(1), 6);
+  EXPECT_EQ(tensor->size(0), 3);
+  EXPECT_EQ(tensor->size(1), 4);
+}
+
+TEST_F(TensorPtrTest, MakeViewSameRankShapeChangeCopiesDimOrder) {
+  float data[24] = {0};
+  auto tensor = make_tensor_ptr({2, 3, 4}, data, {2, 0, 1}, {3, 1, 6});
+  auto view = make_tensor_ptr(tensor, {4, 2, 3});
+  EXPECT_EQ(view->dim(), 3);
+  EXPECT_EQ(view->size(0), 4);
+  EXPECT_EQ(view->size(1), 2);
+  EXPECT_EQ(view->size(2), 3);
+  EXPECT_EQ(view->strides()[0], 2);
+  EXPECT_EQ(view->strides()[1], 1);
+  EXPECT_EQ(view->strides()[2], 8);
+}
+#endif
+
 TEST_F(TensorPtrTest, CloneTensorPtrFromExistingTensorInt32) {
   std::vector<int32_t> data = {1, 2, 3, 4};
   auto tensor = make_tensor_ptr({2, 2}, std::move(data));
@@ -803,7 +1001,7 @@ TEST_F(TensorPtrTest, TensorDeducedScalarType) {
   EXPECT_EQ(tensor->const_data_ptr<double>()[3], 4.0);
 }
 
-TEST_F(TensorPtrTest, TensorUint8BufferWithFloatScalarType) {
+TEST_F(TensorPtrTest, TensorUint8dataWithFloatScalarType) {
   std::vector<uint8_t> data(
       4 * executorch::aten::elementSize(executorch::aten::ScalarType::Float));
 
@@ -827,14 +1025,14 @@ TEST_F(TensorPtrTest, TensorUint8BufferWithFloatScalarType) {
   EXPECT_EQ(tensor->const_data_ptr<float>()[3], 4.0f);
 }
 
-TEST_F(TensorPtrTest, TensorUint8BufferTooSmallExpectDeath) {
+TEST_F(TensorPtrTest, TensorUint8dataTooSmallExpectDeath) {
   std::vector<uint8_t> data(
       2 * executorch::aten::elementSize(executorch::aten::ScalarType::Float));
   ET_EXPECT_DEATH(
       { auto tensor = make_tensor_ptr({2, 2}, std::move(data)); }, "");
 }
 
-TEST_F(TensorPtrTest, TensorUint8BufferTooLargeExpectDeath) {
+TEST_F(TensorPtrTest, TensorUint8dataTooLargeExpectDeath) {
   std::vector<uint8_t> data(
       5 * executorch::aten::elementSize(executorch::aten::ScalarType::Float));
   ET_EXPECT_DEATH({ auto _ = make_tensor_ptr({2, 2}, std::move(data)); }, "");

From c97915852eba654c49d2435b77d5df4ca185f6ac Mon Sep 17 00:00:00 2001
From: Adrian Lundell <36153706+AdrianLundell@users.noreply.github.com>
Date: Fri, 10 Oct 2025 09:17:38 +0200
Subject: [PATCH 226/266] Cortex-M backend: Add mul and linear tests (#14746)

Minor included fixes:
- Make quantized_linear_fusion_pass an XNNPACK pass to initialize it
with an exported program
- Add TO_EXECUTORCH as a valid stage after RUN_PASSES
- Add ramp_tensor function to simplify creating dummy data

Signed-off-by: Adrian Lundell <adrian.lundell@arm.com>
---
 .../passes/quantized_linear_fusion_pass.py    |   9 +-
 backends/cortex_m/test/ops/test_add.py        |  22 +-
 backends/cortex_m/test/ops/test_linear.py     | 211 ++++++++++++++++++
 backends/cortex_m/test/ops/test_mul.py        | 131 +++++++++++
 backends/cortex_m/test/tester.py              |  26 ++-
 backends/test/harness/tester.py               |   6 +
 6 files changed, 386 insertions(+), 19 deletions(-)
 create mode 100644 backends/cortex_m/test/ops/test_linear.py
 create mode 100644 backends/cortex_m/test/ops/test_mul.py

diff --git a/backends/cortex_m/passes/quantized_linear_fusion_pass.py b/backends/cortex_m/passes/quantized_linear_fusion_pass.py
index 8f8a90eec2f..11a49beb2f4 100644
--- a/backends/cortex_m/passes/quantized_linear_fusion_pass.py
+++ b/backends/cortex_m/passes/quantized_linear_fusion_pass.py
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -19,9 +20,10 @@
 )
 
 from executorch.backends.transforms.utils import create_mutable_buffer, get_param_tensor
+
+from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass
 from executorch.exir import ExportedProgram
 from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.pass_base import ExportPass
 from torch.fx import Node
 from torch.fx.passes.infra.pass_manager import PassResult
 
@@ -29,7 +31,7 @@
 logger.setLevel(logging.INFO)
 
 
-class QuantizedLinearFusionPass(ExportPass):
+class QuantizedLinearFusionPass(XNNPACKPass):
     """
     Cortex-M backend pass that fuses quantized linear-like patterns.
     Fuses: dequantize -> [linear/addmm/fc_ops] -> quantize
@@ -44,8 +46,7 @@ class QuantizedLinearFusionPass(ExportPass):
     requires_exported_program = True
 
     def __init__(self, exported_program: ExportedProgram):
-        super().__init__()
-        self._exported_program = exported_program
+        super().__init__(exported_program)
         self.nodes_to_erase = []
 
     def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
diff --git a/backends/cortex_m/test/ops/test_add.py b/backends/cortex_m/test/ops/test_add.py
index 10edacb5a11..b7b0ffcbfbc 100644
--- a/backends/cortex_m/test/ops/test_add.py
+++ b/backends/cortex_m/test/ops/test_add.py
@@ -6,7 +6,11 @@
 
 import torch
 from executorch.backends.arm.test.common import parametrize
-from executorch.backends.cortex_m.test.tester import CortexMTester, McuTestCase
+from executorch.backends.cortex_m.test.tester import (
+    CortexMTester,
+    McuTestCase,
+    ramp_tensor,
+)
 from executorch.backends.test.suite.operators.test_add import Model, ModelAlpha
 
 
@@ -80,19 +84,19 @@ class CortexMAlphaAdd(ModelAlpha):
     ),
     "self_rank_2_pos": McuTestCase(
         CortexMSelfAdd(),
-        (torch.linspace(0, 1000, 10).reshape((10, 1)),),
+        (ramp_tensor(0, 1000, (10, 1)),),
     ),
     "self_rank_3_neg": McuTestCase(
         CortexMSelfAdd(),
-        (torch.linspace(-100, 0, 8).reshape((2, 2, 2)),),
+        (ramp_tensor(-100, 0, (2, 2, 2)),),
     ),
     "self_rank_4_small": McuTestCase(
         CortexMSelfAdd(),
-        (torch.linspace(-0.1, 0.1, 16).reshape(2, 2, 2, 2),),
+        (ramp_tensor(-0.1, 0.1, (2, 2, 2, 2)),),
     ),
     "self_rank_5": McuTestCase(
         CortexMSelfAdd(),
-        (torch.linspace(-5, 5, 32).reshape(2, 2, 2, 2, 2),),
+        (ramp_tensor(-5, 5, (2, 2, 2, 2, 2)),),
     ),
     "scalar_scalar": McuTestCase(
         CortexMScalarAdd(),
@@ -117,15 +121,15 @@ class CortexMAlphaAdd(ModelAlpha):
     "broadcast_3": McuTestCase(
         CortexMTensorAdd(),
         (
-            torch.linspace(-2, 2, 4).reshape(2, 1, 2, 1),
-            torch.linspace(-5, 5, 4).reshape(1, 2, 1, 2),
+            ramp_tensor(-2, 2, (2, 1, 2, 1)),
+            ramp_tensor(-5, 5, (1, 2, 1, 2)),
         ),
     ),
     "alpha": McuTestCase(
         CortexMAlphaAdd(0.5),
         (
-            torch.linspace(-10, 10, 20).reshape(4, 5),
-            torch.linspace(-20, 20, 20).reshape(4, 5),
+            ramp_tensor(-10, 10, (4, 5)),
+            ramp_tensor(-20, 20, (4, 5)),
         ),
     ),
 }
diff --git a/backends/cortex_m/test/ops/test_linear.py b/backends/cortex_m/test/ops/test_linear.py
new file mode 100644
index 00000000000..a1275352fcf
--- /dev/null
+++ b/backends/cortex_m/test/ops/test_linear.py
@@ -0,0 +1,211 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import torch
+from executorch.backends.arm.test.common import parametrize
+from executorch.backends.cortex_m.test.tester import (
+    CortexMTester,
+    McuTestCase,
+    ramp_tensor,
+)
+
+
+class CortexMMm(torch.nn.Module):
+    def forward(self, x, y):
+        return torch.mm(x, y)
+
+    ops_before_transforms = {
+        "executorch_exir_dialects_edge__ops_aten_mm_default": 1,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3,
+    }
+
+    ops_after_transforms = {
+        "executorch_exir_dialects_edge__ops_cortex_m_quantized_linear_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
+    }
+
+
+class CortexMBmm(torch.nn.Module):
+    def forward(self, x, y):
+        return torch.bmm(x, y)
+
+    ops_before_transforms = {
+        "executorch_exir_dialects_edge__ops_aten_bmm_default": 1,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3,
+    }
+
+    ops_after_transforms = {
+        "executorch_exir_dialects_edge__ops_cortex_m_quantized_linear_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
+    }
+
+
+class CortexMAddmm(torch.nn.Module):
+    def forward(self, x, y, z, alpha=None, beta=None):
+        return torch.addmm(beta, x, alpha, y, z)
+
+    ops_before_transforms = {
+        "executorch_exir_dialects_edge__ops_aten_addmm_default": 1,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3,
+    }
+
+    ops_after_transforms = {
+        "executorch_exir_dialects_edge__ops_cortex_m_quantized_linear_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
+    }
+
+
+class CortexMAt(CortexMMm):
+    def forward(self, x, y):
+        return x @ y
+
+
+class CortexMMatmul(CortexMMm):
+    def forward(self, x, y):
+        return torch.matmul(x, y)
+
+
+class CortexMLinear(CortexMMatmul):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self.linear = torch.nn.Linear(*args, bias=False)
+
+    def forward(self, x):
+        return self.linear(x)
+
+
+class CortexMLinearBias(CortexMAddmm):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self.linear = torch.nn.Linear(*args, bias=True)
+
+    def forward(self, x):
+        return self.linear(x)
+
+
+test_cases = {
+    "mm": McuTestCase(
+        model=CortexMMm(),
+        example_inputs=(
+            ramp_tensor(0, 10, (1, 16)),
+            ramp_tensor(0, 10, (16, 16)),
+        ),
+    ),
+    "bmm": McuTestCase(
+        model=CortexMBmm(),
+        example_inputs=(
+            ramp_tensor(0, 10, (1, 16, 16)),
+            ramp_tensor(0, 10, (1, 16, 16)),
+        ),
+    ),
+    "addmm": McuTestCase(
+        model=CortexMAddmm(),
+        example_inputs=(
+            ramp_tensor(0, 10, (1, 16)),
+            ramp_tensor(0, 10, (16, 16)),
+            ramp_tensor(0, 10, (16, 16)),
+            2,
+            4,
+        ),
+    ),
+    "addmm_scalars": McuTestCase(
+        model=CortexMAddmm(),
+        example_inputs=(
+            ramp_tensor(0, 10, (1, 16)),
+            ramp_tensor(0, 10, (16, 16)),
+            ramp_tensor(0, 10, (16, 16)),
+        ),
+    ),
+    "@-operator": McuTestCase(
+        model=CortexMAt(),
+        example_inputs=(
+            ramp_tensor(0, 10, (1, 16)),
+            ramp_tensor(0, 10, (16, 16)),
+        ),
+    ),
+    "matmul": McuTestCase(
+        model=CortexMMatmul(),
+        example_inputs=(
+            ramp_tensor(0, 10, (1, 16)),
+            ramp_tensor(0, 10, (16, 16)),
+        ),
+    ),
+    "linear_rank1": McuTestCase(
+        model=CortexMLinear(2, 3),
+        example_inputs=(ramp_tensor(-1, 1, (2,)),),
+    ),
+    "linear_rank2_pos": McuTestCase(
+        model=CortexMLinear(8, 3),
+        example_inputs=(ramp_tensor(0, 10, (2, 8)),),
+    ),
+    "linear_rank3_neg": McuTestCase(
+        model=CortexMLinear(5, 3),
+        example_inputs=(ramp_tensor(-40, 0, (4, 2, 5)),),
+    ),
+    "linear_rank4": McuTestCase(
+        model=CortexMLinear(16, 32),
+        example_inputs=(ramp_tensor(-100, 100, (2, 1, 2, 16)),),
+    ),
+    "linear_rank5": McuTestCase(
+        model=CortexMLinear(4, 3),
+        example_inputs=(ramp_tensor(-2, 2, (5, 2, 1, 2, 4)),),
+    ),
+    "linear_bias": McuTestCase(
+        model=CortexMLinearBias(61, 37),
+        example_inputs=(ramp_tensor(0, 10, (8, 61)),),
+    ),
+}
+
+dialect_xfails = {
+    "mm": ("torch.mm ops are currently not quantized", RuntimeError),
+    "bmm": ("torch.bmm ops are currently not quantized", RuntimeError),
+    "addmm": ("torch.addmm ops are currently not quantized", RuntimeError),
+    "addmm_scalars": ("torch.addmm ops are currently not quantized", RuntimeError),
+    "matmul": ("torch.matmul ops are currently not quantized", RuntimeError),
+    "@-operator": ("@ ops are currently not quantized", RuntimeError),
+    "linear_rank1": ("Only rank 2 linear ops are fused currently", RuntimeError),
+    "linear_rank2_pos": ("name 'int32' is not defined", NameError),
+    "linear_rank3_neg": ("Only rank 2 linear ops are fused currently", RuntimeError),
+    "linear_rank4": ("Only rank 2 linear ops are fused currently", RuntimeError),
+    "linear_rank5": ("Only rank 2 linear ops are fused currently", RuntimeError),
+    "linear_bias": ("name 'int32' is not defined", NameError),
+}
+
+
+@parametrize("test_case", test_cases, dialect_xfails)
+def test_dialect_linear(test_case):
+    tester = CortexMTester(test_case.model, test_case.example_inputs)
+    tester.test_dialect(
+        test_case.model.ops_before_transforms, test_case.model.ops_after_transforms
+    )
+
+
+implementation_xfails = {
+    "mm": ("torch.mm ops are currently not quantized", RuntimeError),
+    "bmm": ("torch.bmm ops are currently not quantized", RuntimeError),
+    "addmm": ("torch.addmm ops are currently not quantized", RuntimeError),
+    "addmm_scalars": ("torch.addmm ops are currently not quantized", RuntimeError),
+    "matmul": ("torch.matmul ops are currently not quantized", RuntimeError),
+    "@-operator": ("@ ops are currently not quantized", RuntimeError),
+    "linear_rank1": ("Only rank 2 linear ops are fused currently", RuntimeError),
+    "linear_rank2_pos": ("Output 0 does not match reference output.", AssertionError),
+    "linear_rank3_neg": ("Only rank 2 linear ops are fused currently", RuntimeError),
+    "linear_rank4": ("Only rank 2 linear ops are fused currently", RuntimeError),
+    "linear_rank5": ("Only rank 2 linear ops are fused currently", RuntimeError),
+    "linear_bias": ("Output 0 does not match reference output.", AssertionError),
+}
+
+
+@parametrize("test_case", test_cases, implementation_xfails)
+def test_implementation_linear(test_case):
+    tester = CortexMTester(test_case.model, test_case.example_inputs)
+    tester.test_implementation()
diff --git a/backends/cortex_m/test/ops/test_mul.py b/backends/cortex_m/test/ops/test_mul.py
new file mode 100644
index 00000000000..a2f13760bf0
--- /dev/null
+++ b/backends/cortex_m/test/ops/test_mul.py
@@ -0,0 +1,131 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import pytest
+import torch
+from executorch.backends.arm.test.common import parametrize
+from executorch.backends.cortex_m.test.tester import (
+    CortexMTester,
+    McuTestCase,
+    ramp_tensor,
+)
+from executorch.backends.test.suite.operators.test_mul import Model
+
+
+class CortexMSelfMul(torch.nn.Module):
+    ops_before_transforms = {
+        "executorch_exir_dialects_edge__ops_aten_mul_Tensor": 1,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 2,
+    }
+
+    ops_after_transforms = {
+        "executorch_exir_dialects_edge__ops_cortex_m_quantized_mul_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
+    }
+
+    def forward(self, x):
+        return x * x
+
+
+class CortexMScalarMul(Model):
+    ops_before_transforms = {
+        "executorch_exir_dialects_edge__ops_aten_mul_Tensor": 1,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3,
+    }
+
+    ops_after_transforms = {
+        "executorch_exir_dialects_edge__ops_cortex_m_quantized_mul_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
+    }
+
+
+class CortexMTensorMul(Model):
+    ops_before_transforms = {
+        "executorch_exir_dialects_edge__ops_aten_mul_Tensor": 1,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 3,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3,
+    }
+
+    ops_after_transforms = {
+        "executorch_exir_dialects_edge__ops_cortex_m_quantized_mul_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
+    }
+
+
+test_cases = {
+    "self_scalar": McuTestCase(
+        CortexMSelfMul(),
+        (10.0,),
+    ),
+    "self_rank_1": McuTestCase(
+        CortexMSelfMul(),
+        (ramp_tensor(-5, 5, (10,)),),
+    ),
+    "self_rank_2_pos": McuTestCase(
+        CortexMSelfMul(),
+        (ramp_tensor(0, 1000, (10, 1)),),
+    ),
+    "self_rank_3_neg": McuTestCase(
+        CortexMSelfMul(),
+        (ramp_tensor(-100, 0, (2, 2, 2)),),
+    ),
+    "self_rank_4_small": McuTestCase(
+        CortexMSelfMul(),
+        (ramp_tensor(-0.1, 0.1, (2, 2, 2, 2)),),
+    ),
+    "self_rank_5": McuTestCase(
+        CortexMSelfMul(),
+        (ramp_tensor(-5, 5, (2, 2, 2, 2, 2)),),
+    ),
+    "scalar_scalar": McuTestCase(
+        CortexMScalarMul(),
+        (-0.5, 1.0),
+    ),
+    "tensor_scalar": McuTestCase(
+        CortexMScalarMul(),
+        (torch.ones(2, 2), 1.0),
+    ),
+    "scalar_tensor": McuTestCase(
+        CortexMScalarMul(),
+        (1000.0, torch.ones(2, 2)),
+    ),
+    "broadcast_1": McuTestCase(
+        CortexMTensorMul(),
+        (torch.ones(1), torch.ones(2, 2, 2, 2)),
+    ),
+    "broadcast_2": McuTestCase(
+        CortexMTensorMul(),
+        (torch.ones((2, 1, 1, 1)), torch.ones(1)),
+    ),
+    "broadcast_3": McuTestCase(
+        CortexMTensorMul(),
+        (
+            ramp_tensor(-2, 2, (2, 1, 2, 1)),
+            ramp_tensor(-5, 5, (1, 2, 1, 2)),
+        ),
+    ),
+}
+
+
+@pytest.mark.skip(reason="Not implemented yet")
+@parametrize("test_case", test_cases)
+def test_dialect_mul(test_case):
+    tester = CortexMTester(test_case.model, test_case.example_inputs)
+    tester.test_dialect(
+        test_case.model.ops_before_transforms, test_case.model.ops_after_transforms
+    )
+
+
+@pytest.mark.skip(reason="Not implemented yet")
+@parametrize("test_case", test_cases)
+def test_implementation_mul(test_case):
+    tester = CortexMTester(test_case.model, test_case.example_inputs)
+    tester.test_implementation()
diff --git a/backends/cortex_m/test/tester.py b/backends/cortex_m/test/tester.py
index 8af31e58cd7..c492d3c8443 100644
--- a/backends/cortex_m/test/tester.py
+++ b/backends/cortex_m/test/tester.py
@@ -8,13 +8,11 @@
 from typing import Any
 
 import torch
-
-from backends.xnnpack.quantizer.xnnpack_quantizer import (
-    get_symmetric_quantization_config,
-    XNNPACKQuantizer,
-)
 from executorch.backends.arm.test.common import get_u55_compile_spec
 from executorch.backends.arm.test.tester.arm_tester import Serialize
+from executorch.backends.cortex_m.passes.quantized_linear_fusion_pass import (
+    QuantizedLinearFusionPass,
+)
 from executorch.backends.cortex_m.passes.quantized_op_fusion_pass import (
     QuantizedOpFusionPass,
 )
@@ -33,6 +31,11 @@
 )
 from executorch.backends.xnnpack._passes import XNNPACKPassManager
 
+from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import (
+    get_symmetric_quantization_config,
+    XNNPACKQuantizer,
+)
+
 
 class CortexMQuantize(Quantize):
     def __init__(self):
@@ -44,7 +47,12 @@ def __init__(self):
 class CortexMRunPasses(RunPasses):
     def __init__(self):
         super().__init__(
-            XNNPACKPassManager, pass_list=[QuantizedOpFusionPass, ReplaceQuantNodesPass]
+            XNNPACKPassManager,
+            pass_list=[
+                ReplaceQuantNodesPass,
+                QuantizedLinearFusionPass,
+                QuantizedOpFusionPass,
+            ],
         )
 
 
@@ -98,3 +106,9 @@ def test_implementation(self, qtol=0):
 class McuTestCase:
     model: torch.nn.Module
     example_inputs: tuple[Any]
+
+
+def ramp_tensor(start: int, end: int, shape: tuple[int]) -> torch.Tensor:
+    return torch.linspace(start, end, steps=torch.prod(torch.tensor(shape))).reshape(
+        shape
+    )
diff --git a/backends/test/harness/tester.py b/backends/test/harness/tester.py
index 351bab4a605..02c6fc4c82d 100644
--- a/backends/test/harness/tester.py
+++ b/backends/test/harness/tester.py
@@ -1,3 +1,8 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
 import random
 from collections import Counter, OrderedDict
 from typing import Any, Callable, Dict, List, Optional, Tuple
@@ -62,6 +67,7 @@ def __init__(
             StageType.RUN_PASSES: [
                 StageType.PARTITION,
                 StageType.TO_EDGE_TRANSFORM_AND_LOWER,
+                StageType.TO_EXECUTORCH,
             ],
             # TODO Make this Stage optional
             StageType.PARTITION: [StageType.TO_EXECUTORCH],

From 73959991c702dcf4a6c9127b250d65ea524ffadf Mon Sep 17 00:00:00 2001
From: tirwu01 <tirui.wu@arm.com>
Date: Fri, 10 Oct 2025 12:39:14 +0100
Subject: [PATCH 227/266] Arm backend: add DeiTTiny evaluator and deterministic
 shuffled calibration subsets (#14579)

Change-Id: I7f61120772906ae0fec5d1f2b9cfcc0aa2c2c7af

### Summary
Add DeiTTiny evaluator for model accuracy evaluation.


cc @digantdesai @freddan80 @per @zingo @oscarandersson8218

Signed-off-by: Tirui Wu <tirui.wu@arm.com>
---
 backends/arm/util/arm_model_evaluator.py | 306 ++++++++++++++++++-----
 examples/arm/aot_arm_compiler.py         |   2 +-
 2 files changed, 244 insertions(+), 64 deletions(-)

diff --git a/backends/arm/util/arm_model_evaluator.py b/backends/arm/util/arm_model_evaluator.py
index cbfa337ab09..8c36128cea8 100644
--- a/backends/arm/util/arm_model_evaluator.py
+++ b/backends/arm/util/arm_model_evaluator.py
@@ -1,5 +1,4 @@
 # Copyright 2024-2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -30,7 +29,139 @@
 logger.setLevel(logging.INFO)
 
 
+# ImageNet 224x224 transforms (Resize->CenterCrop->ToTensor->Normalize)
+# If future models require different preprocessing, extend this helper accordingly.
+def _get_imagenet_224_transforms():
+    """Return standard ImageNet 224x224 preprocessing transforms."""
+    return transforms.Compose(
+        [
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.484, 0.454, 0.403], std=[0.225, 0.220, 0.220]),
+        ]
+    )
+
+
+def _build_calibration_loader(
+    dataset: datasets.ImageFolder, max_items: int
+) -> DataLoader:
+    """Return a DataLoader over a deterministic, shuffled subset of size <= max_items.
+
+    Shuffles with seed: ARM_EVAL_CALIB_SEED (int) or default 1337; then selects first k and
+    sorts indices to keep enumeration order stable while content depends on seed.
+    """
+    k = min(max_items, len(dataset))
+    seed_env = os.getenv("ARM_EVAL_CALIB_SEED")
+    default_seed = 1337
+    if seed_env is not None:
+        try:
+            seed = int(seed_env)
+        except ValueError:
+            logger.warning(
+                "ARM_EVAL_CALIB_SEED is not an int (%s); using default seed %d",
+                seed_env,
+                default_seed,
+            )
+            seed = default_seed
+    else:
+        seed = default_seed
+    rng = random.Random(seed)
+    indices = list(range(len(dataset)))
+    rng.shuffle(indices)
+    selected = sorted(indices[:k])
+    return torch.utils.data.DataLoader(
+        torch.utils.data.Subset(dataset, selected), batch_size=1, shuffle=False
+    )
+
+
+def _load_imagenet_folder(directory: str) -> datasets.ImageFolder:
+    """Shared helper to load an ImageNet-layout folder.
+
+    Raises FileNotFoundError for a missing directory early to aid debugging.
+    """
+    directory_path = Path(directory)
+    if not directory_path.exists():
+        raise FileNotFoundError(f"Directory: {directory} does not exist.")
+    transform = _get_imagenet_224_transforms()
+    return datasets.ImageFolder(directory_path, transform=transform)
+
+
 class GenericModelEvaluator:
+    """Base evaluator computing quantization error metrics and optional compression ratio.
+
+    Subclasses can extend: provide calibration (get_calibrator) and override evaluate()
+    to add domain specific metrics (e.g. top-1 / top-5 accuracy).
+    """
+
+    @staticmethod
+    def evaluate_topk(
+        model: Module,
+        dataset: datasets.ImageFolder,
+        batch_size: int,
+        topk: int = 5,
+        log_every: int = 50,
+    ) -> Tuple[float, float]:
+        """Evaluate model top-1 / top-k accuracy.
+
+        Args:
+            model: Torch module (should be in eval() mode prior to call).
+            dataset: ImageFolder style dataset.
+            batch_size: Batch size for evaluation.
+            topk: Maximum k for accuracy (default 5).
+            log_every: Log running accuracy every N batches.
+        Returns:
+            (top1_accuracy, topk_accuracy)
+        """
+        # Some exported / quantized models (torchao PT2E) disallow direct eval()/train().
+        # Try to switch to eval mode, but degrade gracefully if unsupported.
+        try:
+            model.eval()
+        except NotImplementedError:
+            # Attempt to enable train/eval overrides if torchao helper is present.
+            try:
+                from torchao.quantization.pt2e.utils import (  # type: ignore
+                    allow_exported_model_train_eval,
+                )
+
+                allow_exported_model_train_eval(model)
+                try:
+                    model.eval()
+                except Exception:
+                    logger.debug(
+                        "Model eval still not supported after allow_exported_model_train_eval; proceeding without explicit eval()."
+                    )
+            except Exception:
+                logger.debug(
+                    "Model eval() unsupported and torchao allow_exported_model_train_eval not available; proceeding."
+                )
+        loaded_dataset = DataLoader(dataset, batch_size=batch_size, shuffle=False)
+        top1_correct = 0
+        topk_correct = 0
+        total = 0
+        with torch.inference_mode():  # disable autograd + some backend optimizations
+            for i, (image, target) in enumerate(loaded_dataset):
+                prediction = model(image)
+                topk_indices = torch.topk(prediction, k=topk, dim=1).indices
+                # target reshaped for broadcasting
+                target_view = target.view(-1, 1)
+                top1_correct += (topk_indices[:, :1] == target_view).sum().item()
+                topk_correct += (topk_indices == target_view).sum().item()
+                batch_sz = image.size(0)
+                total += batch_sz
+                if (i + 1) % log_every == 0 or total == len(dataset):
+                    logger.info(
+                        "Eval progress: %d / %d  top1=%.4f top%d=%.4f",
+                        total,
+                        len(dataset),
+                        top1_correct / total,
+                        topk,
+                        topk_correct / total,
+                    )
+        top1_accuracy = top1_correct / len(dataset)
+        topk_accuracy = topk_correct / len(dataset)
+        return top1_accuracy, topk_accuracy
+
     REQUIRES_CONFIG = False
 
     def __init__(
@@ -53,12 +184,13 @@ def __init__(
             self.tosa_output_path = ""
 
     def get_model_error(self) -> defaultdict:
-        """
-        Returns a dict containing the following metrics between the outputs of the FP32 and INT8 model:
-        - Maximum error
-        - Maximum absolute error
-        - Maximum percentage error
-        - Mean absolute error
+        """Return per-output quantization error statistics.
+
+        Metrics (lists per output tensor):
+            max_error
+            max_absolute_error
+            max_percentage_error (safe-divided; zero fp32 elements -> 0%)
+            mean_absolute_error
         """
         fp32_outputs, _ = tree_flatten(self.fp32_model(*self.example_input))
         int8_outputs, _ = tree_flatten(self.int8_model(*self.example_input))
@@ -67,7 +199,12 @@ def get_model_error(self) -> defaultdict:
 
         for fp32_output, int8_output in zip(fp32_outputs, int8_outputs):
             difference = fp32_output - int8_output
-            percentage_error = torch.div(difference, fp32_output) * 100
+            # Avoid divide by zero: elements where fp32 == 0 produce 0% contribution
+            percentage_error = torch.where(
+                fp32_output != 0,
+                difference / fp32_output * 100,
+                torch.zeros_like(difference),
+            )
             model_error_dict["max_error"].append(torch.max(difference).item())
             model_error_dict["max_absolute_error"].append(
                 torch.max(torch.abs(difference)).item()
@@ -132,77 +269,116 @@ def __init__(
 
     @staticmethod
     def __load_dataset(directory: str) -> datasets.ImageFolder:
-        directory_path = Path(directory)
-        if not directory_path.exists():
-            raise FileNotFoundError(f"Directory: {directory} does not exist.")
-
-        transform = transforms.Compose(
-            [
-                transforms.Resize(256),
-                transforms.CenterCrop(224),
-                transforms.ToTensor(),
-                transforms.Normalize(
-                    mean=[0.484, 0.454, 0.403], std=[0.225, 0.220, 0.220]
-                ),
-            ]
-        )
-        return datasets.ImageFolder(directory_path, transform=transform)
+        return _load_imagenet_folder(directory)
 
     @staticmethod
     def get_calibrator(training_dataset_path: str) -> DataLoader:
         dataset = MobileNetV2Evaluator.__load_dataset(training_dataset_path)
-        rand_indices = random.sample(range(len(dataset)), k=1000)
+        return _build_calibration_loader(dataset, 1000)
 
-        # Return a subset of the dataset to be used for calibration
-        return torch.utils.data.DataLoader(
-            torch.utils.data.Subset(dataset, rand_indices),
-            batch_size=1,
-            shuffle=False,
+    @classmethod
+    def from_config(
+        cls,
+        model_name: str,
+        fp32_model: Module,
+        int8_model: Module,
+        example_input: Tuple[torch.Tensor],
+        tosa_output_path: str | None,
+        config: dict[str, Any],
+    ) -> "MobileNetV2Evaluator":
+        """Factory constructing evaluator from a config dict.
+
+        Expected keys: batch_size, validation_dataset_path
+        """
+        return cls(
+            model_name,
+            fp32_model,
+            int8_model,
+            example_input,
+            tosa_output_path,
+            batch_size=config["batch_size"],
+            validation_dataset_path=config["validation_dataset_path"],
         )
 
-    def __evaluate_mobilenet(self) -> Tuple[float, float]:
+    def evaluate(self) -> dict[str, Any]:
+        # Load dataset and compute top-1 / top-5
         dataset = MobileNetV2Evaluator.__load_dataset(self.__validation_set_path)
-        loaded_dataset = DataLoader(
-            dataset,
-            batch_size=self.__batch_size,
-            shuffle=False,
+        top1_correct, top5_correct = GenericModelEvaluator.evaluate_topk(
+            self.int8_model, dataset, self.__batch_size, topk=5
         )
+        output = super().evaluate()
 
-        top1_correct = 0
-        top5_correct = 0
+        output["metrics"]["accuracy"] = {"top-1": top1_correct, "top-5": top5_correct}
+        return output
 
-        for i, (image, target) in enumerate(loaded_dataset):
-            prediction = self.int8_model(image)
-            top1_prediction = torch.topk(prediction, k=1, dim=1).indices
-            top5_prediction = torch.topk(prediction, k=5, dim=1).indices
 
-            top1_correct += (top1_prediction == target.view(-1, 1)).sum().item()
-            top5_correct += (top5_prediction == target.view(-1, 1)).sum().item()
+class DeiTTinyEvaluator(GenericModelEvaluator):
+    REQUIRES_CONFIG = True
 
-            logger.info("Iteration: {}".format((i + 1) * self.__batch_size))
-            logger.info(
-                "Top 1: {}".format(top1_correct / ((i + 1) * self.__batch_size))
-            )
-            logger.info(
-                "Top 5: {}".format(top5_correct / ((i + 1) * self.__batch_size))
-            )
+    def __init__(
+        self,
+        model_name: str,
+        fp32_model: Module,
+        int8_model: Module,
+        example_input: Tuple[torch.Tensor],
+        tosa_output_path: str | None,
+        batch_size: int,
+        validation_dataset_path: str,
+    ) -> None:
+        super().__init__(
+            model_name, fp32_model, int8_model, example_input, tosa_output_path
+        )
+        self.__batch_size = batch_size
+        self.__validation_set_path = validation_dataset_path
 
-        top1_accuracy = top1_correct / len(dataset)
-        top5_accuracy = top5_correct / len(dataset)
+    @staticmethod
+    def __load_dataset(directory: str) -> datasets.ImageFolder:
+        return _load_imagenet_folder(directory)
 
-        return top1_accuracy, top5_accuracy
+    @staticmethod
+    def get_calibrator(training_dataset_path: str) -> DataLoader:
+        dataset = DeiTTinyEvaluator.__load_dataset(training_dataset_path)
+        return _build_calibration_loader(dataset, 1000)
+
+    @classmethod
+    def from_config(
+        cls,
+        model_name: str,
+        fp32_model: Module,
+        int8_model: Module,
+        example_input: Tuple[torch.Tensor],
+        tosa_output_path: str | None,
+        config: dict[str, Any],
+    ) -> "DeiTTinyEvaluator":
+        """Factory constructing evaluator from a config dict.
+
+        Expected keys: batch_size, validation_dataset_path
+        """
+        return cls(
+            model_name,
+            fp32_model,
+            int8_model,
+            example_input,
+            tosa_output_path,
+            batch_size=config["batch_size"],
+            validation_dataset_path=config["validation_dataset_path"],
+        )
 
     def evaluate(self) -> dict[str, Any]:
-        top1_correct, top5_correct = self.__evaluate_mobilenet()
+        # Load dataset and compute top-1 / top-5
+        dataset = DeiTTinyEvaluator.__load_dataset(self.__validation_set_path)
+        top1, top5 = GenericModelEvaluator.evaluate_topk(
+            self.int8_model, dataset, self.__batch_size, topk=5
+        )
         output = super().evaluate()
-
-        output["metrics"]["accuracy"] = {"top-1": top1_correct, "top-5": top5_correct}
+        output["metrics"]["accuracy"] = {"top-1": top1, "top-5": top5}
         return output
 
 
 evaluators: dict[str, type[GenericModelEvaluator]] = {
     "generic": GenericModelEvaluator,
     "mv2": MobileNetV2Evaluator,
+    "deit_tiny": DeiTTinyEvaluator,
 }
 
 
@@ -223,6 +399,10 @@ def evaluator_calibration_data(
             return evaluator.get_calibrator(
                 training_dataset_path=config["training_dataset_path"]
             )
+        if evaluator is DeiTTinyEvaluator:
+            return evaluator.get_calibrator(
+                training_dataset_path=config["training_dataset_path"]
+            )
         else:
             raise RuntimeError(f"Unknown evaluator: {evaluator_name}")
 
@@ -238,30 +418,30 @@ def evaluate_model(
 ) -> None:
     evaluator = evaluators[evaluator_name]
 
-    # Get the path of the TOSA flatbuffer that is dumped
     intermediates_path = Path(intermediates)
     tosa_paths = list(intermediates_path.glob("*.tosa"))
 
     if evaluator.REQUIRES_CONFIG:
         assert evaluator_config is not None
-
         config_path = Path(evaluator_config)
         with config_path.open() as f:
             config = json.load(f)
 
-        if evaluator == MobileNetV2Evaluator:
-            mv2_evaluator = cast(type[MobileNetV2Evaluator], evaluator)
-            init_evaluator: GenericModelEvaluator = mv2_evaluator(
+        # Prefer a subclass provided from_config if available.
+        if hasattr(evaluator, "from_config"):
+            factory = cast(Any, evaluator.from_config)  # type: ignore[attr-defined]
+            init_evaluator = factory(
                 model_name,
                 model_fp32,
                 model_int8,
                 example_inputs,
                 str(tosa_paths[0]),
-                batch_size=config["batch_size"],
-                validation_dataset_path=config["validation_dataset_path"],
+                config,
             )
         else:
-            raise RuntimeError(f"Unknown evaluator {evaluator_name}")
+            raise RuntimeError(
+                f"Evaluator {evaluator_name} requires config but does not implement from_config()"
+            )
     else:
         init_evaluator = evaluator(
             model_name, model_fp32, model_int8, example_inputs, str(tosa_paths[0])
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
index cf924971327..34ed7e3f1bd 100644
--- a/examples/arm/aot_arm_compiler.py
+++ b/examples/arm/aot_arm_compiler.py
@@ -410,7 +410,7 @@ def get_args():
         required=False,
         nargs="?",
         const="generic",
-        choices=["generic", "mv2"],
+        choices=["generic", "mv2", "deit_tiny"],
         help="Flag for running evaluation of the model.",
     )
     parser.add_argument(

From caa00943658dbfbceeb7bb9bb6103a5c33556564 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Fri, 10 Oct 2025 08:08:34 -0700
Subject: [PATCH 228/266] backends/cuda: use async malloc/free (#14976)

Found device synchronize in aoti_torch_delete_tensor_object via Linux
perf. This change appears to significantly improve self-reported latency
from voxtral_runner as found in
https://github.com/pytorch/executorch/blob/main/.github/workflows/cuda.yml#L111-L172:

Baseline:
Run latency (ms):
audio_encoder: 575.797
token_embedding: 14.571
text_decoder: 3095.356

With this PR:
Run latency (ms):
audio_encoder: 175.807
token_embedding: 8.799
text_decoder: 344.367
---
 backends/cuda/runtime/shims/memory.cpp | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/backends/cuda/runtime/shims/memory.cpp b/backends/cuda/runtime/shims/memory.cpp
index b8e3dc8e21b..6fe315ba8ee 100644
--- a/backends/cuda/runtime/shims/memory.cpp
+++ b/backends/cuda/runtime/shims/memory.cpp
@@ -225,7 +225,7 @@ AOTITorchError aoti_torch_empty_strided(
 
   if (device_type == static_cast<int32_t>(SupportedDevices::CUDA)) {
     ET_CUDA_CHECK_OR_RETURN_ERROR(
-        cudaMallocManaged(&ptr, static_cast<size_t>(nbytes)));
+        cudaMallocAsync(&ptr, static_cast<size_t>(nbytes), cudaStreamDefault));
   } else if (device_type == static_cast<int32_t>(SupportedDevices::CPU)) {
     // Ensure 16-byte alignment for CPU memory to match CUDA requirements
     int result = posix_memalign(&ptr, 16, nbytes);
@@ -328,11 +328,14 @@ AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor) {
           ET_CUDA_CHECK_OR_RETURN_ERROR(
               cudaPointerGetAttributes(&attributes, data_ptr));
 
-          if (attributes.type == cudaMemoryTypeManaged) {
-            // This is CUDA managed memory - free with proper synchronization
-            ET_CUDA_CHECK_OR_RETURN_ERROR(cudaDeviceSynchronize());
-            ET_CUDA_CHECK_OR_RETURN_ERROR(cudaFree(data_ptr));
+          if (attributes.type == cudaMemoryTypeDevice) {
+            ET_CUDA_CHECK_OR_RETURN_ERROR(
+                cudaFreeAsync(data_ptr, cudaStreamDefault));
           } else {
+            ET_CHECK_OR_RETURN_ERROR(
+                attributes.type != cudaMemoryTypeManaged,
+                Internal,
+                "Expected host memory but got managed!")
             // This is CPU memory - free immediately
             free(data_ptr);
             data_ptr = nullptr;

From 94c892cdf0dd5e92ec62aaec111fb233123ca7f8 Mon Sep 17 00:00:00 2001
From: Ryan OShea <86965113+ArmRyan@users.noreply.github.com>
Date: Fri, 10 Oct 2025 17:14:35 +0200
Subject: [PATCH 229/266] Arm backend: Update MLSDK dependencies to use gitlab
 (#14989)

Updates the mlsdk manifest to point tosa_mlir_translator
 subdependencies to gitlab instead of ml_platform

Signed-off-by: Ryan O'Shea <ryan.oshea3@arm.com>
---
 backends/arm/scripts/mlsdk_utils.sh | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/backends/arm/scripts/mlsdk_utils.sh b/backends/arm/scripts/mlsdk_utils.sh
index 7a7d2585e52..92e8f329846 100755
--- a/backends/arm/scripts/mlsdk_utils.sh
+++ b/backends/arm/scripts/mlsdk_utils.sh
@@ -38,6 +38,28 @@ function download_ai_mlsdk_manifest() {
                --manifest-url ${mlsdk_manifest_url} \
                --manifest-branch ${mlsdk_manifest_tag} \
                -g model-converter,emulation-layer,vgf-library
+
+# Update dependencies to use gitlab tosa-mlir-translator
+# Do not indent the xml. Heredoc indentation is significant.
+mkdir -p .repo/local_manifests/
+cat > ".repo/local_manifests/tosa_gitlab.xml" <<'XML'
+<manifest>
+  <remote name="gitlab" fetch="https://git.gitlab.arm.com/"/>
+
+  <!-- remove the mlplatform entry -->
+  <remove-project name="tosa/tosa_mlir_translator"/>
+
+  <!-- re-add with GitLab repo and pin the SHA -->
+  <project
+      name="tosa/tosa-mlir-translator"
+      path="dependencies/tosa_mlir_translator"
+      remote="gitlab"
+      revision="refs/tags/v2025.07.1"
+      groups="all model-converter"
+      sync-s="true"/>
+</manifest>
+XML
+
         ./repo sync -j$(nproc)
 
         popd

From 3591604fdff5605a38a74222be908108ccbed78c Mon Sep 17 00:00:00 2001
From: roman-janik-nxp <roman.janik@nxp.com>
Date: Fri, 10 Oct 2025 17:47:54 +0200
Subject: [PATCH 230/266] NXP backend: Extend NXP backend docs page, add
 partitioner and quantization (#14942)

### Summary
This PR updates NXP backend docs Readme.

### Test plan


cc @robert-kalmar @JakeStevens @digantdesai
---
 docs/source/backends-nxp.md | 49 +++++++++++++++++++++++++++++++++----
 1 file changed, 44 insertions(+), 5 deletions(-)

diff --git a/docs/source/backends-nxp.md b/docs/source/backends-nxp.md
index 4783b4a5bc6..5fcfaa21912 100644
--- a/docs/source/backends-nxp.md
+++ b/docs/source/backends-nxp.md
@@ -10,14 +10,14 @@ For up-to-date status about running ExecuTorch on Neutron Backend please visit t
 
 ## Features
 
-Executorch v1.0 supports running machine learning models on selected NXP chips (for now only i.MXRT700).
+ExecuTorch v1.0 supports running machine learning models on selected NXP chips (for now only i.MXRT700).
 Among currently supported machine learning models are:
 - Convolution-based neutral networks
-- Full support for MobileNetv2 and CifarNet
+- Full support for MobileNetV2 and CifarNet
 
 ## Prerequisites (Hardware and Software)
 
-In order to succesfully build executorch project and convert models for NXP eIQ Neutron Backend you will need a computer running Windows or Linux.
+In order to successfully build ExecuTorch project and convert models for NXP eIQ Neutron Backend you will need a computer running Linux.
 
 If you want to test the runtime, you'll also need:
 - Hardware with NXP's [i.MXRT700](https://www.nxp.com/products/i.MX-RT700) chip or a testing board like MIMXRT700-AVK
@@ -32,9 +32,48 @@ To test converting a neural network model for inference on NXP eIQ Neutron Backe
 ./examples/nxp/aot_neutron_compile.sh [model (cifar10 or mobilenetv2)]
 ```
 
-For a quick overview how to convert a custom PyTorch model, take a look at our [exmple python script](https://github.com/pytorch/executorch/tree/release/1.0/examples/nxp/aot_neutron_compile.py).
+For a quick overview how to convert a custom PyTorch model, take a look at our [example python script](https://github.com/pytorch/executorch/tree/release/1.0/examples/nxp/aot_neutron_compile.py).
+
+### Partitioner API
+
+The partitioner is defined in `NeutronPartitioner` in `backends/nxp/neutron_partitioner.py`. It has the following 
+arguments:
+* `compile_spec` - list of key-value pairs defining compilation. E.g. for specifying platform (i.MXRT700) and Neutron Converter flavor.
+* `custom_delegation_options` - custom options for specifying node delegation.
+
+### Quantization
+
+The quantization for Neutron Backend is defined in `NeutronQuantizer` in `backends/nxp/quantizer/neutron_quantizer.py`. 
+The quantization follows PT2E workflow, INT8 quantization is supported. Operators are quantized statically, activations
+follow affine and weights symmetric per-tensor quantization scheme.
+
+#### Supported operators
+
+List of Aten operators supported by Neutron quantizer:
+
+`abs`, `adaptive_avg_pool2d`, `addmm`, `add.Tensor`, `avg_pool2d`, `cat`, `conv1d`, `conv2d`, `dropout`,
+`flatten.using_ints`, `hardtanh`, `hardtanh_`, `linear`, `max_pool2d`, `mean.dim`, `pad`, `permute`, `relu`, `relu_`,
+`reshape`, `view`, `softmax.int`, `sigmoid`, `tanh`, `tanh_`
+
+#### Example
+```python
+import torch
+from executorch.backends.nxp.quantizer.neutron_quantizer import NeutronQuantizer
+from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
+
+# Prepare your model in Aten dialect
+aten_model = get_model_in_aten_dialect()
+# Prepare calibration inputs, each tuple is one example, example tuple has items for each model input
+calibration_inputs: list[tuple[torch.Tensor, ...]] = get_calibration_inputs()
+quantizer = NeutronQuantizer()
+
+m = prepare_pt2e(aten_model, quantizer)
+for data in calibration_inputs:
+    m(*data)
+m = convert_pt2e(m)
+```
 
 ## Runtime Integration
 
-To learn how to run the converted model on the NXP hardware, use one of our example projects on using executorch runtime from MCUXpresso IDE example projects list.
+To learn how to run the converted model on the NXP hardware, use one of our example projects on using ExecuTorch runtime from MCUXpresso IDE example projects list.
 For more finegrained tutorial, visit [this manual page](https://mcuxpresso.nxp.com/mcuxsdk/latest/html/middleware/eiq/executorch/docs/nxp/topics/example_applications.html).

From 896178e98b27612d94cda39ca41206eeda730d8c Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin <anthony@shoumikh.in>
Date: Fri, 10 Oct 2025 10:08:34 -0700
Subject: [PATCH 231/266] Fix MSVC ambiguity in make_tensor_ptr (#14991)

---
 extension/tensor/tensor_ptr.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/extension/tensor/tensor_ptr.h b/extension/tensor/tensor_ptr.h
index f0f586ffb56..900252109d3 100644
--- a/extension/tensor/tensor_ptr.h
+++ b/extension/tensor/tensor_ptr.h
@@ -272,7 +272,8 @@ inline TensorPtr make_tensor_ptr(
  */
 template <typename T>
 inline TensorPtr make_tensor_ptr(T value) {
-  return make_tensor_ptr({}, std::vector<T>{value});
+  return make_tensor_ptr(
+      std::vector<executorch::aten::SizesType>{}, std::vector<T>{value});
 }
 
 /**

From 3247c15b8cc492531b9fbe834c96ff40b28c49d8 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu0820@users.noreply.github.com>
Date: Fri, 10 Oct 2025 10:54:52 -0700
Subject: [PATCH 232/266] Revert "[multimodal] Allow generate and prefill to
 take move sematics" (#15000)

Reverts pytorch/executorch#14643
---
 extension/llm/runner/multimodal_runner.cpp | 15 --------------
 extension/llm/runner/multimodal_runner.h   | 24 ----------------------
 2 files changed, 39 deletions(-)

diff --git a/extension/llm/runner/multimodal_runner.cpp b/extension/llm/runner/multimodal_runner.cpp
index c1c99ad6c9f..8b7e4e315d8 100644
--- a/extension/llm/runner/multimodal_runner.cpp
+++ b/extension/llm/runner/multimodal_runner.cpp
@@ -62,11 +62,6 @@ Error MultimodalRunner::load() {
     ET_LOG(Info, format, __VA_ARGS__);     \
   }
 
-Error MultimodalRunner::prefill(std::vector<MultimodalInput>&& inputs) {
-  // Forward to the const reference version
-  return prefill(inputs);
-}
-
 Error MultimodalRunner::prefill(const std::vector<MultimodalInput>& inputs) {
   if (!is_loaded()) {
     ET_CHECK_OK_OR_RETURN_ERROR(load());
@@ -77,16 +72,6 @@ Error MultimodalRunner::prefill(const std::vector<MultimodalInput>& inputs) {
   return Error::Ok;
 }
 
-Error MultimodalRunner::generate(
-    std::vector<MultimodalInput>&& inputs,
-    const GenerationConfig& config,
-    std::function<void(const std::string&)> token_callback,
-    std::function<void(const Stats&)> stats_callback) {
-  // Forward to the const reference version
-  return generate(
-      inputs, config, std::move(token_callback), std::move(stats_callback));
-}
-
 Error MultimodalRunner::generate(
     const std::vector<MultimodalInput>& inputs,
     const GenerationConfig& config,
diff --git a/extension/llm/runner/multimodal_runner.h b/extension/llm/runner/multimodal_runner.h
index eccf5bde301..caf3c296038 100644
--- a/extension/llm/runner/multimodal_runner.h
+++ b/extension/llm/runner/multimodal_runner.h
@@ -119,21 +119,6 @@ class ET_EXPERIMENTAL MultimodalRunner {
       std::function<void(const std::string&)> token_callback = {},
       std::function<void(const Stats&)> stats_callback = {});
 
-  /**
-   * Generate tokens from multimodal inputs with move semantics.
-   * This overload allows efficient transfer of temporary vectors.
-   * @param inputs A vector of MultimodalInput objects (moved).
-   * @param config Generation configuration parameters.
-   * @param token_callback Callback function called for each generated token.
-   * @param stats_callback Callback function for generation statistics.
-   * @return The error code. KV cache position is tracked internally in pos_.
-   */
-  virtual ::executorch::runtime::Error generate(
-      std::vector<MultimodalInput>&& inputs,
-      const GenerationConfig& config,
-      std::function<void(const std::string&)> token_callback = {},
-      std::function<void(const Stats&)> stats_callback = {});
-
   /**
    * Prefill multimodal inputs, for example to reload chat history.
    * @param inputs A vector of MultimodalInput objects containing images and
@@ -143,15 +128,6 @@ class ET_EXPERIMENTAL MultimodalRunner {
   virtual ::executorch::runtime::Error prefill(
       const std::vector<MultimodalInput>& inputs);
 
-  /**
-   * Prefill multimodal inputs with move semantics.
-   * This overload allows efficient transfer of temporary vectors.
-   * @param inputs A vector of MultimodalInput objects (moved).
-   * @return The error code. KV cache position is tracked internally in pos_.
-   */
-  virtual ::executorch::runtime::Error prefill(
-      std::vector<MultimodalInput>&& inputs);
-
   inline void stop() {
     text_token_generator_->stop();
   }

From 9b03c138ca9419d61ae37396c6d2b5c768ee6ad8 Mon Sep 17 00:00:00 2001
From: lucylq <lfq@meta.com>
Date: Fri, 10 Oct 2025 10:57:35 -0700
Subject: [PATCH 233/266] Add extension_named_data_map to llava (#14973)

ci
---
 .ci/scripts/test_llava.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.ci/scripts/test_llava.sh b/.ci/scripts/test_llava.sh
index c3bd2f77b86..d8cb9596ffc 100644
--- a/.ci/scripts/test_llava.sh
+++ b/.ci/scripts/test_llava.sh
@@ -38,6 +38,7 @@ EXECUTORCH_COMMON_CMAKE_ARGS="                      \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON      \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
         -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
         -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
         -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
         -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON      \

From 21557d0d96d4c0fd0894dfe90d9bc676da1c22da Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Fri, 10 Oct 2025 14:19:04 -0400
Subject: [PATCH 234/266] set emulate_precision_casts as true for cuda backend
 for better accuracy (#15005)

This PR was created by the merge bot to help merge the original PR into
the main branch.
ghstack PR number: https://github.com/pytorch/executorch/pull/14983 by
@Gasoonjia
^ Please use this as the source of truth for the PR details, comments,
and reviews
ghstack PR base:
https://github.com/pytorch/executorch/tree/gh/gasoonjia/53/base
ghstack PR head:
https://github.com/pytorch/executorch/tree/gh/gasoonjia/53/head
Merge bot PR base: https://github.com/pytorch/executorch/tree/main
Merge bot PR head:
https://github.com/pytorch/executorch/tree/gh/gasoonjia/53/orig
Differential Revision:
[D84288174](https://our.internmc.facebook.com/intern/diff/D84288174/)
@diff-train-skip-merge

Co-authored-by: gasoonjia <gasoonjia@icloud.com>
Co-authored-by: Gasoonjia <gasoonjia@meta.com>
---
 backends/cuda/cuda_backend.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
index 8ed8cdefbb1..ef98de29f23 100644
--- a/backends/cuda/cuda_backend.py
+++ b/backends/cuda/cuda_backend.py
@@ -129,6 +129,8 @@ def preprocess(
                 user_input_placeholders.append(node.meta["val"])
 
         options: dict[str, typing.Any] = {
+            # Better model precision
+            "emulate_precision_casts": True,
             # Embed CUDA kernel binaries directly into the compiled shared object
             "aot_inductor.embed_kernel_binary": True,
             # Do not link against the full PyTorch/libtorch library

From e16cf17a6e67e577c772642181b530880045402b Mon Sep 17 00:00:00 2001
From: Abhinayk <abhinayk@meta.com>
Date: Fri, 10 Oct 2025 11:21:01 -0700
Subject: [PATCH 235/266] More fixes to docs, fix broken links and more typos
 (#14975)

---
 docs/source/api-section.md                          |  2 +-
 docs/source/backends-arm-ethos-u.md                 |  4 ++--
 docs/source/backends-coreml.md                      |  4 ++--
 docs/source/backends-nxp.md                         |  6 +++---
 docs/source/backends-qualcomm.md                    |  2 +-
 docs/source/build-run-openvino.md                   |  2 +-
 docs/source/bundled-io.md                           |  2 +-
 docs/source/compiler-delegate-and-partitioner.md    |  6 +++---
 docs/source/getting-started-architecture.md         |  2 +-
 docs/source/getting-started.md                      |  4 ++--
 docs/source/intro-overview.md                       |  2 +-
 docs/source/kernel-library-selective-build.md       | 10 +++++-----
 docs/source/llm/export-llm.md                       |  4 ++--
 docs/source/llm/run-with-c-plus-plus.md             |  2 +-
 docs/source/running-a-model-cpp-tutorial.md         |  2 +-
 docs/source/tutorial-arm-vgf.md                     |  2 +-
 docs/source/tutorial-template.md                    |  6 +++---
 docs/source/tutorial-xnnpack-delegate-lowering.md   |  2 +-
 docs/source/using-executorch-cpp.md                 |  2 +-
 docs/source/using-executorch-export.md              |  6 +++---
 docs/source/using-executorch-faqs.md                |  2 +-
 docs/source/using-executorch-runtime-integration.md |  2 +-
 22 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/docs/source/api-section.md b/docs/source/api-section.md
index f5725a063d4..ab2573aefa9 100644
--- a/docs/source/api-section.md
+++ b/docs/source/api-section.md
@@ -7,7 +7,7 @@ In this section, find complete API documentation for ExecuTorch's export, runtim
 - {doc}`executorch-runtime-api-reference` — ExecuTorch Runtime API Reference
 - {doc}`runtime-python-api-reference` — Runtime Python API Reference
 - {doc}`api-life-cycle` — API Life Cycle
-- [Android doc →](https://pytorch.org/executorch/main/javadoc/)** — Android API Documentation
+- [Android doc →](https://pytorch.org/executorch/main/javadoc/) — Android API Documentation
 - {doc}`extension-module` — Extension Module
 - {doc}`extension-tensor` — Extension Tensor
 - {doc}`running-a-model-cpp-tutorial` — Detailed C++ Runtime APIs Tutorial
diff --git a/docs/source/backends-arm-ethos-u.md b/docs/source/backends-arm-ethos-u.md
index 4b4cd625d6e..2dfddacd20f 100644
--- a/docs/source/backends-arm-ethos-u.md
+++ b/docs/source/backends-arm-ethos-u.md
@@ -1,7 +1,7 @@
 # Arm&reg; Ethos&trade;-U NPU Backend
 
 The Arm&reg; Ethos&trade;-U backend targets Edge/IoT-type AI use-cases by enabling optimal execution of quantized models on
-[Arm&reg; Ethos&trade;-U55 NPU](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u55), [Arm&reg; Ethos&trade;-U55 NPU](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u65), and
+[Arm&reg; Ethos&trade;-U55 NPU](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u55), [Arm&reg; Ethos&trade;-U65 NPU](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u65), and
 [Arm&reg; Ethos&trade;-U85 NPU](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u85), leveraging [TOSA](https://www.mlplatform.org/tosa/) and the
 [ethos-u-vela](https://pypi.org/project/ethos-u-vela/) graph compiler. This document is a technical reference for using the Ethos-U backend, for a top level view with code examples
 please refer to the [Arm Ethos-U Backend Tutorial](https://docs.pytorch.org/executorch/stable/tutorial-arm-ethos-u.html).
@@ -282,4 +282,4 @@ full network is converted to use channels last. A word of caution must be given
 unsupported ops being inserted into the graph, and it is currently not widely tested, so the feature must so far be viewed as experimental.
 
 ## See Also
-- [Arm Ethos-U Backend Tutorial](tutorial-arm.md)
\ No newline at end of file
+- [Arm Ethos-U Backend Tutorial](tutorial-arm-ethos-u.md)
\ No newline at end of file
diff --git a/docs/source/backends-coreml.md b/docs/source/backends-coreml.md
index d0d6138d277..3ab0d3d3435 100644
--- a/docs/source/backends-coreml.md
+++ b/docs/source/backends-coreml.md
@@ -187,7 +187,7 @@ To quantize a PyTorch model for the Core ML backend, use the `CoreMLQuantizer`.
 Quantization with the Core ML backend requires exporting the model for iOS 17 or later.
 To perform 8-bit quantization with the PT2E flow, follow these steps:
 
-1) Create a [`coremltools.optimize.torch.quantization.LinearQuantizerConfig`](https://apple.github.io/coremltools/source/coremltools.optimize.torch.quantization.html#coremltools.optimize.torch.quantization.LinearQuantizerConfig) and use to to create an instance of a `CoreMLQuantizer`.
+1) Create a [`coremltools.optimize.torch.quantization.LinearQuantizerConfig`](https://apple.github.io/coremltools/source/coremltools.optimize.torch.quantization.html#coremltools.optimize.torch.quantization.LinearQuantizerConfig) and use it to create an instance of a `CoreMLQuantizer`.
 2) Use `torch.export.export` to export a graph module that will be prepared for quantization.
 3) Call `prepare_pt2e` to prepare the model for quantization.
 4) Run the prepared model with representative samples to calibrate the quantizated tensor activation ranges.
@@ -386,4 +386,4 @@ If you're using Python 3.13, try reducing your python version to Python 3.12.  c
 ### At runtime
 1. [ETCoreMLModelCompiler.mm:55] [Core ML]  Failed to compile model, error = Error Domain=com.apple.mlassetio Code=1 "Failed to parse the model specification. Error: Unable to parse ML Program: at unknown location: Unknown opset 'CoreML7'." UserInfo={NSLocalizedDescription=Failed to par$
 
-This means the model requires the the Core ML opset 'CoreML7', which requires running the model on iOS >= 17 or macOS >= 14.
+This means the model requires the Core ML opset 'CoreML7', which requires running the model on iOS >= 17 or macOS >= 14.
diff --git a/docs/source/backends-nxp.md b/docs/source/backends-nxp.md
index 5fcfaa21912..f4f7762c769 100644
--- a/docs/source/backends-nxp.md
+++ b/docs/source/backends-nxp.md
@@ -23,7 +23,7 @@ If you want to test the runtime, you'll also need:
 - Hardware with NXP's [i.MXRT700](https://www.nxp.com/products/i.MX-RT700) chip or a testing board like MIMXRT700-AVK
 - [MCUXpresso IDE](https://www.nxp.com/design/design-center/software/development-software/mcuxpresso-software-and-tools-/mcuxpresso-integrated-development-environment-ide:MCUXpresso-IDE) or [MCUXpresso Visual Studio Code extension](https://www.nxp.com/design/design-center/software/development-software/mcuxpresso-software-and-tools-/mcuxpresso-for-visual-studio-code:MCUXPRESSO-VSC)
 
-## Using NXP backend 
+## Using NXP backend
 
 To test converting a neural network model for inference on NXP eIQ Neutron Backend, you can use our example script:
 
@@ -36,14 +36,14 @@ For a quick overview how to convert a custom PyTorch model, take a look at our [
 
 ### Partitioner API
 
-The partitioner is defined in `NeutronPartitioner` in `backends/nxp/neutron_partitioner.py`. It has the following 
+The partitioner is defined in `NeutronPartitioner` in `backends/nxp/neutron_partitioner.py`. It has the following
 arguments:
 * `compile_spec` - list of key-value pairs defining compilation. E.g. for specifying platform (i.MXRT700) and Neutron Converter flavor.
 * `custom_delegation_options` - custom options for specifying node delegation.
 
 ### Quantization
 
-The quantization for Neutron Backend is defined in `NeutronQuantizer` in `backends/nxp/quantizer/neutron_quantizer.py`. 
+The quantization for Neutron Backend is defined in `NeutronQuantizer` in `backends/nxp/quantizer/neutron_quantizer.py`.
 The quantization follows PT2E workflow, INT8 quantization is supported. Operators are quantized statically, activations
 follow affine and weights symmetric per-tensor quantization scheme.
 
diff --git a/docs/source/backends-qualcomm.md b/docs/source/backends-qualcomm.md
index 31ff72cd555..74089885fcf 100644
--- a/docs/source/backends-qualcomm.md
+++ b/docs/source/backends-qualcomm.md
@@ -290,7 +290,7 @@ Please refer to `$EXECUTORCH_ROOT/examples/qualcomm/scripts/` and `$EXECUTORCH_R
 
 ### Step-by-Step Implementation Guide
 
-Please reference [the simple example](https://github.com/pytorch/executorch/blob/main/examples/qualcomm/scripts/export_example.py) and [more compilated examples](https://github.com/pytorch/executorch/tree/main/examples/qualcomm/scripts) for reference
+Please reference [the simple example](https://github.com/pytorch/executorch/blob/main/examples/qualcomm/scripts/export_example.py) and [more complicated examples](https://github.com/pytorch/executorch/tree/main/examples/qualcomm/scripts) for reference
 #### Step 1: Prepare Your Model
 ```python
 import torch
diff --git a/docs/source/build-run-openvino.md b/docs/source/build-run-openvino.md
index dc6f098850f..d06a6eb82c8 100644
--- a/docs/source/build-run-openvino.md
+++ b/docs/source/build-run-openvino.md
@@ -61,7 +61,7 @@ For more information about OpenVINO build, refer to the [OpenVINO Build Instruct
 
 Follow the steps below to setup your build environment:
 
-1. **Setup ExecuTorch Environment**: Refer to the [Environment Setup](getting-started-setup.md#environment-setup) guide for detailed instructions on setting up the ExecuTorch environment.
+1. **Setup ExecuTorch Environment**: Refer to the [Environment Setup](using-executorch-building-from-source.md#environment-setup) guide for detailed instructions on setting up the ExecuTorch environment.
 
 2. **Setup OpenVINO Backend Environment**
 - Install the dependent libs. Ensure that you are inside `executorch/backends/openvino/` directory
diff --git a/docs/source/bundled-io.md b/docs/source/bundled-io.md
index 79897737268..6c45f09c542 100644
--- a/docs/source/bundled-io.md
+++ b/docs/source/bundled-io.md
@@ -194,7 +194,7 @@ regenerate_bundled_program = deserialize_from_flatbuffer_to_bundled_program(seri
 ```
 
 ## Runtime Stage
-This stage mainly focuses on executing the model with the bundled inputs and and comparing the model's output with the bundled expected output. We provide multiple APIs to handle the key parts of it.
+This stage mainly focuses on executing the model with the bundled inputs and comparing the model's output with the bundled expected output. We provide multiple APIs to handle the key parts of it.
 
 
 ### Get ExecuTorch Program Pointer from `BundledProgram` Buffer
diff --git a/docs/source/compiler-delegate-and-partitioner.md b/docs/source/compiler-delegate-and-partitioner.md
index 437361517cc..b057f3afa2e 100644
--- a/docs/source/compiler-delegate-and-partitioner.md
+++ b/docs/source/compiler-delegate-and-partitioner.md
@@ -37,7 +37,7 @@ The diagram looks like following
 There are mainly two Ahead-of-Time entry point for backend to implement: `partition` and `preprocess`.
 
 `partitioner` is an algorithm implemented by the backend to tag the nodes to be lowered to the backend. `to_backend` API will apply the partition algorithm and lower each subgraph, which consists of connected tagged nodes, to the targeted backend. Every subgraph
-will be sent to the `preprocess` part provided by the backend to compiled as a binary blob.
+will be sent to the `preprocess` part provided by the backend to be compiled as a binary blob.
 
 During partition, the `exported_program` is not allowed to mutate the program, and it's supposed to apply tag to each node. The
 `PartitionResult` includes both tagged exported program and the partition tags dictionary for `to_backend` to look up the tag and
@@ -194,8 +194,8 @@ qnnpack is one backend and xnnpack is another backend. We haven't open-sourced
 these two backends delegates yet, and this example won't run out of box. It can
 be used as a reference to see how it can be done.
 
-This option is easy to try becuase usually all backends will implement their own
-parititioner. However this option may get different results if we change the
+This option is easy to try because usually all backends will implement their own
+partitioner. However this option may get different results if we change the
 order of to_backend call. If we want to have a better control on the nodes, like
 which backend they should go, option 2 is better.
 
diff --git a/docs/source/getting-started-architecture.md b/docs/source/getting-started-architecture.md
index 84718d9da08..617d521b802 100644
--- a/docs/source/getting-started-architecture.md
+++ b/docs/source/getting-started-architecture.md
@@ -4,7 +4,7 @@ This page describes the technical architecture of ExecuTorch and its individual
 
 **Context**
 
-In order to target on-device AI with diverse hardware, critical power requirements, and realtime processing needs, a single monolithic solution is not practical. Instead, a modular, layered, and extendable architecture is desired. ExecuTorch defines a streamlined workflow to prepare (export, transformation, and compilation) and execute a PyTorch program, with opinionated out-of-the-box default components and well-defined entry points for customizations. This architecture greatly improves portability, allowing engineers to use a performant lightweight, cross-platform runtime that easily integrates into different devices and platforms.
+In order to target on-device AI with diverse hardware, critical power requirements, and real-time processing needs, a single monolithic solution is not practical. Instead, a modular, layered, and extensible architecture is desired. ExecuTorch defines a streamlined workflow to prepare (export, transformation, and compilation) and execute a PyTorch program, with opinionated out-of-the-box default components and well-defined entry points for customizations. This architecture greatly improves portability, allowing engineers to use a performant lightweight, cross-platform runtime that easily integrates into different devices and platforms.
 
 ## Overview
 
diff --git a/docs/source/getting-started.md b/docs/source/getting-started.md
index 2ee476541ee..51c59f5e021 100644
--- a/docs/source/getting-started.md
+++ b/docs/source/getting-started.md
@@ -68,7 +68,7 @@ with open("model.pte", "wb") as f:
 
 If the model requires varying input sizes, you will need to specify the varying dimensions and bounds as part of the `export` call. See [Model Export and Lowering](using-executorch-export.md) for more information.
 
-The hardware backend to target is controlled by the partitioner parameter to to\_edge\_transform\_and\_lower. In this example, the XnnpackPartitioner is used to target mobile CPUs. See the [backend-specific documentation](backends-overview.md) for information on how to use each backend.
+The hardware backend to target is controlled by the partitioner parameter to `to_edge_transform_and_lower`. In this example, the XnnpackPartitioner is used to target mobile CPUs. See the [backend-specific documentation](backends-overview.md) for information on how to use each backend.
 
 Quantization can also be done at this stage to reduce model size and runtime. Quantization is backend-specific. See the documentation for the target backend for a full description of supported quantization schemes.
 
@@ -226,5 +226,5 @@ ExecuTorch provides a high-degree of customizability to support diverse hardware
 - [Using ExecuTorch on Android](using-executorch-android.md) and [Using ExecuTorch on iOS](using-executorch-ios.md) for mobile runtime integration.
 - [Using ExecuTorch with C++](using-executorch-cpp.md) for embedded and mobile native development.
 - [Profiling and Debugging](using-executorch-troubleshooting.md) for developer tooling and debugging.
-- [API Reference](export-to-executorch-api-reference.md) for a full description of available APIs.
+- [API Reference](export-to-executorch-api-reference.rst) for a full description of available APIs.
 - [Examples](https://github.com/pytorch/executorch/tree/main/examples) for demo apps and example code.
diff --git a/docs/source/intro-overview.md b/docs/source/intro-overview.md
index 96c7982b8fe..be2fd468716 100644
--- a/docs/source/intro-overview.md
+++ b/docs/source/intro-overview.md
@@ -20,7 +20,7 @@ Key value propositions of ExecuTorch are:
 ## Why ExecuTorch?
 
 Supporting on-device AI presents unique challenges with diverse hardware,
-critical power requirements, low/no internet connectivity, and realtime
+critical power requirements, low/no internet connectivity, and real-time
 processing needs. These constraints have historically prevented or slowed down
 the creation of scalable and performant on-device AI solutions. We designed
 ExecuTorch, backed by our industry partners like Meta, Arm, Apple, and Qualcomm,
diff --git a/docs/source/kernel-library-selective-build.md b/docs/source/kernel-library-selective-build.md
index 7d6495656a2..666206acb94 100644
--- a/docs/source/kernel-library-selective-build.md
+++ b/docs/source/kernel-library-selective-build.md
@@ -65,7 +65,7 @@ gen_selected_ops(
 )
 ```
 
-The macro makes a call to gen_oplist.py, which requires a [distinct selection](https://github.com/BujSet/executorch/blob/main/codegen/tools/gen_oplist.py#L222-L228) of API choice. `OPS_SCHEMA_YAML`, `ROOT_OPS`, `INCLUDE_ALL_OPS`, and `OPS_FROM_MODEL` are mutually exclusive options, and should not be used in conjunction. 
+The macro makes a call to gen_oplist.py, which requires a [distinct selection](https://github.com/pytorch/executorch/blob/main/codegen/tools/gen_oplist.py#L222-L228) of API choice. `OPS_SCHEMA_YAML`, `ROOT_OPS`, `INCLUDE_ALL_OPS`, and `OPS_FROM_MODEL` are mutually exclusive options, and should not be used in conjunction.
 
 ### Select all ops
 
@@ -83,7 +83,7 @@ This API lets users pass in a list of operator names. Note that this API can be
 
 ### Select ops from model
 
-This API lets users pass in a pte file of an exported model. When used, the pte file will be parsed to generate a yaml file that enumerates the operators and dtypes used in the model. 
+This API lets users pass in a pte file of an exported model. When used, the pte file will be parsed to generate a yaml file that enumerates the operators and dtypes used in the model.
 
 ### Dtype Selective Build
 
@@ -91,7 +91,7 @@ Beyond pruning the binary to remove unused operators, the binary size can furthe
 
 ## Example Walkthrough
 
-In [examples/selective_build/CMakeLists.txt](https://github.com/BujSet/executorch/blob/main/examples/selective_build/CMakeLists.txt#L48-L72), we have the following cmake config options:
+In [examples/selective_build/CMakeLists.txt](https://github.com/pytorch/executorch/blob/main/examples/selective_build/advanced/CMakeLists.txt), we have the following cmake config options:
 
 1. `EXECUTORCH_SELECT_OPS_YAML`
 2. `EXECUTORCH_SELECT_OPS_LIST`
@@ -99,10 +99,10 @@ In [examples/selective_build/CMakeLists.txt](https://github.com/BujSet/executorc
 4. `EXECUTORCH_SELECT_OPS_FROM_MODEL`
 5. `EXECUTORCH_DTYPE_SELECTIVE_BUILD`
 
-These options allow a user to tailor the cmake build process to utilize the different APIs, and results in different invocations on the `gen_selected_ops` [function](https://github.com/BujSet/executorch/blob/main/examples/selective_build/CMakeLists.txt#L110-L123). The following table describes some examples of how the invocation changes when these configs are set:
+These options allow a user to tailor the cmake build process to utilize the different APIs, and results in different invocations on the `gen_selected_ops` [function](https://github.com/pytorch/executorch/blob/main/examples/selective_build/advanced/CMakeLists.txt). The following table describes some examples of how the invocation changes when these configs are set:
 
 | Example cmake Call | Resultant `gen_selected_ops` Invocation |
-| :----: | :---:| 
+| :----: | :---:|
 |<code><br>  cmake -D… -DSELECT_OPS_LIST="aten::add.out,aten::mm.out" <br></code> | <code><br>  gen_selected_ops("" "${SELECT_OPS_LIST}" "" "" "") <br></code> |
 |<code><br> cmake -D… -DSELECT_OPS_YAML=ON <br></code> | <code><br>  set(_custom_ops_yaml ${EXECUTORCH_ROOT}/examples/portable/custom_ops/custom_ops.yaml) <br> gen_selected_ops("${_custom_ops_yaml}" "" "") <br></code> |
 |<code><br> cmake -D… -DEXECUTORCH_SELECT_OPS_FROM_MODEL="model.pte.out" <br></code> | <code><br> gen_selected_ops("" "" "" "${_model_path}" "") <br></code> |
diff --git a/docs/source/llm/export-llm.md b/docs/source/llm/export-llm.md
index e9f8307f2c3..082b8c2b18d 100644
--- a/docs/source/llm/export-llm.md
+++ b/docs/source/llm/export-llm.md
@@ -78,7 +78,7 @@ python -m extension.llm.export.export_llm \
 - `use_shared_embedding` can help for models with tied input/output embedding layers, given that you quantize using TorchAO low bit ops (`quantization.qmode: torchao:8da(\\d+)w` or `quantization.qmode: torchao:fpa(\d+)w`), see more [here](https://github.com/pytorch/executorch/blob/main/extension/llm/export/config/llm_config.py#L307).
 - `use_attention_sink` to extend generation by removing from the beginning of the KV cache when the max context length is reached.
 - `quantize_kv_cache` quantizes the KV cache in int8.
-- `local_global_attention` impements [Local-Global Attention](https://arxiv.org/abs/2411.09604), making specific attention layers use a much smaller localized sliding window KV cache.
+- `local_global_attention` implements [Local-Global Attention](https://arxiv.org/abs/2411.09604), making specific attention layers use a much smaller localized sliding window KV cache.
 
 ## Quantization
 Quantization options are defined by [`QuantizationConfig`](https://github.com/pytorch/executorch/blob/main/extension/llm/export/config/llm_config.py#L283). ExecuTorch does quantization in two ways:
@@ -92,7 +92,7 @@ The quantization modes are defined [here](https://github.com/pytorch/executorch/
 
 Common ones to use are:
 - `8da4w`: short for int8 dynamic activation + int4 weight quantization.
-- `int8`: int8 weight-only quanziation.
+- `int8`: int8 weight-only quantization.
 
 Group size is specified with:
 - `group_size`: 8, 32, 64, etc.
diff --git a/docs/source/llm/run-with-c-plus-plus.md b/docs/source/llm/run-with-c-plus-plus.md
index f987fcab2a5..217afad847b 100644
--- a/docs/source/llm/run-with-c-plus-plus.md
+++ b/docs/source/llm/run-with-c-plus-plus.md
@@ -10,7 +10,7 @@ Before you begin, make sure you have:
    - Please also see [Model Metadata](#model-metadata) section for important metadata to be serialized into `.pte`.
 2. A tokenizer file compatible with your model
    - For HuggingFace tokenizers, this is a JSON file `tokenizer.json`
-   - For SentencePiece tokenizers, this is is a `tokenizer.model` file and normally live alongside the weights file
+   - For SentencePiece tokenizers, this is a `tokenizer.model` file and normally lives alongside the weights file
 3. CMake and a C++ compiler installed
    - CMake version 3.29 or higher
    - g++ or clang compiler
diff --git a/docs/source/running-a-model-cpp-tutorial.md b/docs/source/running-a-model-cpp-tutorial.md
index f7bc3773949..a993eba6b40 100644
--- a/docs/source/running-a-model-cpp-tutorial.md
+++ b/docs/source/running-a-model-cpp-tutorial.md
@@ -6,7 +6,7 @@ In this tutorial, we will cover how to run an ExecuTorch model in C++ using the
 
 For a high level overview of the ExecuTorch Runtime please see [Runtime Overview](runtime-overview.md), and for more in-depth documentation on
 each API please see the [Runtime API Reference](executorch-runtime-api-reference.rst).
-[Here](https://github.com/pytorch/executorch/blob/main/examples/portable/executor_runner/executor_runner.cpp) is a fully functional version C++ model runner, and the [Setting up ExecuTorch](getting-started-setup.md) doc shows how to build and run it.
+[Here](https://github.com/pytorch/executorch/blob/main/examples/portable/executor_runner/executor_runner.cpp) is a fully functional version C++ model runner, and the [Setting up ExecuTorch](getting-started-setup.rst) doc shows how to build and run it.
 
 
 ## Prerequisites
diff --git a/docs/source/tutorial-arm-vgf.md b/docs/source/tutorial-arm-vgf.md
index dff7111d080..0e34e4be4b6 100644
--- a/docs/source/tutorial-arm-vgf.md
+++ b/docs/source/tutorial-arm-vgf.md
@@ -193,7 +193,7 @@ The block diagram below demonstrates, at the high level, how the various build a
 
 ## Deploying and running on device
 
-Since we are using the Vulkan emulation layer, we can run the the executor runner with the VGF delegate on the host machine:
+Since we are using the Vulkan emulation layer, we can run the executor runner with the VGF delegate on the host machine:
 
 ```bash
 ./cmake-out/executor_runner -model_path simple_example.pte
diff --git a/docs/source/tutorial-template.md b/docs/source/tutorial-template.md
index b25731afa17..73b787c9e2c 100644
--- a/docs/source/tutorial-template.md
+++ b/docs/source/tutorial-template.md
@@ -9,12 +9,12 @@
 :::{grid-item-card}  Tutorials we recommend you complete before this:
 :class-card: card-prerequisites
 * [Introduction to ExecuTorch](intro-how-it-works.md)
-* [Setting up ExecuTorch](getting-started-setup.md)
-* [Building ExecuTorch with CMake](runtime-build-and-cross-compilation.md)
+* [Setting up ExecuTorch](getting-started-setup.rst)
+* [Building ExecuTorch with CMake](using-executorch-building-from-source.md)
 :::
 ::::
 
-## Prerequsites (Hardware and Software)
+## Prerequisites (Hardware and Software)
 
 Provide instructions on what kind of hardware and software are pre-requisite for the tutorial.
 
diff --git a/docs/source/tutorial-xnnpack-delegate-lowering.md b/docs/source/tutorial-xnnpack-delegate-lowering.md
index bccd4e4add3..5471b39052b 100644
--- a/docs/source/tutorial-xnnpack-delegate-lowering.md
+++ b/docs/source/tutorial-xnnpack-delegate-lowering.md
@@ -74,7 +74,7 @@ After lowering to the XNNPACK Program, we can then prepare it for executorch and
 
 
 ## Lowering a Quantized Model to XNNPACK
-The XNNPACK delegate can also execute symmetrically quantized models. To understand the quantization flow and learn how to quantize models, refer to [Custom Quantization](quantization-custom-quantization.md) note. For the sake of this tutorial, we will leverage the `quantize()` python helper function conveniently added to the `executorch/executorch/examples` folder.
+The XNNPACK delegate can also execute symmetrically quantized models. To understand the quantization flow and learn how to quantize models, refer to [Quantization Overview](quantization-overview.md). For the sake of this tutorial, we will leverage the `quantize()` python helper function conveniently added to the `executorch/executorch/examples` folder.
 
 ```python
 from torch.export import export
diff --git a/docs/source/using-executorch-cpp.md b/docs/source/using-executorch-cpp.md
index 3736226bc06..5505ade9573 100644
--- a/docs/source/using-executorch-cpp.md
+++ b/docs/source/using-executorch-cpp.md
@@ -69,7 +69,7 @@ The runner source code can be found in the ExecuTorch repo under [examples/porta
 
 ## Next Steps
 
-- [Runtime API Reference](executorch-runtime-api-reference.md) for documentation on the available C++ runtime APIs.
+- [Runtime API Reference](executorch-runtime-api-reference.rst) for documentation on the available C++ runtime APIs.
 - [Running an ExecuTorch Model Using the Module Extension in C++](extension-module.md) for information on the high-level Module API.
 - [Managing Tensor Memory in C++](extension-tensor.md) for information on high-level tensor APIs.
 - [Running an ExecuTorch Model in C++ Tutorial](running-a-model-cpp-tutorial.md) for information on the low-level runtime APIs.
diff --git a/docs/source/using-executorch-export.md b/docs/source/using-executorch-export.md
index 2363affa7cb..7abf5cbd30a 100644
--- a/docs/source/using-executorch-export.md
+++ b/docs/source/using-executorch-export.md
@@ -24,7 +24,7 @@ Quantization - the process of using reduced precision to reduce inference time a
 
 ExecuTorch backends provide hardware acceleration for a specific hardware target. In order to achieve maximum performance on target hardware, ExecuTorch optimizes the model for a specific backend during the export and lowering process. This means that the resulting .pte file is specialized for the specific hardware. In order to deploy to multiple backends, such as Core ML on iOS and Arm CPU on Android, it is common to generate a dedicated .pte file for each.
 
-The choice of hardware backend is informed by the hardware that the model is intended to be deployed on. Each backend has specific hardware requires and level of model support. See the documentation for each hardware backend for more details.
+The choice of hardware backend is informed by the hardware that the model is intended to be deployed on. Each backend has specific hardware requirements and level of model support. See the documentation for each hardware backend for more details.
 
 As part of the .pte file creation process, ExecuTorch identifies portions of the model (partitions) that are supported for the given backend. These sections are processed by the backend ahead of time to support efficient execution. Portions of the model that are not supported on the delegate, if any, are executed using the portable fallback implementation on CPU. This allows for partial model acceleration when not all model operators are supported on the backend, but may have negative performance implications. In addition, multiple partitioners can be specified in order of priority. This allows for operators not supported on GPU to run on CPU via XNNPACK, for example.
 
@@ -206,7 +206,7 @@ outputs = module.forward([input_tensor])
 
 There is also an E2E demo in [executorch-examples](https://github.com/meta-pytorch/executorch-examples/tree/main/program-data-separation).
 
-For more information, see [Runtime API Reference](executorch-runtime-api-reference.md).
+For more information, see [Runtime API Reference](executorch-runtime-api-reference.rst).
 
 ## Advanced Topics
 
@@ -280,7 +280,7 @@ decode_ep = torch.export.export(DecodeWrapper(model), ...)
 
 ## Next Steps
 
-The PyTorch and ExecuTorch export and lowering APIs provide a high level of customizability to meet the needs of diverse hardware and models. See [torch.export](https://pytorch.org/docs/main/export.html) and [Export API Reference](export-to-executorch-api-reference.md) for more information.
+The PyTorch and ExecuTorch export and lowering APIs provide a high level of customizability to meet the needs of diverse hardware and models. See [torch.export](https://pytorch.org/docs/main/export.html) and [Export API Reference](export-to-executorch-api-reference.rst) for more information.
 
 For advanced use cases, see the following:
 - [Quantization Overview](quantization-overview.md) for information on quantizing models to reduce inference time and memory footprint.
diff --git a/docs/source/using-executorch-faqs.md b/docs/source/using-executorch-faqs.md
index 1d90edc6dc2..c147403c9e8 100644
--- a/docs/source/using-executorch-faqs.md
+++ b/docs/source/using-executorch-faqs.md
@@ -48,7 +48,7 @@ Thread count can be set with the following function. Ensure this is done prior t
 ::executorch::extension::threadpool::get_threadpool()->_unsafe_reset_threadpool(num_threads);
 ```
 
-For a deeper investgiation into model performance, ExecuTorch supports operator-level performance profiling. See [Using the ExecuTorch Developer Tools to Profile a Model](devtools-integration-tutorial.md) for more information.
+For a deeper investigation into model performance, ExecuTorch supports operator-level performance profiling. See [Using the ExecuTorch Developer Tools to Profile a Model](devtools-integration-tutorial.md) for more information.
 
 ### Missing Logs
 
diff --git a/docs/source/using-executorch-runtime-integration.md b/docs/source/using-executorch-runtime-integration.md
index 550cb3eb71a..36bc4f6b2fe 100644
--- a/docs/source/using-executorch-runtime-integration.md
+++ b/docs/source/using-executorch-runtime-integration.md
@@ -64,7 +64,7 @@ namespace {
 ```
 
 ### Weak Symbol Override
-ExecuTorch also provides a link-time method to override the PAL using weak symbols. This method is primarily maintained for backwards compatability.
+ExecuTorch also provides a link-time method to override the PAL using weak symbols. This method is primarily maintained for backwards compatibility.
 
 To override one or more PAL methods, take the following steps:
 

From e26670b0142c764dfc9bc1bccbf41338055eb37c Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu0820@users.noreply.github.com>
Date: Fri, 10 Oct 2025 11:26:19 -0700
Subject: [PATCH 236/266] Make image and audio variables const references
 (#14999)

Bind the result of `get_image()` and `get_audio()` to const ref to avoid
copy.
---
 extension/llm/runner/multimodal_prefiller.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/extension/llm/runner/multimodal_prefiller.cpp b/extension/llm/runner/multimodal_prefiller.cpp
index 2c83df24f55..f0158847e92 100644
--- a/extension/llm/runner/multimodal_prefiller.cpp
+++ b/extension/llm/runner/multimodal_prefiller.cpp
@@ -40,7 +40,7 @@ Result<uint64_t> MultimodalPrefiller::prefill(
   // 1. Run encoder model.
   ::executorch::runtime::EValue encoder_output;
   if (input.is_image()) {
-    Image image = input.get_image();
+    const Image& image = input.get_image();
 
     auto method_meta = ET_UNWRAP(
         module_->method_meta(kVisionEncoderMethod),
@@ -91,7 +91,7 @@ Result<uint64_t> MultimodalPrefiller::prefill(
 
     encoder_output = image_encoder_outputs[0];
   } else if (input.is_audio()) {
-    Audio audio = input.get_audio();
+    const Audio& audio = input.get_audio();
 
     // Use Audio::toTensor() for tensor creation
     auto audio_tensor =

From c4bd450828ae4d73c0df085435edfb334c1fcce5 Mon Sep 17 00:00:00 2001
From: Ryan OShea <86965113+ArmRyan@users.noreply.github.com>
Date: Fri, 10 Oct 2025 21:17:38 +0200
Subject: [PATCH 237/266] Revert "Arm backend: Add correction for floor mode"
 (#14998)

Reverts pytorch/executorch#14776 due to flaky test results

Signed-off-by: Ryan O'Shea <ryan.oshea3@arm.com>


cc @freddan80 @per @zingo @oscarandersson8218 @digantdesai
---
 .../arm/_passes/decompose_div_tensor_mode.py  | 52 +------------------
 backends/arm/test/ops/test_div_tensor_mode.py | 26 +---------
 2 files changed, 4 insertions(+), 74 deletions(-)

diff --git a/backends/arm/_passes/decompose_div_tensor_mode.py b/backends/arm/_passes/decompose_div_tensor_mode.py
index 5ad348806e3..b5352475d51 100644
--- a/backends/arm/_passes/decompose_div_tensor_mode.py
+++ b/backends/arm/_passes/decompose_div_tensor_mode.py
@@ -22,8 +22,6 @@
     "full": exir_ops.edge.aten.full.default,
     "lt": exir_ops.edge.aten.lt.Tensor,
     "where": exir_ops.edge.aten.where.self,
-    "mul": exir_ops.edge.aten.mul.Tensor,
-    "sub": exir_ops.edge.aten.sub.Tensor,
 }
 
 aten_unary = {
@@ -33,8 +31,6 @@
     "full": torch.ops.aten.full.default,
     "lt": torch.ops.aten.lt.Tensor,
     "where": torch.ops.aten.where.self,
-    "mul": torch.ops.aten.mul.Tensor,
-    "sub": torch.ops.aten.sub.Tensor,
 }
 
 
@@ -74,57 +70,13 @@ def call_operator(self, op, args, kwargs, meta):
             return q
 
         if rounding_mode == "floor":
-            q_raw = q
-
-            # trunc(q_raw) = where(q_raw < 0, ceil(q_raw), floor(q_raw))
-            q_floor = super().call_operator(opset["floor"], (q_raw,), {}, meta)
-            q_ceil = super().call_operator(opset["ceil"], (q_raw,), {}, meta)
-
-            # a zero tensor with the right shape
-            out_shape = (1,) * len(meta["val"].size())
-            zero = super().call_operator(
-                opset["full"],
-                args=(out_shape, 0.0),
-                kwargs={},
-                meta=meta,
-            )
-
-            is_neg = super().call_operator(opset["lt"], (q_raw, zero), {}, meta)
-            q_trunc = super().call_operator(
-                opset["where"], (is_neg, q_ceil, q_floor), {}, meta
-            )
-
-            # r = a - q_trunc * b (true remainder under truncation)
-            q_times_b = super().call_operator(opset["mul"], (q_trunc, b), {}, meta)
-            r = super().call_operator(opset["sub"], (a, q_times_b), {}, meta)
-
-            # Decide if we need to subtract 1:
-            # for b > 0, adjust if r < 0; for b < 0, adjust if r > 0.
-            b_pos = super().call_operator(opset["lt"], (zero, b), {}, meta)  # b > 0
-            r_lt0 = super().call_operator(opset["lt"], (r, zero), {}, meta)  # r < 0
-            r_gt0 = super().call_operator(opset["lt"], (zero, r), {}, meta)  # r > 0
-
-            adjust_if = super().call_operator(
-                opset["where"], (b_pos, r_lt0, r_gt0), {}, meta
-            )
-
-            one = super().call_operator(
-                opset["full"],
-                args=(out_shape, 1.0),
-                kwargs={},
-                meta=meta,
-            )
-            q_minus_1 = super().call_operator(opset["sub"], (q_trunc, one), {}, meta)
-
-            return super().call_operator(
-                opset["where"], (adjust_if, q_minus_1, q_trunc), {}, meta
-            )
+            return super().call_operator(opset["floor"], (q,), {}, meta)
 
         if rounding_mode == "trunc":
             zero = super().call_operator(
                 opset["full"],
                 args=((1,) * len(meta["val"].size()), 0.0),
-                kwargs={},
+                kwargs={"dtype": torch.float32},
                 meta=meta,
             )
             lt0 = self.call_operator(opset["lt"], (q, zero), {}, meta)
diff --git a/backends/arm/test/ops/test_div_tensor_mode.py b/backends/arm/test/ops/test_div_tensor_mode.py
index 9057be343f1..e1f6036a487 100644
--- a/backends/arm/test/ops/test_div_tensor_mode.py
+++ b/backends/arm/test/ops/test_div_tensor_mode.py
@@ -36,14 +36,6 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         return torch.div(x, y, rounding_mode=self.mode)
 
 
-def _rank4_large_randn_case():
-    torch.manual_seed(0)
-    x = 200 * torch.randn(5, 10, 25, 20) + 1
-    torch.manual_seed(1)
-    y = torch.rand(5, 10, 25, 20) + 1
-    return x, y
-
-
 test_data = {
     "mode_none": lambda: (None, (torch.randn(4, 8), torch.randn(4, 8).abs() + 1e-3)),
     "mode_floor": lambda: (
@@ -55,13 +47,6 @@ def _rank4_large_randn_case():
         (torch.randn(4, 8), torch.randn(4, 8).abs() + 1e-3),
     ),
     "int_denominator": lambda: (None, (torch.randn(4, 8), 2)),
-    "op_floor_div_rank4_large_randn": lambda: (
-        "floor",
-        (
-            200 * torch.randn(5, 10, 25, 20) + 1,
-            torch.rand(5, 10, 25, 20) + 1,
-        ),
-    ),
 }
 
 
@@ -99,13 +84,7 @@ def test_div_tensor_mode_tosa_INT(data):
 
 @common.XfailIfNoCorstone300
 @common.parametrize(
-    "data",
-    test_data,
-    xfails={
-        "mode_trunc": "CPU op missing in unittests",
-        "mode_floor": "Not supported",
-        "op_floor_div_rank4_large_randn": "Not supported",
-    },
+    "data", test_data, xfails={"mode_trunc": "CPU op missing in unittests"}
 )
 def test_div_tensor_mode_u55_INT(data):
     mode, inputs = data()
@@ -115,10 +94,9 @@ def test_div_tensor_mode_u55_INT(data):
         model,
         inputs,
         aten_ops=model.aten_ops_int,
+        exir_ops=[],
         use_to_edge_transform_and_lower=True,
     )
-    pipeline.pop_stage("check_not.exir")
-    pipeline.pop_stage("check_count.exir")
     pipeline.run()
 
 
From fca0f381d5140d15078747eaf7baed520775e4ec Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Fri, 10 Oct 2025 15:53:16 -0400
Subject: [PATCH 238/266] update etrecrod doc to cover new generation pipeline
 (#15012)

This PR was created by the merge bot to help merge the original PR into
the main branch.
ghstack PR number: https://github.com/pytorch/executorch/pull/15007 by
@Gasoonjia
^ Please use this as the source of truth for the PR details, comments,
and reviews
ghstack PR base:
https://github.com/pytorch/executorch/tree/gh/gasoonjia/55/base
ghstack PR head:
https://github.com/pytorch/executorch/tree/gh/gasoonjia/55/head
Merge bot PR base: https://github.com/pytorch/executorch/tree/main
Merge bot PR head:
https://github.com/pytorch/executorch/tree/gh/gasoonjia/55/orig
Differential Revision:
[D84327387](https://our.internmc.facebook.com/intern/diff/D84327387/)
@diff-train-skip-merge

Co-authored-by: gasoonjia <gasoonjia@icloud.com>
---
 docs/source/etrecord.rst | 117 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 112 insertions(+), 5 deletions(-)

diff --git a/docs/source/etrecord.rst b/docs/source/etrecord.rst
index 1ab84a6ee10..39bc45cab5a 100644
--- a/docs/source/etrecord.rst
+++ b/docs/source/etrecord.rst
@@ -23,13 +23,120 @@ It should be provided to the `Inspector API <model-inspector.html>`__ to link ba
 Generating an ``ETRecord``
 --------------------------
 
-The user should use the following API to generate an ``ETRecord`` file. They
-will be expected to provide the Edge Dialect program (returned by the call to ``to_edge()``),
-the ExecuTorch program (returned by the call to ``to_executorch()``), and optional models that
-they are interested in working with via our tooling.
+There are multiple ways to generate an ``ETRecord`` for debugging purposes:
+
+Method 1: Using the ``generate_etrecord`` Parameter (Recommended)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The recommended approach is to enable ``ETRecord`` generation by passing ``generate_etrecord=True``
+to your export API calls. This can be used with:
+
+* ``executorch.export()`` - High-level export API
+* ``to_edge()`` - Edge dialect conversion
+* ``to_edge_transform_and_lower()`` - Edge conversion with transformations and lowering
+
+After export completes, retrieve the ``ETRecord`` using the ``get_etrecord()`` method, and save it using the ``save()`` method:
+
+**Example with** ``executorch.export()``:
+
+.. code-block:: python
+
+    import executorch
+    from executorch.export import ExportRecipe
+
+    # Export with ETRecord generation enabled
+    session = executorch.export(
+        model=model,
+        example_inputs=[example_inputs],
+        export_recipe=recipe,
+        generate_etrecord=True  # Enable ETRecord generation
+    )
+
+    # Get and save the ETRecord
+    etrecord = session.get_etrecord()
+    etrecord.save("model_debug.etrecord")
+
+**Example with** ``to_edge()``:
+
+.. code-block:: python
+
+    from executorch.exir.program import to_edge
+    from torch.export import export
+
+    # Export model first
+    exported_program = export(model, example_inputs)
+
+    # Convert to edge with ETRecord generation
+    edge_manager = to_edge(
+        exported_program,
+        generate_etrecord=True  # Enable ETRecord generation
+    )
+
+    # Apply transformations
+    edge_manager = edge_manager.to_backend()
+    et_manager = edge_manager.to_executorch()
+
+    # Get and save ETRecord
+    etrecord = et_manager.get_etrecord()
+    etrecord.save("edge_debug.etrecord")
+
+**Example with** ``to_edge_transform_and_lower()``:
+
+.. code-block:: python
+
+    from executorch.exir.program import to_edge_transform_and_lower
+    from torch.export import export
+
+    # Export model first
+    exported_program = export(model, example_inputs)
+
+    # Transform and lower with ETRecord generation
+    edge_manager = to_edge_transform_and_lower(
+        exported_program,
+        partitioner=[MyPartitioner()],
+        generate_etrecord=True  # Enable ETRecord generation
+    )
+
+    et_manager = edge_manager.to_executorch()
+
+    # Get and save ETRecord
+    etrecord = et_manager.get_etrecord()
+    etrecord.save("debug.etrecord")
+
+Method 2: Using the ``generate_etrecord()`` Function
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+You can also use the standalone ``generate_etrecord()`` function to generate an ``ETRecord``.
+This method requires you to provide the Edge Dialect program (returned by ``to_edge()``),
+the ExecuTorch program (returned by ``to_executorch()``), and optional models.
 
 .. warning::
-    Users should do a deepcopy of the output of ``to_edge()`` and pass in the deepcopy to the ``generate_etrecord`` API. This is needed because the subsequent call, ``to_executorch()``, does an in-place mutation and will lose debug data in the process.
+    When using the standalone function, users should do a deepcopy of the output of ``to_edge()`` and pass in the deepcopy to the ``generate_etrecord`` API. This is needed because the subsequent call, ``to_executorch()``, does an in-place mutation and will lose debug data in the process.
+
+**Example:**
+
+.. code-block:: python
+
+    import copy
+    from executorch.devtools import generate_etrecord
+    from torch.export import export
+
+    # Export and convert to edge
+    aten_dialect = export(model, example_inputs, strict=True)
+    edge_program = to_edge(aten_dialect)
+
+    # Create copy for ETRecord (needed because to_executorch modifies in-place)
+    edge_program_copy = copy.deepcopy(edge_program)
+
+    # Convert to ExecutorchProgramManager
+    executorch_program = edge_program_copy.to_executorch()
+
+    # Generate ETRecord separately
+    generate_etrecord(
+        "debug.etrecord",
+        edge_program,
+        executorch_program,
+    )
 
 .. currentmodule:: executorch.devtools.etrecord._etrecord
 .. autofunction:: generate_etrecord

From 3bfd5e02a08cd1757eb85baf6252b6025a47596d Mon Sep 17 00:00:00 2001
From: Mergen Nachin <mnachin@meta.com>
Date: Fri, 10 Oct 2025 17:11:25 -0400
Subject: [PATCH 239/266] Promote pyproject beta to production/stable (#14777)

Will land this PR and cherry-pick to release/1.0 branch as we approach
to 1.0 release.
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 401b1fa2c24..393f8578c8e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,7 +31,7 @@ classifiers = [
     #   3 - Alpha
     #   4 - Beta
     #   5 - Production/Stable
-    "Development Status :: 4 - Beta",
+    "Development Status :: 5 - Production/Stable",
     "Intended Audience :: Developers",
     "Intended Audience :: Education",
     "Intended Audience :: Science/Research",

From 7533df6849d5e2d1ddcffd241df447baf7444c3f Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Fri, 10 Oct 2025 19:29:00 -0400
Subject: [PATCH 240/266] use reference link for html doc (#15029)

This PR was created by the merge bot to help merge the original PR into
the main branch.
ghstack PR number: https://github.com/pytorch/executorch/pull/15004 by
@Gasoonjia
^ Please use this as the source of truth for the PR details, comments,
and reviews
ghstack PR base:
https://github.com/pytorch/executorch/tree/gh/gasoonjia/54/base
ghstack PR head:
https://github.com/pytorch/executorch/tree/gh/gasoonjia/54/head
Merge bot PR base: https://github.com/pytorch/executorch/tree/main
Merge bot PR head:
https://github.com/pytorch/executorch/tree/gh/gasoonjia/54/orig
Differential Revision:
[D84367515](https://our.internmc.facebook.com/intern/diff/D84367515/)
@diff-train-skip-merge

Co-authored-by: gasoonjia <gasoonjia@icloud.com>
---
 docs/source/backend-delegates-xnnpack-reference.md | 2 +-
 docs/source/bundled-io.md                          | 2 +-
 docs/source/devtools-tutorial.md                   | 2 +-
 docs/source/export-overview.md                     | 2 +-
 docs/source/extension-module.md                    | 2 +-
 docs/source/llm/export-custom-llm.md               | 4 ++--
 docs/source/running-a-model-cpp-tutorial.md        | 2 +-
 docs/source/runtime-overview.md                    | 2 +-
 docs/source/runtime-profiling.md                   | 2 +-
 docs/source/tutorial-xnnpack-delegate-lowering.md  | 2 +-
 docs/source/using-executorch-troubleshooting.md    | 2 +-
 11 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/docs/source/backend-delegates-xnnpack-reference.md b/docs/source/backend-delegates-xnnpack-reference.md
index cfb915aca59..8b4338e703c 100644
--- a/docs/source/backend-delegates-xnnpack-reference.md
+++ b/docs/source/backend-delegates-xnnpack-reference.md
@@ -70,7 +70,7 @@ Since weight packing creates an extra copy of the weights inside XNNPACK, We fre
 When executing the XNNPACK subgraphs, we prepare the tensor inputs and outputs and feed them to the XNNPACK runtime graph. After executing the runtime graph, the output pointers are filled with the computed tensors.
 
 #### **Profiling**
-We have enabled basic profiling for the XNNPACK delegate that can be enabled with the compiler flag `-DEXECUTORCH_ENABLE_EVENT_TRACER` (add `-DENABLE_XNNPACK_PROFILING` for additional details). With ExecuTorch's Developer Tools integration, you can also now use the Developer Tools to profile the model. You can follow the steps in [Using the ExecuTorch Developer Tools to Profile a Model](https://pytorch.org/executorch/main/tutorials/devtools-integration-tutorial) on how to profile ExecuTorch models and use Developer Tools' Inspector API to view XNNPACK's internal profiling information. An example implementation is available in the `executor_runner` (see [tutorial here](tutorial-xnnpack-delegate-lowering.md#profiling)).
+We have enabled basic profiling for the XNNPACK delegate that can be enabled with the compiler flag `-DEXECUTORCH_ENABLE_EVENT_TRACER` (add `-DENABLE_XNNPACK_PROFILING` for additional details). With ExecuTorch's Developer Tools integration, you can also now use the Developer Tools to profile the model. You can follow the steps in [Using the ExecuTorch Developer Tools to Profile a Model](tutorials/devtools-integration-tutorial) <!-- @lint-ignore --> on how to profile ExecuTorch models and use Developer Tools' Inspector API to view XNNPACK's internal profiling information. An example implementation is available in the `executor_runner` (see [tutorial here](tutorial-xnnpack-delegate-lowering.md#profiling)).
 
 
 [comment]: <> (TODO: Refactor quantizer to a more official quantization doc)
diff --git a/docs/source/bundled-io.md b/docs/source/bundled-io.md
index 6c45f09c542..c0b03938374 100644
--- a/docs/source/bundled-io.md
+++ b/docs/source/bundled-io.md
@@ -17,7 +17,7 @@ This stage mainly focuses on the creation of a `BundledProgram` and dumping it o
 
 ### Step 1: Create a Model and Emit its ExecuTorch Program.
 
-ExecuTorch Program can be emitted from user's model by using ExecuTorch APIs. Follow the [Generate and emit sample ExecuTorch program](getting-started.md#exporting) or [Exporting to ExecuTorch tutorial](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial).
+ExecuTorch Program can be emitted from user's model by using ExecuTorch APIs. Follow the [Generate and emit sample ExecuTorch program](getting-started.md#exporting) or [Exporting to ExecuTorch tutorial](tutorials/export-to-executorch-tutorial) <!-- @lint-ignore -->.
 
 ### Step 2: Construct `List[MethodTestSuite]` to hold test info
 
diff --git a/docs/source/devtools-tutorial.md b/docs/source/devtools-tutorial.md
index 7c6cedc311b..6d540dc7f35 100644
--- a/docs/source/devtools-tutorial.md
+++ b/docs/source/devtools-tutorial.md
@@ -1,3 +1,3 @@
 ## Developer Tools Usage Tutorial
 
-Please refer to the [Developer Tools tutorial](https://pytorch.org/executorch/main/tutorials/devtools-integration-tutorial) for a walkthrough on how to profile a model in ExecuTorch using the Developer Tools.
+Please refer to the [Developer Tools tutorial](tutorials/devtools-integration-tutorial) <!-- @lint-ignore --> for a walkthrough on how to profile a model in ExecuTorch using the Developer Tools.
diff --git a/docs/source/export-overview.md b/docs/source/export-overview.md
index d07701d06cd..c96716a0949 100644
--- a/docs/source/export-overview.md
+++ b/docs/source/export-overview.md
@@ -11,5 +11,5 @@ program, making it easier for you to understand and implement the process.
 
 To learn more about exporting your model:
 
-* Complete the [Exporting to ExecuTorch tutorial](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial).
+* Complete the [Exporting to ExecuTorch tutorial](tutorials/export-to-executorch-tutorial) <!-- @lint-ignore -->.
 * Read the [torch.export documentation](https://pytorch.org/docs/2.1/export.html).
diff --git a/docs/source/extension-module.md b/docs/source/extension-module.md
index 29aa6712d37..690256fecbb 100644
--- a/docs/source/extension-module.md
+++ b/docs/source/extension-module.md
@@ -6,7 +6,7 @@ In the [Detailed C++ Runtime APIs Tutorial](running-a-model-cpp-tutorial.md), we
 
 ## Example
 
-Let's see how we can run the `SimpleConv` model generated from the [Exporting to ExecuTorch tutorial](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial) using the `Module` and [`TensorPtr`](extension-tensor.md) APIs:
+Let's see how we can run the `SimpleConv` model generated from the [Exporting to ExecuTorch tutorial](tutorials/export-to-executorch-tutorial) <!-- @lint-ignore --> using the `Module` and [`TensorPtr`](extension-tensor.md) APIs:
 
 ```cpp
 #include <executorch/extension/module/module.h>
diff --git a/docs/source/llm/export-custom-llm.md b/docs/source/llm/export-custom-llm.md
index 57537ba31d8..4797f773fa3 100644
--- a/docs/source/llm/export-custom-llm.md
+++ b/docs/source/llm/export-custom-llm.md
@@ -81,7 +81,7 @@ with open("nanogpt.pte", "wb") as file:
 
 To export, run the script with `python export_nanogpt.py` (or python3, as appropriate for your environment). It will generate a `nanogpt.pte` file in the current directory.
 
-For more information, see [Exporting to ExecuTorch](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial) and
+For more information, see [Exporting to ExecuTorch](../tutorials/export-to-executorch-tutorial) <!-- @lint-ignore --> and
 [torch.export](https://pytorch.org/docs/stable/export.html).
 
 ## Backend delegation
@@ -143,7 +143,7 @@ example_inputs = (
 # long as they adhere to the rules specified in the dynamic shape configuration.
 # Here we set the range of 0th model input's 1st dimension as
 # [0, model.config.block_size].
-# See https://pytorch.org/executorch/main/concepts.html#dynamic-shapes
+# See ../concepts.html#dynamic-shapes
 # for details about creating dynamic shapes.
 dynamic_shape = (
     {1: torch.export.Dim("token_dim", max=model.config.block_size - 1)},
diff --git a/docs/source/running-a-model-cpp-tutorial.md b/docs/source/running-a-model-cpp-tutorial.md
index a993eba6b40..5ae4235995d 100644
--- a/docs/source/running-a-model-cpp-tutorial.md
+++ b/docs/source/running-a-model-cpp-tutorial.md
@@ -12,7 +12,7 @@ each API please see the [Runtime API Reference](executorch-runtime-api-reference
 ## Prerequisites
 
 You will need an ExecuTorch model to follow along. We will be using
-the model `SimpleConv` generated from the [Exporting to ExecuTorch tutorial](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial).
+the model `SimpleConv` generated from the [Exporting to ExecuTorch tutorial](tutorials/export-to-executorch-tutorial) <!-- @lint-ignore -->.
 
 ## Model Loading
 
diff --git a/docs/source/runtime-overview.md b/docs/source/runtime-overview.md
index 96a618a2a41..1df3da40478 100644
--- a/docs/source/runtime-overview.md
+++ b/docs/source/runtime-overview.md
@@ -11,7 +11,7 @@ Works](intro-how-it-works.md).
 At the highest level, the ExecuTorch runtime is responsible for:
 
 * Loading binary `.pte` program files that were generated by the
-  [`to_executorch()`](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial) step of the
+  [`to_executorch()`](tutorials/export-to-executorch-tutorial) <!-- @lint-ignore --> step of the
   model-lowering process.
 * Executing the series of instructions that implement a lowered model.
 
diff --git a/docs/source/runtime-profiling.md b/docs/source/runtime-profiling.md
index 120d31954fd..56b62de599d 100644
--- a/docs/source/runtime-profiling.md
+++ b/docs/source/runtime-profiling.md
@@ -20,4 +20,4 @@ We provide access to all the profiling data via the Python [Inspector API](model
     - Through the Inspector API, users can do a wide range of analysis varying from printing out performance details to doing more finer granular calculation on module level.
 
 
-Please refer to the [Developer Tools tutorial](https://pytorch.org/executorch/main/tutorials/devtools-integration-tutorial) for a step-by-step walkthrough of the above process on a sample model.
+Please refer to the [Developer Tools tutorial](tutorials/devtools-integration-tutorial) <!-- @lint-ignore --> for a step-by-step walkthrough of the above process on a sample model.
diff --git a/docs/source/tutorial-xnnpack-delegate-lowering.md b/docs/source/tutorial-xnnpack-delegate-lowering.md
index 5471b39052b..3fb079f24d6 100644
--- a/docs/source/tutorial-xnnpack-delegate-lowering.md
+++ b/docs/source/tutorial-xnnpack-delegate-lowering.md
@@ -11,7 +11,7 @@ In this tutorial, you will learn how to export an XNNPACK lowered Model and run
 :::{grid-item-card}  Before you begin it is recommended you go through the following:
 :class-card: card-prerequisites
 * [Setting up ExecuTorch](getting-started-setup.rst)
-* [Model Lowering Tutorial](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial)
+* [Model Lowering Tutorial](tutorials/export-to-executorch-tutorial) <!-- @lint-ignore -->
 * [ExecuTorch XNNPACK Delegate](backends-xnnpack.md)
 :::
 ::::
diff --git a/docs/source/using-executorch-troubleshooting.md b/docs/source/using-executorch-troubleshooting.md
index 1abc5ed999e..75648dc5b46 100644
--- a/docs/source/using-executorch-troubleshooting.md
+++ b/docs/source/using-executorch-troubleshooting.md
@@ -16,5 +16,5 @@ The ExecuTorch developer tools, or devtools, are a collection of tooling for tro
 
 - [Frequently Asked Questions](using-executorch-faqs.md) for solutions to commonly encountered questions and issues.
 - [Introduction to the ExecuTorch Developer Tools](runtime-profiling.md) for a high-level introduction to available developer tooling.
-- [Using the ExecuTorch Developer Tools to Profile a Model](https://pytorch.org/executorch/main/tutorials/devtools-integration-tutorial) for information on runtime performance profiling.
+- [Using the ExecuTorch Developer Tools to Profile a Model](tutorials/devtools-integration-tutorial) <!-- @lint-ignore --> for information on runtime performance profiling.
 - [Inspector APIs](runtime-profiling.md) for reference material on trace inspector APIs.

From 09eac169713761d4d64ca498b9fdb570aa4ad962 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu0820@users.noreply.github.com>
Date: Fri, 10 Oct 2025 19:01:38 -0700
Subject: [PATCH 241/266] [aoti-et] Enable multimodal runner for Voxtral on
 CUDA (#14980)

This pull request introduces changes to the CUDA workflow, model
artifact handling, and multimodal runner logic. The main changes include
restructuring the GitHub Actions workflow to separate model export,
benchmarking, and end-to-end testing for the Voxtral CUDA pipeline,
improving artifact management and reproducibility. Additionally, the
multimodal runner now supports automatic conversion of audio tensors to
bfloat16, ensuring compatibility with expected input types. There are
also enhancements to caching and symbol registration in the CUDA
backend, and build system updates to support linking the CUDA backend.

**Workflow and Artifact Management Improvements:**

* Refactored `.github/workflows/cuda.yml` to split the Voxtral CUDA
pipeline into three jobs: `export-voxtral-cuda-artifact` (exports and
stores model artifacts), `benchmark-voxtral-cuda` (benchmarks using
exported artifacts), and `test-voxtral-cuda-e2e` (runs full end-to-end
tests with artifact download and audio input). Improved artifact
handling, reproducibility, and added explicit checks for required files.
[[1]](diffhunk://#diff-29abea04e0613c2569973e5c8e3c89e04846d408c855eeb1f3efcfae7cfa6f89L90-R91)
[[2]](diffhunk://#diff-29abea04e0613c2569973e5c8e3c89e04846d408c855eeb1f3efcfae7cfa6f89R107)
[[3]](diffhunk://#diff-29abea04e0613c2569973e5c8e3c89e04846d408c855eeb1f3efcfae7cfa6f89R134-R185)
[[4]](diffhunk://#diff-29abea04e0613c2569973e5c8e3c89e04846d408c855eeb1f3efcfae7cfa6f89R196-R267)
[[5]](diffhunk://#diff-29abea04e0613c2569973e5c8e3c89e04846d408c855eeb1f3efcfae7cfa6f89R122)

**Multimodal Runner Logic:**

* Added automatic conversion of audio tensors to bfloat16 in
`MultimodalPrefiller::prefill` and implemented a helper function
`convert_to_bfloat16` in `util.h` to support this. This ensures that
audio inputs match the expected dtype for the encoder, improving
robustness for multimodal inference.
[[1]](diffhunk://#diff-ad4fcb32ffc5f1f7b4f87b5ee58927cb948a8c0976295befd10e3de445913ae4L96-R136)
[[2]](diffhunk://#diff-db4801445eaa3bb4f1370fe41d3a00ae2e3ef354a23ad4d5ace141ecc3c6f413R144-R180)

**CUDA Backend and Caching Enhancements:**

* Improved caching logic in `common_shims.cpp` for tensor strides and
sizes by validating cached values and updating them when necessary. This
prevents stale cache issues and ensures correct tensor metadata.
[[1]](diffhunk://#diff-1e7c9d572d434c9a85c9d466e7f406877bc974a373c370fe7ddb3fe32852c1f2R54-R81)
[[2]](diffhunk://#diff-1e7c9d572d434c9a85c9d466e7f406877bc974a373c370fe7ddb3fe32852c1f2R104-R130)
* Added dynamic symbol re-registration in `CudaBackend` to handle
multiple shared objects in the same process, ensuring correct execution
when switching between models.
* Removed redundant logging statements in CUDA backend for cleaner
output.
[[1]](diffhunk://#diff-a4b17eccf1aa933837671c5184e02bc815d934a362344bb2b17b789cdfaa5375L226)
[[2]](diffhunk://#diff-a4b17eccf1aa933837671c5184e02bc815d934a362344bb2b17b789cdfaa5375L256)

**Build System Updates:**

* Updated `CMakeLists.txt` and `executorch-config.cmake` to include and
link the CUDA backend (`aoti_cuda`) when building Voxtral and other
components, improving build flexibility and CUDA support.
[[1]](diffhunk://#diff-606feb24310595f592d98d021a2c90618346977d94decb80b35b7e26ed8ccc1eR89-R95)
[[2]](diffhunk://#diff-6a78a155992483ff6f35d595ff6cef63b477d1c853f6482e77acae6ef443f0e4R56)

**Debugging and Tuning Options:**

* Added support for enabling debug compilation in `cuda_backend.py` via
the `DEBUG` environment variable, allowing easier troubleshooting and
development.
---
 .github/workflows/cuda.yml                    | 150 +++++++++++++++---
 backends/aoti/common_shims.cpp                |  41 ++++-
 backends/cuda/runtime/cuda_backend.cpp        |  10 +-
 examples/models/voxtral/CMakeLists.txt        |   7 +
 examples/models/voxtral/README.md             |  51 ++++++
 extension/llm/runner/multimodal_prefiller.cpp |  41 ++++-
 extension/llm/runner/test/CMakeLists.txt      |   9 +-
 extension/llm/runner/test/targets.bzl         |  10 ++
 extension/llm/runner/test/test_util.cpp       |  59 +++++++
 extension/llm/runner/util.h                   |  25 +++
 tools/cmake/executorch-config.cmake           |   1 +
 11 files changed, 374 insertions(+), 30 deletions(-)
 create mode 100644 extension/llm/runner/test/test_util.cpp

diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index 8dbbb254ac3..c1b22e692ab 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -87,8 +87,8 @@ jobs:
         export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
         PYTHON_EXECUTABLE=python source .ci/scripts/test_model.sh "${{ matrix.model }}" cmake cuda
 
-  test-voxtral-cuda-e2e:
-    name: test-voxtral-cuda-e2e
+  export-voxtral-cuda-artifact:
+    name: export-voxtral-cuda-artifact
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
@@ -104,6 +104,7 @@ jobs:
       gpu-arch-version: 12.6
       use-custom-docker-registry: false
       submodules: recursive
+      upload-artifact: voxtral-cuda-export
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
         set -eux
@@ -118,6 +119,7 @@ jobs:
         OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
         pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
         pip install mistral-common librosa
+        pip list
         echo "::endgroup::"
 
         echo "::group::Export Voxtral"
@@ -129,9 +131,58 @@ jobs:
             --device cuda \
             --max_seq_len 1024 \
             --output_dir ./
+        python -m executorch.extension.audio.mel_spectrogram \
+            --feature_size 128 \
+            --stack_output \
+            --max_audio_len 300 \
+            --output_file voxtral_preprocessor.pte
+
+        test -f model.pte
+        test -f aoti_cuda_blob.ptd
+        test -f voxtral_preprocessor.pte
         echo "::endgroup::"
 
-        echo "::group::Build Voxtral Runner"
+        echo "::group::Store Voxtral Artifacts"
+        mkdir -p "${RUNNER_ARTIFACT_DIR}"
+        cp model.pte "${RUNNER_ARTIFACT_DIR}/"
+        cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
+        cp voxtral_preprocessor.pte "${RUNNER_ARTIFACT_DIR}/"
+        ls -al "${RUNNER_ARTIFACT_DIR}"
+        echo "::endgroup::"
+
+  benchmark-voxtral-cuda:
+    name: benchmark-voxtral-cuda
+    needs: export-voxtral-cuda-artifact
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+    with:
+      timeout: 90
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      download-artifact: voxtral-cuda-export
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        echo "::group::Setup ExecuTorch Requirements"
+        CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Prepare Voxtral Artifacts"
+        cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
+        cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
+        ls -al model.pte aoti_cuda_blob.ptd
+        echo "::endgroup::"
+
+        echo "::group::Build Voxtral Benchmark"
         cmake -DCMAKE_BUILD_TYPE=Release \
               -DEXECUTORCH_BUILD_CUDA=ON \
               -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
@@ -142,31 +193,90 @@ jobs:
         cmake --build cmake-out -j$(( $(nproc) - 1 )) --target voxtral_runner
         echo "::endgroup::"
 
+        echo "::group::Run Voxtral Benchmark"
+
+        export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
+        cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd
+
+        echo "::endgroup::"
+
+  test-voxtral-cuda-e2e:
+    name: test-voxtral-cuda-e2e
+    needs: export-voxtral-cuda-artifact
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+    with:
+      timeout: 90
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      download-artifact: voxtral-cuda-export
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        echo "::group::Setup ExecuTorch Requirements"
+        CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Prepare Voxtral Artifacts"
+        cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
+        cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
+        cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" .
+        TOKENIZER_URL="https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main/tekken.json"
+        curl -L $TOKENIZER_URL -o tekken.json
+        ls -al model.pte aoti_cuda_blob.ptd voxtral_preprocessor.pte tekken.json
+        echo "::endgroup::"
+
+        echo "::group::Download Test Audio File"
+        AUDIO_URL="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav"
+        curl -L $AUDIO_URL -o poem.wav
+        echo "::endgroup::"
+
+        echo "::group::Build Voxtral Runner"
+        cmake --preset llm \
+              -DEXECUTORCH_BUILD_CUDA=ON \
+              -DCMAKE_INSTALL_PREFIX=cmake-out \
+              -DCMAKE_BUILD_TYPE=Release \
+              -Bcmake-out -S.
+        cmake --build cmake-out -j$(( $(nproc) - 1 )) --target install --config Release
+
+        cmake -DEXECUTORCH_BUILD_CUDA=ON \
+              -DCMAKE_BUILD_TYPE=Release \
+              -Sexamples/models/voxtral \
+              -Bcmake-out/examples/models/voxtral/
+        cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release
+        echo "::endgroup::"
+
         echo "::group::Run Voxtral Runner"
-        # Capture output and allow exit code 139 if we have the expected printout
         set +e
         export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
-        OUTPUT=$(cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd 2>&1)
+        OUTPUT=$(cmake-out/examples/models/voxtral/voxtral_runner \
+              --model_path model.pte \
+              --data_path aoti_cuda_blob.ptd \
+              --tokenizer_path tekken.json \
+              --audio_path poem.wav \
+              --processor_path voxtral_preprocessor.pte \
+              --temperature 0 2>&1)
         EXIT_CODE=$?
         set -e
 
         echo "$OUTPUT"
 
-        # Check if the output contains "Run latency (ms):"
-        if echo "$OUTPUT" | grep -q "Run latency (ms):"; then
-          echo "Found expected output: 'Run latency (ms):'"
-          if [ $EXIT_CODE -eq 139 ]; then
-            echo "Exit code 139 (segfault) detected, but passing since we have the expected output"
-            exit 0
-          elif [ $EXIT_CODE -ne 0 ]; then
-            echo "Unexpected exit code: $EXIT_CODE"
-            exit $EXIT_CODE
-          else
-            echo "Command succeeded with exit code 0"
-            exit 0
-          fi
-        else
-          echo "Expected output 'Run latency (ms):' not found in output"
+        if ! echo "$OUTPUT" | grep -iq "poem"; then
+          echo "Expected output 'poem' not found in output"
           exit 1
         fi
+
+        if [ $EXIT_CODE -ne 0 ]; then
+          echo "Unexpected exit code: $EXIT_CODE"
+          exit $EXIT_CODE
+        fi
         echo "::endgroup::"
diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp
index abc83779443..f0c134a716c 100644
--- a/backends/aoti/common_shims.cpp
+++ b/backends/aoti/common_shims.cpp
@@ -51,13 +51,32 @@ AOTITorchError aoti_torch_get_storage_offset(
 
 AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides) {
   auto it = internal::tensor_to_strides.find(tensor);
+  bool needs_update = false;
+
   if (it == internal::tensor_to_strides.end()) {
+    needs_update = true;
+  } else {
+    // CRITICAL: Multimodal models reuse tensors with different shapes across
+    // executions (e.g., variable-length audio). We MUST validate cached
+    // metadata matches current tensor state, or CUDA kernels will receive
+    // incorrect shapes leading to memory corruption and segfaults.
+    auto tensor_strides = tensor->strides();
+    needs_update = !std::equal(
+        it->second.begin(),
+        it->second.end(),
+        tensor_strides.begin(),
+        tensor_strides.end());
+  }
+
+  if (needs_update) {
     std::vector<int64_t> strides(tensor->dim());
     auto tensor_strides = tensor->strides();
     for (int i = 0; i < tensor->dim(); i++) {
       strides[i] = tensor_strides[i];
     }
-    it = internal::tensor_to_strides.emplace(tensor, std::move(strides)).first;
+    it =
+        internal::tensor_to_strides.insert_or_assign(tensor, std::move(strides))
+            .first;
   }
 
   // For 0D tensors, data() returns nullptr on empty vectors, but we need to
@@ -80,13 +99,31 @@ AOTITorchError aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype) {
 
 AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes) {
   auto it = internal::tensor_to_sizes.find(tensor);
+  bool needs_update = false;
+
   if (it == internal::tensor_to_sizes.end()) {
+    needs_update = true;
+  } else {
+    // CRITICAL: Multimodal models reuse tensors with different shapes across
+    // executions (e.g., variable-length audio). We MUST validate cached
+    // metadata matches current tensor state, or CUDA kernels will receive
+    // incorrect shapes leading to memory corruption and segfaults.
+    auto tensor_sizes = tensor->sizes();
+    needs_update = !std::equal(
+        it->second.begin(),
+        it->second.end(),
+        tensor_sizes.begin(),
+        tensor_sizes.end());
+  }
+
+  if (needs_update) {
     std::vector<int64_t> sizes(tensor->dim());
     auto tensor_sizes = tensor->sizes();
     for (int i = 0; i < tensor->dim(); i++) {
       sizes[i] = tensor_sizes[i];
     }
-    it = internal::tensor_to_sizes.emplace(tensor, std::move(sizes)).first;
+    it = internal::tensor_to_sizes.insert_or_assign(tensor, std::move(sizes))
+             .first;
   }
 
   // For 0D tensors, data() returns nullptr on empty vectors, but we need to
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index 58ab54e1aac..805c54ff55c 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -165,6 +165,14 @@ class ET_EXPERIMENTAL CudaBackend final
       Span<EValue*> args) const override {
     AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
 
+    // Need to re-register all the symbols from the so_handle hosted by this
+    // CudaBackend instance. The reason is that these symbols are
+    // static/singleton across the whole process. When we share multiple methods
+    // (meaning multiple so_handle) in the same process, we need to re-register
+    // the symbols from the so_handle that is being used in this execution.
+    ET_CHECK_OK_OR_RETURN_ERROR(
+        register_shared_library_functions(handle->so_handle));
+
     size_t n_inputs;
     AOTInductorModelContainerGetNumInputs(handle->container_handle, &n_inputs);
 
@@ -223,7 +231,6 @@ class ET_EXPERIMENTAL CudaBackend final
           "Failed to copy input %d from CPU to GPU",
           i);
     }
-    ET_LOG(Info, "Inputs copied to GPU");
     // Process output tensors: create GPU counterparts for ExecuTorch CPU
     // tensors
     for (int i = 0; i < n_outputs; i++) {
@@ -253,7 +260,6 @@ class ET_EXPERIMENTAL CudaBackend final
 
       gpu_outputs[i] = gpu_output_handle;
     }
-    ET_LOG(Info, "Outputs created on GPU");
     // Run AOTI container with GPU tensors
     AOTIRuntimeError error = AOTInductorModelContainerRun(
         handle->container_handle,
diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt
index 85c6a13e0ff..3995f5533e6 100644
--- a/examples/models/voxtral/CMakeLists.txt
+++ b/examples/models/voxtral/CMakeLists.txt
@@ -86,6 +86,13 @@ list(
   extension_flat_tensor
 )
 
+# Link CUDA backend
+if(EXECUTORCH_BUILD_CUDA)
+  find_package(CUDAToolkit REQUIRED)
+  list(APPEND link_libraries aoti_cuda)
+  executorch_target_link_options_shared_lib(aoti_cuda)
+endif()
+
 # Add tokenizers
 list(APPEND link_libraries tokenizers::tokenizers)
 
diff --git a/examples/models/voxtral/README.md b/examples/models/voxtral/README.md
index 4e9ddcf34a4..861043fe2a7 100644
--- a/examples/models/voxtral/README.md
+++ b/examples/models/voxtral/README.md
@@ -36,6 +36,29 @@ optimum-cli export executorch \
 
 This exports Voxtral with XNNPack backend acceleration and 4-bit weight/8-bit activation linear quantization.
 
+## CUDA Support
+If your environment has CUDA support, you can enable the runner to run on CUDA for improved performance. Follow the export and runtime commands below:
+
+**Note:** We are currently working on quantization support for CUDA. Currently, only bfloat16 dtype is supported for CUDA execution.
+
+### Exporting with CUDA
+```
+optimum-cli export executorch \
+  --model "mistralai/Voxtral-Mini-3B-2507" \
+  --task "multimodal-text-to-text" \
+  --recipe "cuda" \
+  --dtype bfloat16 \
+  --device cuda \
+  --max_seq_len 1024 \
+  --output_dir="voxtral"
+```
+
+This will generate:
+- `model.pte` - The exported model
+- `aoti_cuda_blob.ptd` - The CUDA kernel blob required for runtime
+
+See the "Building the multimodal runner" section below for instructions on building with CUDA support, and the "Running the model" section for runtime instructions.
+
 # Running the model
 To run the model, we will use the Voxtral runner, which utilizes ExecuTorch's MultiModal runner API.
 The Voxtral runner will do the following things:
@@ -56,6 +79,8 @@ python -m executorch.extension.audio.mel_spectrogram --feature_size 128 --stack_
 ```
 
 ## Building the multimodal runner
+
+### Building for CPU (XNNPack)
 ```
 # Build and install ExecuTorch
 cmake --preset llm -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=cmake-out -DEXECUTORCH_ENABLE_LOGGING=ON && cmake --build cmake-out -j16 --target install --config Release
@@ -64,6 +89,26 @@ cmake --preset llm -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=cmake-out -
 cmake -DCMAKE_INSTALL_PREFIX=cmake-out -DBUILD_TESTING=OFF -DCMAKE_BUILD_TYPE=Release -Bcmake-out/examples/models/voxtral examples/models/voxtral && cmake --build cmake-out/examples/models/voxtral -j16 --config Release
 ```
 
+### Building for CUDA
+```
+# Install ExecuTorch with CUDA support
+CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh
+
+# Build the multimodal runner with CUDA
+cmake --preset llm \
+      -DEXECUTORCH_BUILD_CUDA=ON \
+      -DCMAKE_INSTALL_PREFIX=cmake-out \
+      -DCMAKE_BUILD_TYPE=Release \
+      -Bcmake-out -S.
+cmake --build cmake-out -j16 --target install --config Release
+
+cmake -DEXECUTORCH_BUILD_CUDA=ON \
+      -DCMAKE_BUILD_TYPE=Release \
+      -Sexamples/models/voxtral \
+      -Bcmake-out/examples/models/voxtral/
+cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release
+```
+
 ## Running the model
 You can download the `tekken.json` tokenizer from [Voxtral's HuggingFace repo](https://huggingface.co/mistralai/Voxtral-Mini-3B-2507).
 
@@ -88,6 +133,12 @@ If you already have a preprocessed mel spectrogram saved as a `.bin` file, you c
   --audio_path path/to/preprocessed_audio.bin
 ```
 
+
+**For CUDA:** Add the `--data_path` argument to provide the CUDA kernel blob to the commands above:
+```
+  --data_path path/to/aoti_cuda_blob.ptd
+```
+
 Example output:
 ```
 The speaker in this audio seems to be talking about their concerns about a device called the model or maybe they're just talking about the model in general. They mention that the model was trained with the speaker for inference, which suggests that
diff --git a/extension/llm/runner/multimodal_prefiller.cpp b/extension/llm/runner/multimodal_prefiller.cpp
index f0158847e92..7f5a8356979 100644
--- a/extension/llm/runner/multimodal_prefiller.cpp
+++ b/extension/llm/runner/multimodal_prefiller.cpp
@@ -67,11 +67,11 @@ Result<uint64_t> MultimodalPrefiller::prefill(
           InvalidArgument,
           "Model expects uint8_t image data, but image has float data.");
     } else {
-      ET_LOG(
-          Error,
+      ET_CHECK_OR_RETURN_ERROR(
+          false,
+          NotSupported,
           "Unsupported image encoder input dtype: %s",
           ::executorch::runtime::toString(expected_dtype));
-      return ::executorch::runtime::Error::NotSupported;
     }
 
     // The model might expect a 4D tensor (NCHW), but toTensor() returns a 3D
@@ -93,14 +93,47 @@ Result<uint64_t> MultimodalPrefiller::prefill(
   } else if (input.is_audio()) {
     const Audio& audio = input.get_audio();
 
-    // Use Audio::toTensor() for tensor creation
+    auto method_meta = ET_UNWRAP(
+        module_->method_meta(kAudioEncoderMethod),
+        "Failed to get method_meta for %s",
+        kAudioEncoderMethod);
+
+    ET_CHECK_OR_RETURN_ERROR(
+        method_meta.num_inputs() > 0,
+        InvalidArgument,
+        "Audio encoder should have at least 1 input");
+    auto input_meta = ET_UNWRAP(
+        method_meta.input_tensor_meta(0),
+        "Cannot get input tensor meta at index 0");
+    auto expected_dtype = input_meta.scalar_type();
+
+    // Create tensor with original dtype
     auto audio_tensor =
         ET_UNWRAP(audio.toTensor(), "Failed to convert audio to tensor");
+
+    // Convert to expected dtype if needed
+    if (audio_tensor->scalar_type() != expected_dtype) {
+      if (expected_dtype == ::executorch::aten::ScalarType::BFloat16) {
+        // Convert to bfloat16
+        audio_tensor = ET_UNWRAP(
+            convert_to_bfloat16(audio_tensor),
+            "Failed to convert audio tensor to bfloat16");
+      } else {
+        ET_CHECK_OR_RETURN_ERROR(
+            false,
+            NotSupported,
+            "Unsupported audio encoder input dtype: %s. Expecting %s",
+            ::executorch::runtime::toString(audio_tensor->scalar_type()),
+            ::executorch::runtime::toString(expected_dtype));
+      }
+    }
+
     ET_LOG(
         Info,
         "Audio tensor dim: %zu, dtype: %s",
         audio_tensor->dim(),
         ::executorch::runtime::toString(audio_tensor->scalar_type()));
+
     // Run audio encoder
     auto audio_encoder_result =
         module_->execute(kAudioEncoderMethod, audio_tensor);
diff --git a/extension/llm/runner/test/CMakeLists.txt b/extension/llm/runner/test/CMakeLists.txt
index 934a5797da1..81b69c0ab9a 100644
--- a/extension/llm/runner/test/CMakeLists.txt
+++ b/extension/llm/runner/test/CMakeLists.txt
@@ -18,8 +18,13 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..)
 include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
 set(_test_srcs
-    test_generation_config.cpp test_text_llm_runner.cpp test_text_prefiller.cpp
-    test_text_decoder_runner.cpp test_multimodal_input.cpp test_wav_loader.cpp
+    test_generation_config.cpp
+    test_text_llm_runner.cpp
+    test_text_prefiller.cpp
+    test_text_decoder_runner.cpp
+    test_multimodal_input.cpp
+    test_util.cpp
+    test_wav_loader.cpp
 )
 
 # Add LSan stub for Apple platforms
diff --git a/extension/llm/runner/test/targets.bzl b/extension/llm/runner/test/targets.bzl
index 0571b39ccdb..1109ff315ac 100644
--- a/extension/llm/runner/test/targets.bzl
+++ b/extension/llm/runner/test/targets.bzl
@@ -45,6 +45,16 @@ def define_common_targets():
         ],
     )
 
+    runtime.cxx_test(
+        name = "test_util",
+        srcs = ["test_util.cpp"],
+        deps = [
+            "//executorch/extension/llm/runner:stats",
+            "//executorch/extension/tensor:tensor",
+            "//executorch/runtime/core:core",
+        ],
+    )
+
     runtime.cxx_test(
         name = "test_wav_loader",
         srcs = ["test_wav_loader.cpp"],
diff --git a/extension/llm/runner/test/test_util.cpp b/extension/llm/runner/test/test_util.cpp
new file mode 100644
index 00000000000..242e48e6871
--- /dev/null
+++ b/extension/llm/runner/test/test_util.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/llm/runner/util.h>
+#include <executorch/extension/tensor/tensor_ptr_maker.h>
+
+#include <gtest/gtest.h>
+
+#include <vector>
+
+namespace {
+
+using ::executorch::aten::ScalarType;
+using ::executorch::extension::make_tensor_ptr;
+using ::executorch::extension::llm::convert_to_bfloat16;
+
+TEST(ConvertToBFloat16Test, ConvertsFloatTensorData) {
+  auto source_tensor = make_tensor_ptr<float>(
+      {2, 2}, std::vector<float>{0.0f, 1.5f, -2.0f, 3.25f});
+
+  auto result = convert_to_bfloat16(source_tensor);
+  ASSERT_TRUE(result.ok());
+  auto bf16_tensor = *result;
+
+  EXPECT_EQ(bf16_tensor->scalar_type(), ScalarType::BFloat16);
+  EXPECT_EQ(bf16_tensor->numel(), source_tensor->numel());
+
+  auto src_sizes = source_tensor->sizes();
+  auto dst_sizes = bf16_tensor->sizes();
+  ASSERT_EQ(dst_sizes.size(), src_sizes.size());
+  for (size_t dim = 0; dim < dst_sizes.size(); ++dim) {
+    EXPECT_EQ(dst_sizes[dim], src_sizes[dim]);
+  }
+
+  const auto* converted_data = bf16_tensor->const_data_ptr<::c10::BFloat16>();
+  const auto* original_data = source_tensor->const_data_ptr<float>();
+  ASSERT_NE(converted_data, nullptr);
+  ASSERT_NE(original_data, nullptr);
+
+  for (size_t i = 0; i < static_cast<size_t>(source_tensor->numel()); ++i) {
+    EXPECT_NEAR(static_cast<float>(converted_data[i]), original_data[i], 1e-2f);
+  }
+}
+
+TEST(ConvertToBFloat16Test, RejectsNonFloatTensor) {
+  auto non_float_tensor =
+      make_tensor_ptr<int64_t>({3}, std::vector<int64_t>{1, 2, 3});
+
+  auto result = convert_to_bfloat16(non_float_tensor);
+  EXPECT_FALSE(result.ok());
+  EXPECT_EQ(result.error(), ::executorch::runtime::Error::InvalidArgument);
+}
+
+} // namespace
diff --git a/extension/llm/runner/util.h b/extension/llm/runner/util.h
index 8fb245107ab..ec08ecfb647 100644
--- a/extension/llm/runner/util.h
+++ b/extension/llm/runner/util.h
@@ -141,6 +141,31 @@ inline runtime::Result<TensorPtr> populate_start_pos_or_cache_position(
   }
 }
 
+/**
+ * Helper function to convert a float tensor to bfloat16.
+ * Creates a new tensor with bfloat16 dtype and copies/converts the data.
+ */
+inline ::executorch::runtime::Result<::executorch::extension::TensorPtr>
+convert_to_bfloat16(const ::executorch::extension::TensorPtr& src_tensor) {
+  ET_CHECK_OR_RETURN_ERROR(
+      src_tensor->scalar_type() == ::executorch::aten::ScalarType::Float,
+      InvalidArgument,
+      "BFloat16 conversion only supported from Float source data");
+
+  const auto num_elements = static_cast<size_t>(src_tensor->numel());
+  const float* float_data = src_tensor->const_data_ptr<float>();
+
+  auto bf16_tensor = ::executorch::extension::empty_like(
+      src_tensor, ::executorch::aten::ScalarType::BFloat16);
+  auto* bf16_data =
+      bf16_tensor->mutable_data_ptr<::executorch::aten::BFloat16>();
+  for (size_t i = 0; i < num_elements; ++i) {
+    bf16_data[i] = ::executorch::aten::BFloat16(float_data[i]);
+  }
+
+  return bf16_tensor;
+}
+
 } // namespace llm
 } // namespace extension
 } // namespace executorch
diff --git a/tools/cmake/executorch-config.cmake b/tools/cmake/executorch-config.cmake
index 6c27e8ba616..3df8e947459 100644
--- a/tools/cmake/executorch-config.cmake
+++ b/tools/cmake/executorch-config.cmake
@@ -53,6 +53,7 @@ set(EXECUTORCH_FOUND ON)
 include("${CMAKE_CURRENT_LIST_DIR}/ExecuTorchTargets.cmake")
 
 set(optional_lib_list
+    aoti_cuda
     flatccrt
     etdump
     bundled_program

From 4609cdba0194bd84930e13801ff32f7d6e8ff9b0 Mon Sep 17 00:00:00 2001
From: eigen-k <eigen@meta.com>
Date: Fri, 10 Oct 2025 23:28:28 -0700
Subject: [PATCH 242/266] Move RemovePermutesAroundElementwiseOps and
 RemoveSqueezeViewBeforeElementwiseOps to the common section.

Differential Revision: D83793229

Pull Request resolved: https://github.com/pytorch/executorch/pull/14780
---
 backends/cadence/aot/remove_ops.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/backends/cadence/aot/remove_ops.py b/backends/cadence/aot/remove_ops.py
index 755692ec2ec..fca1c1ff262 100644
--- a/backends/cadence/aot/remove_ops.py
+++ b/backends/cadence/aot/remove_ops.py
@@ -935,6 +935,8 @@ class CommonRemovePasses:
         RemoveNopSelectOpPass,
         RemoveToOpsPass,
         RemoveZeroSizedCatArgsPass,
+        RemovePermutesAroundElementwiseOps,
+        RemoveSqueezeViewBeforeElementwiseOps,
     ]
 
 
From 1dc0e0e9a7711dfb9382d4c0c23af220da534ca1 Mon Sep 17 00:00:00 2001
From: Oscar Andersson <87121123+oscarandersson8218@users.noreply.github.com>
Date: Sat, 11 Oct 2025 08:29:09 +0200
Subject: [PATCH 243/266] Arm backend: Upgrade vela to 4.4.1

Differential Revision: D84357937

Pull Request resolved: https://github.com/pytorch/executorch/pull/14890
---
 backends/arm/requirements-arm-ethos-u.txt         | 2 +-
 backends/arm/test/ops/test_multihead_attention.py | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/backends/arm/requirements-arm-ethos-u.txt b/backends/arm/requirements-arm-ethos-u.txt
index a26fb014234..9076aa08852 100644
--- a/backends/arm/requirements-arm-ethos-u.txt
+++ b/backends/arm/requirements-arm-ethos-u.txt
@@ -3,4 +3,4 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-ethos-u-vela == 4.4.0
+ethos-u-vela == 4.4.1
\ No newline at end of file
diff --git a/backends/arm/test/ops/test_multihead_attention.py b/backends/arm/test/ops/test_multihead_attention.py
index c7998e2235e..cbc2ccb32f4 100644
--- a/backends/arm/test/ops/test_multihead_attention.py
+++ b/backends/arm/test/ops/test_multihead_attention.py
@@ -3,7 +3,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import pytest
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.test_pipeline import (
@@ -69,7 +68,6 @@ def test_multihead_attention_tosa_INT(test_data):
     "test_data",
     test_suite,
 )
-@pytest.mark.xfail(reason="MLETORCH-1102: Numerical issues on FVP")
 @common.XfailIfNoCorstone300
 def test_multihead_attention_u55_INT(test_data: input_t1):
     test_data, module = test_data()
@@ -90,7 +88,6 @@ def test_multihead_attention_u55_INT(test_data: input_t1):
     "test_data",
     test_suite,
 )
-@pytest.mark.xfail(reason="MLETORCH-1102: Numerical issues on FVP")
 @common.XfailIfNoCorstone320
 def test_multihead_attention_u85_INT(test_data: input_t1):
     test_data, module = test_data()

From cc6cb837d6ac92f52a2d30a405900caf115f0556 Mon Sep 17 00:00:00 2001
From: Hardik Sharma <hardiksharma@meta.com>
Date: Sat, 11 Oct 2025 11:59:30 +0530
Subject: [PATCH 244/266] Add option to specify fake tensor mode for graph and
 program builders.

Differential Revision: D84187909

Pull Request resolved: https://github.com/pytorch/executorch/pull/14958
---
 backends/cadence/aot/graph_builder.py   | 4 ++--
 backends/cadence/aot/program_builder.py | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/backends/cadence/aot/graph_builder.py b/backends/cadence/aot/graph_builder.py
index 2cfd7900e8e..f609ba55472 100644
--- a/backends/cadence/aot/graph_builder.py
+++ b/backends/cadence/aot/graph_builder.py
@@ -44,12 +44,12 @@ class GraphBuilder(ExportPass):
         gm = builder.get_graph_module()
     """
 
-    def __init__(self) -> None:
+    def __init__(self, fake_tensor_mode: Optional[FakeTensorMode] = None) -> None:
         self.exporter = ExportPass()
         self.tracer: ExportPass.ExportTracer = self.ExportTracer(
             self, torch.fx.graph.CodeGen()
         )
-        self.fake_tensor_mode = FakeTensorMode(
+        self.fake_tensor_mode: FakeTensorMode = fake_tensor_mode or FakeTensorMode(
             allow_fallback_kernels=False,
             allow_non_fake_inputs=True,
         )
diff --git a/backends/cadence/aot/program_builder.py b/backends/cadence/aot/program_builder.py
index 862ba4e977c..46d730b68ff 100644
--- a/backends/cadence/aot/program_builder.py
+++ b/backends/cadence/aot/program_builder.py
@@ -12,6 +12,7 @@
 from torch import Tensor
 from torch._export.verifier import Verifier
 from torch._ops import OpOverload
+from torch._subclasses.fake_tensor import FakeTensorMode
 from torch.export import ExportedProgram
 from torch.export.exported_program import ModuleCallEntry, ModuleCallSignature
 from torch.export.graph_signature import (
@@ -37,6 +38,7 @@ def __init__(
         self,
         mode: Optional[IrMode] = None,
         _core_aten_ops_exception_list: Optional[list[OpOverload]] = None,
+        fake_tensor_mode: Optional[FakeTensorMode] = None,
     ) -> None:
         self.input_specs: list[InputSpec] = []
         self.output_specs: list[OutputSpec] = []
@@ -46,7 +48,7 @@ def __init__(
         self._core_aten_ops_exception_list: list[OpOverload] = (
             _core_aten_ops_exception_list or []
         )
-        super().__init__()
+        super().__init__(fake_tensor_mode=fake_tensor_mode)
 
     def insert_input_spec(
         self, target: str, input_kind: InputKind, value: Tensor

From 35d431b8b73f0a9a472b88a23891a68a29292e71 Mon Sep 17 00:00:00 2001
From: Ryan OShea <86965113+ArmRyan@users.noreply.github.com>
Date: Sat, 11 Oct 2025 13:10:53 +0200
Subject: [PATCH 245/266] Arm backend: Enable parallel building on MLSDK
 emulation layer (#14993)

Signed-off-by: Ryan O'Shea <ryan.oshea3@arm.com>
---
 backends/arm/scripts/mlsdk_utils.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/arm/scripts/mlsdk_utils.sh b/backends/arm/scripts/mlsdk_utils.sh
index 92e8f329846..2c6553df3d3 100755
--- a/backends/arm/scripts/mlsdk_utils.sh
+++ b/backends/arm/scripts/mlsdk_utils.sh
@@ -131,7 +131,7 @@ function setup_mlsdk() {
             -DSPIRV_TOOLS_PATH=../../dependencies/SPIRV-Tools        \
             -DVULKAN_HEADERS_PATH=../../dependencies/Vulkan-Headers
 
-        cmake --build build
+        cmake --build build -j$(nproc)
         cmake --install build --prefix deploy
         popd
     fi

From 019c8dadc00ada28ca5efdb03b0e406afd2abc5f Mon Sep 17 00:00:00 2001
From: haowhsu-quic <111341466+haowhsu-quic@users.noreply.github.com>
Date: Sun, 12 Oct 2025 04:21:49 +0800
Subject: [PATCH 246/266] Qualcomm AI Engine Direct - increase index_put
 coverage (#14924)

### Summary
- refactor a bit & add more test cases


### Test plan
```bash
python backends/qualcomm/tests/test_qnn_delegate.py TestQNNQuantizedOperator.test_qnn_backend_index_put -b build-android -s $SN -m SM8750
python backends/qualcomm/tests/test_qnn_delegate.py TestQNNQuantizedOperator.test_qnn_backend_index_put_suite -b build-android -s $SN -m SM8750
```
---
 backends/qualcomm/builders/op_index_put.py   | 484 +++++++++++++------
 backends/qualcomm/builders/op_mean_dim.py    |   2 +-
 backends/qualcomm/tests/models.py            |  48 +-
 backends/qualcomm/tests/test_qnn_delegate.py | 403 +++++++++++++--
 examples/qualcomm/utils.py                   |  36 +-
 5 files changed, 780 insertions(+), 193 deletions(-)

diff --git a/backends/qualcomm/builders/op_index_put.py b/backends/qualcomm/builders/op_index_put.py
index c3c42ed483a..23481894f0d 100644
--- a/backends/qualcomm/builders/op_index_put.py
+++ b/backends/qualcomm/builders/op_index_put.py
@@ -1,14 +1,19 @@
 import warnings
+from collections import OrderedDict
 from typing import Dict
 
 import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
 import numpy as np
 import torch
 
-from executorch.backends.qualcomm.utils.constants import QCOM_DATA, QCOM_QUANT_ATTRS
+from executorch.backends.qualcomm.utils.constants import (
+    QCOM_DATA,
+    QCOM_DTYPE,
+    QCOM_QUANT_ATTRS,
+)
 from executorch.exir.dialects._ops import ops as exir_ops
 
-from .node_visitor import NodeVisitor, QNN_TENSOR_TYPE_MAP
+from .node_visitor import NodeVisitor, QNN_QUANT_TYPE_MAP, QNN_TENSOR_TYPE_MAP
 from .node_visitor_manager import register_node_visitor
 from .qnn_constants import (
     OpConcat,
@@ -26,7 +31,7 @@ class IndexPutVisitor(NodeVisitor):
     def __init__(self, *args) -> None:
         super().__init__(*args)
 
-    def define_node(
+    def define_node(  # noqa: C901
         self,
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
@@ -37,6 +42,7 @@ def define_node(
         if quant_attrs := node.meta.get(QCOM_QUANT_ATTRS):
             quant_attrs = quant_attrs.copy()
             input_node.meta[QCOM_QUANT_ATTRS] = quant_attrs
+
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
@@ -46,52 +52,110 @@ def define_node(
             nodes_to_wrappers,
         )
 
-        indicies_node = node.args[1]
-        index_node_dim = None
-        index_nodes = []
-        index_tensors = []
+        indices_nodes = (
+            node.args[1] if isinstance(node.args[1], list) else [node.args[1]]
+        )
         target_index = []
+        all_range_index = OrderedDict()
+        index_dtype = [
+            node.meta["val"].dtype for node in indices_nodes if node is not None
+        ][0]
+
+        # preprocess:
+        # - broadcast dimension for multiple specified index
+        # - broadcast specified index if dimensions are not matched
+        max_indices_in_specified_index = 0
+        for index, idx_node in enumerate(indices_nodes):
+            if isinstance(idx_node, torch.fx.Node):
+                last_specified_index_node = index
+                if max_indices_in_specified_index < idx_node.meta["val"].nelement():
+                    max_indices_in_specified_index = idx_node.meta["val"].nelement()
         # If there is None in a list, it means all range at that dimension
-        # E.g., indicies_node: [None, None, aten__to_copy_default_1]
-        if isinstance(indicies_node, list):
-            for index, idx_node in enumerate(indicies_node):
-                # First, collect the indice_node and index of None to construct the shape of index node
-                # E.g., shape of input: [1, 1024, 12, 64]
-                # For "None" axis (assume indicies_node: [None, None, aten__to_copy_default_1]),
-                # target_index: [1, 1024, x], x is the shape of index_tensor, index_node_dim: 2
-                if isinstance(idx_node, torch.fx.Node):
-                    index_nodes.append(idx_node)
-                    index_tensors.append(self.get_tensor(idx_node, idx_node))
-                    target_index.extend(index_tensors[-1].size())
-                    index_node_dim = index
-                elif idx_node is None and index_node_dim is None:
-                    # E.g., indicies_node: [None, aten__to_copy_default_1, None]
-                    # Don't need to consider "None" after index_node.
-                    target_index.append(input_tensor.size(index))
-                else:
-                    warnings.warn(
-                        f"[QNN Delegate Op Builder]: Get the index {idx_node} that is neither a node nor None",
-                        stacklevel=1,
+        for index, idx_node in enumerate(indices_nodes):
+            # First, collect the index_node and index of None to construct the shape of index node
+            # E.g., shape of input: [1, 1024, 12, 64]
+            # For "None" axis (assume indices_node: [None, None, aten__to_copy_default_1]),
+            # target_index: [1, 1024, x], x is the shape of index_tensor, index_node_dim: 2
+            if isinstance(idx_node, torch.fx.Node):
+                # e.g. for case [index_node_0, None, index_node_1], nodes will have the same number of indices
+                target_index.append(
+                    self.get_tensor(idx_node, idx_node).nelement()
+                    if last_specified_index_node == index
+                    else 1
+                )
+            elif idx_node is None:
+                # E.g., indices_node: [None, None, aten__to_copy_default_1]
+                all_range_index[index] = torch.arange(
+                    input_tensor.size(index), dtype=index_dtype
+                )
+                target_index.append(input_tensor.size(index))
+            else:
+                warnings.warn(
+                    f"[QNN Delegate Op Builder]: Get the index {idx_node} that is neither a node nor None",
+                    stacklevel=1,
+                )
+                return
+
+        # preprocess all range indices if any
+        if None in indices_nodes:
+            all_range_tensor = torch.cartesian_prod(*all_range_index.values())
+            # repeat all_range_tensor interleavely for future concatenation
+            # e.g. input_node = [5, 4, 3, 2], indices = [index_0_node, None, index_2_node]
+            #      index_0.shape == index_2.shape == 2 (will guarantee this condition)
+            #      where user specified (3, 4) for index_0, (0, 1) for index_2
+            # ---
+            # we should have all_range_tensor: [0, 1, 2, 3]
+            # repeat interleavely with 2 to match future tiled index_0_node & index_2_node
+            # we'll have 1(index_0 -> same as index_2)*4(index_1)*2(index_2) indices in total:
+            # | index_0_node | None | index_2_node |
+            # | 3            | 0    | 0            |
+            # | 4            | 0    | 1            |
+            # | 3            | 1    | 0            |
+            # | 4            | 1    | 1            |
+            # | 3            | 2    | 0            |
+            # | 4            | 2    | 1            |
+            # | 3            | 3    | 0            |
+            # | 4            | 3    | 1            |
+            all_range_tensor_aug = all_range_tensor.repeat_interleave(
+                max_indices_in_specified_index, dim=0
+            )
+            for index in all_range_index.keys():
+                # Repeat index for "None" axis in indices_nodes
+                range_index_node = torch.fx.Node(
+                    node.graph,
+                    node.name + f"_all_range_index_{index}",
+                    "call_function",
+                    exir_ops.edge.aten.tensor.default,
+                    (),  # args
+                    {},  # kwargs
+                )
+                range_indices = (
+                    (
+                        all_range_tensor_aug[:, index]
+                        if all_range_tensor_aug.dim() > 1
+                        else
+                        # if there is only one None
+                        all_range_tensor_aug
                     )
-                    return
-        # Assume that there is only one node in list
-        assert len(index_nodes) == 1, "Not support multiple indices tensor"
-        indice_node = index_nodes[0]
-        indice_tensor = index_tensors[0]
-        indices_tensor_wrapper = self.define_tensor(
-            indice_node,
-            node,
-            indice_tensor,
-            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
-            nodes_to_wrappers,
-        )
+                    .reshape(-1, 1)
+                    .contiguous()
+                )
+                target_index_tensor_wrapper = self.define_tensor(
+                    range_index_node,
+                    node,
+                    range_indices,
+                    PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
+                    nodes_to_wrappers,
+                )
+                # store it for future concatenation
+                all_range_index[index] = (range_indices, target_index_tensor_wrapper)
 
         # Need to reconstruct the index tensor.
         # E.g., based on ScatterND Op Def in QNN Docs.
         # Torch:
         #   Given that
         #     shape of input: [1, 12, 1024, 64]
-        #     indicies_node: [None, None, aten__to_copy_default_1]
+        #     indices_node: [None, None, aten__to_copy_default_1]
         #     shape of aten__to_copy_default_1: [1]
         # QNN:
         #   Index tensor:
@@ -104,113 +168,135 @@ def define_node(
         #   update_indices = indices.shape[:-1]
         #   for idx in np.ndindex(update_indices):
         #       output[indices[idx]] = updates[idx]
+        specified_index = OrderedDict()
+        for i, indices_node in enumerate(indices_nodes):
+            if indices_node is None:
+                continue
 
-        # Append one dimension to specify x-tuple
-        index_shape = target_index + [1]
-        # Reshape the index_node for tile op
-        reshape_shape = [
-            shape if id == index_node_dim else 1 for id, shape in enumerate(index_shape)
-        ]
-        reshape_output_tensor = indice_tensor.reshape(reshape_shape)
-        reshape_output_tensor_wrapper = self.define_custom_tensor_wrapper(
-            node_name=node.name + "_reshape",
-            tensor_type=PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
-            dtype=QNN_TENSOR_TYPE_MAP[reshape_output_tensor.dtype],
-            quant_encoding=PyQnnWrapper.Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_UNDEFINED,
-            quant_configs={},
-            dims=reshape_output_tensor.size(),
-            tensor=reshape_output_tensor,
-            is_fake_tensor=True,
-            nodes_to_wrappers=nodes_to_wrappers,
-        )
-        reshape_op = PyQnnWrapper.PyQnnOpWrapper(
-            node.name,
-            QNN_OP_PACKAGE_NAME_QTI_AISW,
-            OpReshape.op_name,
-        )
-        reshape_op.AddInputTensors([indices_tensor_wrapper])
-        reshape_op.AddOutputTensors([reshape_output_tensor_wrapper])
-        op_wrapper_list.append(reshape_op)
-        index_put_index_input_tensor_wrapper = reshape_output_tensor_wrapper
+            indices_tensor = self.get_tensor(indices_node, indices_node)
+            indices_tensor_wrapper = self.define_tensor(
+                indices_node,
+                node,
+                indices_tensor,
+                PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+                nodes_to_wrappers,
+            )
+            if indices_tensor.nelement() < max_indices_in_specified_index:
+                # broadcast the specified index
+                indices_tensor = indices_tensor.repeat(max_indices_in_specified_index)
+                indices_multiples = [max_indices_in_specified_index]
+                indices_multiples_shape = [len(indices_multiples)]
+                indices_tile_tensor_wrapper = self.define_custom_tensor_wrapper(
+                    node_name=node.name + f"_indices_tile_{i}",
+                    tensor_type=PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+                    dtype=QNN_TENSOR_TYPE_MAP[indices_tensor.dtype],
+                    quant_encoding=PyQnnWrapper.Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_UNDEFINED,
+                    quant_configs={},
+                    dims=indices_tensor.size(),
+                    tensor=indices_tensor,
+                    is_fake_tensor=True,
+                    nodes_to_wrappers=nodes_to_wrappers,
+                )
+                tile_op = PyQnnWrapper.PyQnnOpWrapper(
+                    node.name,
+                    QNN_OP_PACKAGE_NAME_QTI_AISW,
+                    OpTile.op_name,
+                )
+                tile_op.AddInputTensors([indices_tensor_wrapper])
+                tile_op.AddOutputTensors([indices_tile_tensor_wrapper])
+                tile_op.AddTensorParam(
+                    OpTile.param_multiples,
+                    PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
+                    len(indices_multiples_shape),
+                    indices_multiples_shape,
+                    np.array(indices_multiples, dtype=np.uint32),
+                    True,
+                )
+                op_wrapper_list.append(tile_op)
+                indices_tensor_wrapper = indices_tile_tensor_wrapper
 
-        # Tile the index_node and concat the target index
-        if None in indicies_node:
-            tile_output_tensor = reshape_output_tensor.expand(index_shape)
-            # Tile the index_node to align with the shape of target_index
-            # Only need to tile the dim of None axis
-            # E.g., indicies_node: [None, None, aten__to_copy_default_1]
-            # Should tile the first two dimension.
-            multiples = [
-                shape if id != index_node_dim else 1
-                for id, shape in enumerate(index_shape)
-            ]
-            multiples_shape = [len(index_shape)]
-            tile_output_tensor_wrapper = self.define_custom_tensor_wrapper(
-                node_name=node.name + "_tile",
+            # Append one dimension to specify x-tuple
+            # Reshape the index_node for tile op
+            reshape_shape = list(indices_tensor.shape) + [1]
+            reshape_output_tensor = indices_tensor.reshape(reshape_shape)
+            reshape_output_tensor_wrapper = self.define_custom_tensor_wrapper(
+                node_name=node.name + f"_reshape_{i}",
                 tensor_type=PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
-                dtype=QNN_TENSOR_TYPE_MAP[tile_output_tensor.dtype],
+                dtype=QNN_TENSOR_TYPE_MAP[reshape_output_tensor.dtype],
                 quant_encoding=PyQnnWrapper.Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_UNDEFINED,
                 quant_configs={},
-                dims=tile_output_tensor.size(),
-                tensor=tile_output_tensor,
+                dims=reshape_output_tensor.size(),
+                tensor=reshape_output_tensor,
                 is_fake_tensor=True,
                 nodes_to_wrappers=nodes_to_wrappers,
             )
-            tile_op = PyQnnWrapper.PyQnnOpWrapper(
+            reshape_op = PyQnnWrapper.PyQnnOpWrapper(
                 node.name,
                 QNN_OP_PACKAGE_NAME_QTI_AISW,
-                OpTile.op_name,
+                OpReshape.op_name,
             )
-            tile_op.AddInputTensors([reshape_output_tensor_wrapper])
-            tile_op.AddOutputTensors([tile_output_tensor_wrapper])
-            tile_op.AddTensorParam(
-                OpTile.param_multiples,
-                PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
-                len(multiples_shape),
-                multiples_shape,
-                np.array(multiples, dtype=np.uint32),
-                True,
-            )
-            op_wrapper_list.append(tile_op)
+            reshape_op.AddInputTensors([indices_tensor_wrapper])
+            reshape_op.AddOutputTensors([reshape_output_tensor_wrapper])
+            op_wrapper_list.append(reshape_op)
+            index_tensor_wrapper = reshape_output_tensor_wrapper
+            index_tensor = reshape_output_tensor
 
-            # Repeat index for "None" axis in indicies_node
-            ranges = [
-                torch.arange(dim, dtype=indice_tensor.dtype)
-                for dim in target_index[:-1]
-            ]
-            target_index_shape = target_index + [len(ranges)]
-            target_index_tensor = torch.cartesian_prod(*ranges)
-            reshape_target_index_shape = [
-                shape if id != index_node_dim else 1
-                for id, shape in enumerate(target_index_shape)
-            ]
-            target_index_tensor = target_index_tensor.reshape(
-                reshape_target_index_shape
-            )
-            target_index_tensor = target_index_tensor.expand(
-                target_index_shape
-            ).contiguous()
-            target_index_node = torch.fx.Node(
-                node.graph,
-                node.name + "_target_index",
-                "call_function",
-                exir_ops.edge.aten.tensor.default,
-                (),  # args
-                {},  # kwargs
-            )
-            target_index_tensor_wrapper = self.define_tensor(
-                target_index_node,
-                node,
-                target_index_tensor,
-                PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
-                nodes_to_wrappers,
-            )
+            # Tile the index_node and concat the target index
+            if None in indices_nodes:
+                tile_output_tensor = reshape_output_tensor.repeat(
+                    all_range_tensor.size(0), 1
+                )
+                # Tile the index_node to align with the shape of target_index
+                # Only need to tile the dim of None axis
+                # E.g., indices_node: [None, None, aten__to_copy_default_1]
+                # Should tile the number of indices combination of first two dimension
+                # times number of indices specified by aten__to_copy_default_1
+                multiples = [all_range_tensor.size(0), 1]
+                multiples_shape = [len(multiples)]
+                tile_output_tensor_wrapper = self.define_custom_tensor_wrapper(
+                    node_name=node.name + f"_tile_{i}",
+                    tensor_type=PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+                    dtype=QNN_TENSOR_TYPE_MAP[tile_output_tensor.dtype],
+                    quant_encoding=PyQnnWrapper.Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_UNDEFINED,
+                    quant_configs={},
+                    dims=tile_output_tensor.size(),
+                    tensor=tile_output_tensor,
+                    is_fake_tensor=True,
+                    nodes_to_wrappers=nodes_to_wrappers,
+                )
+                tile_op = PyQnnWrapper.PyQnnOpWrapper(
+                    node.name,
+                    QNN_OP_PACKAGE_NAME_QTI_AISW,
+                    OpTile.op_name,
+                )
+                tile_op.AddInputTensors([reshape_output_tensor_wrapper])
+                tile_op.AddOutputTensors([tile_output_tensor_wrapper])
+                tile_op.AddTensorParam(
+                    OpTile.param_multiples,
+                    PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
+                    len(multiples_shape),
+                    multiples_shape,
+                    np.array(multiples, dtype=np.uint32),
+                    True,
+                )
+                op_wrapper_list.append(tile_op)
+                index_tensor_wrapper = tile_output_tensor_wrapper
+                index_tensor = tile_output_tensor
 
-            # Concat target_index and tile output to reconstruct index_node
-            # Cannot use QNN Pack (stack) since QNN Pack is not support int32 dtype
-            concat_output_tensor = torch.concat(
-                (target_index_tensor, tile_output_tensor), dim=-1
+            specified_index[i] = (index_tensor, index_tensor_wrapper)
+
+        # Concat target_index and tile output to reconstruct index_node
+        # Cannot use QNN Pack (stack) since QNN Pack is not support int32 dtype
+        index_tensors, index_tensor_wrappers = [], []
+        for i, arg in enumerate(indices_nodes):
+            tensor, tensor_wrapper = (
+                all_range_index[i] if arg is None else specified_index[i]
             )
+            index_tensors.append(tensor)
+            index_tensor_wrappers.append(tensor_wrapper)
+
+        if len(index_tensor_wrappers) > 1:
+            concat_output_tensor = torch.concat(index_tensors, dim=-1)
             concat_output_tensor_wrapper = self.define_custom_tensor_wrapper(
                 node_name=node.name + "_concat",
                 tensor_type=PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
@@ -227,9 +313,7 @@ def define_node(
                 QNN_OP_PACKAGE_NAME_QTI_AISW,
                 OpConcat.op_name,
             )
-            concat_op.AddInputTensors(
-                [target_index_tensor_wrapper, tile_output_tensor_wrapper]
-            )
+            concat_op.AddInputTensors(index_tensor_wrappers)
             concat_op.AddOutputTensors([concat_output_tensor_wrapper])
             concat_op.AddScalarParam(
                 OpConcat.param_axis,
@@ -237,7 +321,6 @@ def define_node(
                 {QCOM_DATA: np.uint32(concat_output_tensor.dim() - 1)},
             )
             op_wrapper_list.append(concat_op)
-            index_put_index_input_tensor_wrapper = concat_output_tensor_wrapper
 
         value_node = self.get_node(node.args[2])
         value_tensor = self.get_tensor(value_node, node)
@@ -248,6 +331,94 @@ def define_node(
             PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
             nodes_to_wrappers,
         )
+        # handle broadcast scenario
+        # e.g. input_tensor: (1, 12, 1024, 64), value_tensor: (1, 64)
+        #      => value_reshape_tensor: (1, 1, 1, 64)
+        new_value_shape = (
+            *([1] * (input_tensor.dim() - value_tensor.dim())),
+            *value_tensor.shape,
+        )
+        # reshape the value_node for tile op
+        value_quant_encoding, value_quant_configs = self.get_quant_encoding_conf(
+            value_node, node
+        )
+        value_dtype = (
+            QNN_TENSOR_TYPE_MAP[value_tensor.dtype]
+            if value_quant_encoding
+            == PyQnnWrapper.Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_UNDEFINED
+            else QNN_QUANT_TYPE_MAP[
+                (
+                    torch.uint16
+                    if value_quant_configs[QCOM_DTYPE] == torch.int32
+                    else value_quant_configs[QCOM_DTYPE]
+                )
+            ]
+        )
+        value_reshape_tensor = value_tensor.reshape(new_value_shape)
+        value_reshape_tensor_wrapper = self.define_custom_tensor_wrapper(
+            node_name=node.name + "_value_reshape",
+            tensor_type=PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            dtype=value_dtype,
+            quant_encoding=value_quant_encoding,
+            quant_configs=value_quant_configs,
+            dims=value_reshape_tensor.size(),
+            tensor=value_reshape_tensor,
+            is_fake_tensor=True,
+            nodes_to_wrappers=nodes_to_wrappers,
+        )
+        value_reshape_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpReshape.op_name,
+        )
+        value_reshape_op.AddInputTensors([value_tensor_wrapper])
+        value_reshape_op.AddOutputTensors([value_reshape_tensor_wrapper])
+        op_wrapper_list.append(value_reshape_op)
+
+        # e.g. input_tensor: (1, 12, 1024, 64), index_tensor: (None, None, 2), value_tensor: (1, 64)
+        #      => multiples: [1, 12, 2, 1]
+        value_multiples = []
+        for i in range(input_tensor.dim() - 1, -1, -1):
+            if i in specified_index:
+                # all user specified index node wil have the same dimension
+                multiplier = (
+                    indices_nodes[i].meta["val"].nelement() // new_value_shape[i]
+                    if i == last_specified_index_node
+                    else 1
+                )
+            else:
+                multiplier = input_tensor.shape[i] // new_value_shape[i]
+            value_multiples.insert(0, multiplier)
+
+        value_tile_tensor = value_reshape_tensor.repeat(value_multiples)
+        value_multiples_shape = [len(value_multiples)]
+        value_tile_tensor_wrapper = self.define_custom_tensor_wrapper(
+            node_name=node.name + "_value_tile",
+            tensor_type=PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            dtype=value_dtype,
+            quant_encoding=value_quant_encoding,
+            quant_configs=value_quant_configs,
+            dims=value_tile_tensor.size(),
+            tensor=value_tile_tensor,
+            is_fake_tensor=True,
+            nodes_to_wrappers=nodes_to_wrappers,
+        )
+        value_tile_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpTile.op_name,
+        )
+        value_tile_op.AddInputTensors([value_reshape_tensor_wrapper])
+        value_tile_op.AddOutputTensors([value_tile_tensor_wrapper])
+        value_tile_op.AddTensorParam(
+            OpTile.param_multiples,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
+            len(value_multiples_shape),
+            value_multiples_shape,
+            np.array(value_multiples, dtype=np.uint32),
+            True,
+        )
+        op_wrapper_list.append(value_tile_op)
 
         output_tensor = self.get_tensor(node, node)
         output_tensor_wrapper = self.define_tensor(
@@ -263,11 +434,46 @@ def define_node(
             QNN_OP_PACKAGE_NAME_QTI_AISW,
             OpScatterNd.op_name,
         )
+        # accumulation
+        if len(node.args) > 3 and node.args[3]:
+            index_put_op.AddScalarParam(
+                OpScatterNd.param_reduction,
+                PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
+                {QCOM_DATA: 1},
+            )
+
+        # check final index_input tensor
+        index_input_tensor, index_input_tensor_wrapper = (
+            (concat_output_tensor, concat_output_tensor_wrapper)
+            if len(index_tensor_wrappers) > 1
+            else specified_index[last_specified_index_node]
+        )
+        target_index_reshape_tensor = index_input_tensor.reshape((*target_index, -1))
+        target_index_reshape_tensor_wrapper = self.define_custom_tensor_wrapper(
+            node_name=node.name + "_target_index_reshape",
+            tensor_type=PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            dtype=QNN_TENSOR_TYPE_MAP[target_index_reshape_tensor.dtype],
+            quant_encoding=PyQnnWrapper.Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_UNDEFINED,
+            quant_configs={},
+            dims=target_index_reshape_tensor.size(),
+            tensor=target_index_reshape_tensor,
+            is_fake_tensor=True,
+            nodes_to_wrappers=nodes_to_wrappers,
+        )
+        target_index_reshape_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpReshape.op_name,
+        )
+        target_index_reshape_op.AddInputTensors([index_input_tensor_wrapper])
+        target_index_reshape_op.AddOutputTensors([target_index_reshape_tensor_wrapper])
+        op_wrapper_list.append(target_index_reshape_op)
+
         index_put_op.AddInputTensors(
             [
                 input_tensor_wrapper,
-                index_put_index_input_tensor_wrapper,
-                value_tensor_wrapper,
+                target_index_reshape_tensor_wrapper,
+                value_tile_tensor_wrapper,
             ]
         )
         index_put_op.AddOutputTensors([output_tensor_wrapper])
diff --git a/backends/qualcomm/builders/op_mean_dim.py b/backends/qualcomm/builders/op_mean_dim.py
index 22cb47ee288..10644e17c79 100644
--- a/backends/qualcomm/builders/op_mean_dim.py
+++ b/backends/qualcomm/builders/op_mean_dim.py
@@ -55,7 +55,7 @@ def define_node(
             mean_dims = [dim_arg]
         else:
             mean_dims = list(dim_arg)
-        print("mean_dims: ", mean_dims, "rank: ", rank)
+
         mean_dims = [
             mean_dim % len(input_node.meta["val"].shape) for mean_dim in mean_dims
         ]
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
index 3240ad7a018..5ea6caf54ad 100644
--- a/backends/qualcomm/tests/models.py
+++ b/backends/qualcomm/tests/models.py
@@ -1141,20 +1141,62 @@ def forward(self, input_pos, k_val):
 
 
 class IndexPut(torch.nn.Module):
-    def __init__(self, skip_mutable_buffer=False):
+    def __init__(self, skip_mutable_buffer=False, mode=0):
         super().__init__()
         self.skip_mutable_buffer = skip_mutable_buffer
         self.register_buffer(
             "k_cache",
-            torch.zeros((1, 1024, 12, 64), dtype=torch.float32),
+            torch.zeros((2, 1024, 12, 64), dtype=torch.float32),
             persistent=True,
         )
+        self.mode = mode
 
     def forward(self, input_pos, k_val):
-        k_out = torch.ops.aten.index_put_(self.k_cache, [None, input_pos], k_val)
+        match self.mode:
+            case 0:
+                k_out = torch.ops.aten.index_put_(self.k_cache, [input_pos], k_val)
+            case 1:
+                k_out = torch.ops.aten.index_put_(
+                    self.k_cache, [None, input_pos], k_val
+                )
+            case 2:
+                k_out = torch.ops.aten.index_put_(
+                    self.k_cache, [None, None, input_pos], k_val
+                )
+            case 3:
+                k_out = torch.ops.aten.index_put_(
+                    self.k_cache, [input_pos[0], input_pos[1]], k_val
+                )
+            case 4:
+                k_out = torch.ops.aten.index_put_(
+                    self.k_cache, [None, input_pos[0], input_pos[1]], k_val
+                )
+            case 5:
+                k_out = torch.ops.aten.index_put_(
+                    self.k_cache, [input_pos[0], None, input_pos[1]], k_val
+                )
+
         return k_out + 0
 
 
+class IndexPutSuite(torch.nn.Module):
+    def __init__(self, accumulate=False, in_place=False):
+        super().__init__()
+        self.accumulate = accumulate
+        self.in_place = in_place
+
+    def forward(self, x, indices, values):
+        if self.in_place:
+            # Clone the input to avoid modifying it in-place
+            result = x.clone()
+            # Apply index_put_ and return the modified tensor
+            result.index_put_(indices, values, self.accumulate)
+            return result
+        else:
+            # Use the non-in-place variant which returns a new tensor
+            return torch.index_put(x, indices, values, self.accumulate)
+
+
 class IndexSelect(torch.nn.Module):
     def __init__(self, dim):
         super().__init__()
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index 56983561e5f..2641acc5a2d 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 import io
+import itertools
 import json
 import subprocess
 import sys
@@ -887,28 +888,191 @@ def test_qnn_backend_index_copy(self):
                 )
 
     def test_qnn_backend_index_put(self):
-        test_comb = [
-            {
-                QCOM_MODULE: IndexPut(skip_mutable_buffer=False),  # noqa: F405
-                QCOM_SAMPLE_INPUTS: (
-                    torch.tensor([2], dtype=torch.int32),
-                    torch.randn([1, 1, 12, 64]),
+        skip_mutable_buffer = [False, True]
+        total_test_combo = []
+        # mode 0
+        sample_inputs = [
+            (torch.tensor([0], dtype=torch.int32), torch.randn([1, 1, 12, 64])),
+            (torch.tensor([0], dtype=torch.int32), torch.randn([1, 64])),
+            (torch.tensor([0, 1], dtype=torch.int32), torch.randn([2, 1, 12, 64])),
+            (torch.tensor([0, 1], dtype=torch.int32), torch.randn([1, 64])),
+        ]
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+        # mode 1
+        sample_inputs = [
+            (torch.tensor([2], dtype=torch.int32), torch.randn([1, 1, 12, 64])),
+            (torch.tensor([2], dtype=torch.int32), torch.randn([1, 64])),
+            (torch.tensor([2, 3], dtype=torch.int32), torch.randn([1, 2, 12, 64])),
+            (torch.tensor([2, 3], dtype=torch.int32), torch.randn([1, 64])),
+        ]
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+        # mode 2
+        sample_inputs = [
+            (torch.tensor([2], dtype=torch.int32), torch.randn([1, 1, 1, 64])),
+            (torch.tensor([2], dtype=torch.int32), torch.randn([1, 64])),
+            (torch.tensor([0, 1], dtype=torch.int32), torch.randn([1, 1, 2, 64])),
+            (torch.tensor([2, 3], dtype=torch.int32), torch.randn([1, 64])),
+        ]
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+        # mode 3
+        sample_inputs = [
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
                 ),
-            },
-            {
-                QCOM_MODULE: IndexPut(skip_mutable_buffer=True),  # noqa: F405
-                QCOM_SAMPLE_INPUTS: (
-                    torch.tensor([2], dtype=torch.int32),
-                    torch.randn([1, 1, 12, 64]),
+                torch.randn([2, 12, 64]),
+            ),
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
                 ),
-            },
+                torch.randn([1, 64]),
+            ),
         ]
-        for i, test in enumerate(test_comb):
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+        # mode 4
+        sample_inputs = [
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
+                ),
+                torch.randn([2, 64]),
+            ),
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
+                ),
+                torch.randn([1, 64]),
+            ),
+        ]
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+        # mode 5
+        sample_inputs = [
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
+                ),
+                torch.randn([64]),
+            ),
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
+                ),
+                torch.randn([1]),
+            ),
+        ]
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+
+        for i, test_combo in enumerate(total_test_combo):
+            for j, combo in enumerate(test_combo):
+                with self.subTest(f"mode_{i}-{j}"):
+                    self.lower_module_and_test_output(
+                        IndexPut(skip_mutable_buffer=combo[0], mode=i),  # noqa: F405
+                        combo[1],
+                        skip_mutable_buffer=combo[0],
+                    )
+
+    def test_qnn_backend_index_put_suite(self):
+        accumulate = [False, True]
+        in_place = [False, True]
+        sample_inputs = [
+            # basic
+            (
+                torch.rand(5, 2) * 100,
+                (torch.tensor([0, 2]),),
+                torch.tensor([10.0, 20.0]),
+            ),
+            (torch.rand(5, 2), (torch.tensor([0, 2]),), torch.tensor([10.0, 20.0])),
+            # shape
+            (torch.rand(5), (torch.tensor([0, 2]),), torch.tensor([10.0, 20.0])),
+            (
+                torch.rand(5, 2),
+                (torch.tensor([0, 2]), torch.tensor([1, 1])),
+                torch.tensor([10.0, 20.0]),
+            ),
+            (
+                torch.rand(5, 3, 2),
+                (torch.tensor([0, 2]), torch.tensor([1, 1]), torch.tensor([0, 1])),
+                torch.tensor([10.0, 20.0]),
+            ),
+            # TODO: not supported by HTP
+            # (
+            #     torch.rand(5, 3, 2, 4),
+            #     (torch.tensor([0, 2]), torch.tensor([1, 1]), torch.tensor([0, 1]), torch.tensor([2, 3])),
+            #     torch.tensor([10.0]),
+            # ),
+            # indices
+            (torch.rand(5, 2), (torch.tensor([2]),), torch.tensor([10.0])),
+            (
+                torch.rand(5, 3),
+                (torch.tensor([0, 2, 4]),),
+                torch.tensor([10.0, 20.0, 30.0]),
+            ),
+            (
+                torch.rand(5),
+                (torch.tensor([1, 1, 3, 3]),),
+                torch.tensor([10.0, 20.0, 30.0, 40.0]),
+            ),
+            # broadcasting
+            (torch.rand(5, 3), (torch.tensor([0, 2, 4]),), torch.tensor([42.0])),
+            (
+                torch.rand(3, 4),
+                (torch.tensor([0, 1]), torch.tensor([1, 2])),
+                torch.tensor([10.0, 20.0]),
+            ),
+            (torch.rand(4, 2), (torch.tensor([0, 2]),), torch.tensor([5.0, 15.0])),
+            (
+                torch.rand(3, 2, 2),
+                (torch.tensor([0, 1]),),
+                torch.tensor([[1.0, 2.0], [3.0, 4.0]]),
+            ),
+            (torch.rand(4, 2), (torch.tensor([1, 1, 1]),), torch.tensor([5.0])),
+            # two-index
+            (
+                torch.rand(4, 3),
+                (torch.tensor([0, 1, 2]), torch.tensor([1, 0, 2])),
+                torch.tensor([10.0, 20.0, 30.0]),
+            ),
+            (
+                torch.rand(3, 3),
+                (torch.tensor([0, 2]), torch.tensor([1, 1])),
+                torch.tensor([15.0, 25.0]),
+            ),
+            (
+                torch.rand(3, 2),
+                (torch.tensor([1, 1, 2]), torch.tensor([0, 0, 1])),
+                torch.tensor([5.0, 10.0, 15.0]),
+            ),
+            (
+                torch.rand(3, 2),
+                (torch.tensor([1]), torch.tensor([0, 0, 1])),
+                torch.tensor([5.0, 10.0, 15.0]),
+            ),
+        ]
+        test_combo = list(itertools.product(accumulate, in_place, sample_inputs))
+        for i, combo in enumerate(test_combo):
             with self.subTest(i=i):
                 self.lower_module_and_test_output(
-                    test[QCOM_MODULE],
-                    test[QCOM_SAMPLE_INPUTS],
-                    skip_mutable_buffer=test[QCOM_MODULE].skip_mutable_buffer,
+                    IndexPutSuite(accumulate=combo[0], in_place=combo[1]),  # noqa: F405
+                    combo[2],
                 )
 
     def test_qnn_backend_index_select(self):
@@ -2642,32 +2806,197 @@ def test_qnn_backend_index_copy(self):
                 )
 
     def test_qnn_backend_index_put(self):
-        test_comb = [
-            {
-                QCOM_MODULE: IndexPut(skip_mutable_buffer=False),  # noqa: F405
-                QCOM_SAMPLE_INPUTS: (
-                    torch.tensor([2], dtype=torch.int32),
-                    torch.randn([1, 1, 12, 64]),
+        skip_mutable_buffer = [False, True]
+        total_test_combo = []
+        # mode 0
+        sample_inputs = [
+            (torch.tensor([0], dtype=torch.int32), torch.randn([1, 1, 12, 64])),
+            (torch.tensor([0], dtype=torch.int32), torch.randn([1, 64])),
+            (torch.tensor([0, 1], dtype=torch.int32), torch.randn([2, 1, 12, 64])),
+            (torch.tensor([0, 1], dtype=torch.int32), torch.randn([1, 64])),
+        ]
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+        # mode 1
+        sample_inputs = [
+            (torch.tensor([2], dtype=torch.int32), torch.randn([1, 1, 12, 64])),
+            (torch.tensor([2], dtype=torch.int32), torch.randn([1, 64])),
+            (torch.tensor([2, 3], dtype=torch.int32), torch.randn([1, 2, 12, 64])),
+            (torch.tensor([2, 3], dtype=torch.int32), torch.randn([1, 64])),
+        ]
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+        # mode 2
+        sample_inputs = [
+            (torch.tensor([2], dtype=torch.int32), torch.randn([1, 1, 1, 64])),
+            (torch.tensor([2], dtype=torch.int32), torch.randn([1, 64])),
+            (torch.tensor([0, 1], dtype=torch.int32), torch.randn([1, 1, 2, 64])),
+            (torch.tensor([2, 3], dtype=torch.int32), torch.randn([1, 64])),
+        ]
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+        # mode 3
+        sample_inputs = [
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
                 ),
-            },
-            {
-                QCOM_MODULE: IndexPut(skip_mutable_buffer=True),  # noqa: F405
-                QCOM_SAMPLE_INPUTS: (
-                    torch.tensor([2], dtype=torch.int32),
-                    torch.randn([1, 1, 12, 64]),
+                torch.randn([2, 12, 64]),
+            ),
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
                 ),
-            },
+                torch.randn([1, 64]),
+            ),
         ]
-        for i, test in enumerate(test_comb):
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+        # mode 4
+        sample_inputs = [
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
+                ),
+                torch.randn([2, 64]),
+            ),
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
+                ),
+                torch.randn([1, 64]),
+            ),
+        ]
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+        # mode 5
+        sample_inputs = [
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
+                ),
+                torch.randn([64]),
+            ),
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
+                ),
+                torch.randn([1]),
+            ),
+        ]
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+
+        for i, test_combo in enumerate(total_test_combo):
+            for j, combo in enumerate(test_combo):
+                with self.subTest(f"mode_{i}-{j}"):
+                    module = self.get_qdq_module(
+                        IndexPut(skip_mutable_buffer=combo[0], mode=i),  # noqa: F405
+                        combo[1],
+                    )
+                    self.lower_module_and_test_output(
+                        module,
+                        combo[1],
+                        skip_mutable_buffer=combo[0],
+                    )
+
+    def test_qnn_backend_index_put_suite(self):
+        accumulate = [False, True]
+        in_place = [False, True]
+        sample_inputs = [
+            # basic
+            (
+                torch.rand(5, 2) * 100,
+                (torch.tensor([0, 2]),),
+                torch.tensor([10.0, 20.0]),
+            ),
+            (torch.rand(5, 2), (torch.tensor([0, 2]),), torch.tensor([10.0, 20.0])),
+            # shape
+            (torch.rand(5), (torch.tensor([0, 2]),), torch.tensor([10.0, 20.0])),
+            (
+                torch.rand(5, 2),
+                (torch.tensor([0, 2]), torch.tensor([1, 1])),
+                torch.tensor([10.0, 20.0]),
+            ),
+            (
+                torch.rand(5, 3, 2),
+                (torch.tensor([0, 2]), torch.tensor([1, 1]), torch.tensor([0, 1])),
+                torch.tensor([10.0, 20.0]),
+            ),
+            # TODO: not supported by HTP
+            # (
+            #     torch.rand(5, 3, 2, 4),
+            #     (torch.tensor([0, 2]), torch.tensor([1, 1]), torch.tensor([0, 1]), torch.tensor([2, 3])),
+            #     torch.tensor([10.0]),
+            # ),
+            # indices
+            (torch.rand(5, 2), (torch.tensor([2]),), torch.tensor([10.0])),
+            (
+                torch.rand(5, 3),
+                (torch.tensor([0, 2, 4]),),
+                torch.tensor([10.0, 20.0, 30.0]),
+            ),
+            (
+                torch.rand(5),
+                (torch.tensor([1, 1, 3, 3]),),
+                torch.tensor([10.0, 20.0, 30.0, 40.0]),
+            ),
+            # broadcasting
+            (torch.rand(5, 3), (torch.tensor([0, 2, 4]),), torch.tensor([42.0])),
+            (
+                torch.rand(3, 4),
+                (torch.tensor([0, 1]), torch.tensor([1, 2])),
+                torch.tensor([10.0, 20.0]),
+            ),
+            (torch.rand(4, 2), (torch.tensor([0, 2]),), torch.tensor([5.0, 15.0])),
+            (
+                torch.rand(3, 2, 2),
+                (torch.tensor([0, 1]),),
+                torch.tensor([[1.0, 2.0], [3.0, 4.0]]),
+            ),
+            (torch.rand(4, 2), (torch.tensor([1, 1, 1]),), torch.tensor([5.0])),
+            # two-index
+            (
+                torch.rand(4, 3),
+                (torch.tensor([0, 1, 2]), torch.tensor([1, 0, 2])),
+                torch.tensor([10.0, 20.0, 30.0]),
+            ),
+            (
+                torch.rand(3, 3),
+                (torch.tensor([0, 2]), torch.tensor([1, 1])),
+                torch.tensor([15.0, 25.0]),
+            ),
+            (
+                torch.rand(3, 2),
+                (torch.tensor([1, 1, 2]), torch.tensor([0, 0, 1])),
+                torch.tensor([5.0, 10.0, 15.0]),
+            ),
+            (
+                torch.rand(3, 2),
+                (torch.tensor([1]), torch.tensor([0, 0, 1])),
+                torch.tensor([5.0, 10.0, 15.0]),
+            ),
+        ]
+        test_combo = list(itertools.product(accumulate, in_place, sample_inputs))
+        for i, combo in enumerate(test_combo):
             with self.subTest(i=i):
                 module = self.get_qdq_module(
-                    test[QCOM_MODULE], test[QCOM_SAMPLE_INPUTS]
-                )
-                self.lower_module_and_test_output(
-                    module,
-                    test[QCOM_SAMPLE_INPUTS],
-                    skip_mutable_buffer=test[QCOM_MODULE].skip_mutable_buffer,
+                    IndexPutSuite(accumulate=combo[0], in_place=combo[1]),  # noqa: F405
+                    combo[2],
                 )
+                self.lower_module_and_test_output(module, combo[2])
 
     def test_qnn_backend_index_select(self):
         module = IndexSelect(dim=1)  # noqa: F405
diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py
index 11b9ab88bfe..036c5060b12 100755
--- a/examples/qualcomm/utils.py
+++ b/examples/qualcomm/utils.py
@@ -918,24 +918,34 @@ def generate_inputs(dest_path: str, file_name: str, inputs=None):
     input_list_file = None
     input_files = []
 
+    def prepare_input_file(tensor, fd, index, sub_index):
+        # transform torch.Tensor to raw file
+        input_file_name = f"input_{index}_{sub_index}.raw"
+        input_file_path = f"{dest_path}/{input_file_name}"
+        if not isinstance(tensor, torch.Tensor):
+            tensor = torch.tensor(tensor)
+        tensor.detach().numpy().tofile(input_file_path)
+        input_files.append(input_file_path)
+        # prepare input_list
+        if sub_index > 0:
+            fd.write(" ")
+        fd.write(input_file_name)
+
     # Prepare input data
     if inputs is not None:
         input_list_file = f"{dest_path}/{file_name}"
         with open(input_list_file, "w") as f:
             for idx, data in enumerate(inputs):
-                for i, d in enumerate(data):
-                    # transform torch.Tensor to raw file
-                    file_name = f"input_{idx}_{i}.raw"
-                    file_path = f"{dest_path}/{file_name}"
-                    if not isinstance(d, torch.Tensor):
-                        d = torch.tensor(d)
-                    d.detach().numpy().tofile(file_path)
-                    input_files.append(file_path)
-
-                    # prepare input_list
-                    if i > 0:
-                        f.write(" ")
-                    f.write(file_name)
+                sub_index = 0
+                for d in data:
+                    if isinstance(d, (list, tuple)):
+                        for sub_d in d:
+                            prepare_input_file(sub_d, f, idx, sub_index)
+                            sub_index += 1
+                    else:
+                        prepare_input_file(d, f, idx, sub_index)
+                        sub_index += 1
+
                 f.write("\n")
 
     return input_list_file, input_files

From 50a10a2ffc4cce362f8a9a0a9577d12aa8ec1fae Mon Sep 17 00:00:00 2001
From: Nitin Jain <jainnitin@meta.com>
Date: Fri, 10 Oct 2025 00:10:20 -0700
Subject: [PATCH 247/266] Updating tests for 16A8W ops which are supported
 (#14945)

Summary:

Updating the TOSA, U55 & U85 tests to remove xfails. These ops are supported now and updating tests to not expect failure.

Differential Revision: D84262200
---
 backends/arm/test/ops/test_add.py   |  4 ----
 backends/arm/test/ops/test_cat.py   | 10 ----------
 backends/arm/test/ops/test_mul.py   |  4 ----
 backends/arm/test/ops/test_slice.py | 10 ----------
 backends/arm/test/ops/test_view.py  | 10 ----------
 5 files changed, 38 deletions(-)

diff --git a/backends/arm/test/ops/test_add.py b/backends/arm/test/ops/test_add.py
index bcab40116d8..09c9d8fa224 100644
--- a/backends/arm/test/ops/test_add.py
+++ b/backends/arm/test/ops/test_add.py
@@ -7,7 +7,6 @@
 
 from typing import cast, Tuple
 
-import pytest
 import torch
 from executorch.backends.arm.quantizer import arm_quantizer
 from executorch.backends.arm.quantizer.arm_quantizer import (
@@ -260,9 +259,6 @@ def get_symmetric_a16w8_add_quantizer(per_channel_quantization=False):
 
 
 @common.parametrize("test_data", Add.test_data)
-@pytest.mark.xfail(
-    reason="missing int16 add ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13730"
-)
 def test_add_tensor_16a8w_tosa_INT(test_data: input_t1):
     """Test add operation with 16A8W quantization (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
diff --git a/backends/arm/test/ops/test_cat.py b/backends/arm/test/ops/test_cat.py
index a410240d310..254edbc411f 100644
--- a/backends/arm/test/ops/test_cat.py
+++ b/backends/arm/test/ops/test_cat.py
@@ -8,7 +8,6 @@
 
 from typing import Tuple
 
-import pytest
 import torch
 from executorch.backends.arm.quantizer.arm_quantizer import (
     get_symmetric_a16w8_quantization_config,
@@ -178,9 +177,6 @@ def get_symmetric_a16w8_cat_quantizer(per_channel_quantization=False):
 
 
 @common.parametrize("test_data", Cat.test_parameters)
-@pytest.mark.xfail(
-    reason="missing int16 cat ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13978"
-)
 def test_cat_16a8w_tosa_INT(test_data: Tuple):
     """Test cat operation with 16A8W quantization (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -206,9 +202,6 @@ def test_cat_16a8w_tosa_INT(test_data: Tuple):
 
 @common.parametrize("test_data", Cat.test_parameters)
 @common.XfailIfNoCorstone300
-@pytest.mark.xfail(
-    reason="Vela compilation fails with 'Invalid arguments' for int16 cat operations"
-)
 def test_cat_16a8w_u55_INT16(test_data: Tuple):
     """Test cat operation with 16A8W quantization on U55 (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -233,9 +226,6 @@ def test_cat_16a8w_u55_INT16(test_data: Tuple):
 
 @common.parametrize("test_data", Cat.test_parameters)
 @common.XfailIfNoCorstone320
-@pytest.mark.xfail(
-    reason="Vela compilation fails with 'Invalid arguments' for int16 cat operations"
-)
 def test_cat_16a8w_u85_INT16(test_data: Tuple):
     """Test cat operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
diff --git a/backends/arm/test/ops/test_mul.py b/backends/arm/test/ops/test_mul.py
index 2c7b040658a..02447e40c4e 100644
--- a/backends/arm/test/ops/test_mul.py
+++ b/backends/arm/test/ops/test_mul.py
@@ -8,7 +8,6 @@
 
 from typing import Tuple
 
-import pytest
 import torch
 from executorch.backends.arm.quantizer.arm_quantizer import (
     get_symmetric_a16w8_quantization_config,
@@ -310,9 +309,6 @@ def get_symmetric_a16w8_mul_quantizer(per_channel_quantization=False):
 
 
 @common.parametrize("test_data", test_data_suite)
-@pytest.mark.xfail(
-    reason="missing int16 mul ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13947"
-)
 def test_mul_tensor_16a8w_tosa_INT(test_data: input_t1):
     """Test mul operation with 16A8W quantization (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
diff --git a/backends/arm/test/ops/test_slice.py b/backends/arm/test/ops/test_slice.py
index b4bbf60d1bd..7e71a51899a 100644
--- a/backends/arm/test/ops/test_slice.py
+++ b/backends/arm/test/ops/test_slice.py
@@ -7,7 +7,6 @@
 
 from typing import Tuple
 
-import pytest
 import torch
 from executorch.backends.arm.quantizer.arm_quantizer import (
     get_symmetric_a16w8_quantization_config,
@@ -154,9 +153,6 @@ def get_symmetric_a16w8_slice_quantizer(per_channel_quantization=False):
 
 
 @common.parametrize("test_data", test_data_suite)
-@pytest.mark.xfail(
-    reason="missing int16 slice ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13976"
-)
 def test_slice_tensor_16a8w_tosa_INT(test_data: torch.Tensor):
     """Test slice operation with 16A8W quantization (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -182,9 +178,6 @@ def test_slice_tensor_16a8w_tosa_INT(test_data: torch.Tensor):
 
 @common.parametrize("test_data", test_data_suite)
 @common.XfailIfNoCorstone300
-@pytest.mark.xfail(
-    reason="Vela compilation fails with 'Invalid arguments' for int16 slice operations"
-)
 def test_slice_tensor_16a8w_u55_INT16(test_data: torch.Tensor):
     """Test slice operation with 16A8W quantization on U55 (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -209,9 +202,6 @@ def test_slice_tensor_16a8w_u55_INT16(test_data: torch.Tensor):
 
 @common.parametrize("test_data", test_data_suite)
 @common.XfailIfNoCorstone320
-@pytest.mark.xfail(
-    reason="Vela compilation fails with 'Invalid arguments' for int16 slice operations"
-)
 def test_slice_tensor_16a8w_u85_INT16(test_data: torch.Tensor):
     """Test slice operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
diff --git a/backends/arm/test/ops/test_view.py b/backends/arm/test/ops/test_view.py
index 915eef45755..3e706ae1cac 100644
--- a/backends/arm/test/ops/test_view.py
+++ b/backends/arm/test/ops/test_view.py
@@ -9,7 +9,6 @@
 
 from typing import Tuple
 
-import pytest
 import torch
 from executorch.backends.arm.quantizer.arm_quantizer import (
     get_symmetric_a16w8_quantization_config,
@@ -180,9 +179,6 @@ def get_symmetric_a16w8_view_quantizer(per_channel_quantization=False):
 
 
 @common.parametrize("test_data", View.needs_transpose_tests)
-@pytest.mark.xfail(
-    reason="missing int16 view ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13977"
-)
 def test_view_16a8w_tosa_INT(test_data: Tuple):
     """Test view operation with 16A8W quantization (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -209,9 +205,6 @@ def test_view_16a8w_tosa_INT(test_data: Tuple):
 
 @common.parametrize("test_data", View.needs_transpose_tests)
 @common.XfailIfNoCorstone300
-@pytest.mark.xfail(
-    reason="Vela compilation fails with 'Invalid arguments' for int16 view operations"
-)
 def test_view_16a8w_u55_INT16(test_data: Tuple):
     """Test view operation with 16A8W quantization on U55 (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -237,9 +230,6 @@ def test_view_16a8w_u55_INT16(test_data: Tuple):
 
 @common.parametrize("test_data", View.needs_transpose_tests)
 @common.XfailIfNoCorstone320
-@pytest.mark.xfail(
-    reason="Vela compilation fails with 'Invalid arguments' for int16 view operations"
-)
 def test_view_16a8w_u85_INT16(test_data: Tuple):
     """Test view operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False

From 703d25ab414ffb73f8c0e4686d27ea2316ade2d8 Mon Sep 17 00:00:00 2001
From: Marco Giordano <112122023+mgiordy@users.noreply.github.com>
Date: Sat, 11 Oct 2025 23:30:23 -0700
Subject: [PATCH 248/266] Including mixed quant GRU op in Jarvis

Differential Revision: D81703253

Pull Request resolved: https://github.com/pytorch/executorch/pull/15011
---
 backends/cadence/aot/compiler.py              | 12 +++-
 backends/cadence/aot/functions_hifi.yaml      |  5 ++
 backends/cadence/aot/ops_registrations.py     | 25 ++++++++
 backends/cadence/aot/quantizer/fusion_pass.py | 46 +++++++++++++++
 backends/cadence/aot/quantizer/patterns.py    | 57 +++++++++++++++++++
 backends/cadence/aot/quantizer/quantizer.py   |  4 ++
 6 files changed, 147 insertions(+), 2 deletions(-)

diff --git a/backends/cadence/aot/compiler.py b/backends/cadence/aot/compiler.py
index 6c497d5bec4..765ddcd581d 100644
--- a/backends/cadence/aot/compiler.py
+++ b/backends/cadence/aot/compiler.py
@@ -24,6 +24,7 @@
 from executorch.backends.cadence.aot.quantizer.quantizer import (
     CadenceDefaultQuantizer,
     CadenceQuantizer,
+    CadenceW8A32MixedQuantizer,
 )
 from executorch.backends.cadence.aot.utils import (
     get_default_memory_config,
@@ -59,6 +60,7 @@ def trace(
     model: torch.nn.Module,
     inputs: tuple[object, ...],
     dump_graphs: bool = False,
+    quantizer: Optional[CadenceQuantizer] = None,
 ) -> ExportedProgram:
     """
     Trace the model with export and return an ExportedProgram.
@@ -73,6 +75,12 @@ def trace(
         torch.ops.aten.rms_norm.default,
     ]
 
+    if isinstance(quantizer, CadenceW8A32MixedQuantizer):
+        ops_to_keep += [
+            torch.ops.aten.gru.input,
+            torch.ops.aten.gru.data,
+        ]
+
     program = trace_fn(
         model, inputs, is_qat=False, strict=True, ops_to_keep=ops_to_keep
     )
@@ -99,7 +107,7 @@ def prepare_pt2(
     Returns a GraphModule with the prepared model.
     """
 
-    traced_program = trace(model, inputs, dump_graphs=dump_graphs)
+    traced_program = trace(model, inputs, dump_graphs=dump_graphs, quantizer=quantizer)
     prepared_program = prepare_traced_pt2(
         traced_program, quantizer, dump_graphs=dump_graphs
     )
@@ -184,7 +192,7 @@ def get_fake_quant_model(
     # Make the model inference mode by calling model.eval()
     model.eval()
 
-    program = trace(model, inputs, dump_graphs=dump_graphs)
+    program = trace(model, inputs, dump_graphs=dump_graphs, quantizer=quantizer)
 
     if dump_graphs:
         logging.info("Graph after trace:")
diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml
index c1cef01c1e8..3bdbb33d59b 100644
--- a/backends/cadence/aot/functions_hifi.yaml
+++ b/backends/cadence/aot/functions_hifi.yaml
@@ -558,3 +558,8 @@
   kernels:
     - arg_meta: null
       kernel_name: impl::HiFi::quantized_w8a32_conv_out
+
+- func: cadence::quantized_w8a32_gru.out(Tensor inputs, Tensor hidden, Tensor weights_inputs, float w_i_scale, Tensor weights_hidden, float w_h_scale, Tensor bias_inputs, float b_i_scale, Tensor bias_hidden, float b_h_scale, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::quantized_w8a32_gru_out
diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
index a0527618bcf..f827488adfb 100644
--- a/backends/cadence/aot/ops_registrations.py
+++ b/backends/cadence/aot/ops_registrations.py
@@ -578,6 +578,15 @@
     "quantized_w8a32_conv.out(Tensor input, Tensor weight, float w_scale, Tensor bias, float b_scale, *, Tensor(a!) output) -> Tensor(a!)"
 )
 
+lib.define(
+    "quantized_w8a32_gru(Tensor inputs, Tensor hidden, Tensor weights_inputs, float w_i_scale, Tensor weights_hidden, float w_h_scale, Tensor bias_inputs, float b_i_scale, Tensor bias_hidden, float b_h_scale) -> Tensor"
+)
+
+lib.define(
+    "quantized_w8a32_gru.out(Tensor inputs, Tensor hidden, Tensor weights_inputs, float w_i_scale, Tensor weights_hidden, float w_h_scale, Tensor bias_inputs, float b_i_scale, Tensor bias_hidden, float b_h_scale, *, Tensor(a!) out) -> Tensor(a!)"
+)
+
+
 # Custom ops with aten namespace. Need to specify the lib var as FRAGMENT type as aten library is already defined
 aten_lib = Library("aten", "FRAGMENT")
 aten_lib.define(
@@ -2646,3 +2655,19 @@ def quantized_w8a32_conv_meta(
         channel_last=False,
     )
     return src.new_empty(output_size, dtype=src.dtype)
+
+
+@register_fake("cadence::quantized_w8a32_gru")
+def quantized_w8a32_gru_meta(
+    inputs: torch.Tensor,
+    hidden: torch.Tensor,
+    weights_inputs: torch.Tensor,
+    w_i_scale: float,
+    weights_hidden: torch.Tensor,
+    w_h_scale: float,
+    bias_inputs: torch.Tensor,
+    b_i_scale: float,
+    bias_hidden: torch.Tensor,
+    b_h_scale: float,
+) -> torch.Tensor:
+    return inputs.new_empty((2, hidden.shape[-1]), dtype=inputs.dtype)
diff --git a/backends/cadence/aot/quantizer/fusion_pass.py b/backends/cadence/aot/quantizer/fusion_pass.py
index c8bfa5cbac7..2fa0f794e3c 100644
--- a/backends/cadence/aot/quantizer/fusion_pass.py
+++ b/backends/cadence/aot/quantizer/fusion_pass.py
@@ -25,6 +25,7 @@
     LinearPattern,
     MatmulPattern,
     MixedW8A32ConvPattern,
+    MixedW8A32GruPattern,
     MixedW8A32LinearPattern,
     ReluPattern0,
     ReluPattern1,
@@ -528,6 +529,41 @@ def get_args_and_kwargs_mixed_w8a32_conv(
     return args, kwargs
 
 
+def get_args_and_kwargs_mixed_w8a32_gru(
+    graph_module: GraphModule,
+    other_inputs: List[fx.Node],
+    weights_inputs: List[fx.Node],
+    dequants_weights: List[fx.Node],
+    bias_inputs: List[fx.Node],
+    dequants_biases: List[fx.Node],
+    op_node: fx.Node,
+) -> Tuple[Tuple[ArgsType, ...], Dict[str, ArgsType]]:
+    # Stride, padding, dilation, groups not supported yet
+
+    assert len(dequants_weights) == 2
+    assert len(dequants_biases) == 2
+    w_i_scale = dequants_weights[0].args[1]
+    w_h_scale = dequants_weights[1].args[1]
+    b_i_scale = dequants_biases[0].args[1]
+    b_h_scale = dequants_biases[1].args[1]
+
+    args = (
+        other_inputs[0],
+        other_inputs[1],
+        weights_inputs[0],
+        w_i_scale,
+        weights_inputs[1],
+        w_h_scale,
+        bias_inputs[0],
+        b_i_scale,
+        bias_inputs[1],
+        b_h_scale,
+    )
+    kwargs = {}
+
+    return args, kwargs
+
+
 class QuantFusion(ExportPass):
     # pyre-ignore[2]: Parameter `patterns` has no type specified
     def __init__(self, patterns) -> None:
@@ -707,6 +743,16 @@ def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
                             dequants_biases,
                             op_node,
                         )
+                    elif isinstance(pattern, MixedW8A32GruPattern):
+                        args, kwargs = get_args_and_kwargs_mixed_w8a32_gru(
+                            graph_module,
+                            other_inputs,
+                            weights_inputs,
+                            dequants_weights,
+                            bias_inputs,
+                            dequants_biases,
+                            op_node,
+                        )
 
                     fused = graph_module.graph.call_function(
                         pattern.replacement_op(),
diff --git a/backends/cadence/aot/quantizer/patterns.py b/backends/cadence/aot/quantizer/patterns.py
index 65389aaad37..2452cfdcfea 100644
--- a/backends/cadence/aot/quantizer/patterns.py
+++ b/backends/cadence/aot/quantizer/patterns.py
@@ -661,3 +661,60 @@ def get_anchors(
 
     def replacement_op(self) -> OpOverload:
         return torch.ops.cadence.quantized_w8a32_conv.default
+
+
+class MixedW8A32GruPattern(QuantizationPattern):
+    def partition_types(self) -> List[OpOverload]:
+        return [torch.ops.aten.gru.input]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+    ) -> Tuple[PartitionAnchors, fx.Node]:
+        # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
+        gru_layer = fused_partition[0].nodes[-1]
+        if len(gru_layer.kwargs) > 0:
+            return (
+                PartitionAnchors(
+                    empty=True,
+                ),
+                gru_layer,
+            )
+
+        # Bail if input or states are not multiple of 4 (SIMD)
+        if gru_layer.args[0].meta["tensor_meta"].shape[-1] % 4 != 0:
+            return (
+                PartitionAnchors(
+                    empty=True,
+                ),
+                gru_layer,
+            )
+        if gru_layer.args[1].meta["tensor_meta"].shape[-1] % 4 != 0:
+            return (
+                PartitionAnchors(
+                    empty=True,
+                ),
+                gru_layer,
+            )
+
+        class Wrapper:  # noqa: B903
+            def __init__(self, args, meta):
+                self.args = args
+                self.meta = meta
+
+        wrapper = Wrapper(tuple(gru_layer.args[2]), gru_layer.meta)
+
+        return (
+            PartitionAnchors(
+                inputs=[],
+                # pyre-fixme[6]: Expected `List[Tuple[Node, int]]` but got `List[Tuple[Wrapper, int]]`.
+                weights=[(wrapper, 0), (wrapper, 1)],
+                # pyre-fixme[6]: Expected `List[Union[Tuple[Node, int], Tuple[Node, int, DerivedQuantizationSpec]]]` but got `List[Tuple[Wrapper, int]]`.
+                biases=[(wrapper, 2), (wrapper, 3)],
+                output=[],
+                others=[(gru_layer, 0), (gru_layer, 1)],
+            ),
+            gru_layer,
+        )
+
+    def replacement_op(self) -> OpOverload:
+        return torch.ops.cadence.quantized_w8a32_gru.default
diff --git a/backends/cadence/aot/quantizer/quantizer.py b/backends/cadence/aot/quantizer/quantizer.py
index f824ef874c4..d4af074c475 100644
--- a/backends/cadence/aot/quantizer/quantizer.py
+++ b/backends/cadence/aot/quantizer/quantizer.py
@@ -25,6 +25,7 @@
     LinearPattern,
     MatmulPattern,
     MixedW8A32ConvPattern,
+    MixedW8A32GruPattern,
     MixedW8A32LinearPattern,
     QuantizationPattern,
     ReluPattern0,
@@ -325,6 +326,9 @@ def __init__(self) -> None:
         quantizers.append(
             CadenceAtenQuantizer(MixedW8A32ConvPattern(), qconfig_A32W8sym)
         )
+        quantizers.append(
+            CadenceAtenQuantizer(MixedW8A32GruPattern(), qconfig_A32W8sym)
+        )
         super().__init__(quantizers)
 
 
From e69700baae2b192934d283193d7dd716736f5a69 Mon Sep 17 00:00:00 2001
From: Andrew Grebenisan <33402477+DrJessop@users.noreply.github.com>
Date: Sat, 11 Oct 2025 23:30:47 -0700
Subject: [PATCH 249/266] Support for batched matmul

Differential Revision: D84279595

Pull Request resolved: https://github.com/pytorch/executorch/pull/14956
---
 backends/cadence/aot/ref_implementations.py   | 32 ++++++++++---------
 .../aot/tests/test_ref_implementations.py     | 25 ++++++++++++++-
 2 files changed, 41 insertions(+), 16 deletions(-)

diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py
index 6a13a4424da..ed9bb438a9e 100644
--- a/backends/cadence/aot/ref_implementations.py
+++ b/backends/cadence/aot/ref_implementations.py
@@ -62,7 +62,7 @@ def quantize_per_tensor(
     ]
     if dtype not in supported_quant_types:
         raise ValueError(
-            f"Unsupported dtype to quantize to. Supported dtypes must be one of {supported_quant_types}"
+            f"Unsupported dtype to quantize to {dtype}. Supported dtypes must be one of {supported_quant_types}"
         )
 
     return torch.ops.quantized_decomposed.quantize_per_tensor(
@@ -264,7 +264,7 @@ def quantized_linear_common(
     supported_dtypes = [torch.int8, torch.uint8, torch.int32]
     if dtype not in supported_dtypes:
         raise ValueError(
-            f"Unsupported dtype to quantize to. Supported dtypes must be one of {supported_dtypes}"
+            f"Unsupported dtype to quantize to {dtype}. Supported dtypes must be one of {supported_dtypes}"
         )
 
     out = torch.nn.functional.linear(
@@ -427,25 +427,27 @@ def quantized_matmul(
         - out_multiplier (int): The multiplier used to scale the output
         - out_shift (int): The shift used to scale the output
         - out_zero_point (int): The quantized mapping of zero for the output
-        - transposed (bool): Whether to transpose the weight tensor
+        - transposed (bool): Whether Y is transposed.
     """
     if bias is not None and not torch.all(bias == 0):
         raise ValueError("bias must be None or all zeros since unused in out variant")
 
-    # Looks weird, but quantized linear assumes weights are pre-transposed,
-    # hence we transpose only if `transposed` is False.
-    if not transposed:
-        Y = Y.T
+    if transposed:
+        Y = Y.transpose(-1, -2)
 
-    return quantized_linear_common(
-        X,
-        Y,
-        bias or torch.zeros(1, dtype=torch.int32),
-        X_zero_point,
-        Y_zero_point,
-        out_multiplier,
-        out_shift,
+    out_scale = 1.0 / (-out_multiplier * (1 / (1 << 31)) * (2**out_shift))
+
+    out = torch.matmul(
+        (X - X_zero_point).float(),
+        (Y - Y_zero_point).float(),
+    )
+    return quantize_per_tensor(
+        out,
+        out_scale,
         out_zero_point,
+        torch.iinfo(X.dtype).min,
+        torch.iinfo(X.dtype).max,
+        X.dtype,
     )
 
 
diff --git a/backends/cadence/aot/tests/test_ref_implementations.py b/backends/cadence/aot/tests/test_ref_implementations.py
index f679bae9485..259752f3893 100644
--- a/backends/cadence/aot/tests/test_ref_implementations.py
+++ b/backends/cadence/aot/tests/test_ref_implementations.py
@@ -350,6 +350,29 @@ def test_quantized_add(
                 for (matmul, transposed_matmul) in ((True, False), (True, True))
                 for (per_tensor, dtype) in ((True, torch.int8),)
             ],
+            *[
+                (
+                    torch.Size([2, 1, 2]),  # src_shape: 1 sample, 2 input features
+                    torch.Size(
+                        [2, 2, 2]
+                    ),  # weight_shape: 2 output features, 2 input features
+                    2,  # in_zero_point
+                    torch.tensor([1, 1], dtype=dtype),  # weight_zero_point
+                    torch.tensor(
+                        [268435456], dtype=torch.int32
+                    ),  # out_multiplier (0.125 * 2^31)
+                    torch.tensor(
+                        [1], dtype=torch.int32
+                    ),  # out_shift (shift=1, doubles the scale)
+                    1,  # out_zero_point
+                    torch.tensor([[[1, 2]], [[0, -1]]], dtype=dtype),  # expected_output
+                    per_tensor,
+                    matmul,
+                    transposed_matmul,
+                )
+                for (matmul, transposed_matmul) in ((True, False), (True, True))
+                for (per_tensor, dtype) in ((True, torch.int8),)
+            ],
         ]
     )
     def test_quantized_linear(
@@ -380,7 +403,7 @@ def test_quantized_linear(
             .to(expected_output.dtype)
         )
         if matmul and not transposed_matmul:
-            weight = weight.T
+            weight = weight.transpose(-1, -2)
 
         if per_tensor:
             weight_zero_point = weight_zero_point[0]

From f32cdc3de6f7176d70a80228f1a60bcd45d93437 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Sun, 12 Oct 2025 05:59:42 -0400
Subject: [PATCH 250/266] pin bump with better architecture (#15040)

This PR was created by the merge bot to help merge the original PR into
the main branch.
ghstack PR number: https://github.com/pytorch/executorch/pull/15016 by
@Gasoonjia
^ Please use this as the source of truth for the PR details, comments,
and reviews
ghstack PR base:
https://github.com/pytorch/executorch/tree/gh/gasoonjia/56/base
ghstack PR head:
https://github.com/pytorch/executorch/tree/gh/gasoonjia/56/head
Merge bot PR base: https://github.com/pytorch/executorch/tree/main
Merge bot PR head:
https://github.com/pytorch/executorch/tree/gh/gasoonjia/56/orig
Differential Revision:
[D84280496](https://our.internmc.facebook.com/intern/diff/D84280496/)
@diff-train-skip-merge

Co-authored-by: gasoonjia <gasoonjia@icloud.com>
---
 .ci/docker/ci_commit_pins/pytorch.txt         |  2 +-
 docs/source/conf.py                           |  2 +-
 install_requirements.py                       | 23 +---------
 .../c10/torch/headeronly/macros/Macros.h      | 42 +++++++++++++++++++
 torch_pin.py                                  | 19 ++++++++-
 5 files changed, 63 insertions(+), 25 deletions(-)

diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
index e3a53c8bcb5..aafc7565373 100644
--- a/.ci/docker/ci_commit_pins/pytorch.txt
+++ b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-53a2908a10f414a2f85caa06703a26a40e873869
+cf9d09490c7f6685ec68d5db3acf2e0d73c54d00
diff --git a/docs/source/conf.py b/docs/source/conf.py
index f1869d38a46..b1c6b8b43a2 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -24,7 +24,7 @@
 import sys
 from typing import Any
 
-import pytorch_sphinx_theme2  # type: ignore[import-untyped]
+import pytorch_sphinx_theme2  # type: ignore[import-not-found]
 
 # To let us import ./custom_directives.py
 sys.path.insert(0, os.path.abspath("."))
diff --git a/install_requirements.py b/install_requirements.py
index b84e250cf87..a026e5b9964 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -12,33 +12,12 @@
 
 from install_utils import determine_torch_url, is_intel_mac_os, python_is_compatible
 
-from torch_pin import NIGHTLY_VERSION, TORCH_VERSION
+from torch_pin import NIGHTLY_VERSION, SUPPORTED_CUDA_VERSIONS, TORCH_VERSION
 
 # The pip repository that hosts nightly torch packages.
 # This will be dynamically set based on CUDA availability and CUDA backend enabled/disabled.
 TORCH_NIGHTLY_URL_BASE = "https://download.pytorch.org/whl/nightly"
 
-# Supported CUDA versions - modify this to add/remove supported versions
-# Format: tuple of (major, minor) version numbers
-SUPPORTED_CUDA_VERSIONS = (
-    (12, 6),
-    (12, 8),
-    (13, 0),
-)
-
-# Since ExecuTorch often uses main-branch features of pytorch, only the nightly
-# pip versions will have the required features.
-#
-# NOTE: If a newly-fetched version of the executorch repo changes the value of
-# NIGHTLY_VERSION, you should re-run this script to install the necessary
-# package versions.
-#
-# NOTE: If you're changing, make the corresponding change in .ci/docker/ci_commit_pins/pytorch.txt
-# by picking the hash from the same date in https://hud.pytorch.org/hud/pytorch/pytorch/nightly/
-#
-# NOTE: If you're changing, make the corresponding supported CUDA versions in
-# SUPPORTED_CUDA_VERSIONS above if needed.
-
 
 def install_requirements(use_pytorch_nightly):
     # Skip pip install on Intel macOS if using nightly.
diff --git a/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h b/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h
index 558edb175ae..e340e7626a0 100644
--- a/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h
+++ b/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h
@@ -359,6 +359,7 @@ static inline int C10_WARP_SIZE_INTERNAL() {
 // Those platforms do not support assert()
 #define CUDA_KERNEL_ASSERT(cond)
 #define CUDA_KERNEL_ASSERT_MSG(cond, msg)
+#define CUDA_KERNEL_ASSERT_PRINTF(cond, msg, ...)
 #define SYCL_KERNEL_ASSERT(cond)
 #elif defined(_MSC_VER)
 #if defined(NDEBUG)
@@ -396,6 +397,26 @@ __host__ __device__
                static_cast<unsigned>(__LINE__)), \
            0);                                   \
   }
+#define CUDA_KERNEL_ASSERT_PRINTF(cond, msg, ...)                     \
+  if (C10_UNLIKELY(!(cond))) {                                        \
+    (void)(printf(                                                    \
+        "[CUDA_KERNEL_ASSERT] " __FILE__ ":" C10_STRINGIZE(           \
+            __LINE__) ": %s: block: [%d,%d,%d], thread: [%d,%d,%d]: " \
+                      "Assertion failed: `" #cond "`: " msg "\n",     \
+        __func__,                                                     \
+        blockIdx.x,                                                   \
+        blockIdx.y,                                                   \
+        blockIdx.z,                                                   \
+        threadIdx.x,                                                  \
+        threadIdx.y,                                                  \
+        threadIdx.z,                                                  \
+        ##__VA_ARGS__));                                              \
+    (void)(_wassert(                                                  \
+               _CRT_WIDE(#cond),                                      \
+               _CRT_WIDE(__FILE__),                                   \
+               static_cast<unsigned>(__LINE__)),                      \
+           0);                                                        \
+  }
 #define SYCL_KERNEL_ASSERT(cond)                 \
   if (C10_UNLIKELY(!(cond))) {                   \
     (void)(_wassert(                             \
@@ -455,6 +476,10 @@ __host__ __device__
   if C10_UNLIKELY (!(cond)) {             \
     abort();                              \
   }
+#define CUDA_KERNEL_ASSERT_PRINTF(cond, msg, ...) \
+  if C10_UNLIKELY (!(cond)) {                     \
+    abort();                                      \
+  }
 #define SYCL_KERNEL_ASSERT(cond) \
   if C10_UNLIKELY (!(cond)) {    \
     abort();                     \
@@ -470,6 +495,23 @@ __host__ __device__
     __assert_fail(                                                     \
         msg, __FILE__, static_cast<unsigned int>(__LINE__), __func__); \
   }
+#define CUDA_KERNEL_ASSERT_PRINTF(cond, msg, ...)                        \
+  if (C10_UNLIKELY(!(cond))) {                                           \
+    printf(                                                            \
+        "[CUDA_KERNEL_ASSERT] " __FILE__ ":" C10_STRINGIZE(            \
+            __LINE__) ": %s: block: [%d,%d,%d], thread: [%d,%d,%d]: "  \
+            "Assertion failed: `" #cond "`: " msg "\n",                \
+        __func__,                                                      \
+        blockIdx.x,                                                    \
+        blockIdx.y,                                                    \
+        blockIdx.z,                                                    \
+        threadIdx.x,                                                   \
+        threadIdx.y,                                                   \
+        threadIdx.z,                                                   \
+        ##__VA_ARGS__); \
+    __assert_fail(                                                       \
+        #cond, __FILE__, static_cast<unsigned int>(__LINE__), __func__); \
+  }
 #define SYCL_KERNEL_ASSERT(cond)                                         \
   if (C10_UNLIKELY(!(cond))) {                                           \
     __assert_fail(                                                       \
diff --git a/torch_pin.py b/torch_pin.py
index 02040c91963..bb8d32d4716 100644
--- a/torch_pin.py
+++ b/torch_pin.py
@@ -1,2 +1,19 @@
+# Since ExecuTorch often uses main-branch features of pytorch, only the nightly
+# pip versions will have the required features.
+#
+# NOTE: If a newly-fetched version of the executorch repo changes the value of
+# NIGHTLY_VERSION, you should re-run install_executorch.sh script to install the necessary
+# package versions.
+#
+# NOTE: If you're changing, make the corresponding change in .ci/docker/ci_commit_pins/pytorch.txt
+# by picking the hash from the same date in https://hud.pytorch.org/hud/pytorch/pytorch/nightly/
+#
+# NOTE: If you're changing, make the corresponding supported CUDA versions in
+# SUPPORTED_CUDA_VERSIONS above if needed.
 TORCH_VERSION = "2.10.0"
-NIGHTLY_VERSION = "dev20251003"
+NIGHTLY_VERSION = "dev20251004"
+SUPPORTED_CUDA_VERSIONS = (
+    (12, 6),
+    (12, 8),
+    (13, 0),
+)

From afd98fe95d872d20f5c1bad283e22725c4f6490b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0imon=20Str=C3=BD=C4=8Dek?= <simon.strycek@nxp.com>
Date: Mon, 13 Oct 2025 11:58:54 +0200
Subject: [PATCH 251/266] NXP backend: Add conversion and quantization support
 for dim_order_ops._clone_dim_order.default (#14535)

### Summary
- Adds support for conversion and quantization of
`dim_order_ops._clone_dim_order.default` operator and fixes problems
with some variations of `nn.Dropout`.
- Adds more robust test cases for clone operators.

### Test plan
All changes should be covered by unit tests.

cc @robert-kalmar @JakeStevens @digantdesai
---
 .../nxp/backend/edge_program_converter.py     |   1 +
 .../ops_converters/clone_converter.py         |   5 +
 backends/nxp/neutron_partitioner.py           |   1 +
 backends/nxp/tests/executors.py               |   8 +-
 .../node_converter/test_clone_converter.py    | 165 ++++++++++++------
 5 files changed, 123 insertions(+), 57 deletions(-)

diff --git a/backends/nxp/backend/edge_program_converter.py b/backends/nxp/backend/edge_program_converter.py
index 03d55548d2d..fcfb9787715 100644
--- a/backends/nxp/backend/edge_program_converter.py
+++ b/backends/nxp/backend/edge_program_converter.py
@@ -34,6 +34,7 @@
     exir_ops.edge.aten.avg_pool2d.default: AvgPool2dConverter,  # noqa F405
     exir_ops.edge.aten.cat.default: CatConverter,  # noqa F405
     exir_ops.edge.aten.clone.default: CloneConverter,  # noqa F405
+    exir_ops.edge.dim_order_ops._clone_dim_order.default: CloneConverter,  # noqa F405
     exir_ops.edge.aten.constant_pad_nd.default: ConstantPadNDConverter,  # noqa F405
     exir_ops.edge.aten.convolution.default: ConvolutionConverter,  # noqa F405
     exir_ops.edge.aten.hardtanh.default: HardTanhConverter,  # noqa F405
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/clone_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/clone_converter.py
index 1d370ab8c48..17b2cee9874 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/clone_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/clone_converter.py
@@ -20,6 +20,11 @@ def _has_supported_memory_format(node: Node) -> bool:
 
 
 class CloneConverter(NodeConverter):
+    """
+    This converter is responsible for converting both edge operators:
+    - aten.clone.default
+    - dim_order_ops._clone_dim_order.default
+    """
 
     @staticmethod
     def _is_supported_in_IR(
diff --git a/backends/nxp/neutron_partitioner.py b/backends/nxp/neutron_partitioner.py
index e7ad7ff7a0b..965ad41309b 100644
--- a/backends/nxp/neutron_partitioner.py
+++ b/backends/nxp/neutron_partitioner.py
@@ -201,6 +201,7 @@ def tag_qdq_clusters(self, nodes: list[torch.fx.Node]):
     exir_ops.edge.aten.avg_pool2d.default: AvgPool2dConverter,  # noqa F405
     exir_ops.edge.aten.cat.default: CatConverter,  # noqa F405
     exir_ops.edge.aten.clone.default: CloneConverter,  # noqa F405
+    exir_ops.edge.dim_order_ops._clone_dim_order.default: CloneConverter,  # noqa F405
     exir_ops.edge.aten.constant_pad_nd.default: ConstantPadNDConverter,  # noqa F405
     exir_ops.edge.aten.convolution.default: ConvolutionConverter,  # noqa F405
     exir_ops.edge.aten.hardtanh.default: HardTanhConverter,  # noqa F405
diff --git a/backends/nxp/tests/executors.py b/backends/nxp/tests/executors.py
index 9626a2779c4..632e3da055f 100644
--- a/backends/nxp/tests/executors.py
+++ b/backends/nxp/tests/executors.py
@@ -368,7 +368,13 @@ def convert_run_compare(
 
 
 def graph_contains_any_of_ops(graph: Graph, ops: list) -> bool:
-    return any(node.target in ops for node in graph.nodes)
+    return graph_contains_any(
+        graph, condition=lambda n: hasattr(n, "target") and n.target in ops
+    )
+
+
+def graph_contains_any(graph: Graph, condition: Callable[[Node], bool]) -> bool:
+    return any(map(condition, graph.nodes))
 
 
 target_support_check_function = Callable[[Node, NeutronTargetSpec], bool]
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py
index f5945607f1b..c02d184c5ae 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py
@@ -4,31 +4,33 @@
 # LICENSE file in the root directory of this source tree.
 
 
+import itertools
+import unittest
+
+import kgb
 import numpy as np
-import pytest
 import torch
 
 from executorch.backends.nxp.backend.edge_program_converter import (
     EdgeProgramToIRConverter,
 )
-from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program
+from executorch.backends.nxp.tests.executorch_pipeline import (
+    to_edge_program,
+    to_quantized_edge_program,
+)
 from executorch.backends.nxp.tests.executors import (
     convert_run_compare,
+    graph_contains_any,
     graph_contains_any_of_ops,
-    ToNCHWPreprocess,
-    ToNHWCPreprocess,
+    ToChannelFirstPreprocess,
+    ToChannelLastPreprocess,
 )
 from executorch.exir.dialects._ops import ops as exir_ops
+from parameterized import parameterized
 from torch import nn
 from torch.export import ExportedProgram
 
 
-@pytest.fixture(autouse=True)
-def reseed_model_per_test_run():
-    torch.manual_seed(23)
-    np.random.seed(23)
-
-
 class SingleConvBlockWithDropout(torch.nn.Module):
     def __init__(
         self, conv_in_channels: int = 3, perform_inplace_dropout: bool = False
@@ -74,57 +76,108 @@ def forward(self, x):
         return self.block(x)
 
 
-@pytest.mark.parametrize("inplace_dropout", [False, True])
-@pytest.mark.parametrize("input_shape", [(1, 3, 128, 128), (1, 3, 256, 256)])
-def test_conv_dropout_quant(mocker, inplace_dropout: bool, input_shape: tuple[int]):
-    model = SingleConvBlockWithDropout(
-        conv_in_channels=input_shape[1], perform_inplace_dropout=inplace_dropout
-    ).eval()
+class TestCloneConverter(unittest.TestCase):
+    __test__ = False  # Prevent interfering with PyTest tests
 
-    converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
+    @classmethod
+    def setUpClass(cls):
+        torch.manual_seed(23)
+        np.random.seed(23)
 
-    quantized_program = to_quantized_edge_program(model, input_shape).exported_program()
+    @staticmethod
+    def _node_is_clone(node) -> bool:
+        clone_ops = [
+            exir_ops.edge.aten.clone.default,
+            exir_ops.edge.dim_order_ops._clone_dim_order.default,
+        ]
 
-    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
-    exported_program: ExportedProgram = converter_spy.call_args.args[1]
-
-    assert not graph_contains_any_of_ops(
-        graph=quantized_program.graph, ops=[exir_ops.edge.aten.clone.default]
-    )
-
-    input_data = (np.random.random(input_shape) * 50).astype(np.int8)
-    convert_run_compare(
-        exported_program,
-        tfl_model=tflite_flatbuffers_model,
-        tflite_input_preprocess=ToNHWCPreprocess(),
-        tflite_output_preprocess=ToNCHWPreprocess(),
-        input_data=input_data,
-        atol=1.0,
-    )
+        def target_can_be_clone(node):
+            if hasattr(node, "op") and node.op == "call_function":
+                return "clone" in node.target.__name__
 
+            return False
 
-@pytest.mark.parametrize("inplace_dropout", [False, True])
-def test_clone_pool_view_copy_quant(
-    mocker, inplace_dropout: bool, input_shape: tuple[int] = (1, 64, 25, 5)
-):
-    model = KWSFinalBlock(input_shape).eval()
+        return node in clone_ops or target_can_be_clone(node)
 
-    converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
-
-    quantized_program = to_quantized_edge_program(model, input_shape).exported_program()
-
-    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
-    exported_program: ExportedProgram = converter_spy.call_args.args[1]
-
-    assert not graph_contains_any_of_ops(
-        graph=quantized_program.graph, ops=[exir_ops.edge.aten.clone.default]
+    @parameterized.expand(
+        list(itertools.product([True, False], [(1, 3, 128, 128), (1, 3, 256, 256)]))
     )
-
-    input_data = (np.random.random(input_shape) * 50).astype(np.int8)
-    convert_run_compare(
-        exported_program,
-        tfl_model=tflite_flatbuffers_model,
-        tflite_input_preprocess=ToNHWCPreprocess(),
-        input_data=input_data,
-        atol=1.0,
+    def test_conv_dropout_quant(self, inplace_dropout: bool, input_shape: tuple[int]):
+        model = SingleConvBlockWithDropout(
+            conv_in_channels=input_shape[1], perform_inplace_dropout=inplace_dropout
+        ).eval()
+
+        with kgb.spy_on(
+            EdgeProgramToIRConverter.convert_program, call_original=True
+        ) as converter_spy:
+            quantized_program = to_quantized_edge_program(
+                model, input_shape
+            ).exported_program()
+
+            tflite_flatbuffers_model, _ = converter_spy.calls[-1].return_value
+            exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
+
+            assert not graph_contains_any(
+                graph=quantized_program.graph,
+                condition=TestCloneConverter._node_is_clone,
+            )
+
+            input_data = (np.random.random(input_shape) * 50).astype(np.int8)
+            convert_run_compare(
+                exported_program,
+                tfl_model=tflite_flatbuffers_model,
+                tflite_input_preprocess=ToChannelLastPreprocess(),
+                tflite_output_preprocess=ToChannelFirstPreprocess(),
+                input_data=input_data,
+                atol=1.0,
+            )
+
+    @parameterized.expand(
+        list(itertools.product([True, False], [(1, 3, 128, 128), (1, 3, 256, 256)]))
     )
+    def test_conv_dropout_no_quant(
+        self, inplace_dropout: bool, input_shape: tuple[int]
+    ):
+        model = SingleConvBlockWithDropout(
+            conv_in_channels=input_shape[1], perform_inplace_dropout=inplace_dropout
+        ).eval()
+
+        edge_program = to_edge_program(model, input_shape).exported_program()
+
+        has_clone = graph_contains_any_of_ops(
+            graph=edge_program.graph,
+            ops=[
+                exir_ops.edge.aten.clone.default,
+                exir_ops.edge.dim_order_ops._clone_dim_order.default,
+            ],
+        )
+
+        # Clone with inplace=True should not produce clone edge op and vice versa
+        assert inplace_dropout ^ has_clone
+
+    def test_clone_pool_view_copy_quant(self, input_shape: tuple[int] = (1, 64, 25, 5)):
+        model = KWSFinalBlock(input_shape).eval()
+
+        with kgb.spy_on(
+            EdgeProgramToIRConverter.convert_program, call_original=True
+        ) as converter_spy:
+            quantized_program = to_quantized_edge_program(
+                model, input_shape
+            ).exported_program()
+
+            tflite_flatbuffers_model, _ = converter_spy.calls[-1].return_value
+            exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
+
+            assert not graph_contains_any(
+                graph=quantized_program.graph,
+                condition=TestCloneConverter._node_is_clone,
+            )
+
+            input_data = (np.random.random(input_shape) * 50).astype(np.int8)
+            convert_run_compare(
+                exported_program,
+                tfl_model=tflite_flatbuffers_model,
+                tflite_input_preprocess=ToChannelLastPreprocess(),
+                input_data=input_data,
+                atol=1.0,
+            )

From d00279d4ab19bf38f7b130f66be11352802ab7cc Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Mon, 13 Oct 2025 14:16:09 +0100
Subject: [PATCH 252/266] Minor update for Arm README.md (#15045)

fix unexpanded VGF term use.
---
 backends/arm/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/arm/README.md b/backends/arm/README.md
index e495a8e40cb..0abf5e9bf55 100644
--- a/backends/arm/README.md
+++ b/backends/arm/README.md
@@ -6,7 +6,7 @@ PyTorch models to a TOSA representation. This representation is used to
 deploy to the following targets:
 
 - **Arm&reg; Ethos&trade;-U55/65/85** - Compiled using the Ethos-U Vela compiler.
-- **VGF (Vulkan&reg; Graph Format)** – SPIR-V™ representation for Vulkan-capable devices.
+- **VGF Format, for ML extensions for Vulkan®** – a format containing SPIR-V™ ML operators for Vulkan-capable devices.
 
 The backend provides an ahead-of-time (AOT) flow, that produces a PTE file for your
 chosen target. The AOT flow supports the following development operating systems:

From 1a8acf64d5896708ed6a381004d7525225b0c836 Mon Sep 17 00:00:00 2001
From: Mergen Nachin <mnachin@meta.com>
Date: Mon, 13 Oct 2025 12:49:37 -0400
Subject: [PATCH 253/266] Update top-level README.md file (#15049)

---
 README.md | 278 ++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 228 insertions(+), 50 deletions(-)

diff --git a/README.md b/README.md
index 17327990a1d..d2d115e32d2 100644
--- a/README.md
+++ b/README.md
@@ -1,72 +1,250 @@
 <div align="center">
-  <img src="docs/source/_static/img/et-logo.png" alt="Logo" width="200">
-  <h1 align="center">ExecuTorch: A powerful on-device AI Framework</h1>
+  <img src="docs/source/_static/img/et-logo.png" alt="ExecuTorch logo mark" width="200">
+  <h1>ExecuTorch</h1>
+  <p><strong>On-device AI inference powered by PyTorch</strong></p>
 </div>
 
-
 <div align="center">
-  <a href="https://github.com/pytorch/executorch/graphs/contributors"><img src="https://img.shields.io/github/contributors/pytorch/executorch?style=for-the-badge&color=blue" alt="Contributors"></a>
-  <a href="https://github.com/pytorch/executorch/stargazers"><img src="https://img.shields.io/github/stars/pytorch/executorch?style=for-the-badge&color=blue" alt="Stargazers"></a>
-  <a href="https://discord.gg/Dh43CKSAdc"><img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Join our Discord community"></a>
-  <a href="https://pytorch.org/executorch/main/index"><img src="https://img.shields.io/badge/Documentation-000?logo=googledocs&logoColor=FFE165&style=for-the-badge" alt="Check out the documentation"></a>
-  <hr>
+  <a href="https://pypi.org/project/executorch/"><img src="https://img.shields.io/pypi/v/executorch?style=for-the-badge&color=blue" alt="PyPI - Version"></a>
+  <a href="https://github.com/pytorch/executorch/graphs/contributors"><img src="https://img.shields.io/github/contributors/pytorch/executorch?style=for-the-badge&color=blue" alt="GitHub - Contributors"></a>
+  <a href="https://github.com/pytorch/executorch/stargazers"><img src="https://img.shields.io/github/stars/pytorch/executorch?style=for-the-badge&color=blue" alt="GitHub - Stars"></a>
+  <a href="https://discord.gg/Dh43CKSAdc"><img src="https://img.shields.io/badge/Discord-Join%20Us-blue?logo=discord&logoColor=white&style=for-the-badge" alt="Discord - Chat with Us"></a>
+  <a href="https://docs.pytorch.org/executorch/main/index.html"><img src="https://img.shields.io/badge/Documentation-blue?logo=googledocs&logoColor=white&style=for-the-badge" alt="Documentation"></a>
 </div>
 
-**ExecuTorch** is an end-to-end solution for on-device inference and training. It powers much of Meta's on-device AI experiences across Facebook, Instagram, Meta Quest, Ray-Ban Meta Smart Glasses, WhatsApp, and more.
+**ExecuTorch** is PyTorch's unified solution for deploying AI models on-device—from smartphones to microcontrollers—built for privacy, performance, and portability. It powers Meta's on-device AI across **Instagram, WhatsApp, Quest 3, Ray-Ban Meta Smart Glasses**, and [more](https://docs.pytorch.org/executorch/main/success-stories.html).
+
+Deploy **LLMs, vision, speech, and multimodal models** with the same PyTorch APIs you already know—accelerating research to production with seamless model export, optimization, and deployment. No manual C++ rewrites. No format conversions. No vendor lock-in.
+
+<details>
+  <summary><strong>📘 Table of Contents</strong></summary>
+
+- [Why ExecuTorch?](#why-executorch)
+- [How It Works](#how-it-works)
+- [Quick Start](#quick-start)
+  - [Installation](#installation)
+  - [Export and Deploy in 3 Steps](#export-and-deploy-in-3-steps)
+  - [Run on Device](#run-on-device)
+  - [LLM Example: Llama](#llm-example-llama)
+- [Platform & Hardware Support](#platform--hardware-support)
+- [Production Deployments](#production-deployments)
+- [Examples & Models](#examples--models)
+- [Key Features](#key-features)
+- [Documentation](#documentation)
+- [Community & Contributing](#community--contributing)
+- [License](#license)
+
+</details>
+
+## Why ExecuTorch?
+
+- **🔒 Native PyTorch Export** — Direct export from PyTorch. No .onnx, .tflite, or intermediate format conversions. Preserve model semantics.
+- **⚡ Production-Proven** — Powers billions of users at [Meta with real-time on-device inference](https://engineering.fb.com/2025/07/28/android/executorch-on-device-ml-meta-family-of-apps/).
+- **💾 Tiny Runtime** — 50KB base footprint. Runs on microcontrollers to high-end smartphones.
+- **🚀 [12+ Hardware Backends](https://docs.pytorch.org/executorch/main/backends-overview.html)** — Open-source acceleration for Apple, Qualcomm, ARM, MediaTek, Vulkan, and more.
+- **🎯 One Export, Multiple Backends** — Switch hardware targets with a single line change. Deploy the same model everywhere.
+
+## How It Works
+
+ExecuTorch uses **ahead-of-time (AOT) compilation** to prepare PyTorch models for edge deployment:
+
+1. **🧩 Export** — Capture your PyTorch model graph with `torch.export()`
+2. **⚙️ Compile** — Quantize, optimize, and partition to hardware backends → `.pte`
+3. **🚀 Execute** — Load `.pte` on-device via lightweight C++ runtime
+
+Models use a standardized [Core ATen operator set](https://docs.pytorch.org/executorch/main/concepts.html#core-aten-operators). [Partitioners](https://docs.pytorch.org/executorch/main/compiler-delegate-and-partitioner.html) delegate subgraphs to specialized hardware (NPU/GPU) with CPU fallback.
+
+Learn more: [How ExecuTorch Works](https://docs.pytorch.org/executorch/main/intro-how-it-works.html) • [Architecture Guide](https://docs.pytorch.org/executorch/main/getting-started-architecture.html)
+
+## Quick Start
+
+### Installation
+
+```bash
+pip install executorch
+```
+
+For platform-specific setup (Android, iOS, embedded systems), see the [Quick Start](https://docs.pytorch.org/executorch/main/quick-start-section.html) documentation for additional info.
+
+### Export and Deploy in 3 Steps
+
+```python
+import torch
+from executorch.exir import to_edge_transform_and_lower
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+
+# 1. Export your PyTorch model
+model = MyModel().eval()
+example_inputs = (torch.randn(1, 3, 224, 224),)
+exported_program = torch.export.export(model, example_inputs)
+
+# 2. Optimize for target hardware (switch backends with one line)
+program = to_edge_transform_and_lower(
+    exported_program,
+    partitioner=[XnnpackPartitioner()]  # CPU | CoreMLPartitioner() for iOS | QnnPartitioner() for Qualcomm
+).to_executorch()
+
+# 3. Save for deployment
+with open("model.pte", "wb") as f:
+    f.write(program.buffer)
+
+# Test locally via ExecuTorch runtime's pybind API (optional)
+from executorch.runtime import Runtime
+runtime = Runtime.get()
+method = runtime.load_program("model.pte").load_method("forward")
+outputs = method.execute([torch.randn(1, 3, 224, 224)])
+```
+
+### Run on Device
+
+**[C++](https://docs.pytorch.org/executorch/main/using-executorch-cpp.html)**
+```cpp
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor.h>
+
+Module module("model.pte");
+auto tensor = make_tensor_ptr({2, 2}, {1.0f, 2.0f, 3.0f, 4.0f});
+auto outputs = module.forward({tensor});
+```
+
+**[Swift (iOS)](https://docs.pytorch.org/executorch/main/ios-section.html)**
+```swift
+let module = Module(filePath: "model.pte")
+let input = Tensor<Float>([1.0, 2.0, 3.0, 4.0])
+let outputs: [Value] = try module.forward([input])
+```
+
+**[Kotlin (Android)](https://docs.pytorch.org/executorch/main/android-section.html)**
+```kotlin
+val module = Module.load("model.pte")
+val inputTensor = Tensor.fromBlob(floatArrayOf(1.0f, 2.0f, 3.0f, 4.0f), longArrayOf(2, 2))
+val outputs = module.forward(EValue.from(inputTensor))
+```
+
+### LLM Example: Llama
+
+Export Llama models using the [`export_llm`](https://docs.pytorch.org/executorch/main/llm/export-llm.html) script or [Optimum-ExecuTorch](https://github.com/huggingface/optimum-executorch):
+
+```bash
+# Using export_llm
+python -m executorch.extension.llm.export.export_llm --model llama3_2 --output llama.pte
+
+# Using Optimum-ExecuTorch
+optimum-cli export executorch \
+  --model meta-llama/Llama-3.2-1B \
+  --task text-generation \
+  --recipe xnnpack \
+  --output_dir llama_model
+```
 
-It supports a wide range of models including LLMs (Large Language Models), CV (Computer Vision), ASR (Automatic Speech Recognition), and TTS (Text to Speech).
+Run on-device with the LLM runner API:
 
-Platform Support:
-- Operating Systems:
-  - iOS
-  - MacOS (ARM64)
-  - Android
-  - Linux
-  - Microcontrollers
+**[C++](https://docs.pytorch.org/executorch/main/llm/run-with-c-plus-plus.html)**
+```cpp
+#include <executorch/extension/llm/runner/text_llm_runner.h>
 
-- Hardware Acceleration:
-  - Apple
-  - Arm
-  - Cadence
-  - MediaTek
-  - NXP
-  - OpenVINO
-  - Qualcomm
-  - Vulkan
-  - XNNPACK
+auto runner = create_llama_runner("llama.pte", "tiktoken.bin");
+executorch::extension::llm::GenerationConfig config{
+    .seq_len = 128, .temperature = 0.8f};
+runner->generate("Hello, how are you?", config);
+```
 
-Key value propositions of ExecuTorch are:
+**[Swift (iOS)](https://docs.pytorch.org/executorch/main/llm/run-on-ios.html)**
+```swift
+let runner = TextRunner(modelPath: "llama.pte", tokenizerPath: "tiktoken.bin")
+try runner.generate("Hello, how are you?", Config {
+    $0.sequenceLength = 128
+}) { token in
+    print(token, terminator: "")
+}
+```
 
-- **Portability:** Compatibility with a wide variety of computing platforms,
-  from high-end mobile phones to highly constrained embedded systems and
-  microcontrollers.
-- **Productivity:** Enabling developers to use the same toolchains and Developer
-  Tools from PyTorch model authoring and conversion, to debugging and deployment
-  to a wide variety of platforms.
-- **Performance:** Providing end users with a seamless and high-performance
-  experience due to a lightweight runtime and utilizing full hardware
-  capabilities such as CPUs, NPUs, and DSPs.
+**Kotlin (Android)** — [API Docs](https://docs.pytorch.org/executorch/main/javadoc/org/pytorch/executorch/extension/llm/package-summary.html) • [Demo App](https://github.com/meta-pytorch/executorch-examples/tree/main/llm/android/LlamaDemo)
+```kotlin
+val llmModule = LlmModule("llama.pte", "tiktoken.bin", 0.8f)
+llmModule.load()
+llmModule.generate("Hello, how are you?", 128, object : LlmCallback {
+    override fun onResult(result: String) { print(result) }
+    override fun onStats(stats: String) { }
+})
+```
 
-## Getting Started
-To get started you can:
+For multimodal models (vision, audio), use the [MultiModal runner API](extension/llm/runner) which extends the LLM runner to handle image and audio inputs alongside text. See [Llava](examples/models/llava/README.md) and [Voxtral](examples/models/voxtral/README.md) examples.
 
-- Visit the [Step by Step Tutorial](https://pytorch.org/executorch/stable/getting-started.html) to get things running locally and deploy a model to a device
-- Use this [Colab Notebook](https://colab.research.google.com/drive/1qpxrXC3YdJQzly3mRg-4ayYiOjC6rue3?usp=sharing) to start playing around right away
-- Jump straight into LLM use cases by following specific instructions for popular open-source models such as [Llama](examples/models/llama/README.md), [Qwen 3](examples/models/qwen3/README.md), [Phi-4-mini](examples/models/phi_4_mini/README.md), [Llava](examples/models/llava/README.md), [Voxtral](examples/models/voxtral/README.md), and [LFM2](examples/models/lfm2/README.md).
+See [examples/models/llama](examples/models/llama/README.md) for complete workflow including quantization, mobile deployment, and advanced options.
 
-## Feedback and Engagement
+**Next Steps:**
+- 📖 [Step-by-step tutorial](https://docs.pytorch.org/executorch/main/getting-started.html) — Complete walkthrough for your first model
+- ⚡ [Colab notebook](https://colab.research.google.com/drive/1qpxrXC3YdJQzly3mRg-4ayYiOjC6rue3?usp=sharing) — Try ExecuTorch instantly in your browser
+- 🤖 [Deploy Llama models](examples/models/llama/README.md) — LLM workflow with quantization and mobile demos
 
-We welcome any feedback, suggestions, and bug reports from the community to help
-us improve our technology. Check out the [Discussion Board](https://github.com/pytorch/executorch/discussions) or chat real time with us on [Discord](https://discord.gg/Dh43CKSAdc)
+## Platform & Hardware Support
 
-## Contributing
+| **Platform**     | **Supported Backends**                                   |
+|------------------|----------------------------------------------------------|
+| Android          | XNNPACK, Vulkan, Qualcomm, MediaTek, Samsung Exynos      |
+| iOS              | XNNPACK, MPS, CoreML (Neural Engine)                     |
+| Linux / Windows  | XNNPACK, OpenVINO, CUDA *(experimental)*                 |
+| macOS            | XNNPACK, MPS, Metal *(experimental)*                     |
+| Embedded / MCU   | XNNPACK, ARM Ethos-U, NXP, Cadence DSP                   |
 
-We welcome contributions. To get started review the [guidelines](CONTRIBUTING.md) and chat with us on [Discord](https://discord.gg/Dh43CKSAdc)
+See [Backend Documentation](https://docs.pytorch.org/executorch/main/backends-overview.html) for detailed hardware requirements and optimization guides.
 
+## Production Deployments
 
-## Directory Structure
+ExecuTorch powers on-device AI at scale across Meta's family of apps, VR/AR devices, and partner deployments. [View success stories →](https://docs.pytorch.org/executorch/main/success-stories.html)
 
-Please refer to the [Codebase structure](CONTRIBUTING.md#codebase-structure) section of the [Contributing Guidelines](CONTRIBUTING.md) for more details.
+## Examples & Models
+
+**LLMs:** [Llama 3.2/3.1/3](examples/models/llama/README.md), [Qwen 3](examples/models/qwen3/README.md), [Phi-4-mini](examples/models/phi_4_mini/README.md), [LiquidAI LFM2](examples/models/lfm2/README.md)
+
+**Multimodal:** [Llava](examples/models/llava/README.md) (vision-language), [Voxtral](examples/models/voxtral/README.md) (audio-language)
+
+**Vision/Speech:** [MobileNetV2](https://github.com/meta-pytorch/executorch-examples/tree/main/mv2), [DeepLabV3](https://github.com/meta-pytorch/executorch-examples/tree/main/dl3)
+
+**Resources:** [`examples/`](examples/) directory • [executorch-examples](https://github.com/meta-pytorch/executorch-examples) mobile demos • [Optimum-ExecuTorch](https://github.com/huggingface/optimum-executorch) for HuggingFace models
+
+## Key Features
+
+ExecuTorch provides advanced capabilities for production deployment:
+
+- **Quantization** — Built-in support via [torchao](https://docs.pytorch.org/ao) for 8-bit, 4-bit, and dynamic quantization
+- **Memory Planning** — Optimize memory usage with ahead-of-time allocation strategies
+- **Developer Tools** — ETDump profiler, ETRecord inspector, and model debugger
+- **Selective Build** — Strip unused operators to minimize binary size
+- **Custom Operators** — Extend with domain-specific kernels
+- **Dynamic Shapes** — Support variable input sizes with bounded ranges
+
+See [Advanced Topics](https://docs.pytorch.org/executorch/main/advanced-topics-section.html) for quantization techniques, custom backends, and compiler passes.
+
+## Documentation
+
+- [**Documentation Home**](https://docs.pytorch.org/executorch/main/index.html) — Complete guides and tutorials
+- [**API Reference**](https://docs.pytorch.org/executorch/main/api-section.html) — Python, C++, Java/Kotlin APIs
+- [**Backend Integration**](https://docs.pytorch.org/executorch/main/backend-delegates-integration.html) — Build custom hardware backends
+- [**Troubleshooting**](https://docs.pytorch.org/executorch/main/using-executorch-troubleshooting.html) — Common issues and solutions
+
+## Community & Contributing
+
+We welcome contributions from the community!
+
+- 💬 [**GitHub Discussions**](https://github.com/pytorch/executorch/discussions) — Ask questions and share ideas
+- 🎮 [**Discord**](https://discord.gg/Dh43CKSAdc) — Chat with the team and community
+- 🐛 [**Issues**](https://github.com/pytorch/executorch/issues) — Report bugs or request features
+- 🤝 [**Contributing Guide**](CONTRIBUTING.md) — Guidelines and codebase structure
 
 ## License
-ExecuTorch is BSD licensed, as found in the LICENSE file.
+
+ExecuTorch is BSD licensed, as found in the [LICENSE](LICENSE) file.
+
+<br><br>
+
+---
+
+<div align="center">
+  <p><strong>Part of the PyTorch ecosystem</strong></p>
+  <p>
+    <a href="https://github.com/pytorch/executorch">GitHub</a> •
+    <a href="https://docs.pytorch.org/executorch">Documentation</a>
+  </p>
+</div>

From f84c423d6df5ce90ecbd86453d22ce950a3b47a0 Mon Sep 17 00:00:00 2001
From: Manuel Candales <42380156+manuelcandales@users.noreply.github.com>
Date: Mon, 13 Oct 2025 12:59:55 -0400
Subject: [PATCH 254/266] [Metal] Update aoti_common with additional AOTI
 functions needed by Metal backend (#15003)

---
 backends/aoti/aoti_model_container.cpp |  7 +++++++
 backends/aoti/aoti_model_container.h   | 20 ++++++++++++++++++++
 backends/aoti/common_shims.cpp         |  6 ++++++
 backends/aoti/common_shims.h           |  3 +++
 4 files changed, 36 insertions(+)

diff --git a/backends/aoti/aoti_model_container.cpp b/backends/aoti/aoti_model_container.cpp
index 03be835a0c3..46a246faeb8 100644
--- a/backends/aoti/aoti_model_container.cpp
+++ b/backends/aoti/aoti_model_container.cpp
@@ -25,6 +25,13 @@ AOTInductorModelContainerGetNumOutputsFunc
     AOTInductorModelContainerGetNumOutputs = nullptr;
 AOTInductorModelContainerRunFunc AOTInductorModelContainerRun = nullptr;
 
+// Additional global function pointers for AOT Inductor model container
+// operations needed by Metal backend
+AOTInductorModelContainerGetInputNameFunc
+    AOTInductorModelContainerGetInputName = nullptr;
+AOTInductorModelContainerGetNumConstantsFunc
+    AOTInductorModelContainerGetNumConstants = nullptr;
+
 } // extern "C"
 
 } // namespace aoti
diff --git a/backends/aoti/aoti_model_container.h b/backends/aoti/aoti_model_container.h
index 9b185327172..877f019c457 100644
--- a/backends/aoti/aoti_model_container.h
+++ b/backends/aoti/aoti_model_container.h
@@ -70,6 +70,26 @@ extern AOTInductorModelContainerGetNumOutputsFunc
     AOTInductorModelContainerGetNumOutputs;
 extern AOTInductorModelContainerRunFunc AOTInductorModelContainerRun;
 
+// Retrieves the name of an input tensor by index from the AOTI model container.
+// Needed by Metal backend
+using AOTInductorModelContainerGetInputNameFunc = AOTIRuntimeError (*)(
+    AOTInductorModelContainerHandle container_handle,
+    size_t input_idx,
+    const char** input_name);
+
+// Retrieves the number of constants from the AOTI model container.
+// Needed by Metal backend
+using AOTInductorModelContainerGetNumConstantsFunc = AOTIRuntimeError (*)(
+    AOTInductorModelContainerHandle container_handle,
+    size_t* num_constants);
+
+// Global function pointers (will be loaded dynamically).
+// Needed by Metal backend
+extern AOTInductorModelContainerGetInputNameFunc
+    AOTInductorModelContainerGetInputName;
+extern AOTInductorModelContainerGetNumConstantsFunc
+    AOTInductorModelContainerGetNumConstants;
+
 } // extern "C"
 
 // AOTI Delegate Handle structure
diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp
index f0c134a716c..1afd137aa26 100644
--- a/backends/aoti/common_shims.cpp
+++ b/backends/aoti/common_shims.cpp
@@ -176,6 +176,12 @@ int32_t aoti_torch_dtype_int64() {
   return 4; // PyTorch's int64 dtype code
 }
 
+// Dtype utility function needed by Metal backend.
+// Returns the size of the dtype in bytes.
+size_t aoti_torch_dtype_element_size(int32_t dtype) {
+  return dtype_to_element_size(dtype);
+}
+
 // Cleanup functions
 void cleanup_tensor_metadata() {
   internal::tensor_to_sizes.clear();
diff --git a/backends/aoti/common_shims.h b/backends/aoti/common_shims.h
index 5f54cd1c878..b79e4c86715 100644
--- a/backends/aoti/common_shims.h
+++ b/backends/aoti/common_shims.h
@@ -61,6 +61,9 @@ int32_t aoti_torch_dtype_float32();
 int32_t aoti_torch_dtype_bfloat16();
 int32_t aoti_torch_dtype_int64();
 
+// Dtype utility function needed by Metal backend
+size_t aoti_torch_dtype_element_size(int32_t dtype);
+
 // Autograd mode functions
 int32_t aoti_torch_grad_mode_is_enabled();
 void aoti_torch_grad_mode_set_enabled(bool enabled);

From 626a7d101116763ec67077655b363f245cc4b537 Mon Sep 17 00:00:00 2001
From: eigen-k <eigen@meta.com>
Date: Mon, 13 Oct 2025 10:01:47 -0700
Subject: [PATCH 255/266] Move RemoveCatFromSliceCopyPass to the common
 section. (#14972)

Summary: As stated in the title

Reviewed By: bingcy

Differential Revision: D83859440

---------

Co-authored-by: Jacob Szwejbka <jakeszwe@meta.com>
---
 backends/cadence/aot/remove_ops.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/backends/cadence/aot/remove_ops.py b/backends/cadence/aot/remove_ops.py
index fca1c1ff262..263d3a521f3 100644
--- a/backends/cadence/aot/remove_ops.py
+++ b/backends/cadence/aot/remove_ops.py
@@ -937,6 +937,7 @@ class CommonRemovePasses:
         RemoveZeroSizedCatArgsPass,
         RemovePermutesAroundElementwiseOps,
         RemoveSqueezeViewBeforeElementwiseOps,
+        RemoveCatFromSliceCopyPass,
     ]
 
 
From 9560800207b61103835c209b8f65819c6be2b935 Mon Sep 17 00:00:00 2001
From: Mergen Nachin <mnachin@meta.com>
Date: Mon, 13 Oct 2025 13:06:34 -0400
Subject: [PATCH 256/266] Fix documentation link for Core ATen operators
 (#15050)

Updated link to Core ATen operator set documentation.

### Summary
[PLEASE REMOVE] See [CONTRIBUTING.md's Pull
Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests)
for ExecuTorch PR guidelines.

[PLEASE REMOVE] If this PR closes an issue, please add a `Fixes
#<issue-id>` line.

[PLEASE REMOVE] If this PR introduces a fix or feature that should be
the upcoming release notes, please add a "Release notes: <area>" label.
For a list of available release notes labels, check out
[CONTRIBUTING.md's Pull
Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests).

### Test plan
[PLEASE REMOVE] How did you test this PR? Please write down any manual
commands you used and note down tests that you have written if
applicable.
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d2d115e32d2..531fcc3b4ef 100644
--- a/README.md
+++ b/README.md
@@ -52,7 +52,7 @@ ExecuTorch uses **ahead-of-time (AOT) compilation** to prepare PyTorch models fo
 2. **⚙️ Compile** — Quantize, optimize, and partition to hardware backends → `.pte`
 3. **🚀 Execute** — Load `.pte` on-device via lightweight C++ runtime
 
-Models use a standardized [Core ATen operator set](https://docs.pytorch.org/executorch/main/concepts.html#core-aten-operators). [Partitioners](https://docs.pytorch.org/executorch/main/compiler-delegate-and-partitioner.html) delegate subgraphs to specialized hardware (NPU/GPU) with CPU fallback.
+Models use a standardized [Core ATen operator set](https://docs.pytorch.org/executorch/main/compiler-ir-advanced.html#intermediate-representation). [Partitioners](https://docs.pytorch.org/executorch/main/compiler-delegate-and-partitioner.html) delegate subgraphs to specialized hardware (NPU/GPU) with CPU fallback.
 
 Learn more: [How ExecuTorch Works](https://docs.pytorch.org/executorch/main/intro-how-it-works.html) • [Architecture Guide](https://docs.pytorch.org/executorch/main/getting-started-architecture.html)
 

From 6efddba0c6a17ad0a33c87a6c5274ce929deed4c Mon Sep 17 00:00:00 2001
From: Gregory Comer <gjcomer@meta.com>
Date: Mon, 13 Oct 2025 12:33:01 -0600
Subject: [PATCH 257/266] Support sine operator on XNNPACK (#14711)

Summary: Wire up the unary sine operator in xnnpack for fp32 and fp16.

Differential Revision: D83623086
---
 backends/xnnpack/operators/__init__.py        |  1 +
 backends/xnnpack/operators/op_sin.py          | 52 +++++++++++
 backends/xnnpack/partition/config/__init__.py |  2 +
 .../partition/config/generic_node_configs.py  |  7 ++
 backends/xnnpack/runtime/XNNCompiler.cpp      |  2 +
 .../xnnpack/serialization/runtime_schema.fbs  |  1 +
 backends/xnnpack/serialization/schema.fbs     |  1 +
 .../serialization/xnnpack_graph_schema.py     |  7 ++
 backends/xnnpack/test/ops/test_sin.py         | 87 +++++++++++++++++++
 9 files changed, 160 insertions(+)
 create mode 100644 backends/xnnpack/operators/op_sin.py
 create mode 100644 backends/xnnpack/test/ops/test_sin.py

diff --git a/backends/xnnpack/operators/__init__.py b/backends/xnnpack/operators/__init__.py
index d17b7abd6a1..93424b1c84d 100644
--- a/backends/xnnpack/operators/__init__.py
+++ b/backends/xnnpack/operators/__init__.py
@@ -41,6 +41,7 @@
     op_relu,
     op_rsqrt,
     op_sigmoid,
+    op_sin,
     op_skip_ops,
     op_slice_copy,
     op_softmax,
diff --git a/backends/xnnpack/operators/op_sin.py b/backends/xnnpack/operators/op_sin.py
new file mode 100644
index 00000000000..56fe9396103
--- /dev/null
+++ b/backends/xnnpack/operators/op_sin.py
@@ -0,0 +1,52 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict
+
+import torch
+from executorch.backends.xnnpack.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import (
+    XNNGraph,
+    XNNSin,
+    XNode,
+)
+from executorch.backends.xnnpack.utils.utils import get_input_node
+
+
+@register_node_visitor
+class SinVisitor(NodeVisitor):
+    target = "aten.sin.default"
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        xnn_graph: XNNGraph,
+        vals_to_ids: Dict[torch.fx.Node, int],
+        debug_handle: int,
+    ) -> None:
+        self.define_nodes_tensor_inputs_outputs(node, xnn_graph, vals_to_ids)
+
+        # input
+        input_id = vals_to_ids[get_input_node(node, 0)]
+
+        # output
+        output_id = vals_to_ids[node]
+
+        ser_node = XNode(
+            xnode_union=XNNSin(
+                input_id=input_id,
+                output_id=output_id,
+                flags=0,
+            ),
+            debug_handle=debug_handle,
+        )
+        xnn_graph.xnodes.append(ser_node)
diff --git a/backends/xnnpack/partition/config/__init__.py b/backends/xnnpack/partition/config/__init__.py
index e393f1c9ac8..86baba3e3f7 100644
--- a/backends/xnnpack/partition/config/__init__.py
+++ b/backends/xnnpack/partition/config/__init__.py
@@ -45,6 +45,7 @@
     ReciprocalSquareRootConfig,
     ReLUConfig,
     SigmoidConfig,
+    SinConfig,
     SliceCopyConfig,
     SoftmaxConfig,
     SquareRootConfig,
@@ -105,6 +106,7 @@
     TanhConfig,
     ToDimOrderCopyConfig,
     SigmoidConfig,
+    SinConfig,
     SliceCopyConfig,
     SoftmaxConfig,
     SquareRootConfig,
diff --git a/backends/xnnpack/partition/config/generic_node_configs.py b/backends/xnnpack/partition/config/generic_node_configs.py
index 559d1522275..06024c632c9 100644
--- a/backends/xnnpack/partition/config/generic_node_configs.py
+++ b/backends/xnnpack/partition/config/generic_node_configs.py
@@ -636,3 +636,10 @@ class BMMConfig(GenericNodePartitionerConfig):
 
     def supported_precision_types(self) -> List[ConfigPrecisionType]:
         return [ConfigPrecisionType.FP32]
+
+
+class SinConfig(GenericNodePartitionerConfig):
+    target_name = "sin.default"
+
+    def supported_precision_types(self) -> List[ConfigPrecisionType]:
+        return [ConfigPrecisionType.FP32]
diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp
index eb9b668dafa..b71ab08ea45 100644
--- a/backends/xnnpack/runtime/XNNCompiler.cpp
+++ b/backends/xnnpack/runtime/XNNCompiler.cpp
@@ -1690,6 +1690,7 @@ _DEFINE_UNARY_NODE_NO_PARAMS(Log, xnn_unary_log)
 _DEFINE_UNARY_NODE_NO_PARAMS(Negate, xnn_unary_negate)
 _DEFINE_UNARY_NODE_NO_PARAMS(Square, xnn_unary_square)
 _DEFINE_UNARY_NODE_NO_PARAMS(Abs, xnn_unary_abs)
+_DEFINE_UNARY_NODE_NO_PARAMS(Sin, xnn_unary_sine)
 
 // Unary Ops with min/max params
 _DEFINE_UNARY_NODE_WITH_MINMAX(Clamp, xnn_unary_clamp)
@@ -1737,6 +1738,7 @@ DefineNodeFunc getDefineNodeFunc(fb_xnnpack::XNodeUnion nodeType) {
     _DEFINE(Floor)
     _DEFINE(PReLU)
     _DEFINE(Sigmoid)
+    _DEFINE(Sin)
 
     // Others
     _DEFINE(FullyConnected)
diff --git a/backends/xnnpack/serialization/runtime_schema.fbs b/backends/xnnpack/serialization/runtime_schema.fbs
index 950318f18dc..239f92d899e 100644
--- a/backends/xnnpack/serialization/runtime_schema.fbs
+++ b/backends/xnnpack/serialization/runtime_schema.fbs
@@ -156,6 +156,7 @@ union XNodeUnion {
   XNNGelu: _XNNNode1x1,
   XNNTanh: _XNNNode1x1,
   XNNExp: _XNNNode1x1,
+  XNNSin: _XNNNode1x1,
 }
 
 union XValueUnion {
diff --git a/backends/xnnpack/serialization/schema.fbs b/backends/xnnpack/serialization/schema.fbs
index a4efc627cbb..92a61c5537b 100644
--- a/backends/xnnpack/serialization/schema.fbs
+++ b/backends/xnnpack/serialization/schema.fbs
@@ -152,6 +152,7 @@ union XNodeUnion {
   XNNGelu: _XNNNode1x1,
   XNNTanh: _XNNNode1x1,
   XNNExp: _XNNNode1x1,
+  XNNSin: _XNNNode1x1,
 }
 
 union XValueUnion {
diff --git a/backends/xnnpack/serialization/xnnpack_graph_schema.py b/backends/xnnpack/serialization/xnnpack_graph_schema.py
index 99b64708f86..2b3f8e74202 100644
--- a/backends/xnnpack/serialization/xnnpack_graph_schema.py
+++ b/backends/xnnpack/serialization/xnnpack_graph_schema.py
@@ -347,6 +347,11 @@ class XNNPReLU(XNNNode2x1):
     pass
 
 
+@dataclass
+class XNNSin(XNNNode1x1):
+    pass
+
+
 @dataclass
 class XNNScaledDotProductAttention:
     query_id: int
@@ -402,6 +407,8 @@ class XNNScaledDotProductAttention:
     XNNLog,
     XNNGelu,
     XNNTanh,
+    XNNExp,
+    XNNSin,
 ]
 
 
diff --git a/backends/xnnpack/test/ops/test_sin.py b/backends/xnnpack/test/ops/test_sin.py
new file mode 100644
index 00000000000..6a1b323e14c
--- /dev/null
+++ b/backends/xnnpack/test/ops/test_sin.py
@@ -0,0 +1,87 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from executorch.backends.xnnpack.test.tester import Tester
+
+
+class TestSin(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
+    class Sin(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x):
+            z = torch.sin(x)
+            return z
+
+    def _test_sin(self, inputs, legacy_mode: bool = False):
+        tester = (
+            Tester(self.Sin(), inputs)
+            .export()
+            .check_count({"torch.ops.aten.sin.default": 1})
+        )
+
+        if legacy_mode:
+            tester = tester.to_edge().partition()
+        else:
+            tester = tester.to_edge_transform_and_lower()
+
+        (
+            tester.check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .check_not(["executorch_exir_dialects_edge__ops_aten_sin_default"])
+            .to_executorch()
+            .serialize()
+            .run_method_and_compare_outputs()
+        )
+
+    def test_fp16_sin(self):
+        inputs = (
+            torch.Tensor(
+                [
+                    [0.0, 0.1, 0.5, 0.785398],
+                    [-0.5, -0.785398, 1.5708, -1.5708],
+                ],
+            ).to(torch.float16),
+        )
+        self._test_sin(inputs, legacy_mode=False)
+
+    def test_fp16_sin_legacy_mode(self):
+        inputs = (
+            torch.Tensor(
+                [
+                    [0.0, 0.1, 0.5, 0.785398],
+                    [-0.5, -0.785398, 1.5708, -1.5708],
+                ],
+            ).to(torch.float16),
+        )
+        self._test_sin(inputs, legacy_mode=True)
+
+    def test_fp32_sin(self):
+        inputs = (
+            torch.Tensor(
+                [
+                    [0.0, 0.1, 0.5, 0.785398],
+                    [-0.5, -0.785398, 1.5708, -1.5708],
+                ],
+            ),
+        )
+        self._test_sin(inputs, legacy_mode=False)
+
+    def test_fp32_sin_legacy_mode(self):
+        inputs = (
+            torch.Tensor(
+                [
+                    [0.0, 0.1, 0.5, 0.785398],
+                    [-0.5, -0.785398, 1.5708, -1.5708],
+                ],
+            ),
+        )
+        self._test_sin(inputs, legacy_mode=True)

From a66ea20e4a3d67d2cc1cc4f3bfc70aa70dce381b Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@meta.com>
Date: Mon, 13 Oct 2025 11:34:19 -0700
Subject: [PATCH 258/266] msvc support 1/N (#14970)

Summary: Fix up flags.

Differential Revision: D84296634
---
 CMakeLists.txt                          | 20 ++++++++++++++++----
 backends/aoti/CMakeLists.txt            | 10 ++++++++--
 backends/cuda/CMakeLists.txt            |  9 +++++++--
 extension/android/CMakeLists.txt        |  5 ++++-
 extension/llm/custom_ops/CMakeLists.txt | 20 ++++++++++++++++----
 extension/module/CMakeLists.txt         |  8 ++++++--
 extension/training/CMakeLists.txt       |  9 ++++++++-
 extension/wasm/CMakeLists.txt           | 10 +++++++++-
 8 files changed, 74 insertions(+), 17 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ad08c72d1ae..10e2eb437e3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -296,7 +296,10 @@ if(EXECUTORCH_BUILD_TESTS)
 endif()
 
 # TODO(dbort): Fix these warnings and remove this flag.
-set(_common_compile_options -Wno-deprecated-declarations -fPIC)
+set(_common_compile_options
+    $<$<CXX_COMPILER_ID:MSVC>:/wd4996>
+    $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations -fPIC>
+)
 
 # Let files say "include <executorch/path/to/header.h>".
 # TODO(#6475): This requires/assumes that the repo lives in a directory named
@@ -787,7 +790,10 @@ if(EXECUTORCH_BUILD_PYBIND)
       bundled_module PUBLIC ${_common_include_directories}
     )
     target_compile_options(
-      bundled_module PUBLIC -Wno-deprecated-declarations -fPIC
+      bundled_module
+      PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/wd4996>
+             $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations
+             -fPIC>
     )
   endif()
 
@@ -859,8 +865,14 @@ if(EXECUTORCH_BUILD_PYBIND)
   endif()
 
   # compile options for pybind
-  set(_pybind_compile_options -Wno-deprecated-declarations -fPIC -frtti
-                              -fexceptions
+  set(_pybind_compile_options
+      $<$<CXX_COMPILER_ID:MSVC>:/EHsc
+      /GR
+      /wd4996>
+      $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations
+      -fPIC
+      -frtti
+      -fexceptions>
   )
 
   # util lib
diff --git a/backends/aoti/CMakeLists.txt b/backends/aoti/CMakeLists.txt
index 845144af50f..fcabb0a3f2b 100644
--- a/backends/aoti/CMakeLists.txt
+++ b/backends/aoti/CMakeLists.txt
@@ -36,9 +36,15 @@ target_include_directories(
          # PyTorch AOTI headers from ExecuTorch's torch detection
          ${TORCH_INCLUDE_DIRS}
 )
-target_compile_options(aoti_common PUBLIC -fexceptions -frtti -fPIC)
+target_compile_options(
+  aoti_common
+  PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
+         $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
+)
 # Ensure symbols are exported properly
-target_link_options(aoti_common PUBLIC -Wl,--export-dynamic)
+target_link_options(
+  aoti_common PUBLIC $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
+)
 
 # Link against ExecuTorch libraries and standard libraries
 target_link_libraries(aoti_common PUBLIC extension_tensor ${CMAKE_DL_LIBS})
diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index 575f676e4cc..221291442ec 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -49,9 +49,14 @@ target_include_directories(
          # PyTorch AOTI headers from ExecutorTorch's torch detection
          ${TORCH_INCLUDE_DIRS}
 )
-target_compile_options(aoti_cuda PUBLIC -fexceptions -frtti -fPIC)
+target_compile_options(
+  aoti_cuda PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
+                   $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
+)
 # Ensure symbols are exported properly
-target_link_options(aoti_cuda PUBLIC -Wl,--export-dynamic)
+target_link_options(
+  aoti_cuda PUBLIC $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
+)
 
 # Link against CUDA::cudart, common AOTI library, and PyTorch CUDA libraries
 target_link_libraries(
diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
index 34a1d3d2fd0..38b28a1407a 100644
--- a/extension/android/CMakeLists.txt
+++ b/extension/android/CMakeLists.txt
@@ -18,7 +18,10 @@ endif()
 
 set(EXECUTORCH_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../..")
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
-set(_common_compile_options -Wno-deprecated-declarations -fPIC)
+set(_common_compile_options
+    $<$<CXX_COMPILER_ID:MSVC>:/wd4996>
+    $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations -fPIC>
+)
 if(NOT ANDROID_PLATFORM)
   set(ANDROID_PLATFORM android-30)
 endif()
diff --git a/extension/llm/custom_ops/CMakeLists.txt b/extension/llm/custom_ops/CMakeLists.txt
index 8b29dfdcfd0..2cdfe547430 100644
--- a/extension/llm/custom_ops/CMakeLists.txt
+++ b/extension/llm/custom_ops/CMakeLists.txt
@@ -16,9 +16,14 @@ if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 endif()
 
-set(_common_compile_options -Wno-deprecated-declarations -fPIC)
+set(_common_compile_options
+    $<$<CXX_COMPILER_ID:MSVC>:/wd4996>
+    $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations -fPIC>
+)
 if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64")
-  list(APPEND _common_compile_options "-march=armv8.2-a+dotprod")
+  list(APPEND _common_compile_options
+       "$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-march=armv8.2-a+dotprod>"
+  )
 endif()
 
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
@@ -135,8 +140,15 @@ if(EXECUTORCH_BUILD_KERNELS_LLM_AOT)
     target_link_libraries(custom_ops_aot_lib PUBLIC pthreadpool cpuinfo)
   endif()
   target_compile_options(
-    custom_ops_aot_lib PUBLIC -Wno-deprecated-declarations -fPIC -frtti
-                              -fexceptions ${_common_compile_options}
+    custom_ops_aot_lib
+    PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc
+           /GR
+           /wd4996>
+           $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations
+           -fPIC
+           -frtti
+           -fexceptions>
+           ${_common_compile_options}
   )
 
   install(
diff --git a/extension/module/CMakeLists.txt b/extension/module/CMakeLists.txt
index 8fb2be9a677..4e1c3f160bd 100644
--- a/extension/module/CMakeLists.txt
+++ b/extension/module/CMakeLists.txt
@@ -35,7 +35,9 @@ target_include_directories(
   extension_module PUBLIC ${_common_include_directories}
 )
 target_compile_options(
-  extension_module PUBLIC -Wno-deprecated-declarations -fPIC
+  extension_module
+  PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/wd4996>
+         $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations -fPIC>
 )
 
 # Module extension built as a static library. TODO(gjcomer) Remove this target
@@ -50,7 +52,9 @@ target_include_directories(
   extension_module_static PUBLIC ${_common_include_directories}
 )
 target_compile_options(
-  extension_module_static PUBLIC -Wno-deprecated-declarations -fPIC
+  extension_module_static
+  PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/wd4996>
+         $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations -fPIC>
 )
 
 # Install libraries
diff --git a/extension/training/CMakeLists.txt b/extension/training/CMakeLists.txt
index ed2b3bc5a1e..8f572514aa5 100644
--- a/extension/training/CMakeLists.txt
+++ b/extension/training/CMakeLists.txt
@@ -70,7 +70,14 @@ if(EXECUTORCH_BUILD_PYBIND)
 
   target_include_directories(_training_lib PRIVATE ${TORCH_INCLUDE_DIRS})
   target_compile_options(
-    _training_lib PUBLIC -Wno-deprecated-declarations -fPIC -frtti -fexceptions
+    _training_lib
+    PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc
+           /GR
+           /wd4996>
+           $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations
+           -fPIC
+           -frtti
+           -fexceptions>
   )
   target_link_libraries(_training_lib PRIVATE ${_pybind_training_dep_libs})
 
diff --git a/extension/wasm/CMakeLists.txt b/extension/wasm/CMakeLists.txt
index 36c336e17c5..8ffd1801c63 100644
--- a/extension/wasm/CMakeLists.txt
+++ b/extension/wasm/CMakeLists.txt
@@ -27,7 +27,15 @@ if(NOT EXECUTORCH_ROOT)
 endif()
 
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
-set(_common_compile_options -Wno-deprecated-declarations -fPIC -Wall -Werror)
+set(_common_compile_options
+    $<$<CXX_COMPILER_ID:MSVC>:/W4
+    /WX
+    /wd4996>
+    $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wall
+    -Werror
+    -Wno-deprecated-declarations
+    -fPIC>
+)
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 
 set(link_libraries)

From adc4889a7eb5eb1b806c93698ac5dc7b2deef743 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Mon, 13 Oct 2025 15:18:13 -0400
Subject: [PATCH 259/266] Move tensor layout into exir (#14917)

This PR was created by the merge bot to help merge the original PR into
the main branch.
ghstack PR number: https://github.com/pytorch/executorch/pull/14666 by
@lucylq
^ Please use this as the source of truth for the PR details, comments,
and reviews
ghstack PR base:
https://github.com/pytorch/executorch/tree/gh/lucylq/114/base
ghstack PR head:
https://github.com/pytorch/executorch/tree/gh/lucylq/114/head
Merge bot PR base: https://github.com/pytorch/executorch/tree/main
Merge bot PR head:
https://github.com/pytorch/executorch/tree/gh/lucylq/114/orig
Differential Revision:
[D83504588](https://our.internmc.facebook.com/intern/diff/D83504588/)
@diff-train-skip-merge

Co-authored-by: lucylq <lfq@meta.com>
---
 exir/TARGETS                                  | 10 +++++++++
 exir/_serialize/TARGETS                       |  1 +
 exir/_serialize/_serialize.py                 |  2 +-
 exir/_serialize/data_serializer.py            |  2 +-
 exir/tensor_layout.py                         | 21 +++++++++++++++++++
 extension/flat_tensor/serialize/TARGETS       |  3 +++
 .../flat_tensor/serialize/flat_tensor.fbs     |  2 ++
 .../serialize/flat_tensor_schema.py           |  9 +-------
 extension/flat_tensor/test/test_serialize.py  |  2 +-
 9 files changed, 41 insertions(+), 11 deletions(-)
 create mode 100644 exir/tensor_layout.py

diff --git a/exir/TARGETS b/exir/TARGETS
index 853d5e199ba..402e9a21bd1 100644
--- a/exir/TARGETS
+++ b/exir/TARGETS
@@ -79,6 +79,16 @@ runtime.python_library(
     ],
 )
 
+runtime.python_library(
+    name = "tensor_layout",
+    srcs = [
+        "tensor_layout.py",
+    ],
+    deps = [
+        ":scalar_type",
+    ]
+)
+
 runtime.python_library(
     name = "memory",
     srcs = [
diff --git a/exir/_serialize/TARGETS b/exir/_serialize/TARGETS
index 1b8b76b7835..51bad73ab5c 100644
--- a/exir/_serialize/TARGETS
+++ b/exir/_serialize/TARGETS
@@ -64,5 +64,6 @@ runtime.python_library(
     deps = [
         "//executorch/exir:schema",
         "//executorch/exir:tensor",
+        "//executorch/exir:tensor_layout",
     ],
 )
diff --git a/exir/_serialize/_serialize.py b/exir/_serialize/_serialize.py
index e2147458545..06e81997654 100644
--- a/exir/_serialize/_serialize.py
+++ b/exir/_serialize/_serialize.py
@@ -16,12 +16,12 @@
     DataEntry,
     DataPayload,
     DataSerializer,
-    TensorLayout,
 )
 
 from executorch.exir.capture._config import ExecutorchBackendConfig
 from executorch.exir.emit import EmitterOutput
 from executorch.exir.schema import Tensor, TensorDataLocation
+from executorch.exir.tensor_layout import TensorLayout
 
 
 def serialize_for_executorch(
diff --git a/exir/_serialize/data_serializer.py b/exir/_serialize/data_serializer.py
index e828b4d0ae3..cee34506b66 100644
--- a/exir/_serialize/data_serializer.py
+++ b/exir/_serialize/data_serializer.py
@@ -3,7 +3,7 @@
 from typing import Dict, Optional, Sequence
 
 from executorch.exir._serialize._cord import Cord
-from executorch.extension.flat_tensor.serialize.flat_tensor_schema import TensorLayout
+from executorch.exir.tensor_layout import TensorLayout
 
 
 @dataclass
diff --git a/exir/tensor_layout.py b/exir/tensor_layout.py
new file mode 100644
index 00000000000..f8f77ebeea3
--- /dev/null
+++ b/exir/tensor_layout.py
@@ -0,0 +1,21 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+from dataclasses import dataclass
+from typing import List
+
+from executorch.exir.scalar_type import ScalarType
+
+
+# Note: keep this in sync with the TensorLayout definition in
+# executorch/extension/flat_tensor/serialize/flat_tensor.fbs
+@dataclass
+class TensorLayout:
+    scalar_type: ScalarType
+    sizes: List[int]
+    dim_order: List[int]
diff --git a/extension/flat_tensor/serialize/TARGETS b/extension/flat_tensor/serialize/TARGETS
index 229f6930f4e..b9ccadf9f23 100644
--- a/extension/flat_tensor/serialize/TARGETS
+++ b/extension/flat_tensor/serialize/TARGETS
@@ -13,6 +13,9 @@ runtime.python_library(
     visibility = [
         "//executorch/...",
     ],
+    deps = [
+        "//executorch/exir:tensor_layout",
+    ]
 )
 
 runtime.python_library(
diff --git a/extension/flat_tensor/serialize/flat_tensor.fbs b/extension/flat_tensor/serialize/flat_tensor.fbs
index abf331697d6..4b71e13e2c4 100644
--- a/extension/flat_tensor/serialize/flat_tensor.fbs
+++ b/extension/flat_tensor/serialize/flat_tensor.fbs
@@ -7,6 +7,8 @@ namespace flat_tensor_flatbuffer;
 file_identifier "FT01";
 file_extension "ptd";
 
+// Note: keep this in sync with the python definition in
+// executorch/exir/tensor_layout.py
 table TensorLayout {
   scalar_type: executorch_flatbuffer.ScalarType;
 
diff --git a/extension/flat_tensor/serialize/flat_tensor_schema.py b/extension/flat_tensor/serialize/flat_tensor_schema.py
index 53b0fe98ea9..2fcf2c6eb81 100644
--- a/extension/flat_tensor/serialize/flat_tensor_schema.py
+++ b/extension/flat_tensor/serialize/flat_tensor_schema.py
@@ -9,18 +9,11 @@
 from dataclasses import dataclass
 from typing import List, Optional
 
-from executorch.exir.scalar_type import ScalarType
+from executorch.exir.tensor_layout import TensorLayout
 
 # Note: check executorch/extension/data_format/flat_tensor.fbs for explanations of these fields.
 
 
-@dataclass
-class TensorLayout:
-    scalar_type: ScalarType
-    sizes: List[int]
-    dim_order: List[int]
-
-
 @dataclass
 class DataSegment:
     offset: int
diff --git a/extension/flat_tensor/test/test_serialize.py b/extension/flat_tensor/test/test_serialize.py
index 13402e60a65..726a8845c2e 100644
--- a/extension/flat_tensor/test/test_serialize.py
+++ b/extension/flat_tensor/test/test_serialize.py
@@ -22,7 +22,7 @@
 from executorch.exir._serialize.padding import aligned_size
 
 from executorch.exir.schema import ScalarType
-from executorch.extension.flat_tensor.serialize.flat_tensor_schema import TensorLayout
+from executorch.exir.tensor_layout import TensorLayout
 
 from executorch.extension.flat_tensor.serialize.serialize import (
     _deserialize_to_flat_tensor,

From f19882b96352d7d2938faee191325f0139b2e4e9 Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin <anthony@shoumikh.in>
Date: Mon, 13 Oct 2025 12:18:50 -0700
Subject: [PATCH 260/266] Handle uint types. (#15055)

Summary: .

Differential Revision: D84516559
---
 .../ExecuTorch/Exported/ExecuTorchTensor.mm   |   2 +-
 extension/tensor/tensor_ptr.h                 |  15 +-
 extension/tensor/tensor_ptr_maker.cpp         |   4 +-
 extension/tensor/test/tensor_ptr_test.cpp     | 164 ++++++++++++++++++
 4 files changed, 175 insertions(+), 10 deletions(-)

diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.mm b/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.mm
index 3a2b640b7d7..fd3cd3b1134 100644
--- a/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.mm
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.mm
@@ -271,7 +271,7 @@ - (NSString *)description {
       ET_CHECK_MSG(false, "Unsupported dtype in description");
     }
   } ctx;
-  ET_SWITCH_REALHBBF16_TYPES(
+  ET_SWITCH_REALHBBF16_AND_UINT_TYPES(
     static_cast<ScalarType>(_tensor->scalar_type()),
     ctx,
     "description",
diff --git a/extension/tensor/tensor_ptr.h b/extension/tensor/tensor_ptr.h
index 900252109d3..92893b48158 100644
--- a/extension/tensor/tensor_ptr.h
+++ b/extension/tensor/tensor_ptr.h
@@ -123,13 +123,14 @@ inline TensorPtr make_tensor_ptr(
       }
     } ctx;
 
-    ET_SWITCH_REALHBBF16_TYPES(type, ctx, "make_tensor_ptr", CTYPE, [&] {
-      std::transform(
-          data.begin(),
-          data.end(),
-          reinterpret_cast<CTYPE*>(casted_data.data()),
-          [](const T& val) { return static_cast<CTYPE>(val); });
-    });
+    ET_SWITCH_REALHBBF16_AND_UINT_TYPES(
+        type, ctx, "make_tensor_ptr", CTYPE, [&] {
+          std::transform(
+              data.begin(),
+              data.end(),
+              reinterpret_cast<CTYPE*>(casted_data.data()),
+              [](const T& val) { return static_cast<CTYPE>(val); });
+        });
     const auto raw_data_ptr = casted_data.data();
     auto data_ptr =
         std::make_shared<std::vector<uint8_t>>(std::move(casted_data));
diff --git a/extension/tensor/tensor_ptr_maker.cpp b/extension/tensor/tensor_ptr_maker.cpp
index 511b0ebe582..b71dfab8eeb 100644
--- a/extension/tensor/tensor_ptr_maker.cpp
+++ b/extension/tensor/tensor_ptr_maker.cpp
@@ -96,7 +96,7 @@ TensorPtr random_strided(
     }
   } ctx;
 
-  ET_SWITCH_REALHBBF16_TYPES(type, ctx, "random_strided", CTYPE, [&] {
+  ET_SWITCH_REALHBBF16_AND_UINT_TYPES(type, ctx, "random_strided", CTYPE, [&] {
     std::generate_n(tensor->mutable_data_ptr<CTYPE>(), tensor->numel(), [&]() {
       return static_cast<CTYPE>(distribution(gen));
     });
@@ -138,7 +138,7 @@ TensorPtr full_strided(
     }
   } ctx;
 
-  ET_SWITCH_REALHBBF16_TYPES(type, ctx, "full_strided", CTYPE, [&] {
+  ET_SWITCH_REALHBBF16_AND_UINT_TYPES(type, ctx, "full_strided", CTYPE, [&] {
     CTYPE value;
     ET_EXTRACT_SCALAR(fill_value, value);
     std::fill(
diff --git a/extension/tensor/test/tensor_ptr_test.cpp b/extension/tensor/test/tensor_ptr_test.cpp
index 9156a0c4b10..4a765625934 100644
--- a/extension/tensor/test/tensor_ptr_test.cpp
+++ b/extension/tensor/test/tensor_ptr_test.cpp
@@ -1073,3 +1073,167 @@ TEST_F(TensorPtrTest, TensorDataCastingInvalidCast) {
       },
       "");
 }
+
+TEST_F(TensorPtrTest, TensorDataOnlyUInt16Type) {
+  std::vector<uint16_t> data = {1u, 65535u, 42u, 0u};
+  auto tensor = make_tensor_ptr(std::move(data));
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->strides()[0], 1);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::UInt16);
+  auto ptr = tensor->const_data_ptr<uint16_t>();
+  EXPECT_EQ(ptr[0], 1u);
+  EXPECT_EQ(ptr[1], 65535u);
+  EXPECT_EQ(ptr[2], 42u);
+  EXPECT_EQ(ptr[3], 0u);
+}
+
+TEST_F(TensorPtrTest, TensorDataOnlyUInt32Type) {
+  std::vector<uint32_t> data = {0u, 123u, 4000000000u};
+  auto tensor = make_tensor_ptr(std::move(data));
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 3);
+  EXPECT_EQ(tensor->strides()[0], 1);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::UInt32);
+  auto ptr = tensor->const_data_ptr<uint32_t>();
+  EXPECT_EQ(ptr[0], 0u);
+  EXPECT_EQ(ptr[1], 123u);
+  EXPECT_EQ(ptr[2], 4000000000u);
+}
+
+TEST_F(TensorPtrTest, TensorDataOnlyUInt64Type) {
+  std::vector<uint64_t> data = {0ull, 1ull, 9000000000000000000ull};
+  auto tensor = make_tensor_ptr(std::move(data));
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 3);
+  EXPECT_EQ(tensor->strides()[0], 1);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::UInt64);
+  auto ptr = tensor->const_data_ptr<uint64_t>();
+  EXPECT_EQ(ptr[0], 0ull);
+  EXPECT_EQ(ptr[1], 1ull);
+  EXPECT_EQ(ptr[2], 9000000000000000000ull);
+}
+
+TEST_F(TensorPtrTest, TensorUint8dataUInt32Type) {
+  std::vector<uint32_t> values = {1u, 4000000000u, 123u};
+  const auto* bytes = reinterpret_cast<const uint8_t*>(values.data());
+  std::vector<uint8_t> raw(bytes, bytes + values.size() * sizeof(uint32_t));
+  auto tensor = make_tensor_ptr(
+      {3}, std::move(raw), executorch::aten::ScalarType::UInt32);
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 3);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::UInt32);
+  auto ptr = tensor->const_data_ptr<uint32_t>();
+  EXPECT_EQ(ptr[0], 1u);
+  EXPECT_EQ(ptr[1], 4000000000u);
+  EXPECT_EQ(ptr[2], 123u);
+}
+
+TEST_F(TensorPtrTest, TensorUint8dataUInt64Type) {
+  std::vector<uint64_t> values = {0ull, 42ull, 9000000000000000000ull};
+  const auto* bytes = reinterpret_cast<const uint8_t*>(values.data());
+  std::vector<uint8_t> raw(bytes, bytes + values.size() * sizeof(uint64_t));
+  auto tensor = make_tensor_ptr(
+      {3}, std::move(raw), executorch::aten::ScalarType::UInt64);
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 3);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::UInt64);
+  auto ptr = tensor->const_data_ptr<uint64_t>();
+  EXPECT_EQ(ptr[0], 0ull);
+  EXPECT_EQ(ptr[1], 42ull);
+  EXPECT_EQ(ptr[2], 9000000000000000000ull);
+}
+
+TEST_F(TensorPtrTest, TensorUint8dataSizeMismatchUInt32ExpectDeath) {
+  std::vector<uint8_t> data(
+      3 * executorch::aten::elementSize(executorch::aten::ScalarType::UInt32) -
+      1);
+  ET_EXPECT_DEATH({ auto _ = make_tensor_ptr({3}, std::move(data)); }, "");
+}
+
+TEST_F(TensorPtrTest, TensorUint8dataSizeMismatchUInt64ExpectDeath) {
+  std::vector<uint8_t> data(
+      2 * executorch::aten::elementSize(executorch::aten::ScalarType::UInt64) +
+      1);
+  ET_EXPECT_DEATH({ auto _ = make_tensor_ptr({2}, std::move(data)); }, "");
+}
+
+TEST_F(TensorPtrTest, TensorDataCastingFromInt32ToUInt16) {
+  std::vector<int32_t> data = {-1, 65535, 65536, -65536};
+  auto tensor =
+      make_tensor_ptr(std::move(data), executorch::aten::ScalarType::UInt16);
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::UInt16);
+  auto ptr = tensor->const_data_ptr<uint16_t>();
+  EXPECT_EQ(ptr[0], static_cast<uint16_t>(-1));
+  EXPECT_EQ(ptr[1], static_cast<uint16_t>(65535));
+  EXPECT_EQ(ptr[2], static_cast<uint16_t>(65536));
+  EXPECT_EQ(ptr[3], static_cast<uint16_t>(-65536));
+}
+
+TEST_F(TensorPtrTest, TensorDataCastingFromUInt32ToFloat) {
+  std::vector<uint32_t> data = {0u, 123u, 4000000000u};
+  auto tensor =
+      make_tensor_ptr(std::move(data), executorch::aten::ScalarType::Float);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Float);
+  auto ptr = tensor->const_data_ptr<float>();
+  EXPECT_FLOAT_EQ(ptr[0], 0.0f);
+  EXPECT_FLOAT_EQ(ptr[1], 123.0f);
+  EXPECT_FLOAT_EQ(ptr[2], 4000000000.0f);
+}
+
+TEST_F(TensorPtrTest, TensorDataCastingFromFloatToUInt32) {
+  std::vector<float> data = {1.0f, 2.0f};
+  auto tensor =
+      make_tensor_ptr(std::move(data), executorch::aten::ScalarType::UInt32);
+
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 2);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::UInt32);
+
+  auto ptr = tensor->const_data_ptr<uint32_t>();
+  EXPECT_EQ(ptr[0], 1u);
+  EXPECT_EQ(ptr[1], 2u);
+}
+
+TEST_F(TensorPtrTest, MakeTensorPtrFromExistingTensorUInt32) {
+  std::vector<uint32_t> data = {10u, 20u, 30u, 40u};
+  auto tensor = make_tensor_ptr({2, 2}, data);
+  auto alias = make_tensor_ptr(tensor);
+  EXPECT_EQ(alias->dim(), 2);
+  EXPECT_EQ(alias->size(0), 2);
+  EXPECT_EQ(alias->size(1), 2);
+  EXPECT_EQ(alias->scalar_type(), executorch::aten::ScalarType::UInt32);
+  EXPECT_EQ(
+      alias->const_data_ptr<uint32_t>(), tensor->const_data_ptr<uint32_t>());
+}
+
+TEST_F(TensorPtrTest, CloneTensorPtrFromExistingTensorUInt32) {
+  std::vector<uint32_t> data = {10u, 20u, 30u, 40u};
+  auto tensor = make_tensor_ptr({2, 2}, std::move(data));
+  auto cloned = clone_tensor_ptr(tensor);
+  EXPECT_EQ(cloned->dim(), 2);
+  EXPECT_EQ(cloned->size(0), 2);
+  EXPECT_EQ(cloned->size(1), 2);
+  EXPECT_EQ(cloned->scalar_type(), executorch::aten::ScalarType::UInt32);
+  EXPECT_NE(
+      cloned->const_data_ptr<uint32_t>(), tensor->const_data_ptr<uint32_t>());
+  auto ptr = cloned->const_data_ptr<uint32_t>();
+  EXPECT_EQ(ptr[0], 10u);
+  EXPECT_EQ(ptr[3], 40u);
+}
+
+TEST_F(TensorPtrTest, Tensor2DUInt16OwningData) {
+  std::vector<uint16_t> data = {1u, 2u, 3u, 4u, 5u, 6u};
+  auto tensor = make_tensor_ptr({2, 3}, std::move(data));
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 2);
+  EXPECT_EQ(tensor->size(1), 3);
+  EXPECT_EQ(tensor->strides()[0], 3);
+  EXPECT_EQ(tensor->strides()[1], 1);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::UInt16);
+  auto ptr = tensor->const_data_ptr<uint16_t>();
+  EXPECT_EQ(ptr[0], 1u);
+  EXPECT_EQ(ptr[5], 6u);
+}

From b9451c9914cc5e27c94df632a00f51c2ceb7b0ca Mon Sep 17 00:00:00 2001
From: Mergen Nachin <mnachin@meta.com>
Date: Mon, 13 Oct 2025 15:23:07 -0400
Subject: [PATCH 261/266] Use new logo in ExecuTorch (#14782)

Summary:

Copied assets from https://github.com/dbort/executorch-logos/
---
 .../_static/img/ExecuTorch-Logo-cropped.svg   |  57 -----
 .../img/executorch-chip-logo-circle-16.png    | Bin 0 -> 632 bytes
 .../img/executorch-chip-logo-circle-32.png    | Bin 0 -> 1925 bytes
 .../_static/img/executorch-chip-logo.svg      | 205 ++++++++++++++++++
 docs/source/conf.py                           |   2 +-
 5 files changed, 206 insertions(+), 58 deletions(-)
 delete mode 100644 docs/source/_static/img/ExecuTorch-Logo-cropped.svg
 create mode 100644 docs/source/_static/img/executorch-chip-logo-circle-16.png
 create mode 100644 docs/source/_static/img/executorch-chip-logo-circle-32.png
 create mode 100644 docs/source/_static/img/executorch-chip-logo.svg

diff --git a/docs/source/_static/img/ExecuTorch-Logo-cropped.svg b/docs/source/_static/img/ExecuTorch-Logo-cropped.svg
deleted file mode 100644
index 9e0ef52fbd8..00000000000
--- a/docs/source/_static/img/ExecuTorch-Logo-cropped.svg
+++ /dev/null
@@ -1,57 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<svg
-   id="Layer_2"
-   viewBox="0 0 51.200001 38.52"
-   width="51.200001"
-   height="38.52"
-   version="1.1"
-   sodipodi:docname="ExecuTorch-Logo-cropped.svg"
-   inkscape:version="1.2.1 (9c6d41e4, 2022-07-14)"
-   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
-   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
-   xmlns="http://www.w3.org/2000/svg"
-   xmlns:svg="http://www.w3.org/2000/svg">
-  <sodipodi:namedview
-     id="namedview15"
-     pagecolor="#ffffff"
-     bordercolor="#000000"
-     borderopacity="0.25"
-     inkscape:showpageshadow="2"
-     inkscape:pageopacity="0.0"
-     inkscape:pagecheckerboard="0"
-     inkscape:deskcolor="#d1d1d1"
-     showgrid="false"
-     inkscape:zoom="8.0613964"
-     inkscape:cx="18.235054"
-     inkscape:cy="7.6289512"
-     inkscape:window-width="1680"
-     inkscape:window-height="819"
-     inkscape:window-x="0"
-     inkscape:window-y="25"
-     inkscape:window-maximized="0"
-     inkscape:current-layer="Layer_2" />
-  <defs
-     id="defs4">
-    <style
-       id="style2">.cls-1{fill:#cc2faa;}</style>
-  </defs>
-  <path
-     class="cls-1"
-     d="m 26.89,12.15 c 1.27,-1.27 3.33,-1.27 4.59,0 1.26,1.27 1.26,3.32 0,4.59 -1.26,1.27 -3.33,1.27 -4.59,0 -1.26,-1.27 -1.26,-3.32 0,-4.59"
-     id="path6" />
-  <polygon
-     class="cls-1"
-     points="16.1,27.25 16.11,21.52 39.95,45.19 51.49,45.17 51.53,22.49 55.6,18.42 55.55,49.23 38.27,49.26 "
-     id="polygon8"
-     transform="translate(-4.4,-10.74)" />
-  <polygon
-     class="cls-1"
-     points="4.4,41.62 4.45,10.77 21.74,10.74 30.38,19.31 27.5,22.19 20.05,14.81 8.52,14.83 8.48,37.55 "
-     id="polygon10"
-     transform="translate(-4.4,-10.74)" />
-  <polygon
-     class="cls-1"
-     points="39.52,28.41 44.48,33.33 44.47,39.06 36.66,31.31 "
-     id="polygon12"
-     transform="translate(-4.4,-10.74)" />
-</svg>
diff --git a/docs/source/_static/img/executorch-chip-logo-circle-16.png b/docs/source/_static/img/executorch-chip-logo-circle-16.png
new file mode 100644
index 0000000000000000000000000000000000000000..a3966ae27db497d0ca07c438fd6eec1e339fde77
GIT binary patch
literal 632
zcmV-;0*C#HP)<h;3K|Lk000e1NJLTq000mG000mO1^@s6AM^iV00009a7bBm0004E
z0004E0beK3Qvd(}8FWQhbW?9;ba!ELWdL_~cP?peYja~^aAhuUa%Y?FJQ@H10tHD#
zK~y-6b(2eL6Hyq3-%PMwS(oj$h?}HUEaE><LA-#}P0S`P5-J2lga`%O<xDWuqN3@d
zaUm|;E8gnT3#Eu8TEW&ukuD}q3f0ofoXO)NnOZ093*YAOz0Wz{<p@wD27yVSHZ?W1
zQ7)G^fEsWI7>MS_0JYUE>9(YtB&%2~s#dGXvMlxY_iJoyOp*dg4<%ibv?WO)sT+7p
zTaM$9N~NfOU%>M`GMNnH<Kyhm4A2E^10Hmk7Zx$6&I9ZPxf=oofgpObIdPW2F2ULw
z!Ht`{K@5;oyJ7##OG}uO=UV1aKbL8~nxV109ED5>_!633*ClW!FjuY;mAO1ZaBDL9
zlB&w(@<uwH#&z9R;^_-CzkcfgG-u~9XNLdlNu^R*w<KX%mg4bvYnZ?PDAu!I9Xik>
zt*-8zoB*tBHv1!yNGPArt6r}wHgrVIXD>SZJb$U!&`~uS4ULYDs<*dSnM`I;?G3}W
zZBnTezh>v~2agjxc?w%wu(?U_=rMls7%QJXlgs5O6bfNwCDB?>vlHUI`+&Ka#y@lz
z|KI>-_!2AcKSs?4NQAZA3r|Cc>$+eJ_&%QJMa{Iwzuk~7;LUE3J<P9xeMCM}m!y)U
z5lL~!aWp?aFG=d_>yvHU;b*!p>AEC8I+8GwzywgOR4VJ!)6?rf6(|9TX#QU{jHka-
Sl?Pe?0000<MNUMnLSTZ6Ef>82

literal 0
HcmV?d00001

diff --git a/docs/source/_static/img/executorch-chip-logo-circle-32.png b/docs/source/_static/img/executorch-chip-logo-circle-32.png
new file mode 100644
index 0000000000000000000000000000000000000000..83f1018a76c3e35d59ae6ec57cbcfe8fcadeab33
GIT binary patch
literal 1925
zcmV;02YUF4P)<h;3K|Lk000e1NJLTq001BW001Be1^@s6b9#F800009a7bBm0008T
z0008T0g2Xs?*IS*8FWQhbW?9;ba!ELWdL_~cP?peYja~^aAhuUa%Y?FJQ@H12NFp{
zK~z|Um6vTyTjv$Wk87i563h05m^SM$_L59{X;TFWDuk3yfRQoMzD(WrZId<>iwz_T
z6!S90Bzuub%s{MJn^wF`kj%ow8q7$Ox)nt<T})V%i5bJPMBOUf_Cpif*h!7=Zyzq#
zxyBIEBVAqH=Q;QM|K~jCInQ|vfIOuf6p>~T`HqOZBqA0O`SSMMeG$1QBDX~3uOjki
z5lKFkRbap8DDXD0dF|RY_LIfMMSxA<E#Sz0_MbHQ1>ii8)r>bbHqzeS&c?<@5pX;n
z$L)4gUtg~u{tWN|2R<)>Bfvj3E7Q}{B$G+f=`=Q*4YS!yG#X`jd6}CxZ?d|&N_lxX
zR;!iG%}q8pH<_84(G$52JhvOxGrO$H%ObKUBHz*mZr!>i4u?ZlR#s$qcvvbcD<u#J
z$cYmtq^hb)E?>SZR;yJ81_tELojcOp+$@nuL~pZ3L~e-4i~Hrk5#VEu_So1Mu~-ZM
zyWNh-WFj06bN>8!M1-NCAx1_<5E1(O`<b7gr=+BWhK2?J*4NjWn3&K<;=e%oUQ2KQ
zxTV()g+f>?7N(}Ah{a;Gw6t*U+&M&q;o)Hti3F)sic6O+AtH2lcjIt4h(@DaxpIZd
z%1VO4;LdbeLd~xt0_Qc2(a}*tp%4I_ot=mXe!rg!7cLZlEARaspn$~Lvu7C`97IHL
zyWIfH&CM}6IjJY%-AUjmx@?cdVpuE|y1KfE$K$wMt|HHs_akJj-^^Rw8Hv``R^stE
z9*>8LiVC99sGdMlTMCW9+W>(;fS#Tn0463T5D{E1SFz`d?~}EBgY5Ad)tsV0YO^R3
zE|&`tVSIdCb=&LJFUji~frF@pgJd#EU0oe^yPa4phTres-SbKnT=~yE0CK<m9ob_q
zLoQc9!sGGa^Z8g?Tf^aSP+MC|B9X`k4fvt}KUi2;psA^ebUMw_(h??<38&M!>zZqx
zYv2IN-Fqlk|D=9w7DvMAbYe6bi9{k~G8vkin^{;`0C-J+O8^}m9aybaLZJ}haJbm>
zyZ6W*JzfN^7v-II$-Yzr|NF2g60@_j%+AhYv)O2CYeP{KfHwrV0ct5Vo6XeJ)a3WM
z1`e?G<5Of`IEM1C)m^~>lp9NApR=*`)1MV3W`BP_)z#IMm6Z{X$F*Os3-F2d6^%ye
z@9!@H=hIKgK5rv;`WJhGYqHh)bFzoO1`iTNB>MXLSXo)otA8ZGBkgN>c^QMjz|hbT
z$z<}eXV(_V9<reP<(+*AD1Qv9x{KHIW2sb%k&zJ$1_O~uM6dp^;4EOfH!vEFVlWsU
zPoy*IyR`I)N4toKeCeS2-c09*3<iT3jmErvy*wi#3GL&+fdkUp+beBtZ8A1CCduS>
zPjUEpmEiin_r+&zTjwi>MMUJ`!-o<G1mx7IQ_|htEv1D{JSa%Qva&L&tE=((e254m
zBO_`aC<@u*HEcDsKrXj;{6EXG^~$SckG}$nLMD@8baWIE;mnyc)YQ~qHk*r*@J&Tg
zXlrZ3X0zqDi`VNdAdwrNBzwq0?#wUM%C}Q;IdX1~sxv-W0Dk)PX~N+!p->2`)k;T4
zhxXU+1^E8L!UE0B&15neB9RD2qY<am$-uzCV-gP@s2P69!qyLegmV1`JbDC~49dIL
z*!u2k>fBG-$fVPGaIe>k)9J)yGO@I@L^_?Osi{dTMBfvj6tygpNF=DOt;OMRu(q~_
z&*v+Q#Qpo^e1l}a`gO90EQQYt$H@KW0-02*z;nOfPb?P0ZnslcSEt`yDWFt4jNSqW
z27`FLUI50&$Mc6#K_n6h`1liYf4EGpx1U^(m)!Uy#6KzsyVvW@A4XbJ($mvJAP~?9
zyov5KJc8Ol0%}33sHni>@eq&4X>DyS%88w_bIn~Y7x8$UuC6XD7E8Xf+LlLvl^qR<
zPn(^boMdip4gj~?jfgNfI9MEsUBJCwFMhut5uvlQ6Hq&=(a}*ofnT#nOLkqa6$}Qc
ztgPh9l`E{SuHtYwiXySo^OlwtVzC%gQ&U(hmV9TWllOpUcOxLEUCIZV=ETGV>+9<P
zG&D3&Qc}YF{Cq*3xOnkmexHZKVN50ycDo%=JFBs=F+KQ)z+pY+Zg+wgQM=c?csw3X
zo;=Cz+qVe@gH%;j(bw09!C*i{=;`U9wzih5SFaL@M5wE)qqnzrr{^C4-`<J0*Ttj^
zSkV5QnVHF7IILDH<>lq9uC5Y^M2JKpM59s6W-~UMjdVIqGMQw0dU|Kr%Xh<i($(x)
z)b2H<S*fqD$L)6OOKDy<Ha2K)Z>O=bQ9t|$_!YV}|2zcrdGR`s+P-)`QMNCzDc}vZ
zx7w2)1G^lNdq}CM*Yj6J<hY1bO5yeVu890YME)isVG&6^jnV%D>Xe!Nr0%6!00000
LNkvXXu0mjfCt0oV

literal 0
HcmV?d00001

diff --git a/docs/source/_static/img/executorch-chip-logo.svg b/docs/source/_static/img/executorch-chip-logo.svg
new file mode 100644
index 00000000000..11e5ed60956
--- /dev/null
+++ b/docs/source/_static/img/executorch-chip-logo.svg
@@ -0,0 +1,205 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   width="45.129288mm"
+   height="45.129242mm"
+   viewBox="0 0 45.129288 45.129242"
+   version="1.1"
+   id="svg1124"
+   inkscape:version="1.2.1 (9c6d41e, 2022-07-14)"
+   sodipodi:docname="executorch-chip-logo.svg"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:svg="http://www.w3.org/2000/svg">
+  <sodipodi:namedview
+     id="namedview1126"
+     pagecolor="#ffffff"
+     bordercolor="#000000"
+     borderopacity="0.25"
+     inkscape:showpageshadow="2"
+     inkscape:pageopacity="0.0"
+     inkscape:pagecheckerboard="0"
+     inkscape:deskcolor="#d1d1d1"
+     inkscape:document-units="mm"
+     showgrid="false"
+     inkscape:zoom="2.3786088"
+     inkscape:cx="20.600277"
+     inkscape:cy="32.161657"
+     inkscape:current-layer="layer1" />
+  <defs
+     id="defs1121">
+    <linearGradient
+       id="linearGradient2449"
+       inkscape:swatch="solid">
+      <stop
+         style="stop-color:#ffffff;stop-opacity:1;"
+         offset="0"
+         id="stop2447" />
+    </linearGradient>
+  </defs>
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1"
+     transform="translate(-15.818847,-61.123938)">
+    <g
+       id="g2797"
+       transform="matrix(0.90140816,0,0,0.90140816,3.7842987,8.2510089)">
+      <rect
+         style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+         id="rect2207"
+         width="28.699108"
+         height="28.699108"
+         x="-46.385078"
+         y="71.985069"
+         ry="2.4088593"
+         rx="2.4088593"
+         transform="rotate(-45)" />
+      <g
+         id="g2221"
+         style="fill:#000000;fill-opacity:1;stroke-width:1.20129"
+         transform="matrix(0.15573902,-0.15573902,0.15573902,0.15573902,-118.24576,-200.72519)">
+        <path
+           id="path2209"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -341.83555,1381.7832 v 11.8926 h 10.86719 c 0.95107,0 1.7168,-0.7657 1.7168,-1.7168 v -0.875 h 12.55273 c 0.95122,0 1.7168,-0.7656 1.7168,-1.7168 v -3.2754 c 0,-0.9512 -0.76558,-1.7168 -1.7168,-1.7168 h -12.55273 v -0.875 c 0,-0.951 -0.76573,-1.7168 -1.7168,-1.7168 z" />
+        <path
+           id="path2211"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -341.83555,1400.6595 v 11.8926 h 10.86719 c 0.95107,0 1.7168,-0.7657 1.7168,-1.7168 v -0.875 h 12.55273 c 0.95122,0 1.7168,-0.7656 1.7168,-1.7168 v -3.2754 c 0,-0.9512 -0.76558,-1.7168 -1.7168,-1.7168 h -12.55273 v -0.875 c 0,-0.951 -0.76573,-1.7168 -1.7168,-1.7168 z" />
+        <path
+           id="path2213"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -341.83555,1419.5359 v 11.8925 h 10.86719 c 0.95107,0 1.7168,-0.7657 1.7168,-1.7168 v -0.875 h 12.55273 c 0.95122,0 1.7168,-0.7655 1.7168,-1.7168 v -3.2754 c 0,-0.9512 -0.76558,-1.7168 -1.7168,-1.7168 h -12.55273 v -0.875 c 0,-0.951 -0.76573,-1.7168 -1.7168,-1.7167 z" />
+        <path
+           id="path2215"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -341.83555,1438.4121 v 11.8925 h 10.86719 c 0.95107,0 1.7168,-0.7657 1.7168,-1.7167 v -0.875 h 12.55273 c 0.95122,0 1.7168,-0.7656 1.7168,-1.7168 v -3.2754 c 0,-0.9513 -0.76558,-1.7168 -1.7168,-1.7168 h -12.55273 v -0.875 c 0,-0.9511 -0.76573,-1.7168 -1.7168,-1.7168 z" />
+        <path
+           id="path2217"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -341.83555,1457.2883 v 11.8926 h 10.86719 c 0.95107,0 1.7168,-0.7658 1.7168,-1.7168 v -0.875 h 12.55273 c 0.95122,0 1.7168,-0.7656 1.7168,-1.7168 v -3.2754 c 0,-0.9512 -0.76558,-1.7168 -1.7168,-1.7168 h -12.55273 v -0.875 c 0,-0.9511 -0.76573,-1.7168 -1.7168,-1.7168 z" />
+        <path
+           id="path2219"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -341.83555,1362.907 v 11.8925 h 10.86719 c 0.95107,0 1.7168,-0.7657 1.7168,-1.7167 v -0.875 h 12.55273 c 0.95122,0 1.7168,-0.7656 1.7168,-1.7168 v -3.2754 c 0,-0.9513 -0.76558,-1.7168 -1.7168,-1.7168 h -12.55273 v -0.875 c 0,-0.9511 -0.76573,-1.7168 -1.7168,-1.7168 z" />
+      </g>
+      <g
+         id="g2235"
+         style="fill:#000000;fill-opacity:1;stroke-width:1.20129"
+         transform="matrix(0.15573902,-0.15573902,0.15573902,0.15573902,-118.24576,-200.72519)">
+        <path
+           id="path2223"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -444.51285,1347.476 h 11.8926 v -10.8672 c 0,-0.9511 -0.7657,-1.7168 -1.7168,-1.7168 h -0.875 v -12.5527 c 0,-0.9512 -0.7656,-1.7168 -1.7168,-1.7168 h -3.2754 c -0.9512,0 -1.7168,0.7656 -1.7168,1.7168 v 12.5527 h -0.875 c -0.951,0 -1.7168,0.7657 -1.7168,1.7168 z" />
+        <path
+           id="path2225"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -425.63655,1347.476 h 11.8926 v -10.8672 c 0,-0.9511 -0.7657,-1.7168 -1.7168,-1.7168 h -0.875 v -12.5527 c 0,-0.9512 -0.7656,-1.7168 -1.7168,-1.7168 h -3.2754 c -0.9512,0 -1.7168,0.7656 -1.7168,1.7168 v 12.5527 h -0.875 c -0.951,0 -1.7168,0.7657 -1.7168,1.7168 z" />
+        <path
+           id="path2227"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -406.76015,1347.476 h 11.8925 v -10.8672 c 0,-0.9511 -0.7657,-1.7168 -1.7168,-1.7168 h -0.875 v -12.5527 c 0,-0.9512 -0.7655,-1.7168 -1.7168,-1.7168 h -3.2754 c -0.9512,0 -1.7168,0.7656 -1.7168,1.7168 v 12.5527 h -0.875 c -0.951,0 -1.7168,0.7657 -1.7167,1.7168 z" />
+        <path
+           id="path2229"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -387.88395,1347.476 h 11.8925 v -10.8672 c 0,-0.9511 -0.7657,-1.7168 -1.7167,-1.7168 h -0.875 v -12.5527 c 0,-0.9512 -0.7656,-1.7168 -1.7168,-1.7168 h -3.2754 c -0.9513,0 -1.7168,0.7656 -1.7168,1.7168 v 12.5527 h -0.875 c -0.9511,0 -1.7168,0.7657 -1.7168,1.7168 z" />
+        <path
+           id="path2231"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -369.00775,1347.476 h 11.8926 v -10.8672 c 0,-0.9511 -0.7658,-1.7168 -1.7168,-1.7168 h -0.875 v -12.5527 c 0,-0.9512 -0.7656,-1.7168 -1.7168,-1.7168 h -3.2754 c -0.9512,0 -1.7168,0.7656 -1.7168,1.7168 v 12.5527 h -0.875 c -0.9511,0 -1.7168,0.7657 -1.7168,1.7168 z" />
+        <path
+           id="path2233"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -463.38905,1347.476 h 11.8925 v -10.8672 c 0,-0.9511 -0.7657,-1.7168 -1.7167,-1.7168 h -0.875 v -12.5527 c 0,-0.9512 -0.7656,-1.7168 -1.7168,-1.7168 h -3.2754 c -0.9513,0 -1.7168,0.7656 -1.7168,1.7168 v 12.5527 h -0.875 c -0.9511,0 -1.7168,0.7657 -1.7168,1.7168 z" />
+      </g>
+      <g
+         id="g2249"
+         style="fill:#000000;fill-opacity:1;stroke-width:1.20129"
+         transform="matrix(0.15573902,-0.15573902,0.15573902,0.15573902,-118.24576,-200.72519)">
+        <path
+           id="path2237"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -478.66849,1450.1533 v -11.8926 h -10.86719 c -0.95107,0 -1.7168,0.7657 -1.7168,1.7168 v 0.875 h -12.55273 c -0.95122,0 -1.7168,0.7656 -1.7168,1.7168 v 3.2754 c 0,0.9512 0.76558,1.7168 1.7168,1.7168 h 12.55273 v 0.875 c 0,0.951 0.76573,1.7168 1.7168,1.7168 z" />
+        <path
+           id="path2239"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -478.66849,1431.277 v -11.8926 h -10.86719 c -0.95107,0 -1.7168,0.7657 -1.7168,1.7168 v 0.875 h -12.55273 c -0.95122,0 -1.7168,0.7656 -1.7168,1.7168 v 3.2754 c 0,0.9512 0.76558,1.7168 1.7168,1.7168 h 12.55273 v 0.875 c 0,0.951 0.76573,1.7168 1.7168,1.7168 z" />
+        <path
+           id="path2241"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -478.66849,1412.4006 v -11.8925 h -10.86719 c -0.95107,0 -1.7168,0.7657 -1.7168,1.7168 v 0.875 h -12.55273 c -0.95122,0 -1.7168,0.7655 -1.7168,1.7168 v 3.2754 c 0,0.9512 0.76558,1.7168 1.7168,1.7168 h 12.55273 v 0.875 c 0,0.951 0.76573,1.7168 1.7168,1.7167 z" />
+        <path
+           id="path2243"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -478.66849,1393.5244 v -11.8925 h -10.86719 c -0.95107,0 -1.7168,0.7657 -1.7168,1.7167 v 0.875 h -12.55273 c -0.95122,0 -1.7168,0.7656 -1.7168,1.7168 v 3.2754 c 0,0.9513 0.76558,1.7168 1.7168,1.7168 h 12.55273 v 0.875 c 0,0.9511 0.76573,1.7168 1.7168,1.7168 z" />
+        <path
+           id="path2245"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -478.66849,1374.6482 v -11.8926 h -10.86719 c -0.95107,0 -1.7168,0.7658 -1.7168,1.7168 v 0.875 h -12.55273 c -0.95122,0 -1.7168,0.7656 -1.7168,1.7168 v 3.2754 c 0,0.9512 0.76558,1.7168 1.7168,1.7168 h 12.55273 v 0.875 c 0,0.9511 0.76573,1.7168 1.7168,1.7168 z" />
+        <path
+           id="path2247"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -478.66849,1469.0295 v -11.8925 h -10.86719 c -0.95107,0 -1.7168,0.7657 -1.7168,1.7167 v 0.875 h -12.55273 c -0.95122,0 -1.7168,0.7656 -1.7168,1.7168 v 3.2754 c 0,0.9513 0.76558,1.7168 1.7168,1.7168 h 12.55273 v 0.875 c 0,0.9511 0.76573,1.7168 1.7168,1.7168 z" />
+      </g>
+      <g
+         id="g2263"
+         style="fill:#000000;fill-opacity:1;stroke-width:1.20129"
+         transform="matrix(0.15573902,-0.15573902,0.15573902,0.15573902,-118.24576,-200.72519)">
+        <path
+           id="path2251"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -375.99123,1484.4604 h -11.8926 v 10.8672 c 0,0.951 0.7657,1.7168 1.7168,1.7168 h 0.875 v 12.5527 c 0,0.9512 0.7656,1.7168 1.7168,1.7168 h 3.2754 c 0.9512,0 1.7168,-0.7656 1.7168,-1.7168 v -12.5527 h 0.875 c 0.951,0 1.7168,-0.7658 1.7168,-1.7168 z" />
+        <path
+           id="path2253"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -394.86753,1484.4604 h -11.8926 v 10.8672 c 0,0.951 0.7657,1.7168 1.7168,1.7168 h 0.875 v 12.5527 c 0,0.9512 0.7656,1.7168 1.7168,1.7168 h 3.2754 c 0.9512,0 1.7168,-0.7656 1.7168,-1.7168 v -12.5527 h 0.875 c 0.951,0 1.7168,-0.7658 1.7168,-1.7168 z" />
+        <path
+           id="path2255"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -413.74393,1484.4604 h -11.8925 v 10.8672 c 0,0.951 0.7657,1.7168 1.7168,1.7168 h 0.875 v 12.5527 c 0,0.9512 0.7655,1.7168 1.7168,1.7168 h 3.2754 c 0.9512,0 1.7168,-0.7656 1.7168,-1.7168 v -12.5527 h 0.875 c 0.951,0 1.7168,-0.7658 1.7167,-1.7168 z" />
+        <path
+           id="path2257"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -432.62013,1484.4604 h -11.8925 v 10.8672 c 0,0.951 0.7657,1.7168 1.7167,1.7168 h 0.875 v 12.5527 c 0,0.9512 0.7656,1.7168 1.7168,1.7168 h 3.2754 c 0.9513,0 1.7168,-0.7656 1.7168,-1.7168 v -12.5527 h 0.875 c 0.9511,0 1.7168,-0.7658 1.7168,-1.7168 z" />
+        <path
+           id="path2259"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -451.49633,1484.4604 h -11.8926 v 10.8672 c 0,0.951 0.7658,1.7168 1.7168,1.7168 h 0.875 v 12.5527 c 0,0.9512 0.7656,1.7168 1.7168,1.7168 h 3.2754 c 0.9512,0 1.7168,-0.7656 1.7168,-1.7168 v -12.5527 h 0.875 c 0.9511,0 1.7168,-0.7658 1.7168,-1.7168 z" />
+        <path
+           id="path2261"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -357.11503,1484.4604 h -11.8925 v 10.8672 c 0,0.951 0.7657,1.7168 1.7167,1.7168 h 0.875 v 12.5527 c 0,0.9512 0.7656,1.7168 1.7168,1.7168 h 3.2754 c 0.9513,0 1.7168,-0.7656 1.7168,-1.7168 v -12.5527 h 0.875 c 0.9511,0 1.7168,-0.7658 1.7168,-1.7168 z" />
+      </g>
+      <rect
+         style="font-variation-settings:normal;fill:#ffffff;fill-opacity:1;fill-rule:evenodd;stroke:#ffffff;stroke-width:0.814388;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+         id="rect2265"
+         width="23.585829"
+         height="23.585829"
+         x="-43.828445"
+         y="74.54174"
+         ry="0.11391187"
+         rx="0.11391187"
+         transform="rotate(-45)" />
+      <g
+         id="g2205"
+         transform="matrix(0.21958723,0,0,0.21958723,28.325015,-164.37637)"
+         style="display:inline;fill:#fb1620;fill-opacity:1;stroke:none;stroke-width:1.2049">
+        <path
+           fill="#ee4c2c"
+           d="m 77.6,1099.6 -8.1,8.1 c 13.3,13.3 13.3,34.7 0,47.8 -13.3,13.3 -34.7,13.3 -47.8,0 -13.3,-13.3 -13.3,-34.7 0,-47.8 v 0 l 21.1,-21.1 3,-3 v 0 -15.9 L 14,1099.5 c -17.7,17.7 -17.7,46.3 0,64 17.7,17.7 46.3,17.7 63.7,0 17.6,-17.7 17.6,-46.1 -0.1,-63.9 z"
+           id="path2201"
+           style="fill:#fb1620;fill-opacity:1;stroke:none;stroke-width:1.2049" />
+        <circle
+           fill="#ee4c2c"
+           cx="61.700001"
+           cy="1091.8"
+           r="5.9000001"
+           id="circle2203"
+           style="fill:#fb1620;fill-opacity:1;stroke:none;stroke-width:1.2049" />
+      </g>
+    </g>
+  </g>
+</svg>
diff --git a/docs/source/conf.py b/docs/source/conf.py
index b1c6b8b43a2..31abdef2820 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -74,7 +74,7 @@
     "xml",  # {repo_root}/docs/cpp/build/xml
 )
 
-html_favicon = "_static/img/ExecuTorch-Logo-cropped.svg"
+html_favicon = "_static/img/executorch-chip-logo.svg"
 
 # Get ET_VERSION_DOCS during the build.
 et_version_docs = os.environ.get("ET_VERSION_DOCS", None)

From 23db0bcb8e4ca666cf561a3d09936b48237d4cf7 Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin <anthony@shoumikh.in>
Date: Mon, 13 Oct 2025 12:28:40 -0700
Subject: [PATCH 262/266] Tensor view keeps original tensor alive. (#15056)

Summary: TensorPtr view created with TensorPtr should keep it alive to
match ATen behavior.

Differential Revision: D84512176
---
 .../ExecuTorch/Exported/ExecuTorchTensor.mm   |  2 +-
 extension/tensor/tensor_ptr.h                 | 21 ++++--
 extension/tensor/test/tensor_ptr_test.cpp     | 68 +++++++++++++++++++
 3 files changed, 84 insertions(+), 7 deletions(-)

diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.mm b/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.mm
index fd3cd3b1134..3b1c06a5aa0 100644
--- a/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.mm
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.mm
@@ -129,7 +129,7 @@ - (instancetype)initWithNativeInstance:(void *)nativeInstance {
 - (instancetype)initWithTensor:(ExecuTorchTensor *)otherTensor {
   ET_CHECK(otherTensor);
   auto tensor = make_tensor_ptr(
-    **reinterpret_cast<TensorPtr *>(otherTensor.nativeInstance)
+    *reinterpret_cast<TensorPtr *>(otherTensor.nativeInstance)
   );
   return [self initWithNativeInstance:&tensor];
 }
diff --git a/extension/tensor/tensor_ptr.h b/extension/tensor/tensor_ptr.h
index 92893b48158..d8fad857cd2 100644
--- a/extension/tensor/tensor_ptr.h
+++ b/extension/tensor/tensor_ptr.h
@@ -339,13 +339,16 @@ inline TensorPtr make_tensor_ptr(
  * @param sizes Optional sizes override.
  * @param dim_order Optional dimension order override.
  * @param strides Optional strides override.
+ * @param deleter A custom deleter function for managing the lifetime of the
+ * original Tensor.
  * @return A TensorPtr aliasing the same storage with requested metadata.
  */
 inline TensorPtr make_tensor_ptr(
     const executorch::aten::Tensor& tensor,
     std::vector<executorch::aten::SizesType> sizes = {},
     std::vector<executorch::aten::DimOrderType> dim_order = {},
-    std::vector<executorch::aten::StridesType> strides = {}) {
+    std::vector<executorch::aten::StridesType> strides = {},
+    std::function<void(void*)> deleter = nullptr) {
   if (sizes.empty()) {
     sizes.assign(tensor.sizes().begin(), tensor.sizes().end());
   }
@@ -373,16 +376,18 @@ inline TensorPtr make_tensor_ptr(
       tensor.mutable_data_ptr(),
       std::move(dim_order),
       std::move(strides),
-      tensor.scalar_type()
+      tensor.scalar_type(),
 #ifndef USE_ATEN_LIB
-          ,
-      tensor.shape_dynamism()
+      tensor.shape_dynamism(),
+#else // USE_ATEN_LIB
+      executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
 #endif // USE_ATEN_LIB
-  );
+      std::move(deleter));
 }
 
 /**
  * Convenience overload identical to make_tensor_ptr(*tensor_ptr, ...).
+ * Keeps the original TensorPtr alive until the returned TensorPtr is destroyed.
  *
  * @param tensor_ptr The source tensor pointer to alias.
  * @param sizes Optional sizes override.
@@ -396,7 +401,11 @@ inline TensorPtr make_tensor_ptr(
     std::vector<executorch::aten::DimOrderType> dim_order = {},
     std::vector<executorch::aten::StridesType> strides = {}) {
   return make_tensor_ptr(
-      *tensor_ptr, std::move(sizes), std::move(dim_order), std::move(strides));
+      *tensor_ptr,
+      std::move(sizes),
+      std::move(dim_order),
+      std::move(strides),
+      [tensor_ptr](void*) {});
 }
 
 /**
diff --git a/extension/tensor/test/tensor_ptr_test.cpp b/extension/tensor/test/tensor_ptr_test.cpp
index 4a765625934..5e242e5eb02 100644
--- a/extension/tensor/test/tensor_ptr_test.cpp
+++ b/extension/tensor/test/tensor_ptr_test.cpp
@@ -1038,6 +1038,74 @@ TEST_F(TensorPtrTest, TensorUint8dataTooLargeExpectDeath) {
   ET_EXPECT_DEATH({ auto _ = make_tensor_ptr({2, 2}, std::move(data)); }, "");
 }
 
+TEST_F(TensorPtrTest, MakeViewFromTensorPtrKeepsSourceAlive) {
+  bool freed = false;
+  auto* data = new float[6]{1, 2, 3, 4, 5, 6};
+  auto tensor = make_tensor_ptr(
+      {2, 3},
+      data,
+      {},
+      {},
+      executorch::aten::ScalarType::Float,
+      executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
+      [&freed](void* p) {
+        freed = true;
+        delete[] static_cast<float*>(p);
+      });
+  auto view = make_tensor_ptr(tensor);
+  tensor.reset();
+  EXPECT_FALSE(freed);
+  EXPECT_EQ(view->const_data_ptr<float>()[0], 1.0f);
+  view->mutable_data_ptr<float>()[0] = 42.0f;
+  EXPECT_EQ(view->const_data_ptr<float>()[0], 42.0f);
+  view.reset();
+  EXPECT_TRUE(freed);
+}
+
+TEST_F(TensorPtrTest, MakeViewFromTensorDoesNotKeepAliveByDefault) {
+  bool freed = false;
+  auto* data = new float[2]{7.0f, 8.0f};
+  auto tensor = make_tensor_ptr(
+      {2, 1},
+      data,
+      {},
+      {},
+      executorch::aten::ScalarType::Float,
+      executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
+      [&freed](void* p) {
+        freed = true;
+        delete[] static_cast<float*>(p);
+      });
+  auto view = make_tensor_ptr(*tensor);
+  auto raw = view->const_data_ptr<float>();
+  EXPECT_EQ(raw, data);
+  tensor.reset();
+  EXPECT_TRUE(freed);
+  view.reset();
+}
+
+TEST_F(TensorPtrTest, MakeViewFromTensorWithDeleterKeepsAlive) {
+  bool freed = false;
+  auto* data = new float[3]{1.0f, 2.0f, 3.0f};
+  auto tensor = make_tensor_ptr(
+      {3},
+      data,
+      {},
+      {},
+      executorch::aten::ScalarType::Float,
+      executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
+      [&freed](void* p) {
+        freed = true;
+        delete[] static_cast<float*>(p);
+      });
+  auto view = make_tensor_ptr(*tensor, {}, {}, {}, [tensor](void*) {});
+  tensor.reset();
+  EXPECT_FALSE(freed);
+  EXPECT_EQ(view->const_data_ptr<float>()[2], 3.0f);
+  view.reset();
+  EXPECT_TRUE(freed);
+}
+
 TEST_F(TensorPtrTest, VectorFloatTooSmallExpectDeath) {
   std::vector<float> data(9, 1.f);
   ET_EXPECT_DEATH({ auto _ = make_tensor_ptr({2, 5}, std::move(data)); }, "");

From 887611336485558ee1a85c7299942d955aa63b92 Mon Sep 17 00:00:00 2001
From: Mergen Nachin <mnachin@meta.com>
Date: Mon, 13 Oct 2025 15:46:52 -0400
Subject: [PATCH 263/266] Ignore PRs that's empty (#15065)

https://www.internalfb.com/phabricator/paste/view/P1990751294
---
 scripts/pick_doc_commits.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/pick_doc_commits.py b/scripts/pick_doc_commits.py
index 85958c36977..accec00dda3 100755
--- a/scripts/pick_doc_commits.py
+++ b/scripts/pick_doc_commits.py
@@ -129,7 +129,7 @@ def is_doc_file(path: str) -> bool:
     all_files = frozenset(lines[1:])
     doc_files = frozenset(filter(is_doc_file, all_files))
     non_doc_files = all_files - doc_files
-    is_doc_only = all_files == doc_files
+    is_doc_only = (all_files == doc_files) and len(all_files) > 0
 
     if verbosity > 0 and not is_doc_only:
         debug_log(

From b9e812615179ff985530f12707d5373634cd81f7 Mon Sep 17 00:00:00 2001
From: lucylq <lfq@meta.com>
Date: Mon, 13 Oct 2025 13:21:49 -0700
Subject: [PATCH 264/266] Export lora weights to sep file (#15061)

Differential Revision:
[D83777195](https://our.internmc.facebook.com/intern/diff/D83777195/)

[ghstack-poisoned]

### Summary
[PLEASE REMOVE] See [CONTRIBUTING.md's Pull
Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests)
for ExecuTorch PR guidelines.

[PLEASE REMOVE] If this PR closes an issue, please add a `Fixes
#<issue-id>` line.

[PLEASE REMOVE] If this PR introduces a fix or feature that should be
the upcoming release notes, please add a "Release notes: <area>" label.
For a list of available release notes labels, check out
[CONTRIBUTING.md's Pull
Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests).

### Test plan
[PLEASE REMOVE] How did you test this PR? Please write down any manual
commands you used and note down tests that you have written if
applicable.
---
 examples/models/llama/export_llama_lib.py | 17 ++++++++++++-----
 extension/llm/export/config/llm_config.py | 10 +++++++---
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index d0abaf59720..3369b9bd97b 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -1089,11 +1089,18 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager:  # noqa: C901
 
     if llm_config.backend.xnnpack.enabled:
         if llm_config.export.foundation_weights_file is not None:
-            gen_tag_fn: Callable[[torch.fx.Node], Optional[str]] = lambda x: (
-                llm_config.export.foundation_weights_file
-                if "lora" not in x.name
-                else None
-            )
+            if llm_config.export.lora_weights_file is not None:
+                gen_tag_fn: Callable[[torch.fx.Node], Optional[str]] = lambda x: (
+                    llm_config.export.foundation_weights_file
+                    if "lora" not in x.name
+                    else None
+                )
+            else:
+                gen_tag_fn: Callable[[torch.fx.Node], Optional[str]] = lambda x: (
+                    llm_config.export.foundation_weights_file
+                    if "lora" not in x.name
+                    else llm_config.export.lora_weights_file
+                )
 
             from executorch.exir.passes.external_constants_pass import (
                 delegate_external_constants_pass_unlifted,
diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py
index b13001c005b..f15aad9e000 100644
--- a/extension/llm/export/config/llm_config.py
+++ b/extension/llm/export/config/llm_config.py
@@ -215,9 +215,10 @@ class ExportConfig:
         so_library: Shared library to specify custom quantized operators.
         export_only: Whether to stop right after torch.export() and
             just save the exported .pt2 graph file.
-        foundation_weights_file: configure the foundation weights of a model
-            to be placed in a separate file, external to the PTE. Pass the
-            intended file name here.
+        foundation_weights_file: place the foundation weights of the model into
+            a separate file, external to the PTE. Pass the file name here.
+        lora_weights_file: place the lora weights of the model into a
+            separate file, external to the PTE. Pass the file name here.
     """
 
     max_seq_length: int = 128
@@ -227,6 +228,7 @@ class ExportConfig:
     so_library: Optional[str] = None
     export_only: bool = False
     foundation_weights_file: Optional[str] = None
+    lora_weights_file: Optional[str] = None
 
     def __post_init__(self):
         if self.max_context_length < self.max_seq_length:
@@ -572,6 +574,8 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig":  # noqa: C901
             llm_config.export.export_only = args.export_only
         if hasattr(args, "foundation_weights_file"):
             llm_config.export.foundation_weights_file = args.foundation_weights_file
+        if hasattr(args, "lora_weights_file"):
+            llm_config.export.lora_weights_file = args.lora_weights_file
 
         # QuantizationConfig
         if hasattr(args, "quantization_mode"):

From b18243bf597f444c1cc6032c7e676cf0750e6083 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@meta.com>
Date: Mon, 13 Oct 2025 13:32:33 -0700
Subject: [PATCH 265/266] =?UTF-8?q?Revert=20"[ET-VK]=20Add=20Fusing=20for?=
 =?UTF-8?q?=20Conv/Binary=20Ops,=20Clamp/Binary=20Ops,=20and=E2=80=A6=20(#?=
 =?UTF-8?q?15066)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

… Clamp/Clamp (#14415)"

This reverts commit a5d7e5c2d9f619f3d1d11745e9fb4852fa74ca2c.

Broke internal builds @SS-JIA is trying to fix this in
https://github.com/pytorch/executorch/pull/15058 will leave relanding to
him

### Summary
[PLEASE REMOVE] See [CONTRIBUTING.md's Pull
Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests)
for ExecuTorch PR guidelines.

[PLEASE REMOVE] If this PR closes an issue, please add a `Fixes
#<issue-id>` line.

[PLEASE REMOVE] If this PR introduces a fix or feature that should be
the upcoming release notes, please add a "Release notes: <area>" label.
For a list of available release notes labels, check out
[CONTRIBUTING.md's Pull
Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests).

### Test plan
[PLEASE REMOVE] How did you test this PR? Please write down any manual
commands you used and note down tests that you have written if
applicable.
---
 .../transforms/fuse_clamp_with_binary_op.py   | 123 ---
 backends/transforms/fuse_clamps.py            | 105 ---
 backends/transforms/fuse_conv_with_clamp.py   |  10 +-
 backends/transforms/targets.bzl               |  32 -
 backends/vulkan/custom_ops_lib.py             | 757 ------------------
 backends/vulkan/op_registry.py                |   8 -
 .../runtime/graph/ops/glsl/binary_op.glsl     |  59 +-
 .../runtime/graph/ops/glsl/unary_op.glsl      |   1 -
 .../runtime/graph/ops/impl/BinaryOp.cpp       | 102 +--
 backends/vulkan/targets.bzl                   |   2 -
 backends/vulkan/vulkan_preprocess.py          |  10 +-
 11 files changed, 19 insertions(+), 1190 deletions(-)
 delete mode 100644 backends/transforms/fuse_clamp_with_binary_op.py
 delete mode 100644 backends/transforms/fuse_clamps.py

diff --git a/backends/transforms/fuse_clamp_with_binary_op.py b/backends/transforms/fuse_clamp_with_binary_op.py
deleted file mode 100644
index 4155b2b7458..00000000000
--- a/backends/transforms/fuse_clamp_with_binary_op.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import sys
-
-import executorch.backends.vulkan.custom_ops_lib  # noqa
-
-import torch
-
-from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.pass_base import ExportPass, PassResult
-
-
-class FuseClampBinaryOpPass(ExportPass):
-
-    FUSEABLE_CLAMP_OPS = [
-        exir_ops.edge.aten.relu.default,
-        exir_ops.edge.aten.hardtanh.default,
-        exir_ops.edge.aten.clamp.default,
-    ]
-    FUSEABLE_BINARY_OPS = [
-        exir_ops.edge.aten.add.Tensor,
-        exir_ops.edge.aten.sub.Tensor,
-        exir_ops.edge.aten.mul.Tensor,
-        exir_ops.edge.aten.div.Tensor,
-    ]
-
-    def exists_before(self, graph_module, node_a, node_b):
-        seen_a = False
-        for n in graph_module.graph.nodes:
-            if n is node_a:
-                seen_a = True
-            if n is node_b:
-                return seen_a
-        return False
-
-    def get_output_min_max_from_activation(self, activation_node):
-        if activation_node.target == exir_ops.edge.aten.relu.default:
-            output_min = 0.0
-            output_max = sys.float_info.max
-        elif activation_node.target == exir_ops.edge.aten.hardtanh.default:
-            output_min = -1.0
-            output_max = 1.0
-            if len(activation_node.args) > 1:
-                output_min = activation_node.args[1]
-                output_max = activation_node.args[2]
-        elif activation_node.target == exir_ops.edge.aten.clamp.default:
-            output_min = None
-            output_max = None
-            if len(activation_node.args) >= 2:
-                output_min = activation_node.args[1]
-            if len(activation_node.args) >= 3:
-                output_max = activation_node.args[2]
-
-        return output_min, output_max
-
-    def fuse_binary_op_with_clamp(self, graph_module: torch.fx.GraphModule):
-        fuseAdded = False
-        for clamp_node in graph_module.graph.nodes:
-            if clamp_node.op == "call_function":
-                if clamp_node.target in self.FUSEABLE_CLAMP_OPS:
-                    preceding_op = clamp_node.args[0]
-
-                    if (
-                        preceding_op.op == "call_function"
-                        and preceding_op.target in self.FUSEABLE_BINARY_OPS
-                    ):
-                        # Delete activation
-                        output_min_max = self.get_output_min_max_from_activation(
-                            clamp_node
-                        )
-                        new_args = list(preceding_op.args)
-                        new_args.append(output_min_max[0])
-                        new_args.append(output_min_max[1])
-                        new_args = tuple(new_args)
-                        clamp_node.replace_all_uses_with(preceding_op)
-                        graph_module.graph.erase_node(clamp_node)
-
-                        new_op = None
-                        match preceding_op.target:
-                            case exir_ops.edge.aten.add.Tensor:
-                                new_op = (
-                                    exir_ops.edge.et_vk.binary_add_with_clamp.default
-                                )
-                            case exir_ops.edge.aten.sub.Tensor:
-                                new_op = (
-                                    exir_ops.edge.et_vk.binary_sub_with_clamp.default
-                                )
-                            case exir_ops.edge.aten.mul.Tensor:
-                                new_op = (
-                                    exir_ops.edge.et_vk.binary_mul_with_clamp.default
-                                )
-                            case exir_ops.edge.aten.div.Tensor:
-                                new_op = (
-                                    exir_ops.edge.et_vk.binary_div_with_clamp.default
-                                )
-
-                        # Create and insert node of custom op `binary_<op>_with_clamp`
-                        with graph_module.graph.inserting_before(preceding_op):
-                            binary_op_clamp_node = graph_module.graph.create_node(
-                                "call_function",
-                                new_op,
-                                new_args,
-                            )
-
-                            preceding_op.replace_all_uses_with(binary_op_clamp_node)
-                            graph_module.graph.erase_node(preceding_op)
-
-                            fuseAdded = True
-
-        graph_module.recompile()
-        graph_module = super().call(graph_module).graph_module
-        return [fuseAdded, graph_module]
-
-    def call(self, graph_module: torch.fx.GraphModule):
-        fuseAdded = True
-        while fuseAdded:
-            fuseAdded, graph_module = self.fuse_binary_op_with_clamp(graph_module)
-
-        return PassResult(graph_module, True)
diff --git a/backends/transforms/fuse_clamps.py b/backends/transforms/fuse_clamps.py
deleted file mode 100644
index 6e5be508d54..00000000000
--- a/backends/transforms/fuse_clamps.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import sys
-
-import executorch.backends.vulkan.custom_ops_lib  # noqa
-
-import torch
-
-from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.pass_base import ExportPass, PassResult
-
-
-class FuseClampsPass(ExportPass):
-
-    FUSEABLE_CLAMPS = [
-        exir_ops.edge.aten.relu.default,
-        exir_ops.edge.aten.hardtanh.default,
-        exir_ops.edge.aten.clamp.default,
-    ]
-
-    def get_output_min_max_from_activation(self, activation_node):
-        if activation_node.target == exir_ops.edge.aten.relu.default:
-            output_min = 0.0
-            output_max = sys.float_info.max
-        elif activation_node.target == exir_ops.edge.aten.hardtanh.default:
-            output_min = -1.0
-            output_max = 1.0
-            if len(activation_node.args) > 1:
-                output_min = activation_node.args[1]
-                output_max = activation_node.args[2]
-        elif activation_node.target == exir_ops.edge.aten.clamp.default:
-            output_min = None
-            output_max = None
-            if len(activation_node.args) >= 2:
-                output_min = activation_node.args[1]
-            if len(activation_node.args) >= 3:
-                output_max = activation_node.args[2]
-
-        return output_min, output_max
-
-    def call(self, graph_module: torch.fx.GraphModule):
-        fuseAdded = True
-        while fuseAdded:
-            fuseAdded = False
-            for clamp_2_node in graph_module.graph.nodes:
-                if clamp_2_node.op == "call_function":
-                    if clamp_2_node.target in self.FUSEABLE_CLAMPS:
-                        preceding_op = clamp_2_node.args[0]
-                        if (
-                            preceding_op.op == "call_function"
-                            and preceding_op.target in self.FUSEABLE_CLAMPS
-                        ):
-                            # Ensure the shapes match
-                            if (
-                                "val" not in clamp_2_node.args[0].meta
-                                or "val" not in preceding_op.args[0].meta
-                            ):
-                                continue
-                            if len(clamp_2_node.args[0].meta["val"].shape) != len(
-                                preceding_op.args[0].meta["val"].shape
-                            ):
-                                continue
-
-                            min_max1 = self.get_output_min_max_from_activation(
-                                preceding_op
-                            )
-                            min_max2 = self.get_output_min_max_from_activation(
-                                clamp_2_node
-                            )
-
-                            min_max = [None, None]
-
-                            if min_max1[0] is None and min_max2[0] is not None:
-                                min_max[0] = min_max2[0]
-                            elif min_max1[0] is not None and min_max2[0] is None:
-                                min_max[0] = min_max1[0]
-                            else:
-                                min_max[0] = min(min_max1[0], min_max2[0])
-
-                            if min_max1[1] is None and min_max2[1] is not None:
-                                min_max[1] = min_max2[1]
-                            elif min_max1[1] is not None and min_max2[1] is None:
-                                min_max[1] = min_max1[1]
-                            else:
-                                min_max[1] = max(min_max1[1], min_max2[1])
-
-                            new_args = list(preceding_op.args)
-
-                            # Insert the new min/max at indices 1 and 2
-                            new_args.insert(1, min_max[0])
-                            new_args.insert(2, min_max[1])
-                            new_args = new_args[0:3]
-                            preceding_op.args = tuple(new_args)
-                            clamp_2_node.replace_all_uses_with(preceding_op)
-                            graph_module.graph.erase_node(clamp_2_node)
-                            fuseAdded = True
-
-            graph_module.recompile()
-            graph_module = super().call(graph_module).graph_module
-
-        return PassResult(graph_module, True)
diff --git a/backends/transforms/fuse_conv_with_clamp.py b/backends/transforms/fuse_conv_with_clamp.py
index 52fc1f4a413..3f45296b26c 100644
--- a/backends/transforms/fuse_conv_with_clamp.py
+++ b/backends/transforms/fuse_conv_with_clamp.py
@@ -14,7 +14,7 @@
 from executorch.exir.pass_base import ExportPass, PassResult
 
 
-class FuseConvClampPass(ExportPass):
+class FuseClampPass(ExportPass):
     """
     Some activations like ReLU and hardtanh can be fused with certain operators (e.g. convolution) preceding it.
     """
@@ -25,7 +25,6 @@ class FuseConvClampPass(ExportPass):
     FUSEABLE_ACTIVATIONS = [
         exir_ops.edge.aten.relu.default,
         exir_ops.edge.aten.hardtanh.default,
-        exir_ops.edge.aten.clamp.default,
     ]
 
     def get_output_min_max_from_activation(self, activation_node):
@@ -38,13 +37,6 @@ def get_output_min_max_from_activation(self, activation_node):
             if len(activation_node.args) > 1:
                 output_min = activation_node.args[1]
                 output_max = activation_node.args[2]
-        elif activation_node.target == exir_ops.edge.aten.clamp.default:
-            output_min = None
-            output_max = None
-            if len(activation_node.args) >= 2:
-                output_min = activation_node.args[1]
-            if len(activation_node.args) >= 3:
-                output_max = activation_node.args[2]
 
         return output_min, output_max
 
diff --git a/backends/transforms/targets.bzl b/backends/transforms/targets.bzl
index f354f2234bd..ca09d34c2fe 100644
--- a/backends/transforms/targets.bzl
+++ b/backends/transforms/targets.bzl
@@ -77,38 +77,6 @@ def define_common_targets():
         ],
     )
 
-    runtime.python_library(
-        name = "fuse_clamps",
-        srcs = ["fuse_clamps.py"],
-        visibility = [
-            "//executorch/backends/...",
-        ],
-        deps = [
-            ":utils",
-            "//caffe2:torch",
-            "//executorch/backends/vulkan:custom_ops_lib",
-            "//executorch/exir:pass_base",
-            "//executorch/exir:sym_util",
-            "//executorch/exir/dialects:lib",
-        ],
-    )
-
-    runtime.python_library(
-        name = "fuse_clamp_with_binary_op",
-        srcs = ["fuse_clamp_with_binary_op.py"],
-        visibility = [
-            "//executorch/backends/...",
-        ],
-        deps = [
-            ":utils",
-            "//caffe2:torch",
-            "//executorch/backends/vulkan:custom_ops_lib",
-            "//executorch/exir:pass_base",
-            "//executorch/exir:sym_util",
-            "//executorch/exir/dialects:lib",
-        ],
-    )
-
     runtime.python_library(
         name = "view_copy_to_squeeze_unsqueeze",
         srcs = ["view_copy_to_squeeze_unsqueeze.py"],
diff --git a/backends/vulkan/custom_ops_lib.py b/backends/vulkan/custom_ops_lib.py
index 56d882fa075..6e5aa926d37 100644
--- a/backends/vulkan/custom_ops_lib.py
+++ b/backends/vulkan/custom_ops_lib.py
@@ -109,763 +109,6 @@ def conv_with_clamp_out_impl(
 )
 lib.impl(name, conv_with_clamp_out_impl, "CompositeExplicitAutograd")
 
-##########################
-## conv_with_binary_add ##
-##########################
-
-
-def conv_with_binary_add_impl(
-    input,
-    weight,
-    bias=None,
-    stride=1,
-    padding=0,
-    dilation=1,
-    transposed=False,
-    output_padding=0,
-    groups=1,
-    other=None,
-):
-    return torch.add(
-        torch.convolution(
-            input,
-            weight,
-            bias,
-            stride,
-            padding,
-            dilation,
-            transposed,
-            output_padding,
-            groups,
-        ),
-        other,
-    )
-
-
-name = "conv_with_binary_add"
-lib.define(
-    f"{name}(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, Tensor other) -> Tensor"
-)
-lib.impl(name, conv_with_binary_add_impl, "CompositeExplicitAutograd")
-conv_with_binary_add_op = getattr(getattr(torch.ops, namespace), name)
-
-#############################
-## conv_with_binary_add.out ##
-#############################
-
-
-def conv_with_binary_add_out_impl(
-    input,
-    weight,
-    bias=None,
-    stride=1,
-    padding=0,
-    dilation=1,
-    transposed=False,
-    output_padding=0,
-    groups=1,
-    other=None,
-    out=None,
-):
-    out = conv_with_binary_add_impl(
-        input,
-        weight,
-        bias,
-        stride,
-        padding,
-        dilation,
-        transposed,
-        output_padding,
-        groups,
-        other,
-    )
-    return out
-
-
-name = "conv_with_binary_add.out"
-lib.define(
-    f"{name}(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, Tensor other, *, Tensor(a!) out) -> Tensor(a!)"
-)
-lib.impl(name, conv_with_binary_add_out_impl, "CompositeExplicitAutograd")
-
-##########################
-## conv_with_binary_sub ##
-##########################
-
-
-def conv_with_binary_sub_impl(
-    input,
-    weight,
-    bias=None,
-    stride=1,
-    padding=0,
-    dilation=1,
-    transposed=False,
-    output_padding=0,
-    groups=1,
-    other=None,
-):
-    return torch.sub(
-        torch.convolution(
-            input,
-            weight,
-            bias,
-            stride,
-            padding,
-            dilation,
-            transposed,
-            output_padding,
-            groups,
-        ),
-        other,
-    )
-
-
-name = "conv_with_binary_sub"
-lib.define(
-    f"{name}(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, Tensor other) -> Tensor"
-)
-lib.impl(name, conv_with_binary_sub_impl, "CompositeExplicitAutograd")
-conv_with_binary_sub_op = getattr(getattr(torch.ops, namespace), name)
-
-##############################
-## conv_with_binary_sub.out ##
-##############################
-
-
-def conv_with_binary_sub_out_impl(
-    input,
-    weight,
-    bias=None,
-    stride=1,
-    padding=0,
-    dilation=1,
-    transposed=False,
-    output_padding=0,
-    groups=1,
-    other=None,
-    out=None,
-):
-    out = conv_with_binary_sub_impl(
-        input,
-        weight,
-        bias,
-        stride,
-        padding,
-        dilation,
-        transposed,
-        output_padding,
-        groups,
-        other,
-    )
-    return out
-
-
-name = "conv_with_binary_sub.out"
-lib.define(
-    f"{name}(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, Tensor other, *, Tensor(a!) out) -> Tensor(a!)"
-)
-lib.impl(name, conv_with_binary_sub_out_impl, "CompositeExplicitAutograd")
-
-##########################
-## conv_with_binary_mul ##
-##########################
-
-
-def conv_with_binary_mul_impl(
-    input,
-    weight,
-    bias=None,
-    stride=1,
-    padding=0,
-    dilation=1,
-    transposed=False,
-    output_padding=0,
-    groups=1,
-    other=None,
-):
-    return torch.mul(
-        torch.convolution(
-            input,
-            weight,
-            bias,
-            stride,
-            padding,
-            dilation,
-            transposed,
-            output_padding,
-            groups,
-        ),
-        other,
-    )
-
-
-name = "conv_with_binary_mul"
-lib.define(
-    f"{name}(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, Tensor other) -> Tensor"
-)
-lib.impl(name, conv_with_binary_mul_impl, "CompositeExplicitAutograd")
-conv_with_binary_mul_op = getattr(getattr(torch.ops, namespace), name)
-
-##############################
-## conv_with_binary_mul.out ##
-##############################
-
-
-def conv_with_binary_mul_out_impl(
-    input,
-    weight,
-    bias=None,
-    stride=1,
-    padding=0,
-    dilation=1,
-    transposed=False,
-    output_padding=0,
-    groups=1,
-    other=None,
-    out=None,
-):
-    out = conv_with_binary_mul_impl(
-        input,
-        weight,
-        bias,
-        stride,
-        padding,
-        dilation,
-        transposed,
-        output_padding,
-        groups,
-        other,
-    )
-    return out
-
-
-name = "conv_with_binary_mul.out"
-lib.define(
-    f"{name}(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, Tensor other, *, Tensor(a!) out) -> Tensor(a!)"
-)
-lib.impl(name, conv_with_binary_mul_out_impl, "CompositeExplicitAutograd")
-
-##########################
-## conv_with_binary_div ##
-##########################
-
-
-def conv_with_binary_div_impl(
-    input,
-    weight,
-    bias=None,
-    stride=1,
-    padding=0,
-    dilation=1,
-    transposed=False,
-    output_padding=0,
-    groups=1,
-    other=None,
-):
-    return torch.div(
-        torch.convolution(
-            input,
-            weight,
-            bias,
-            stride,
-            padding,
-            dilation,
-            transposed,
-            output_padding,
-            groups,
-        ),
-        other,
-    )
-
-
-name = "conv_with_binary_div"
-lib.define(
-    f"{name}(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, Tensor other) -> Tensor"
-)
-lib.impl(name, conv_with_binary_div_impl, "CompositeExplicitAutograd")
-conv_with_binary_div_op = getattr(getattr(torch.ops, namespace), name)
-
-##############################
-## conv_with_binary_div.out ##
-##############################
-
-
-def conv_with_binary_div_out_impl(
-    input,
-    weight,
-    bias=None,
-    stride=1,
-    padding=0,
-    dilation=1,
-    transposed=False,
-    output_padding=0,
-    groups=1,
-    other=None,
-    out=None,
-):
-    out = conv_with_binary_div_impl(
-        input,
-        weight,
-        bias,
-        stride,
-        padding,
-        dilation,
-        transposed,
-        output_padding,
-        groups,
-        other,
-    )
-    return out
-
-
-name = "conv_with_binary_div.out"
-lib.define(
-    f"{name}(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, Tensor other, *, Tensor(a!) out) -> Tensor(a!)"
-)
-lib.impl(name, conv_with_binary_div_out_impl, "CompositeExplicitAutograd")
-
-###########################
-## clamp_with_binary_add ##
-###########################
-
-
-def clamp_with_binary_add_impl(
-    input,
-    output_min=-float("inf"),
-    output_max=float("inf"),
-    other=None,
-):
-    return torch.add(
-        torch.clamp(
-            input,
-            output_min,
-            output_max,
-        ),
-        other,
-    )
-
-
-name = "clamp_with_binary_add"
-lib.define(
-    f"{name}(Tensor input, Scalar? output_min, Scalar? output_max, Tensor? other) -> Tensor"
-)
-lib.impl(name, clamp_with_binary_add_impl, "CompositeExplicitAutograd")
-clamp_with_binary_add_op = getattr(getattr(torch.ops, namespace), name)
-
-###############################
-## clamp_with_binary_add.out ##
-###############################
-
-
-def clamp_with_binary_add_out_impl(
-    input,
-    output_min=-float("inf"),
-    output_max=float("inf"),
-    other=None,
-    out=None,
-):
-    out = clamp_with_binary_add_impl(
-        input,
-        output_min,
-        output_max,
-        other,
-    )
-    return out
-
-
-name = "clamp_with_binary_add.out"
-lib.define(
-    f"{name}(Tensor input, Scalar? output_min, Scalar? output_max, Tensor? other, *, Tensor(a!) out) -> Tensor(a!)"
-)
-lib.impl(name, clamp_with_binary_add_out_impl, "CompositeExplicitAutograd")
-
-###########################
-## clamp_with_binary_sub ##
-###########################
-
-
-def clamp_with_binary_sub_impl(
-    input,
-    output_min=-float("inf"),
-    output_max=float("inf"),
-    other=None,
-):
-    return torch.sub(
-        torch.clamp(
-            input,
-            output_min,
-            output_max,
-        ),
-        other,
-    )
-
-
-name = "clamp_with_binary_sub"
-lib.define(
-    f"{name}(Tensor input, Scalar? output_min, Scalar? output_max, Tensor? other) -> Tensor"
-)
-lib.impl(name, clamp_with_binary_sub_impl, "CompositeExplicitAutograd")
-clamp_with_binary_sub_op = getattr(getattr(torch.ops, namespace), name)
-
-###############################
-## clamp_with_binary_sub.out ##
-###############################
-
-
-def clamp_with_binary_sub_out_impl(
-    input,
-    output_min=-float("inf"),
-    output_max=float("inf"),
-    other=None,
-    out=None,
-):
-    out = clamp_with_binary_sub_impl(
-        input,
-        output_min,
-        output_max,
-        other,
-    )
-    return out
-
-
-name = "clamp_with_binary_sub.out"
-lib.define(
-    f"{name}(Tensor input, Scalar? output_min, Scalar? output_max, Tensor? other, *, Tensor(a!) out) -> Tensor(a!)"
-)
-lib.impl(name, clamp_with_binary_sub_out_impl, "CompositeExplicitAutograd")
-
-###########################
-## clamp_with_binary_mul ##
-###########################
-
-
-def clamp_with_binary_mul_impl(
-    input,
-    output_min=-float("inf"),
-    output_max=float("inf"),
-    other=None,
-):
-    return torch.mul(
-        torch.clamp(
-            input,
-            output_min,
-            output_max,
-        ),
-        other,
-    )
-
-
-name = "clamp_with_binary_mul"
-lib.define(
-    f"{name}(Tensor input, Scalar? output_min, Scalar? output_max, Tensor? other) -> Tensor"
-)
-lib.impl(name, clamp_with_binary_mul_impl, "CompositeExplicitAutograd")
-clamp_with_binary_mul_op = getattr(getattr(torch.ops, namespace), name)
-
-###############################
-## clamp_with_binary_mul.out ##
-###############################
-
-
-def clamp_with_binary_mul_out_impl(
-    input,
-    output_min=-float("inf"),
-    output_max=float("inf"),
-    other=None,
-    out=None,
-):
-    out = clamp_with_binary_mul_impl(
-        input,
-        output_min,
-        output_max,
-        other,
-    )
-    return out
-
-
-name = "clamp_with_binary_mul.out"
-lib.define(
-    f"{name}(Tensor input, Scalar? output_min, Scalar? output_max, Tensor? other, *, Tensor(a!) out) -> Tensor(a!)"
-)
-lib.impl(name, clamp_with_binary_mul_out_impl, "CompositeExplicitAutograd")
-
-###########################
-## clamp_with_binary_div ##
-###########################
-
-
-def clamp_with_binary_div_impl(
-    input,
-    output_min=-float("inf"),
-    output_max=float("inf"),
-    other=None,
-):
-    return torch.div(
-        torch.clamp(
-            input,
-            output_min,
-            output_max,
-        ),
-        other,
-    )
-
-
-name = "clamp_with_binary_div"
-lib.define(
-    f"{name}(Tensor input, Scalar? output_min, Scalar? output_max, Tensor? other) -> Tensor"
-)
-lib.impl(name, clamp_with_binary_div_impl, "CompositeExplicitAutograd")
-clamp_with_binary_div_op = getattr(getattr(torch.ops, namespace), name)
-
-###############################
-## clamp_with_binary_div.out ##
-###############################
-
-
-def clamp_with_binary_div_out_impl(
-    input,
-    output_min=-float("inf"),
-    output_max=float("inf"),
-    other=None,
-    out=None,
-):
-    out = clamp_with_binary_div_impl(
-        input,
-        output_min,
-        output_max,
-        other,
-    )
-    return out
-
-
-name = "clamp_with_binary_div.out"
-lib.define(
-    f"{name}(Tensor input, Scalar? output_min, Scalar? output_max, Tensor? other, *, Tensor(a!) out) -> Tensor(a!)"
-)
-lib.impl(name, clamp_with_binary_div_out_impl, "CompositeExplicitAutograd")
-
-###########################
-## binary_add_with_clamp ##
-###########################
-
-
-def binary_add_with_clamp_impl(
-    input,
-    other=None,
-    output_min=-float("inf"),
-    output_max=float("inf"),
-):
-    return torch.clamp(
-        torch.add(
-            input,
-            other,
-        ),
-        output_min,
-        output_max,
-    )
-
-
-name = "binary_add_with_clamp"
-lib.define(
-    f"{name}(Tensor input, Tensor? other, Scalar? output_min, Scalar? output_max) -> Tensor"
-)
-lib.impl(name, binary_add_with_clamp_impl, "CompositeExplicitAutograd")
-binary_add_with_clamp_op = getattr(getattr(torch.ops, namespace), name)
-
-###############################
-## binary_add_with_clamp.out ##
-###############################
-
-
-def binary_add_with_clamp_out_impl(
-    input,
-    other=None,
-    output_min=-float("inf"),
-    output_max=float("inf"),
-    out=None,
-):
-    out = binary_add_with_clamp_impl(
-        input,
-        output_min,
-        output_max,
-        other,
-    )
-    return out
-
-
-name = "binary_add_with_clamp.out"
-lib.define(
-    f"{name}(Tensor input, Tensor? other, Scalar? output_min, Scalar? output_max, *, Tensor(a!) out) -> Tensor(a!)"
-)
-lib.impl(name, binary_add_with_clamp_impl, "CompositeExplicitAutograd")
-
-###########################
-## binary_sub_with_clamp ##
-###########################
-
-
-def binary_sub_with_clamp_impl(
-    input,
-    other=None,
-    output_min=-float("inf"),
-    output_max=float("inf"),
-):
-    return torch.clamp(
-        torch.sub(
-            input,
-            other,
-        ),
-        output_min,
-        output_max,
-    )
-
-
-name = "binary_sub_with_clamp"
-lib.define(
-    f"{name}(Tensor input, Tensor? other, Scalar? output_min, Scalar? output_max) -> Tensor"
-)
-lib.impl(name, binary_sub_with_clamp_impl, "CompositeExplicitAutograd")
-binary_sub_with_clamp_op = getattr(getattr(torch.ops, namespace), name)
-
-###############################
-## binary_sub_with_clamp.out ##
-###############################
-
-
-def binary_sub_with_clamp_out_impl(
-    input,
-    other=None,
-    output_min=-float("inf"),
-    output_max=float("inf"),
-    out=None,
-):
-    out = binary_sub_with_clamp_impl(
-        input,
-        output_min,
-        output_max,
-        other,
-    )
-    return out
-
-
-name = "binary_sub_with_clamp.out"
-lib.define(
-    f"{name}(Tensor input, Tensor? other, Scalar? output_min, Scalar? output_max, *, Tensor(a!) out) -> Tensor(a!)"
-)
-lib.impl(name, binary_sub_with_clamp_impl, "CompositeExplicitAutograd")
-
-###########################
-## binary_mul_with_clamp ##
-###########################
-
-
-def binary_mul_with_clamp_impl(
-    input,
-    other=None,
-    output_min=-float("inf"),
-    output_max=float("inf"),
-):
-    return torch.clamp(
-        torch.mul(
-            input,
-            other,
-        ),
-        output_min,
-        output_max,
-    )
-
-
-name = "binary_mul_with_clamp"
-lib.define(
-    f"{name}(Tensor input, Tensor? other, Scalar? output_min, Scalar? output_max) -> Tensor"
-)
-lib.impl(name, binary_mul_with_clamp_impl, "CompositeExplicitAutograd")
-binary_mul_with_clamp_op = getattr(getattr(torch.ops, namespace), name)
-
-###############################
-## binary_mul_with_clamp.out ##
-###############################
-
-
-def binary_mul_with_clamp_out_impl(
-    input,
-    other=None,
-    output_min=-float("inf"),
-    output_max=float("inf"),
-    out=None,
-):
-    out = binary_mul_with_clamp_impl(
-        input,
-        output_min,
-        output_max,
-        other,
-    )
-    return out
-
-
-name = "binary_mul_with_clamp.out"
-lib.define(
-    f"{name}(Tensor input, Tensor? other, Scalar? output_min, Scalar? output_max, *, Tensor(a!) out) -> Tensor(a!)"
-)
-lib.impl(name, binary_mul_with_clamp_impl, "CompositeExplicitAutograd")
-
-###########################
-## binary_div_with_clamp ##
-###########################
-
-
-def binary_div_with_clamp_impl(
-    input,
-    other=None,
-    output_min=-float("inf"),
-    output_max=float("inf"),
-):
-    return torch.clamp(
-        torch.div(
-            input,
-            other,
-        ),
-        output_min,
-        output_max,
-    )
-
-
-name = "binary_div_with_clamp"
-lib.define(
-    f"{name}(Tensor input, Tensor? other, Scalar? output_min, Scalar? output_max) -> Tensor"
-)
-lib.impl(name, binary_div_with_clamp_impl, "CompositeExplicitAutograd")
-binary_div_with_clamp_op = getattr(getattr(torch.ops, namespace), name)
-
-###############################
-## binary_div_with_clamp.out ##
-###############################
-
-
-def binary_div_with_clamp_out_impl(
-    input,
-    other=None,
-    output_min=-float("inf"),
-    output_max=float("inf"),
-    out=None,
-):
-    out = binary_div_with_clamp_impl(
-        input,
-        output_min,
-        output_max,
-        other,
-    )
-    return out
-
-
-name = "binary_div_with_clamp.out"
-lib.define(
-    f"{name}(Tensor input, Tensor? other, Scalar? output_min, Scalar? output_max, *, Tensor(a!) out) -> Tensor(a!)"
-)
-lib.impl(name, binary_div_with_clamp_impl, "CompositeExplicitAutograd")
-
-
 #################
 ## grid_priors ##
 #################
diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
index 85d14b30e88..63b57a0e79c 100644
--- a/backends/vulkan/op_registry.py
+++ b/backends/vulkan/op_registry.py
@@ -219,10 +219,6 @@ def register_torchao_choose_qparams_affine():
         exir_ops.edge.aten.le.Tensor,
         exir_ops.edge.aten.gt.Tensor,
         exir_ops.edge.aten.ge.Tensor,
-        exir_ops.edge.et_vk.binary_add_with_clamp.default,
-        exir_ops.edge.et_vk.binary_sub_with_clamp.default,
-        exir_ops.edge.et_vk.binary_mul_with_clamp.default,
-        exir_ops.edge.et_vk.binary_div_with_clamp.default,
     ]
 )
 def register_binary_op():
@@ -250,10 +246,6 @@ def register_binary_op():
         exir_ops.edge.aten.tanh.default,
         exir_ops.edge.aten.round.default,
         exir_ops.edge.aten.leaky_relu.default,
-        exir_ops.edge.et_vk.clamp_with_binary_add.default,
-        exir_ops.edge.et_vk.clamp_with_binary_sub.default,
-        exir_ops.edge.et_vk.clamp_with_binary_mul.default,
-        exir_ops.edge.et_vk.clamp_with_binary_div.default,
     ]
 )
 def register_unary_op():
diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
index ed420fcc72f..6f2a93667ea 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
@@ -69,9 +69,6 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 ${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
 ${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
 ${layout_declare_spec_const(C, "int", "other_layout", "DEFAULT_LAYOUT")}
-${layout_declare_spec_const(C, "int", "clamp_type", "0")}
-${layout_declare_spec_const(C, "float", "min_val", "0")}
-${layout_declare_spec_const(C, "float", "max_val", "0")}
 
 $if STORAGE == "buffer":
   const lowp ivec4 out_dim_order = unhash_dim_order(out_layout);
@@ -93,20 +90,7 @@ void main() {
 
   // Simple case; no broadcasting
   if (are_equal(inp, other)) {
-    T in_val = T(t_in[out_bufi]);
-    T other_val = T(t_other[out_bufi]);
-    if (clamp_type == 1) {
-      in_val = T(clamp(in_val, T(min_val), T(max_val)));
-    }
-    else if (clamp_type == 2) {
-      other_val = T(clamp(other_val, T(min_val), T(max_val)));
-    }
-    T out_val = T(op(in_val, other_val, T(alpha)));
-    if (clamp_type == 3) {
-      out_val = T(clamp(out_val, T(min_val), T(max_val)));
-    }
-    t_out[out_bufi] = out_val;
-
+    t_out[out_bufi] = T(op(t_in[out_bufi], t_other[out_bufi], T(alpha)));
     return;
   }
 
@@ -122,19 +106,7 @@ void main() {
   uint inp_bufi = tensor_idx_to_linear_idx(inp, inp_tidx);
   uint other_bufi = tensor_idx_to_linear_idx(other, other_tidx);
 
-  T in_val = T(t_in[inp_bufi]);
-  T other_val = T(t_other[other_bufi]);
-  if (clamp_type == 1) {
-    in_val = T(clamp(in_val, T(min_val), T(max_val)));
-  }
-  else if (clamp_type == 2) {
-    other_val = T(clamp(other_val, T(min_val), T(max_val)));
-  }
-  T out_val = T(op(in_val, other_val, T(alpha)));
-  if (clamp_type == 3) {
-    out_val = T(clamp(out_val, T(min_val), T(max_val)));
-  }
-  t_out[out_bufi] = out_val;
+  t_out[out_bufi] = T(op(t_in[inp_bufi], t_other[other_bufi], T(alpha)));
 }
 
 #else // USING_TEXTURE
@@ -154,10 +126,6 @@ void main() {
     // read axis mapped texel
     tidx_to_pos(in_idx, in_sizes, in_axis_map, packed_dim)));
 
-  if (clamp_type == 1) {
-    in_texel = clamp(in_texel, VEC4_T(min_val), VEC4_T(max_val));
-  }
-
   // broadcast on logical sizes
   ivec4 other_idx = broadcast_indices(tidx, other_sizes);
   VEC4_T other_texel = VEC4_T(load_texel(
@@ -165,10 +133,6 @@ void main() {
     // read axis mapped texel
     tidx_to_pos(other_idx, other_sizes, other_axis_map, packed_dim)));
 
-  if (clamp_type == 2) {
-    in_texel = clamp(other_texel, VEC4_T(min_val), VEC4_T(max_val));
-  }
-
   // Check boolean broadcast flags; we use ivec2 instead of bvec2 for alignment.
   if (broadcast_params.x > 0) {
     in_texel = in_texel.xxxx;
@@ -177,20 +141,11 @@ void main() {
     other_texel = other_texel.xxxx;
   }
 
-  if (clamp_type != 3) {
-    write_texel_lpos(
-      t_out,
-      lpos,
-      VEC4_OUT_T(op(in_texel, other_texel, alpha)),
-      out_axis_map);
-  }
-  else {
-    write_texel_lpos(
-      t_out,
-      lpos,
-      VEC4_OUT_T(clamp(VEC4_OUT_T(op(in_texel, other_texel, alpha)), min_val, max_val)),
-      out_axis_map);
-  }
+  write_texel_lpos(
+    t_out,
+    lpos,
+    VEC4_OUT_T(op(in_texel, other_texel, alpha)),
+    out_axis_map);
 }
 
 #endif
diff --git a/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl
index 5bc01fa7f57..bb7ce482a7a 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl
@@ -61,7 +61,6 @@ void main() {
   }
 
   VEC4_T in_texel = texelFetch(t_in, pos, 0);
-
   imageStore(t_out, pos, VEC4_T(op(in_texel, minimum, maximum)));
 }
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
index 9575ca0dcdd..025b483eab7 100644
--- a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
@@ -54,39 +54,13 @@ void resize_binary_op_node(
   graph->virtual_resize(out, new_out_sizes);
 }
 
-int remove_clamp_from_name(std::string& op) {
-  if (op.find("clamp_0_with_") != std::string::npos) {
-    op.erase(op.find("clamp_0_with_"), 13);
-
-    // Clamp input 0
-    return 1;
-  }
-  if (op.find("clamp_1_with_") != std::string::npos) {
-    op.erase(op.find("clamp_1_with_"), 13);
-
-    // Clamp input 1
-    return 2;
-  }
-  if (op.find("_with_clamp") != std::string::npos) {
-    op.erase(op.find("_with_clamp"), 11);
-
-    // Clamp output
-    return 3;
-  }
-
-  // No clamp
-  return 0;
-}
-
 void add_binary_op_texture_node(
     ComputeGraph& graph,
     const ValueRef in1,
     const ValueRef in2,
     const ValueRef alpha,
     const ValueRef out,
-    const std::string& op_name,
-    const float min,
-    const float max) {
+    const std::string& op_name) {
   ValueRef arg1 = prepack_standard_like(graph, in1, out, true);
   ValueRef arg2 = prepack_standard_like(graph, in2, out, true);
 
@@ -106,10 +80,7 @@ void add_binary_op_texture_node(
 
   std::string kernel_name("binary_");
   kernel_name.reserve(kShaderNameReserve);
-
-  std::string op = op_name;
-  int clamp_type = remove_clamp_from_name(op);
-  kernel_name += op;
+  kernel_name += op_name;
   add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
   add_dtype_suffix(kernel_name, graph.dtype_of(in1));
 
@@ -130,10 +101,7 @@ void add_binary_op_texture_node(
       // Specialization Constants
       {graph.hashed_layout_of(out),
        graph.hashed_layout_of(arg1),
-       graph.hashed_layout_of(arg2),
-       clamp_type,
-       min,
-       max},
+       graph.hashed_layout_of(arg2)},
       // Resize Args
       {},
       // Resizing Logic
@@ -146,9 +114,7 @@ void add_binary_op_buffer_node(
     const ValueRef in2,
     const ValueRef alpha,
     const ValueRef out,
-    const std::string& op_name,
-    const float min,
-    const float max) {
+    const std::string& op_name) {
   // check_binary_op_args(*t_in1, *t_in2, *t_out);
 
   float alpha_val = 1.0f;
@@ -160,9 +126,7 @@ void add_binary_op_buffer_node(
 
   std::string kernel_name("binary_");
   kernel_name.reserve(kShaderNameReserve);
-  std::string op = op_name;
-  int clamp_type = remove_clamp_from_name(op);
-  kernel_name += op;
+  kernel_name += op_name;
   add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
 
   add_dtype_suffix(kernel_name, graph.dtype_of(in1));
@@ -185,9 +149,7 @@ void add_binary_op_buffer_node(
       // Specialization Constants
       {graph.hashed_layout_of(out),
        graph.hashed_layout_of(in1),
-       graph.hashed_layout_of(in2),
-       min,
-       max},
+       graph.hashed_layout_of(in2)},
       // Resize Args
       {},
       // Resizing Logic
@@ -200,13 +162,11 @@ void add_binary_op_node(
     const ValueRef in2,
     const ValueRef alpha,
     const ValueRef out,
-    const std::string& op_name,
-    const float min = std::numeric_limits<float>::infinity(),
-    const float max = -std::numeric_limits<float>::infinity()) {
+    const std::string& op_name) {
   if (graph.is_buffer_storage(out)) {
-    add_binary_op_buffer_node(graph, in1, in2, alpha, out, op_name, min, max);
+    add_binary_op_buffer_node(graph, in1, in2, alpha, out, op_name);
   } else {
-    add_binary_op_texture_node(graph, in1, in2, alpha, out, op_name, min, max);
+    add_binary_op_texture_node(graph, in1, in2, alpha, out, op_name);
   }
 }
 
@@ -222,40 +182,6 @@ void add_binary_op_node(
         graph, args[0], args[1], kDummyValueRef, args[2], #op_name);     \
   }
 
-float get_val_or_inf_(ComputeGraph& graph, const ValueRef& val, bool max) {
-  if (!graph.val_is_none(val)) {
-    return graph.extract_scalar<float>(val);
-  }
-  return max ? std::numeric_limits<float>::infinity()
-             : -std::numeric_limits<float>::infinity();
-}
-
-#define DEFINE_BINARY_OP_WITH_ALPHA_FN_CLAMPED(op_name)                  \
-  void op_name(ComputeGraph& graph, const std::vector<ValueRef>& args) { \
-    return add_binary_op_node(                                           \
-        graph,                                                           \
-        args[0],                                                         \
-        args[1],                                                         \
-        args[2],                                                         \
-        args[5],                                                         \
-        #op_name,                                                        \
-        get_val_or_inf_(graph, args[3], false),                          \
-        get_val_or_inf_(graph, args[4], true));                          \
-  }
-
-#define DEFINE_BINARY_OP_FN_CLAMPED(op_name)                             \
-  void op_name(ComputeGraph& graph, const std::vector<ValueRef>& args) { \
-    return add_binary_op_node(                                           \
-        graph,                                                           \
-        args[0],                                                         \
-        args[1],                                                         \
-        kDummyValueRef,                                                  \
-        args[4],                                                         \
-        #op_name,                                                        \
-        get_val_or_inf_(graph, args[2], false),                          \
-        get_val_or_inf_(graph, args[3], true));                          \
-  }
-
 DEFINE_BINARY_OP_WITH_ALPHA_FN(add);
 DEFINE_BINARY_OP_WITH_ALPHA_FN(sub);
 
@@ -273,11 +199,6 @@ DEFINE_BINARY_OP_FN(le);
 DEFINE_BINARY_OP_FN(gt);
 DEFINE_BINARY_OP_FN(ge);
 
-DEFINE_BINARY_OP_FN_CLAMPED(add_with_clamp);
-DEFINE_BINARY_OP_FN_CLAMPED(sub_with_clamp);
-DEFINE_BINARY_OP_FN_CLAMPED(mul_with_clamp);
-DEFINE_BINARY_OP_FN_CLAMPED(div_with_clamp);
-
 REGISTER_OPERATORS {
   VK_REGISTER_OP(aten.add.Tensor, add);
   VK_REGISTER_OP(aten.sub.Tensor, sub);
@@ -291,11 +212,6 @@ REGISTER_OPERATORS {
   VK_REGISTER_OP(aten.le.Tensor, le);
   VK_REGISTER_OP(aten.gt.Tensor, gt);
   VK_REGISTER_OP(aten.ge.Tensor, ge);
-
-  VK_REGISTER_OP(et_vk.binary_add_with_clamp.default, add_with_clamp);
-  VK_REGISTER_OP(et_vk.binary_sub_with_clamp.default, sub_with_clamp);
-  VK_REGISTER_OP(et_vk.binary_mul_with_clamp.default, mul_with_clamp);
-  VK_REGISTER_OP(et_vk.binary_div_with_clamp.default, div_with_clamp);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/targets.bzl b/backends/vulkan/targets.bzl
index 42173e587ac..c48ce0a452b 100644
--- a/backends/vulkan/targets.bzl
+++ b/backends/vulkan/targets.bzl
@@ -392,8 +392,6 @@ def define_common_targets(is_fbcode = False):
             deps = [
                 "//executorch/backends/transforms:addmm_mm_to_linear",
                 "//executorch/backends/transforms:fuse_batch_norm_with_conv",
-                "//executorch/backends/transforms:fuse_clamp_with_binary_op",
-                "//executorch/backends/transforms:fuse_clamps",
                 "//executorch/backends/transforms:fuse_conv_with_clamp",
                 "//executorch/backends/transforms:fuse_view_copy",
                 "//executorch/backends/transforms:remove_clone_ops",
diff --git a/backends/vulkan/vulkan_preprocess.py b/backends/vulkan/vulkan_preprocess.py
index d23f0a29126..876f7fa8900 100644
--- a/backends/vulkan/vulkan_preprocess.py
+++ b/backends/vulkan/vulkan_preprocess.py
@@ -13,11 +13,7 @@
 import executorch.backends.vulkan.utils as utils
 
 from executorch.backends.transforms.addmm_mm_to_linear import AddmmToLinearTransform
-from executorch.backends.transforms.fuse_clamp_with_binary_op import (
-    FuseClampBinaryOpPass,
-)
-from executorch.backends.transforms.fuse_clamps import FuseClampsPass
-from executorch.backends.transforms.fuse_conv_with_clamp import FuseConvClampPass
+from executorch.backends.transforms.fuse_conv_with_clamp import FuseClampPass
 from executorch.backends.transforms.fuse_view_copy import FuseViewCopyTransform
 from executorch.backends.transforms.view_copy_to_squeeze_unsqueeze import (
     ViewCopyToSqueezeUnsqueezePass,
@@ -173,9 +169,7 @@ def preprocess(  # noqa: C901
             [
                 FuseBatchNormPass(program),
                 FusePatternsPass(),
-                FuseClampsPass(),
-                FuseConvClampPass(),
-                FuseClampBinaryOpPass(),
+                FuseClampPass(),
                 AddmmToLinearTransform(),
                 RemoveRedundantOpsTransform(),
                 FuseQuantizedOpsTransform(),

From 1428d81247f77e5b68d8c9dbfd20b7151f994751 Mon Sep 17 00:00:00 2001
From: suryasidd <surya.siddharth.pemmaraju@intel.com>
Date: Mon, 13 Oct 2025 13:37:37 -0700
Subject: [PATCH 266/266] Changed quantization scheme

---
 extension/llm/export/quantizer_lib.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index f92c59cebd3..592a6666dfa 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -238,9 +238,9 @@ def get_ov_quantizer(
     quantization_params = {}
 
     if quant_config == "4wo":
-        quantization_params["mode"] = QuantizationMode.INT4WO_ASYM
+        quantization_params["mode"] = QuantizationMode.INT4WO_SYM
         quantization_params["group_size"] = group_size
-        quantization_params["ratio"] = 0.8
+        quantization_params["ratio"] = 1
 
     elif quant_config == "8wo":
         quantization_params["mode"] = QuantizationMode.INT8WO_ASYM