From e489d6c7df7c4ed6fdf22414a52d631be5f6eedb Mon Sep 17 00:00:00 2001
From: Cavus Mustafa
Date: Fri, 20 Jun 2025 17:13:03 -0700
Subject: [PATCH 001/266] Runtime support for openvino quantized models
---
backends/openvino/runtime/OpenvinoBackend.cpp | 24 +++++++++++++++++++
backends/openvino/scripts/openvino_build.sh | 1 +
examples/models/llama/CMakeLists.txt | 8 +++++++
tools/cmake/executorch-config.cmake | 1 +
4 files changed, 34 insertions(+)
diff --git a/backends/openvino/runtime/OpenvinoBackend.cpp b/backends/openvino/runtime/OpenvinoBackend.cpp
index a3134f72b4b..39a1bf55c32 100644
--- a/backends/openvino/runtime/OpenvinoBackend.cpp
+++ b/backends/openvino/runtime/OpenvinoBackend.cpp
@@ -114,6 +114,26 @@ exr::Error OpenvinoBackend::execute(
ov_type, input_shape, input_tensor.mutable_data_ptr());
infer_request->set_input_tensor(i, ov_input_tensor);
+
+ if (args[i]->isInt()) {
+ int64_t *val = &(args[i]->payload.copyable_union.as_int);
+
+ // Create OpenVINO tensor from integer input
+ ov::Tensor ov_input_tensor(ov::element::i64, ov::Shape{1}, val);
+ infer_request->set_input_tensor(i, ov_input_tensor);
+ } else {
+ auto input_tensor = args[i]->toTensor();
+ ov::Shape input_shape(
+ input_tensor.sizes().begin(), input_tensor.sizes().end());
+
+ // Convert input tensor to OpenVINO tensor
+ ov::element::Type ov_type =
+ convert_to_openvino_type(input_tensor.scalar_type());
+ ov::Tensor ov_input_tensor(
+ ov_type, input_shape, input_tensor.mutable_data_ptr());
+
+ infer_request->set_input_tensor(i, ov_input_tensor);
+ }
}
// Set outputs
@@ -165,10 +185,14 @@ ov::element::Type OpenvinoBackend::convert_to_openvino_type(
switch (scalar_type) {
case exa::ScalarType::Float:
return ov::element::f32;
+ case exa::ScalarType::Half:
+ return ov::element::f16;
case exa::ScalarType::Int:
return ov::element::i32;
case exa::ScalarType::Char:
return ov::element::i8;
+ case exa::ScalarType::Byte:
+ return ov::element::u8;
case exa::ScalarType::Long:
return ov::element::i64;
case exa::ScalarType::Bool:
diff --git a/backends/openvino/scripts/openvino_build.sh b/backends/openvino/scripts/openvino_build.sh
index bc85d6b8410..c10a3bb4eeb 100755
--- a/backends/openvino/scripts/openvino_build.sh
+++ b/backends/openvino/scripts/openvino_build.sh
@@ -29,6 +29,7 @@ main() {
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
+ -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-DEXECUTORCH_BUILD_OPENVINO_EXECUTOR_RUNNER=ON \
-B"${build_dir}"
diff --git a/examples/models/llama/CMakeLists.txt b/examples/models/llama/CMakeLists.txt
index 8c27de20845..1063ebf2561 100644
--- a/examples/models/llama/CMakeLists.txt
+++ b/examples/models/llama/CMakeLists.txt
@@ -189,6 +189,14 @@ if(TARGET mpsdelegate)
target_link_options_shared_lib(mpsdelegate)
endif()
+# Openvino backend
+if(TARGET openvino_backend)
+ find_package(OpenVINO REQUIRED)
+ target_link_libraries(openvino_backend INTERFACE openvino::runtime executorch_core)
+ list(APPEND link_libraries openvino_backend)
+ target_link_options_shared_lib(openvino_backend)
+endif()
+
if(TARGET coremldelegate)
find_library(SQLITE_LIBRARY sqlite3)
list(
diff --git a/tools/cmake/executorch-config.cmake b/tools/cmake/executorch-config.cmake
index aa5776163a9..adf978fb70a 100644
--- a/tools/cmake/executorch-config.cmake
+++ b/tools/cmake/executorch-config.cmake
@@ -94,6 +94,7 @@ set(lib_list
quantized_kernels
quantized_ops_lib
quantized_ops_aot_lib
+ openvino_backend
)
foreach(lib ${lib_list})
# Name of the variable which stores result of the find_library search
From f0d901f3358fc9bc59b97450111ec0071b90044a Mon Sep 17 00:00:00 2001
From: Cavus Mustafa
Date: Fri, 20 Jun 2025 21:41:24 -0700
Subject: [PATCH 002/266] openvino export_llama_lib support
---
examples/models/llama/config/llm_config.py | 17 +++++++++++++++++
examples/models/llama/export_llama_lib.py | 17 +++++++++++++++++
extension/llm/export/partitioner_lib.py | 13 +++++++++++++
3 files changed, 47 insertions(+)
diff --git a/examples/models/llama/config/llm_config.py b/examples/models/llama/config/llm_config.py
index 034d8af7562..2de58fe47eb 100644
--- a/examples/models/llama/config/llm_config.py
+++ b/examples/models/llama/config/llm_config.py
@@ -437,6 +437,16 @@ class MPSConfig:
enabled: bool = False
+@dataclass
+class OpenvinoConfig:
+ """
+ Configures the QNN backend.
+ """
+
+ enabled: bool = False
+ device: str = "CPU"
+
+
@dataclass
class BackendConfig:
"""
@@ -449,6 +459,7 @@ class BackendConfig:
vulkan: VulkanConfig = field(default_factory=VulkanConfig)
qnn: QNNConfig = field(default_factory=QNNConfig)
mps: MPSConfig = field(default_factory=MPSConfig)
+ openvino: OpenvinoConfig = field(default_factory=OpenvinoConfig)
################################################################################
@@ -609,6 +620,12 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig": # noqa: C901
if hasattr(args, "mps"):
llm_config.backend.mps.enabled = args.mps
+ # Openvino
+ if hasattr(args, "openvino"):
+ llm_config.backend.openvino.enabled = args.openvino
+ if hasattr(args, "openvino_device"):
+ llm_config.backend.openvino.device = args.openvino_device
+
# DebugConfig
if hasattr(args, "profile_memory"):
llm_config.debug.profile_memory = args.profile_memory
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 1f055d65822..8afaa8bf409 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -39,6 +39,7 @@
from executorch.extension.llm.export.partitioner_lib import (
get_coreml_partitioner,
get_mps_partitioner,
+ get_openvino_partitioner,
get_qnn_partitioner,
get_vulkan_partitioner,
get_xnnpack_partitioner,
@@ -443,6 +444,14 @@ def build_args_parser() -> argparse.ArgumentParser:
action="store_true",
help="Delegate llama2 to qnn backend (Qualcomm), please use it --kv_cahce=True",
)
+ parser.add_argument("--openvino", action="store_true")
+ parser.add_argument(
+ "--openvino_device",
+ type=str,
+ default=None,
+ choices=["CPU", "GPU"],
+ help="Specify the device for Openvino (CPU or GPU).",
+ )
parser.add_argument(
"--expand_rope_table",
@@ -857,6 +866,8 @@ def _to_edge_and_lower_llama( # noqa: C901
mps: bool = False,
coreml: bool = False,
qnn: bool = False,
+ openvino: bool = False,
+ openvino_device: str = "CPU",
dtype_override: str = "fp32",
enable_dynamic_shape: bool = True,
use_kv_cache: bool = False,
@@ -901,6 +912,10 @@ def _to_edge_and_lower_llama( # noqa: C901
partitioners.append(coreml_partitioner)
modelname = f"coreml_{modelname}"
+ if openvino:
+ partitioners.append(get_openvino_partitioner(openvino_device))
+ modelname = f"openvino_{modelname}"
+
if qnn:
logging.warning(
"The model definition in current repro is not performant, please refer to the instruction"
@@ -1068,6 +1083,8 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager: # noqa: C901
mps=llm_config.backend.mps.enabled,
coreml=llm_config.backend.coreml.enabled,
qnn=llm_config.backend.qnn.enabled,
+ openvino=llm_config.backend.openvino.enabled,
+ openvino_device=llm_config.backend.openvino.device,
dtype_override=llm_config.model.dtype_override,
enable_dynamic_shape=llm_config.model.enable_dynamic_shape,
use_kv_cache=llm_config.model.use_kv_cache,
diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py
index 20604bbf635..3c795dcdf66 100644
--- a/extension/llm/export/partitioner_lib.py
+++ b/extension/llm/export/partitioner_lib.py
@@ -63,6 +63,19 @@ def get_mps_partitioner(use_kv_cache: bool = False):
compile_specs = [CompileSpec("use_fp16", bytes([True]))]
return MPSPartitioner(compile_specs) # pyre-fixme[16]
+def get_openvino_partitioner(device: str):
+ try:
+ from executorch.exir.backend.backend_details import CompileSpec
+ from executorch.backends.openvino.partitioner import (
+ OpenvinoPartitioner,
+ )
+ except ImportError:
+ raise ImportError(
+ "Please install the OpenVINO backend following https://github.com/pytorch/executorch/tree/main/backends/openvino"
+ )
+
+ compile_specs = [CompileSpec("device", device.encode())]
+ return OpenvinoPartitioner(compile_specs)
def get_coreml_partitioner(
ios: int = 15,
From 24f2d930c62484ba038bd9ee9c7fb9fb73cc3fd5 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa
Date: Sat, 21 Jun 2025 20:43:05 -0700
Subject: [PATCH 003/266] nncf pattern checker in openvino partitioner
---
backends/openvino/partitioner.py | 62 +++++++++++++++++++++++
examples/models/llama/export_llama_lib.py | 2 +-
2 files changed, 63 insertions(+), 1 deletion(-)
diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py
index bc3fde573e2..4828a96f0dd 100644
--- a/backends/openvino/partitioner.py
+++ b/backends/openvino/partitioner.py
@@ -25,6 +25,11 @@
from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
from torch.fx.passes.operator_support import OperatorSupportBase
+class PatternNode:
+ op_types = {}
+
+ def __init__(self):
+ self.op_types = {}
class OpenvinoOperatorsSupport(OperatorSupportBase):
@@ -32,6 +37,7 @@ def __init__(
self,
op_types_to_skip: Optional[set] = None,
op_names_to_skip: Optional[set] = None,
+ enabled_ops_by_name: Optional[set] = None,
) -> None:
"""
Initializes the OpenvinoOperatorsSupport class.
@@ -43,9 +49,12 @@ def __init__(
op_types_to_skip = set()
if op_names_to_skip is None:
op_names_to_skip = set()
+ if enabled_ops_by_name is None:
+ enabled_ops_by_name = set()
self._op_types_to_skip = op_types_to_skip
self._op_names_to_skip = op_names_to_skip
+ self._enabled_ops_by_name = enabled_ops_by_name
def is_node_supported(self, _, node: torch.fx.Node) -> bool:
"""
@@ -62,6 +71,10 @@ def is_node_supported(self, _, node: torch.fx.Node) -> bool:
op_type = node.target.__name__
else:
op_type = str(node.target)
+
+ if node.name in self._enabled_ops_by_name:
+ return True
+
supported_ops = OperatorSupport(options)._support_dict
if op_type == "getitem":
return True
@@ -88,6 +101,7 @@ def __init__(
compile_spec: List[CompileSpec],
op_types_to_skip: Optional[set] = None,
op_names_to_skip: Optional[set] = None,
+ enabled_ops_by_name: Optional[set] = None,
) -> None:
"""
Initializes the OpenvinoPartitioner class.
@@ -99,6 +113,7 @@ def __init__(
self.delegation_spec = DelegationSpec(OpenvinoBackend.__name__, compile_spec)
self._op_types_to_skip = op_types_to_skip
self._op_names_to_skip = op_names_to_skip
+ self._enabled_ops_by_name = enabled_ops_by_name
def ops_to_not_decompose(
self,
@@ -120,6 +135,52 @@ def ops_to_not_decompose(
]
return (ops_not_decompose, None)
+ def check_pattern(self, node: torch.fx.Node, pattern: PatternNode, enabled_ops: list) -> bool:
+ if node.op == "call_function":
+ if ("call_function" + ":" + str(node.target.__name__)) in pattern.op_types:
+ pt_input_nodes = node.all_input_nodes
+ pattern_input_ops = pattern.op_types["call_function" + ":" + str(node.target.__name__)]
+ if pattern_input_ops is None:
+ enabled_ops.append(node)
+ return True
+ if len(pt_input_nodes) != len(pattern_input_ops):
+ return False
+ for i in range(len(pt_input_nodes)):
+ if not self.check_pattern(pt_input_nodes[i], pattern_input_ops[i], enabled_ops):
+ return False
+ enabled_ops.append(node)
+ return True
+ elif node.op == "get_attr":
+ if "get_attr" in pattern.op_types:
+ return True
+ else:
+ return False
+ elif node.op == "placeholder":
+ if "placeholder" in pattern.op_types:
+ return True
+ else:
+ return False
+ return False
+
+ def capture_nncf_patterns(self, graph_module: torch.fx.GraphModule):
+ const_node = PatternNode
+ const_node.op_types["get_attr"] = None
+ const_node.op_types["placeholder"] = None
+ bitwise_right_shift_node = PatternNode
+ bitwise_right_shift_node.op_types["call_function:aten.bitwise_right_shift.Tensor_Scalar"] = [const_node]
+ bitwise_and_node = PatternNode
+ bitwise_and_node.op_types["call_function:aten.bitwise_and.Scalar"] = [const_node]
+ stack_node = PatternNode
+ stack_node.op_types["call_function:aten.stack.default"] = [bitwise_and_node, bitwise_right_shift_node]
+
+ for node in graph_module.graph.nodes:
+ if str(node.op) == "call_function" and str(node.target.__name__) == "aten.stack.default":
+ enabled_ops = []
+ pattern_match = self.check_pattern(node, stack_node, enabled_ops)
+ if pattern_match:
+ for pattern_op in enabled_ops:
+ self._enabled_ops_by_name.add(pattern_op.name)
+
def partition(self, exported_program: ExportedProgram) -> PartitionResult:
"""
Partitions an exported program into supported and unsupported segments.
@@ -127,6 +188,7 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
:param exported_program: The exported program.
:return: A PartitionResult containing the partitioned graph and delegation tags.
"""
+ self.capture_nncf_patterns(exported_program.graph_module)
partitioner = CapabilityBasedPartitioner(
exported_program.graph_module,
OpenvinoOperatorsSupport(self._op_types_to_skip, self._op_names_to_skip),
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 8afaa8bf409..a01b05daa17 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -448,7 +448,7 @@ def build_args_parser() -> argparse.ArgumentParser:
parser.add_argument(
"--openvino_device",
type=str,
- default=None,
+ default="CPU",
choices=["CPU", "GPU"],
help="Specify the device for Openvino (CPU or GPU).",
)
From 7dd8d0f17aec743d7796bf7b314df97f2aeb90eb Mon Sep 17 00:00:00 2001
From: anzr299
Date: Mon, 23 Jun 2025 19:11:55 +0400
Subject: [PATCH 004/266] nncf compression init
---
examples/models/llama/export_llama_lib.py | 8 ++++++
extension/llm/export/builder.py | 32 +++++++++++++++++++++++
2 files changed, 40 insertions(+)
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index a01b05daa17..087e4d1efdc 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -551,6 +551,13 @@ def build_args_parser() -> argparse.ArgumentParser:
help="path to the input pruning token mapping file (token_map.json)",
)
+ parser.add_argument(
+ "--nncf_compression",
+ default=False,
+ action="store_true",
+ help="If true, stops right after torch.export() and saves the exported model.",
+ )
+
parser.add_argument(
"--export_only",
default=False,
@@ -1207,6 +1214,7 @@ def _load_llama_model(llm_config: LlmConfig) -> "LLMEdgeManager":
use_legacy_export=llm_config.backend.qnn.enabled,
save_exported_program=llm_config.export.export_only,
verbose=llm_config.debug.verbose,
+ nncf_compression=llm_config.nncf_compression,
metadata=_load_llama_model_metadata(
WeightType.FAIRSEQ2 if llm_config.base.fairseq2 else WeightType.LLAMA,
llm_config.model.use_kv_cache,
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index 4128bfd8198..f185d9b346d 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -16,6 +16,7 @@
from typing import Any, Callable, Dict, List, Optional, Tuple
from unittest.mock import patch
+import nncf
import torch
from executorch.backends.transforms.duplicate_dynamic_quant_chain import (
DuplicateDynamicQuantChainPass,
@@ -40,6 +41,7 @@
from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
from torchao.quantization.pt2e.quantizer import ComposableQuantizer, Quantizer
from torchao.utils import unwrap_tensor_subclass
+from functools import partial
FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
logging.basicConfig(level=logging.INFO, format=FORMAT)
@@ -98,6 +100,7 @@ def __init__(
dynamic_shapes: Optional[Any] = None,
use_legacy_export: bool = False,
save_exported_program: bool = False,
+ nncf_compression: bool = False
):
# Store necessary constructor arguments.
self.model = model
@@ -119,6 +122,7 @@ def __init__(
self.dynamic_shapes = dynamic_shapes
self.use_legacy_export = use_legacy_export
self.save_exported_program = save_exported_program
+ self.nncf_compression = nncf_compression
# Note: treat this as the source of truth for the result of
# torch.export'ing a model. If the overall ExportedProgram is needed,
@@ -428,6 +432,34 @@ def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManage
DuplicateDynamicQuantChainPass()(m)
self.pre_autograd_graph_module = m
return self
+ elif (self.nncf_compression):
+ tokenizer = get_tokenizer(self.tokenizer_path)
+
+ def transform_fn(
+ prompts: str, tokenizer
+ ):
+ tokenized_text = tokenizer.encode(prompts, bos=False, eos=False)
+ logging.error(tokenized_text)
+
+ inputs = ()
+ inputs = (
+ torch.tensor(tokenized_text).unsqueeze(0),
+ {"input_pos": torch.tensor([0])},
+ )
+
+ return inputs
+
+ self.calibration_data = [self.calibration_data] if isinstance(self.calibration_data, str) else self.calibration_data
+ self.calibration_data = [word for prompt in self.calibration_data for word in prompt.split()] if not self.dynamic_shapes else self.calibration_data
+
+ self.pre_autograd_graph_module = nncf.compress_weights(
+ self.pre_autograd_graph_module,
+ dataset=nncf.Dataset(self.calibration_data, transform_func=partial(transform_fn, tokenizer=tokenizer)),
+ mode=nncf.CompressWeightsMode.INT4_SYM,
+ ratio=0.8,
+ sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
+ )
+ return self
else:
logging.info("No quantizer provided, passing...")
return self
From 1716834b5ff3889da366f54e2d6f2a3e3e999117 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa
Date: Mon, 23 Jun 2025 13:43:11 -0700
Subject: [PATCH 005/266] openvino backend llama nncf support
---
backends/openvino/partitioner.py | 5 +-
backends/openvino/utils.py | 66 ++++++++++++++++++++++
examples/models/llama/config/llm_config.py | 3 +
examples/models/llama/export_llama_lib.py | 4 +-
extension/llm/export/builder.py | 39 +++++++++----
5 files changed, 101 insertions(+), 16 deletions(-)
create mode 100644 backends/openvino/utils.py
diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py
index 4828a96f0dd..b1e7f5d436a 100644
--- a/backends/openvino/partitioner.py
+++ b/backends/openvino/partitioner.py
@@ -101,7 +101,6 @@ def __init__(
compile_spec: List[CompileSpec],
op_types_to_skip: Optional[set] = None,
op_names_to_skip: Optional[set] = None,
- enabled_ops_by_name: Optional[set] = None,
) -> None:
"""
Initializes the OpenvinoPartitioner class.
@@ -113,7 +112,7 @@ def __init__(
self.delegation_spec = DelegationSpec(OpenvinoBackend.__name__, compile_spec)
self._op_types_to_skip = op_types_to_skip
self._op_names_to_skip = op_names_to_skip
- self._enabled_ops_by_name = enabled_ops_by_name
+ self._enabled_ops_by_name = set()
def ops_to_not_decompose(
self,
@@ -191,7 +190,7 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
self.capture_nncf_patterns(exported_program.graph_module)
partitioner = CapabilityBasedPartitioner(
exported_program.graph_module,
- OpenvinoOperatorsSupport(self._op_types_to_skip, self._op_names_to_skip),
+ OpenvinoOperatorsSupport(self._op_types_to_skip, self._op_names_to_skip, self._enabled_ops_by_name),
allows_single_node_partition=True,
)
partition_list = partitioner.propose_partitions()
diff --git a/backends/openvino/utils.py b/backends/openvino/utils.py
new file mode 100644
index 00000000000..ec4bebe0d6d
--- /dev/null
+++ b/backends/openvino/utils.py
@@ -0,0 +1,66 @@
+# Copyright (c) Intel Corporation
+#
+# Licensed under the BSD License (the "License"); you may not use this file
+# except in compliance with the License. See the license file found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+from typing import Any, Dict, Optional, Tuple, Union
+
+import executorch.exir as exir
+
+import torch
+from executorch.exir import EdgeProgramManager
+from executorch.exir.program._program import to_edge_with_preserved_ops
+from executorch.exir.tracer import Value
+from torch.export import ExportedProgram
+from executorch.extension.export_util.utils import _to_core_aten
+
+_EDGE_COMPILE_CONFIG = exir.EdgeCompileConfig(
+ _check_ir_validity=True,
+ _skip_dim_order=True, # TODO(T189114319): Reuse dim order op after solving the ios oss issue
+)
+
+def nncf_core_aten_to_edge(
+ core_aten_exir_ep: ExportedProgram,
+ edge_constant_methods: Optional[Dict[str, Any]] = None,
+ edge_compile_config=None,
+ verbose=True,
+) -> EdgeProgramManager:
+ if not edge_compile_config:
+ edge_compile_config = exir.EdgeCompileConfig(
+ _check_ir_validity=False, # quant ops currently break ir verification
+ )
+ edge_manager: EdgeProgramManager = to_edge_with_preserved_ops(
+ core_aten_exir_ep,
+ constant_methods=edge_constant_methods,
+ compile_config=edge_compile_config,
+ preserve_ops=[torch.ops.aten.stack.default,],
+ )
+ if verbose:
+ logging.info(f"Exported graph:\n{edge_manager.exported_program()}")
+ return edge_manager
+
+def nncf_export_to_edge(
+ model: Union[torch.fx.GraphModule, torch.nn.Module],
+ example_inputs: Tuple[Value, ...],
+ *,
+ example_kwarg_inputs: Optional[Dict] = None,
+ dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any]]] = None,
+ edge_constant_methods: Optional[Dict[str, Any]] = None,
+ edge_compile_config=_EDGE_COMPILE_CONFIG,
+ strict=True,
+ verbose=True,
+) -> EdgeProgramManager:
+ core_aten_ep = _to_core_aten(
+ model,
+ example_inputs,
+ example_kwarg_inputs=example_kwarg_inputs,
+ dynamic_shapes=dynamic_shapes,
+ strict=strict,
+ verbose=verbose,
+ )
+ return nncf_core_aten_to_edge(
+ core_aten_ep, edge_constant_methods, edge_compile_config, verbose=verbose
+ )
diff --git a/examples/models/llama/config/llm_config.py b/examples/models/llama/config/llm_config.py
index 2de58fe47eb..530f7335d8e 100644
--- a/examples/models/llama/config/llm_config.py
+++ b/examples/models/llama/config/llm_config.py
@@ -445,6 +445,7 @@ class OpenvinoConfig:
enabled: bool = False
device: str = "CPU"
+ nncf_compression = False
@dataclass
@@ -625,6 +626,8 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig": # noqa: C901
llm_config.backend.openvino.enabled = args.openvino
if hasattr(args, "openvino_device"):
llm_config.backend.openvino.device = args.openvino_device
+ if hasattr(args, "nncf_compression"):
+ llm_config.backend.openvino.nncf_compression = args.nncf_compression
# DebugConfig
if hasattr(args, "profile_memory"):
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 087e4d1efdc..1ea82e3224a 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -555,7 +555,7 @@ def build_args_parser() -> argparse.ArgumentParser:
"--nncf_compression",
default=False,
action="store_true",
- help="If true, stops right after torch.export() and saves the exported model.",
+ help="Enables nncf compression for openvino backend",
)
parser.add_argument(
@@ -1214,7 +1214,7 @@ def _load_llama_model(llm_config: LlmConfig) -> "LLMEdgeManager":
use_legacy_export=llm_config.backend.qnn.enabled,
save_exported_program=llm_config.export.export_only,
verbose=llm_config.debug.verbose,
- nncf_compression=llm_config.nncf_compression,
+ nncf_compression=llm_config.backend.openvino.nncf_compression,
metadata=_load_llama_model_metadata(
WeightType.FAIRSEQ2 if llm_config.base.fairseq2 else WeightType.LLAMA,
llm_config.model.use_kv_cache,
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index f185d9b346d..a2bfaeae22d 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -16,7 +16,6 @@
from typing import Any, Callable, Dict, List, Optional, Tuple
from unittest.mock import patch
-import nncf
import torch
from executorch.backends.transforms.duplicate_dynamic_quant_chain import (
DuplicateDynamicQuantChainPass,
@@ -41,7 +40,6 @@
from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
from torchao.quantization.pt2e.quantizer import ComposableQuantizer, Quantizer
from torchao.utils import unwrap_tensor_subclass
-from functools import partial
FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
logging.basicConfig(level=logging.INFO, format=FORMAT)
@@ -433,6 +431,13 @@ def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManage
self.pre_autograd_graph_module = m
return self
elif (self.nncf_compression):
+ try:
+ import nncf
+ from functools import partial
+ except ImportError:
+ raise ImportError(
+ "Please install nncf via backends/openvino/requirements.txt"
+ )
tokenizer = get_tokenizer(self.tokenizer_path)
def transform_fn(
@@ -487,15 +492,27 @@ def export_to_edge(self) -> "LLMEdgeManager":
)
with override_export_behaviour:
- self.edge_manager = export_to_edge(
- self.pre_autograd_graph_module, # pyre-fixme[6]
- self.example_inputs,
- example_kwarg_inputs=self.example_kwarg_inputs,
- dynamic_shapes=dynamic_shape,
- edge_constant_methods=self.metadata,
- edge_compile_config=edge_config,
- verbose=self.verbose,
- )
+ if (self.nncf_compression):
+ from executorch.backends.openvino.utils import nncf_export_to_edge
+ self.edge_manager = nncf_export_to_edge(
+ self.pre_autograd_graph_module, # pyre-fixme[6]
+ self.example_inputs,
+ example_kwarg_inputs=self.example_kwarg_inputs,
+ dynamic_shapes=dynamic_shape,
+ edge_constant_methods=self.metadata,
+ edge_compile_config=edge_config,
+ verbose=self.verbose,
+ )
+ else:
+ self.edge_manager = export_to_edge(
+ self.pre_autograd_graph_module, # pyre-fixme[6]
+ self.example_inputs,
+ example_kwarg_inputs=self.example_kwarg_inputs,
+ dynamic_shapes=dynamic_shape,
+ edge_constant_methods=self.metadata,
+ edge_compile_config=edge_config,
+ verbose=self.verbose,
+ )
return self
def to_backend(self, partitioners: Optional[List[Partitioner]]) -> "LLMEdgeManager":
From 198190e6a250632ed9921fa346895521e5b22dfb Mon Sep 17 00:00:00 2001
From: anzr299
Date: Mon, 7 Jul 2025 14:38:05 +0400
Subject: [PATCH 006/266] openvino quantizer init
---
.../quantizer/observers/nncf_observers.py | 114 ++++++++++++
backends/openvino/quantizer/quantizer.py | 170 ++++++++++++------
2 files changed, 228 insertions(+), 56 deletions(-)
create mode 100644 backends/openvino/quantizer/observers/nncf_observers.py
diff --git a/backends/openvino/quantizer/observers/nncf_observers.py b/backends/openvino/quantizer/observers/nncf_observers.py
new file mode 100644
index 00000000000..54f4348e0ed
--- /dev/null
+++ b/backends/openvino/quantizer/observers/nncf_observers.py
@@ -0,0 +1,114 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+from torch.ao.quantization.observer import MappingType, PerGroup, PerAxis, PerChannelMinMaxObserver, get_block_size
+from torch.ao.quantization.pt2e._affine_quantization import (
+ _get_reduction_params,
+ AffineQuantizedMinMaxObserver,
+)
+from nncf.torch.quantization.layers import INT4AsymmetricWeightsDecompressor, INT4SymmetricWeightsDecompressor, INT8AsymmetricWeightsDecompressor, INT8SymmetricWeightsDecompressor
+from nncf.experimental.torch.fx.transformations import constant_update_fn, module_insertion_transformation_builder
+from nncf.experimental.torch.fx.node_utils import get_tensor_constant_from_node
+from nncf.torch.graph.transformations.commands import PTTargetPoint, TargetType
+
+from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization
+from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
+from nncf.parameters import CompressWeightsMode
+from nncf.tensor.tensor import Tensor
+
+class PTPerBlockParamObserver(AffineQuantizedMinMaxObserver):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ qmode = CompressWeightsMode.INT4_ASYM if self.mapping_type==MappingType.ASYMMETRIC else CompressWeightsMode.INT4_SYM
+ assert isinstance(self.granularity, PerGroup), "Only PerGroup granularity is supported"
+ self.wc_config = WeightCompressionConfig(mode=qmode, group_size=self.granularity.group_size)
+
+ def calculate_qparams(self, weight):
+ assert hasattr(self, "min_val") and hasattr(
+ self, "max_val"
+ ), "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams"
+ _, reduction_dims = _get_reduction_params(
+ self.block_size, weight.size()
+ )
+ assert len(reduction_dims) == 1, "Only 1-D group size is supported"
+ reduction_dims = reduction_dims[0] - 1
+ q_weight, scale, zp = do_integer_quantization(Tensor(weight), self.wc_config, reduction_axes=reduction_dims)
+ zp = zp.data if zp is not None else None
+ return q_weight.data, scale.data, zp
+
+ def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node):
+ print("calling convert")
+ assert (
+ self.original_dtype is not None
+ ), "Expecting original_dtype to be populated"
+ weight_node = observer_node.args[0]
+ original_weight = get_tensor_constant_from_node(weight_node, model)
+ q_weight, scale, zero_point = self.calculate_qparams(original_weight)
+
+ with model.graph.inserting_before(observer_node):
+ if(zero_point is not None):
+ decompressor = INT4AsymmetricWeightsDecompressor(scale, zero_point, q_weight.shape, original_weight.shape, original_weight.dtype)
+ else:
+ decompressor = INT4SymmetricWeightsDecompressor(scale, q_weight.shape, original_weight.shape, original_weight.dtype)
+ packed_q_weight = decompressor.pack_weight(q_weight)
+ new_weight_node = constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0)
+ decompressor_name = f'NNCFDecompressor_{new_weight_node.name}'
+
+ module_insertion_transformation_builder(
+ decompressor,
+ [PTTargetPoint(TargetType.OPERATOR_POST_HOOK, target_node_name=new_weight_node.name)],
+ decompressor_name,
+ )(model)
+ decomp_node = observer_node.args[0]
+ observer_node.replace_all_uses_with(decomp_node)
+ model.graph.erase_node(observer_node)
+
+
+class NNCFInt8observer(PerChannelMinMaxObserver):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ qmode = CompressWeightsMode.INT8_SYM if self.qscheme==torch.per_channel_symmetric else CompressWeightsMode.INT8_ASYM
+ self.wc_config = WeightCompressionConfig(mode=qmode)
+
+ def calculate_qparams(self, weight):
+ assert hasattr(self, "min_val") and hasattr(
+ self, "max_val"
+ ), "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams"
+ self.granularity = PerAxis(axis=self.ch_axis)
+ self.block_size = get_block_size(weight.shape, self.granularity)
+ _, reduction_dims = _get_reduction_params(
+ self.block_size, weight.size()
+ )
+ q_weight, scale, zp = do_integer_quantization(Tensor(weight), self.wc_config, reduction_axes=reduction_dims)
+ zp = zp.data if zp is not None else None
+ return q_weight.data, scale.data, zp
+
+ def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node):
+ print("calling convert")
+ weight_node = observer_node.args[0]
+ original_weight = get_tensor_constant_from_node(weight_node, model)
+ q_weight, scale, zero_point = self.calculate_qparams(original_weight)
+
+ with model.graph.inserting_before(observer_node):
+ if(zero_point is not None):
+ decompressor = INT8AsymmetricWeightsDecompressor(scale, zero_point, original_weight.dtype)
+ else:
+ decompressor = INT8SymmetricWeightsDecompressor(scale, original_weight.dtype)
+ packed_q_weight = decompressor.pack_weight(q_weight)
+ new_weight_node = constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0)
+ decompressor_name = f'NNCFDecompressor_{new_weight_node.name}'
+
+ module_insertion_transformation_builder(
+ decompressor,
+ [PTTargetPoint(TargetType.OPERATOR_POST_HOOK, target_node_name=new_weight_node.name)],
+ decompressor_name,
+ )(model)
+ decomp_node = observer_node.args[0]
+ observer_node.replace_all_uses_with(decomp_node)
+ model.graph.erase_node(observer_node)
\ No newline at end of file
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index d0622b24e6d..f8f08996f53 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -21,6 +21,8 @@
HistogramObserver,
PerChannelMinMaxObserver,
UniformQuantizationObserverBase,
+ PerGroup,
+ MappingType,
)
from torchao.quantization.pt2e.quantizer import (
EdgeOrNode,
@@ -30,6 +32,9 @@
Quantizer,
SharedQuantizationSpec,
)
+from nncf.quantization.quantize_model import get_weight_compression_configuration
+from nncf.common.quantization.structs import QuantizerConfig, QuantizationScheme
+from executorch.backends.openvino.quantizer.observers.nncf_observers import PTPerBlockParamObserver,NNCFInt8observer
QUANT_ANNOTATION_KEY = "quantization_annotation"
@@ -46,6 +51,10 @@ class QuantizationMode(Enum):
INT8_SYM = "int8_sym"
INT8_MIXED = "int8_mixed"
INT8_TRANSFORMER = "int8_transformer"
+ INT8_SYM_WC = "int8_sym_wc"
+ INT8_ASYM_WC = "int8_asym_wc"
+ INT4_SYM_WC = "int4_sym"
+ INT4_ASYM_WC = "int4_asym"
class OpenVINOQuantizer(Quantizer):
@@ -66,8 +75,12 @@ def __init__(
- INT8_MIXED: INT8 asymmetric quantization for activations, symmetric for weights.
- INT8_TRANSFORMER: Optimized INT8 quantization for transformer-based models
Default value is INT8_SYM.
+ - INT4_SYM: Symmetric INT4 Weights-Only Compression
+ - INT4_ASYM: Asymmetric INT4 Weights-Only Compression
:param kwargs: Arguments to pass to the NNCF MinMaxQuantization algorithm.
"""
+ self.mode = mode
+ self.wc_modes = [QuantizationMode.INT4_ASYM_WC,QuantizationMode.INT4_SYM_WC, QuantizationMode.INT8_ASYM_WC, QuantizationMode.INT8_SYM_WC]
if mode == QuantizationMode.INT8_SYM:
preset = quantization.structs.QuantizationPreset.PERFORMANCE
model_type = None
@@ -77,11 +90,24 @@ def __init__(
else:
preset = None
model_type = nncf.parameters.ModelType.TRANSFORMER
- self._min_max_algo = (
- nncf.quantization.algorithms.min_max.algorithm.MinMaxQuantization(
- preset=preset, model_type=model_type, **kwargs
+ if(self.mode not in self.wc_modes):
+ self._min_max_algo = (
+ nncf.quantization.algorithms.min_max.algorithm.MinMaxQuantization(
+ preset=preset, model_type=model_type, **kwargs
+ )
)
- )
+ self._algo = self._min_max_algo
+ else:
+ weight_compression_configuration = get_weight_compression_configuration(
+ mode.value.replace("_wc", ""), # Mode value has to match NNCF CompressWeightsMode
+ **kwargs
+ )
+ self._weight_compression_algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression(
+ subset_size=None,
+ **weight_compression_configuration
+ )
+ self._algo = self._weight_compression_algo
+
def set_ignored_scope(
self,
@@ -102,7 +128,7 @@ def set_ignored_scope(
:param validate: If set to True, then a RuntimeError will be raised if any ignored scope does not match
in the model graph.
"""
- self._min_max_algo.set_ignored_scope(
+ self._algo.set_ignored_scope(
nncf.IgnoredScope(
names=names or [],
patterns=patterns or [],
@@ -115,63 +141,80 @@ def set_ignored_scope(
def get_nncf_quantization_setup(
self, model: torch.fx.GraphModule, nncf_graph: NNCFGraph
) -> quantization.quantizer_setup.SingleConfigQuantizerSetup:
- self._min_max_algo._set_backend_entity(model)
- return self._min_max_algo.find_quantization_setup(model, nncf_graph)
+ self._algo._set_backend_entity(model)
+ return self._algo.find_quantization_setup(model, nncf_graph)
def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
nncf_graph = nncf_fx.nncf_graph_builder.GraphConverter.create_nncf_graph(model)
- quantization_setup = self.get_nncf_quantization_setup(model, nncf_graph)
-
+
graph = model.graph
node_vs_torch_annotation: DefaultDict[torch.fx.Node, QuantizationAnnotation] = (
defaultdict(QuantizationAnnotation)
)
+ # Serperate into annotation for quantize and compress
+ if(self.mode in self.wc_modes):
+ self._algo.set_backend_entity(model)
+ nodes_to_compress = self._algo.get_nodes_to_compress(nncf_graph)
+ for node in nodes_to_compress:
+ quantization_insertion_point = quantization.quantizer_setup.WeightQuantizationInsertionPoint(target_node_name=node.node_name)
+ group_size = self._algo._group_size
+ num_bits = 4 if self.mode in [QuantizationMode.INT4_SYM_WC,QuantizationMode.INT4_ASYM_WC] else 8
+ qmode = QuantizationScheme.SYMMETRIC if self.mode in [QuantizationMode.INT4_SYM_WC,QuantizationMode.INT8_SYM_WC] else QuantizationScheme.ASYMMETRIC
+ nncf_qconfig = QuantizerConfig(num_bits=num_bits, mode=qmode)
+ qp = quantization.quantizer_setup.SingleConfigQuantizationPoint(qip=quantization_insertion_point, qconfig=nncf_qconfig, directly_quantized_operator_node_names=[node])
+ edge_or_node, annotation = self._get_edge_or_node_and_annotation(
+ graph, nncf_graph, qp, node_vs_torch_annotation
+ )
+ qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(qp, group_size=group_size, weights_only=True)
+ self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
+ else:
+ quantization_setup = self.get_nncf_quantization_setup(model, nncf_graph)
- for qp in quantization_setup.quantization_points.values():
- edge_or_node, annotation = self._get_edge_or_node_and_annotation(
- graph, nncf_graph, qp, node_vs_torch_annotation
- )
- qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_qp(qp)
- self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
+ for qp in quantization_setup.quantization_points.values():
+ edge_or_node, annotation = self._get_edge_or_node_and_annotation(
+ graph, nncf_graph, qp, node_vs_torch_annotation
+ )
+ qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(qp)
+ self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
- for quantizer_ids in quantization_setup.unified_scale_groups.values():
+ for quantizer_ids in quantization_setup.unified_scale_groups.values():
- root_quantizer_id = self._get_unified_scales_root_quantizer_id(
- nncf_graph, quantizer_ids, quantization_setup
- )
- root_qp = quantization_setup.quantization_points[root_quantizer_id]
+ root_quantizer_id = self._get_unified_scales_root_quantizer_id(
+ nncf_graph, quantizer_ids, quantization_setup
+ )
+ root_qp = quantization_setup.quantization_points[root_quantizer_id]
- if any(
- root_qp.qconfig != quantization_setup.quantization_points[q_id].qconfig
- for q_id in quantizer_ids
- ):
- qps = [
- quantization_setup.quantization_points[q_id]
+ if any(
+ root_qp.qconfig != quantization_setup.quantization_points[q_id].qconfig
for q_id in quantizer_ids
- ]
- msg = (
- "Different quantization configs are set to one unified scale group:"
- f"{[(qp.insertion_point.__dict__, str(qp.qconfig)) for qp in qps]}"
+ ):
+ qps = [
+ quantization_setup.quantization_points[q_id]
+ for q_id in quantizer_ids
+ ]
+ msg = (
+ "Different quantization configs are set to one unified scale group:"
+ f"{[(qp.insertion_point.__dict__, str(qp.qconfig)) for qp in qps]}"
+ )
+ raise nncf.InternalError(msg)
+
+ root_target_node = nncf_fx.node_utils.get_graph_node_by_name(
+ graph, root_qp.insertion_point.target_node_name
+ )
+ root_edge_or_node = self._get_edge_or_node(
+ root_target_node, root_qp, nncf_graph
)
- raise nncf.InternalError(msg)
-
- root_target_node = nncf_fx.node_utils.get_graph_node_by_name(
- graph, root_qp.insertion_point.target_node_name
- )
- root_edge_or_node = self._get_edge_or_node(
- root_target_node, root_qp, nncf_graph
- )
- for quantizer_id in quantizer_ids:
- if quantizer_id == root_quantizer_id:
- continue
+ for quantizer_id in quantizer_ids:
+ if quantizer_id == root_quantizer_id:
+ continue
- qspec = SharedQuantizationSpec(root_edge_or_node)
- qp = quantization_setup.quantization_points[quantizer_id]
- edge_or_node, annotation = self._get_edge_or_node_and_annotation(
- graph, nncf_graph, qp, node_vs_torch_annotation
- )
- self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
+ qspec = SharedQuantizationSpec(root_edge_or_node)
+ qp = quantization_setup.quantization_points[quantizer_id]
+ edge_or_node, annotation = self._get_edge_or_node_and_annotation(
+ graph, nncf_graph, qp, node_vs_torch_annotation
+ )
+ self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
for node, annotation in node_vs_torch_annotation.items():
assert QUANT_ANNOTATION_KEY not in node.meta
@@ -295,8 +338,8 @@ def _fill_torch_ao_annotation(
annotation_to_update.input_qspec_map[edge_or_node[0]] = qspec
@staticmethod
- def _get_torch_ao_qspec_from_qp(
- qp: quantization.quantizer_setup.QuantizationPointBase,
+ def _get_torch_ao_qspec_from_nncf_config(
+ qp: quantization.quantizer_setup.QuantizationPointBase, group_size=-1, weights_only=False
) -> QuantizationSpec:
"""
Retrieves the quantization configuration from the given quantization point and
@@ -307,11 +350,10 @@ def _get_torch_ao_qspec_from_qp(
"""
# Eps value is copied from nncf/torch/quantization/layers.py
extra_args = {"eps": 1e-16}
- qconfig = qp.qconfig
is_weight = qp.is_weight_quantization_point()
+ qconfig = qp.qconfig
observer: Type[UniformQuantizationObserverBase]
-
if qconfig.per_channel:
torch_qscheme = (
torch.per_channel_symmetric
@@ -325,11 +367,27 @@ def _get_torch_ao_qspec_from_qp(
else torch.per_tensor_affine
)
if is_weight:
- observer = PerChannelMinMaxObserver
- quant_min = -128
- quant_max = 127
- dtype = torch.int8
- channel_axis = 0
+ mapping_type = MappingType.SYMMETRIC if qconfig.mode == QuantizationScheme.SYMMETRIC else MappingType.ASYMMETRIC
+ if qconfig.num_bits==4:
+ extra_args["mapping_type"] = mapping_type
+ extra_args["target_dtype"] = torch.int8
+ extra_args["granularity"] = PerGroup(group_size=group_size)
+ observer = PTPerBlockParamObserver
+ quant_min = -8
+ quant_max = 7
+ dtype = torch.int8
+ channel_axis = 0
+ elif qconfig.num_bits==8:
+ observer = NNCFInt8observer if weights_only else PerChannelMinMaxObserver
+ quant_min = -128
+ quant_max = 127
+ dtype = torch.int8
+ channel_axis = 0
+ torch_qscheme = (
+ torch.per_channel_symmetric
+ if qconfig.mode is quantization.structs.QuantizationScheme.SYMMETRIC
+ else torch.per_channel_affine
+ )
else:
observer = (
HistogramObserver
From 3d88a4ea80179ba5b4498a47b3365440c81a37bd Mon Sep 17 00:00:00 2001
From: Cavus Mustafa
Date: Tue, 8 Jul 2025 12:45:43 -0700
Subject: [PATCH 007/266] Moved all openvino llama example changes into
export_llama_lib
---
backends/openvino/partitioner.py | 1 +
examples/models/llama/export_llama_lib.py | 85 ++++++++++++++++++++---
extension/llm/export/builder.py | 67 +++---------------
3 files changed, 86 insertions(+), 67 deletions(-)
diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py
index b1e7f5d436a..b508a698cab 100644
--- a/backends/openvino/partitioner.py
+++ b/backends/openvino/partitioner.py
@@ -131,6 +131,7 @@ def ops_to_not_decompose(
torch.ops.aten.upsample_bilinear2d.vec,
torch.ops.aten.upsample_nearest2d.default,
torch.ops.aten.upsample_nearest2d.vec,
+ torch.ops.aten.stack.default,
]
return (ops_not_decompose, None)
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 1ea82e3224a..ecf0ea72dca 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -862,6 +862,73 @@ def _to_edge_and_lower_llama_xnnpack(
return builder.to_executorch(passes=additional_passes)
+def _to_edge_and_lower_llama_openvino(
+ builder_exported,
+ modelname,
+ additional_passes,
+ openvino_device: str = "CPU",
+ nncf_compression: bool = False,
+ verbose: bool = False,
+) -> LLMEdgeManager: # noqa: C901
+ partitioners = []
+
+ # Add OpenVINO partitioner
+ partitioners.append(get_openvino_partitioner(openvino_device))
+ modelname = f"openvino_{modelname}"
+
+
+ logging.info("Lowering model using following partitioner(s): ")
+ for partitioner in partitioners:
+ logging.info(f"--> {partitioner.__class__.__name__}")
+
+ # Use NNCF compression if enabled
+ # TODO: Enable passing OpenVINOQuantizer as a parameter to pt2e_quantize
+ if nncf_compression:
+ try:
+ import nncf
+ from functools import partial
+ from pytorch_tokenizers import get_tokenizer
+ except ImportError:
+ raise ImportError(
+ "Please install nncf via backends/openvino/requirements.txt"
+ )
+ tokenizer = get_tokenizer(builder_exported.tokenizer_path)
+
+ def transform_fn(
+ prompts: str, tokenizer
+ ):
+ tokenized_text = tokenizer.encode(prompts, bos=False, eos=False)
+ logging.error(tokenized_text)
+
+ inputs = ()
+ inputs = (
+ torch.tensor(tokenized_text).unsqueeze(0),
+ {"input_pos": torch.tensor([0])},
+ )
+
+ return inputs
+
+ builder_exported.calibration_data = [builder_exported.calibration_data] if isinstance(builder_exported.calibration_data, str) else builder_exported.calibration_data
+ builder_exported.calibration_data = [word for prompt in builder_exported.calibration_data for word in prompt.split()] if not builder_exported.dynamic_shapes else builder_exported.calibration_data
+
+ builder_exported.pre_autograd_graph_module = nncf.compress_weights(
+ builder_exported.pre_autograd_graph_module,
+ dataset=nncf.Dataset(builder_exported.calibration_data, transform_func=partial(transform_fn, tokenizer=tokenizer)),
+ mode=nncf.CompressWeightsMode.INT4_SYM,
+ ratio=0.8,
+ sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
+ )
+
+ builder = builder_exported.to_edge_transform_and_lower(
+ partitioners
+ )
+
+ if verbose:
+ print_delegation_info(builder.edge_manager.exported_program().graph_module)
+
+ return builder.to_executorch(passes=additional_passes)
+
+
def _to_edge_and_lower_llama( # noqa: C901
builder_exported,
modelname,
@@ -873,8 +940,6 @@ def _to_edge_and_lower_llama( # noqa: C901
mps: bool = False,
coreml: bool = False,
qnn: bool = False,
- openvino: bool = False,
- openvino_device: str = "CPU",
dtype_override: str = "fp32",
enable_dynamic_shape: bool = True,
use_kv_cache: bool = False,
@@ -919,10 +984,6 @@ def _to_edge_and_lower_llama( # noqa: C901
partitioners.append(coreml_partitioner)
modelname = f"coreml_{modelname}"
- if openvino:
- partitioners.append(get_openvino_partitioner(openvino_device))
- modelname = f"openvino_{modelname}"
-
if qnn:
logging.warning(
"The model definition in current repro is not performant, please refer to the instruction"
@@ -1078,6 +1139,15 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager: # noqa: C901
generate_etrecord=llm_config.debug.generate_etrecord,
verbose=llm_config.debug.verbose,
)
+ elif llm_config.backend.openvino.enabled:
+ builder = _to_edge_and_lower_llama_openvino(
+ builder_exported,
+ modelname,
+ additional_passes,
+ openvino_device=llm_config.backend.openvino.device,
+ nncf_compression=llm_config.backend.openvino.nncf_compression,
+ verbose=llm_config.debug.verbose,
+ )
else:
builder = _to_edge_and_lower_llama(
builder_exported,
@@ -1090,8 +1160,6 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager: # noqa: C901
mps=llm_config.backend.mps.enabled,
coreml=llm_config.backend.coreml.enabled,
qnn=llm_config.backend.qnn.enabled,
- openvino=llm_config.backend.openvino.enabled,
- openvino_device=llm_config.backend.openvino.device,
dtype_override=llm_config.model.dtype_override,
enable_dynamic_shape=llm_config.model.enable_dynamic_shape,
use_kv_cache=llm_config.model.use_kv_cache,
@@ -1214,7 +1282,6 @@ def _load_llama_model(llm_config: LlmConfig) -> "LLMEdgeManager":
use_legacy_export=llm_config.backend.qnn.enabled,
save_exported_program=llm_config.export.export_only,
verbose=llm_config.debug.verbose,
- nncf_compression=llm_config.backend.openvino.nncf_compression,
metadata=_load_llama_model_metadata(
WeightType.FAIRSEQ2 if llm_config.base.fairseq2 else WeightType.LLAMA,
llm_config.model.use_kv_cache,
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index a2bfaeae22d..4128bfd8198 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -98,7 +98,6 @@ def __init__(
dynamic_shapes: Optional[Any] = None,
use_legacy_export: bool = False,
save_exported_program: bool = False,
- nncf_compression: bool = False
):
# Store necessary constructor arguments.
self.model = model
@@ -120,7 +119,6 @@ def __init__(
self.dynamic_shapes = dynamic_shapes
self.use_legacy_export = use_legacy_export
self.save_exported_program = save_exported_program
- self.nncf_compression = nncf_compression
# Note: treat this as the source of truth for the result of
# torch.export'ing a model. If the overall ExportedProgram is needed,
@@ -430,41 +428,6 @@ def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManage
DuplicateDynamicQuantChainPass()(m)
self.pre_autograd_graph_module = m
return self
- elif (self.nncf_compression):
- try:
- import nncf
- from functools import partial
- except ImportError:
- raise ImportError(
- "Please install nncf via backends/openvino/requirements.txt"
- )
- tokenizer = get_tokenizer(self.tokenizer_path)
-
- def transform_fn(
- prompts: str, tokenizer
- ):
- tokenized_text = tokenizer.encode(prompts, bos=False, eos=False)
- logging.error(tokenized_text)
-
- inputs = ()
- inputs = (
- torch.tensor(tokenized_text).unsqueeze(0),
- {"input_pos": torch.tensor([0])},
- )
-
- return inputs
-
- self.calibration_data = [self.calibration_data] if isinstance(self.calibration_data, str) else self.calibration_data
- self.calibration_data = [word for prompt in self.calibration_data for word in prompt.split()] if not self.dynamic_shapes else self.calibration_data
-
- self.pre_autograd_graph_module = nncf.compress_weights(
- self.pre_autograd_graph_module,
- dataset=nncf.Dataset(self.calibration_data, transform_func=partial(transform_fn, tokenizer=tokenizer)),
- mode=nncf.CompressWeightsMode.INT4_SYM,
- ratio=0.8,
- sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
- )
- return self
else:
logging.info("No quantizer provided, passing...")
return self
@@ -492,27 +455,15 @@ def export_to_edge(self) -> "LLMEdgeManager":
)
with override_export_behaviour:
- if (self.nncf_compression):
- from executorch.backends.openvino.utils import nncf_export_to_edge
- self.edge_manager = nncf_export_to_edge(
- self.pre_autograd_graph_module, # pyre-fixme[6]
- self.example_inputs,
- example_kwarg_inputs=self.example_kwarg_inputs,
- dynamic_shapes=dynamic_shape,
- edge_constant_methods=self.metadata,
- edge_compile_config=edge_config,
- verbose=self.verbose,
- )
- else:
- self.edge_manager = export_to_edge(
- self.pre_autograd_graph_module, # pyre-fixme[6]
- self.example_inputs,
- example_kwarg_inputs=self.example_kwarg_inputs,
- dynamic_shapes=dynamic_shape,
- edge_constant_methods=self.metadata,
- edge_compile_config=edge_config,
- verbose=self.verbose,
- )
+ self.edge_manager = export_to_edge(
+ self.pre_autograd_graph_module, # pyre-fixme[6]
+ self.example_inputs,
+ example_kwarg_inputs=self.example_kwarg_inputs,
+ dynamic_shapes=dynamic_shape,
+ edge_constant_methods=self.metadata,
+ edge_compile_config=edge_config,
+ verbose=self.verbose,
+ )
return self
def to_backend(self, partitioners: Optional[List[Partitioner]]) -> "LLMEdgeManager":
From e81f60d895fe235e00fa11567f5f85e6d6e25d08 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa
Date: Tue, 8 Jul 2025 12:57:22 -0700
Subject: [PATCH 008/266] Removed openvino utils.py since it is not needed
anymore
---
backends/openvino/utils.py | 66 --------------------------------------
1 file changed, 66 deletions(-)
delete mode 100644 backends/openvino/utils.py
diff --git a/backends/openvino/utils.py b/backends/openvino/utils.py
deleted file mode 100644
index ec4bebe0d6d..00000000000
--- a/backends/openvino/utils.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) Intel Corporation
-#
-# Licensed under the BSD License (the "License"); you may not use this file
-# except in compliance with the License. See the license file found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-
-from typing import Any, Dict, Optional, Tuple, Union
-
-import executorch.exir as exir
-
-import torch
-from executorch.exir import EdgeProgramManager
-from executorch.exir.program._program import to_edge_with_preserved_ops
-from executorch.exir.tracer import Value
-from torch.export import ExportedProgram
-from executorch.extension.export_util.utils import _to_core_aten
-
-_EDGE_COMPILE_CONFIG = exir.EdgeCompileConfig(
- _check_ir_validity=True,
- _skip_dim_order=True, # TODO(T189114319): Reuse dim order op after solving the ios oss issue
-)
-
-def nncf_core_aten_to_edge(
- core_aten_exir_ep: ExportedProgram,
- edge_constant_methods: Optional[Dict[str, Any]] = None,
- edge_compile_config=None,
- verbose=True,
-) -> EdgeProgramManager:
- if not edge_compile_config:
- edge_compile_config = exir.EdgeCompileConfig(
- _check_ir_validity=False, # quant ops currently break ir verification
- )
- edge_manager: EdgeProgramManager = to_edge_with_preserved_ops(
- core_aten_exir_ep,
- constant_methods=edge_constant_methods,
- compile_config=edge_compile_config,
- preserve_ops=[torch.ops.aten.stack.default,],
- )
- if verbose:
- logging.info(f"Exported graph:\n{edge_manager.exported_program()}")
- return edge_manager
-
-def nncf_export_to_edge(
- model: Union[torch.fx.GraphModule, torch.nn.Module],
- example_inputs: Tuple[Value, ...],
- *,
- example_kwarg_inputs: Optional[Dict] = None,
- dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any]]] = None,
- edge_constant_methods: Optional[Dict[str, Any]] = None,
- edge_compile_config=_EDGE_COMPILE_CONFIG,
- strict=True,
- verbose=True,
-) -> EdgeProgramManager:
- core_aten_ep = _to_core_aten(
- model,
- example_inputs,
- example_kwarg_inputs=example_kwarg_inputs,
- dynamic_shapes=dynamic_shapes,
- strict=strict,
- verbose=verbose,
- )
- return nncf_core_aten_to_edge(
- core_aten_ep, edge_constant_methods, edge_compile_config, verbose=verbose
- )
From 457a868cb01bc1a4be090da18b3e431cf3b506d0 Mon Sep 17 00:00:00 2001
From: Aamir Nazir
Date: Wed, 9 Jul 2025 11:53:26 +0400
Subject: [PATCH 009/266] Update nncf_observers.py
---
.../quantizer/observers/nncf_observers.py | 18 +++++++++++-------
1 file changed, 11 insertions(+), 7 deletions(-)
diff --git a/backends/openvino/quantizer/observers/nncf_observers.py b/backends/openvino/quantizer/observers/nncf_observers.py
index 54f4348e0ed..977458801a4 100644
--- a/backends/openvino/quantizer/observers/nncf_observers.py
+++ b/backends/openvino/quantizer/observers/nncf_observers.py
@@ -57,12 +57,14 @@ def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node):
else:
decompressor = INT4SymmetricWeightsDecompressor(scale, q_weight.shape, original_weight.shape, original_weight.dtype)
packed_q_weight = decompressor.pack_weight(q_weight)
- new_weight_node = constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0)
- decompressor_name = f'NNCFDecompressor_{new_weight_node.name}'
+ constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0)
+ compressed_weight_name = observer_node.all_input_nodes[0].name
+ decompressor_suffix = "_".join(compressed_weight_name.replace(".", "_").split("_")[:-2])
+ decompressor_name = f"{decompressor.quantization_mode}_weights_decompressor_{decompressor_suffix}"
module_insertion_transformation_builder(
decompressor,
- [PTTargetPoint(TargetType.OPERATOR_POST_HOOK, target_node_name=new_weight_node.name)],
+ [PTTargetPoint(TargetType.OPERATOR_POST_HOOK, target_node_name=compressed_weight_name)],
decompressor_name,
)(model)
decomp_node = observer_node.args[0]
@@ -101,14 +103,16 @@ def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node):
else:
decompressor = INT8SymmetricWeightsDecompressor(scale, original_weight.dtype)
packed_q_weight = decompressor.pack_weight(q_weight)
- new_weight_node = constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0)
- decompressor_name = f'NNCFDecompressor_{new_weight_node.name}'
+ constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0)
+ compressed_weight_name = observer_node.all_input_nodes[0].name
+ decompressor_suffix = "_".join(compressed_weight_name.replace(".", "_").split("_")[:-2])
+ decompressor_name = f"{decompressor.quantization_mode}_weights_decompressor_{decompressor_suffix}"
module_insertion_transformation_builder(
decompressor,
- [PTTargetPoint(TargetType.OPERATOR_POST_HOOK, target_node_name=new_weight_node.name)],
+ [PTTargetPoint(TargetType.OPERATOR_POST_HOOK, target_node_name=compressed_weight_name)],
decompressor_name,
)(model)
decomp_node = observer_node.args[0]
observer_node.replace_all_uses_with(decomp_node)
- model.graph.erase_node(observer_node)
\ No newline at end of file
+ model.graph.erase_node(observer_node)
From d1e9330b53f96068590b767ec8896a9317a1e954 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa
Date: Mon, 14 Jul 2025 18:55:40 -0700
Subject: [PATCH 010/266] Add export llama runner build option into openvino
build script
---
backends/openvino/scripts/openvino_build.sh | 28 +++++++++++++++++++--
1 file changed, 26 insertions(+), 2 deletions(-)
diff --git a/backends/openvino/scripts/openvino_build.sh b/backends/openvino/scripts/openvino_build.sh
index c10a3bb4eeb..add946e15ae 100755
--- a/backends/openvino/scripts/openvino_build.sh
+++ b/backends/openvino/scripts/openvino_build.sh
@@ -17,7 +17,7 @@ main() {
# Set build directory
local build_dir="cmake-out"
- # Create and enter the build directory
+ # Enter the Executorch root directory
cd "$EXECUTORCH_ROOT"
rm -rf "${build_dir}"
@@ -32,6 +32,7 @@ main() {
-DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-DEXECUTORCH_BUILD_OPENVINO_EXECUTOR_RUNNER=ON \
+ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-B"${build_dir}"
@@ -42,7 +43,7 @@ main() {
elif [[ "$build_type" == "--enable_python" ]]; then
echo "Building Python Package with Pybinding"
- # Create and enter the build directory
+ # Enter the Executorch root directory
cd "$EXECUTORCH_ROOT"
./install_executorch.sh --clean
@@ -58,6 +59,29 @@ main() {
# Install torchao
pip install third-party/ao
+ # If the first arguments is --llama_runner, build export llama runner binary
+ # Note: c++ runtime with openvino backend should be built before building export llama runner
+ elif [[ "$build_type" == "--llama_runner" ]]; then
+ echo "Building Export Llama Runner"
+
+ # Set build directory
+ local build_dir="cmake-out"
+
+ # Enter the Executorch root directory
+ cd "$EXECUTORCH_ROOT"
+
+ # Configure the project with CMake
+ # Note: Add any additional configuration options you need here
+ cmake -DBUILD_TESTING=OFF \
+ -DCMAKE_INSTALL_PREFIX="${build_dir}" \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DEXECUTORCH_BUILD_OPENVINO=ON \
+ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+ -B"${build_dir}"/examples/models/llama \
+ examples/models/llama
+
+ # Build the export llama runner
+ cmake --build cmake-out/examples/models/llama -j$(nproc) --config Release
else
echo "Error: Argument is not valid: $build_type"
exit 1 # Exit the script with an error code
From cedab9d875e2965f4faaa90e16a1be1adc8d507d Mon Sep 17 00:00:00 2001
From: Mustafa Cavus
Date: Mon, 14 Jul 2025 19:10:02 -0700
Subject: [PATCH 011/266] Update README.md
---
examples/openvino/README.md | 48 +++++++++++++++++++++++++++++++++++++
1 file changed, 48 insertions(+)
diff --git a/examples/openvino/README.md b/examples/openvino/README.md
index 8856ccdce4e..dbce5df1b55 100644
--- a/examples/openvino/README.md
+++ b/examples/openvino/README.md
@@ -183,3 +183,51 @@ Run inference with a given model for 10 iterations:
--model_path=model.pte \
--num_executions=10
```
+
+# Export Llama with OpenVINO Backend
+
+## Download the Model
+Follow the [instructions](../../examples/models/llama#step-2-prepare-model) to download the required model files. Export Llama with OpenVINO backend is only verified with Llama-3.2-1B variants at this time.
+
+## Environment Setup
+Follow the [instructions](../../backends/openvino/README.md) of **Prerequisites** and **Setup** in `backends/openvino/README.md` to set up the OpenVINO backend.
+
+## Export the model:
+Execute the commands below to export the model. Update the model file paths to match the location where your model is downloaded.
+
+```
+LLAMA_CHECKPOINT=/consolidated.00.pth
+LLAMA_PARAMS=/params.json
+LLAMA_TOKENIZER=/tokenizer.model
+
+python -u -m examples.models.llama.export_llama \
+ --model "llama3_2" \
+ --checkpoint "${LLAMA_CHECKPOINT:?}" \
+ --params "${LLAMA_PARAMS:?}" \
+ -kv \
+ --openvino \
+ -d fp32 \
+ --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
+ --output_name="llama.pte" \
+ --verbose \
+ --disable_dynamic_shape \
+ --tokenizer_path "${LLAMA_TOKENIZER:?}" \
+ --nncf_compression
+```
+
+## Build OpenVINO C++ Runtime with Llama Runner:
+First, build the backend libraries by executing the script below in `/backends/openvino/scripts` folder:
+```bash
+./openvino_build.sh
+```
+Then, build the llama runner by executing the script below (with `--llama_runner` argument) also in `/backends/openvino/scripts` folder:
+```bash
+./openvino_build.sh --llama_runner
+```
+The executable is saved in `/cmake-out/examples/models/llama/llama_main`
+
+## Execute Inference Using Llama Runner
+Update the model tokenizer file path to match the location where your model is downloaded and replace the prompt.
+```
+./cmake-out/examples/models/llama/llama_main --model_path=llama.pte --tokenizer_path=/tokenizer.model --prompt="Your custom prompt"
+```
From e54f4c7ef6207733f0907cbe1030124926f6550c Mon Sep 17 00:00:00 2001
From: suryasidd
Date: Tue, 19 Aug 2025 15:56:35 -0700
Subject: [PATCH 012/266] Added CMAKE EXPORT Changes
---
backends/openvino/CMakeLists.txt | 12 +++++++++---
backends/openvino/scripts/openvino_build.sh | 8 +++-----
examples/models/llama/CMakeLists.txt | 3 +--
3 files changed, 13 insertions(+), 10 deletions(-)
diff --git a/backends/openvino/CMakeLists.txt b/backends/openvino/CMakeLists.txt
index cb240805665..a2b982babab 100644
--- a/backends/openvino/CMakeLists.txt
+++ b/backends/openvino/CMakeLists.txt
@@ -38,7 +38,11 @@ add_library(openvino_backend STATIC .)
target_compile_options(openvino_backend PRIVATE -frtti -fexceptions)
# Include Executorch directories
-target_include_directories(openvino_backend PUBLIC ${COMMON_INCLUDE_DIRS})
+target_include_directories(openvino_backend
+ PUBLIC
+ $
+)
+
# Link OpenVINO and ExecuteTorch core libraries
target_link_libraries(
@@ -77,5 +81,7 @@ if(EXECUTORCH_BUILD_OPENVINO_EXECUTOR_RUNNER)
)
endif()
-# Install OpenVINO backend library to the lib directory
-install(TARGETS openvino_backend DESTINATION lib)
+# Install OpenVINO backend library and export target
+install(TARGETS openvino_backend
+ EXPORT ExecuTorchTargets
+ DESTINATION lib)
diff --git a/backends/openvino/scripts/openvino_build.sh b/backends/openvino/scripts/openvino_build.sh
index 7f903086163..08741840ddb 100755
--- a/backends/openvino/scripts/openvino_build.sh
+++ b/backends/openvino/scripts/openvino_build.sh
@@ -33,6 +33,8 @@ main() {
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-DEXECUTORCH_BUILD_OPENVINO_EXECUTOR_RUNNER=ON \
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+ -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
+ -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
-B"${build_dir}"
@@ -72,14 +74,10 @@ main() {
# Configure the project with CMake
# Note: Add any additional configuration options you need here
- cmake -DBUILD_TESTING=OFF \
- -DCMAKE_INSTALL_PREFIX="${build_dir}" \
+ cmake -DCMAKE_INSTALL_PREFIX="${build_dir}" \
-DCMAKE_BUILD_TYPE=Release \
- -DEXECUTORCH_BUILD_OPENVINO=ON \
- -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-B"${build_dir}"/examples/models/llama \
examples/models/llama
-
# Build the export llama runner
cmake --build cmake-out/examples/models/llama -j$(nproc) --config Release
else
diff --git a/examples/models/llama/CMakeLists.txt b/examples/models/llama/CMakeLists.txt
index c469a69596c..a2a1f4efa05 100644
--- a/examples/models/llama/CMakeLists.txt
+++ b/examples/models/llama/CMakeLists.txt
@@ -192,9 +192,8 @@ endif()
# Openvino backend
if(TARGET openvino_backend)
find_package(OpenVINO REQUIRED)
- target_link_libraries(openvino_backend INTERFACE openvino::runtime executorch_core)
list(APPEND link_libraries openvino_backend)
- target_link_options_shared_lib(openvino_backend)
+ executorch_target_link_options_shared_lib(openvino_backend)
endif()
if(TARGET coremldelegate)
From c12a4bafd441be0a77f909c063fcb883a8ac900b Mon Sep 17 00:00:00 2001
From: Cavus Mustafa
Date: Wed, 20 Aug 2025 18:07:33 -0700
Subject: [PATCH 013/266] code formating updates
---
backends/openvino/CMakeLists.txt | 14 +-
backends/openvino/partitioner.py | 38 +++--
.../quantizer/observers/nncf_observers.py | 133 +++++++++++++-----
backends/openvino/quantizer/quantizer.py | 95 +++++++++----
backends/openvino/runtime/OpenvinoBackend.cpp | 26 ++--
examples/models/llama/export_llama_lib.py | 45 +++---
extension/llm/export/partitioner_lib.py | 6 +-
7 files changed, 243 insertions(+), 114 deletions(-)
diff --git a/backends/openvino/CMakeLists.txt b/backends/openvino/CMakeLists.txt
index a2b982babab..94f47c5e929 100644
--- a/backends/openvino/CMakeLists.txt
+++ b/backends/openvino/CMakeLists.txt
@@ -38,12 +38,10 @@ add_library(openvino_backend STATIC .)
target_compile_options(openvino_backend PRIVATE -frtti -fexceptions)
# Include Executorch directories
-target_include_directories(openvino_backend
- PUBLIC
- $
+target_include_directories(
+ openvino_backend PUBLIC $
)
-
# Link OpenVINO and ExecuteTorch core libraries
target_link_libraries(
openvino_backend PRIVATE openvino::runtime executorch_core
@@ -82,6 +80,8 @@ if(EXECUTORCH_BUILD_OPENVINO_EXECUTOR_RUNNER)
endif()
# Install OpenVINO backend library and export target
-install(TARGETS openvino_backend
- EXPORT ExecuTorchTargets
- DESTINATION lib)
+install(
+ TARGETS openvino_backend
+ EXPORT ExecuTorchTargets
+ DESTINATION lib
+)
diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py
index b508a698cab..a2920285f99 100644
--- a/backends/openvino/partitioner.py
+++ b/backends/openvino/partitioner.py
@@ -25,12 +25,14 @@
from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
from torch.fx.passes.operator_support import OperatorSupportBase
+
class PatternNode:
op_types = {}
def __init__(self):
self.op_types = {}
+
class OpenvinoOperatorsSupport(OperatorSupportBase):
def __init__(
@@ -135,18 +137,24 @@ def ops_to_not_decompose(
]
return (ops_not_decompose, None)
- def check_pattern(self, node: torch.fx.Node, pattern: PatternNode, enabled_ops: list) -> bool:
+ def check_pattern(
+ self, node: torch.fx.Node, pattern: PatternNode, enabled_ops: list
+ ) -> bool:
if node.op == "call_function":
if ("call_function" + ":" + str(node.target.__name__)) in pattern.op_types:
pt_input_nodes = node.all_input_nodes
- pattern_input_ops = pattern.op_types["call_function" + ":" + str(node.target.__name__)]
+ pattern_input_ops = pattern.op_types[
+ "call_function" + ":" + str(node.target.__name__)
+ ]
if pattern_input_ops is None:
enabled_ops.append(node)
return True
if len(pt_input_nodes) != len(pattern_input_ops):
return False
for i in range(len(pt_input_nodes)):
- if not self.check_pattern(pt_input_nodes[i], pattern_input_ops[i], enabled_ops):
+ if not self.check_pattern(
+ pt_input_nodes[i], pattern_input_ops[i], enabled_ops
+ ):
return False
enabled_ops.append(node)
return True
@@ -167,14 +175,24 @@ def capture_nncf_patterns(self, graph_module: torch.fx.GraphModule):
const_node.op_types["get_attr"] = None
const_node.op_types["placeholder"] = None
bitwise_right_shift_node = PatternNode
- bitwise_right_shift_node.op_types["call_function:aten.bitwise_right_shift.Tensor_Scalar"] = [const_node]
+ bitwise_right_shift_node.op_types[
+ "call_function:aten.bitwise_right_shift.Tensor_Scalar"
+ ] = [const_node]
bitwise_and_node = PatternNode
- bitwise_and_node.op_types["call_function:aten.bitwise_and.Scalar"] = [const_node]
+ bitwise_and_node.op_types["call_function:aten.bitwise_and.Scalar"] = [
+ const_node
+ ]
stack_node = PatternNode
- stack_node.op_types["call_function:aten.stack.default"] = [bitwise_and_node, bitwise_right_shift_node]
+ stack_node.op_types["call_function:aten.stack.default"] = [
+ bitwise_and_node,
+ bitwise_right_shift_node,
+ ]
for node in graph_module.graph.nodes:
- if str(node.op) == "call_function" and str(node.target.__name__) == "aten.stack.default":
+ if (
+ str(node.op) == "call_function"
+ and str(node.target.__name__) == "aten.stack.default"
+ ):
enabled_ops = []
pattern_match = self.check_pattern(node, stack_node, enabled_ops)
if pattern_match:
@@ -191,7 +209,11 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
self.capture_nncf_patterns(exported_program.graph_module)
partitioner = CapabilityBasedPartitioner(
exported_program.graph_module,
- OpenvinoOperatorsSupport(self._op_types_to_skip, self._op_names_to_skip, self._enabled_ops_by_name),
+ OpenvinoOperatorsSupport(
+ self._op_types_to_skip,
+ self._op_names_to_skip,
+ self._enabled_ops_by_name,
+ ),
allows_single_node_partition=True,
)
partition_list = partitioner.propose_partitions()
diff --git a/backends/openvino/quantizer/observers/nncf_observers.py b/backends/openvino/quantizer/observers/nncf_observers.py
index 977458801a4..aa531336d0c 100644
--- a/backends/openvino/quantizer/observers/nncf_observers.py
+++ b/backends/openvino/quantizer/observers/nncf_observers.py
@@ -7,38 +7,65 @@
from typing import Tuple
import torch
-from torch.ao.quantization.observer import MappingType, PerGroup, PerAxis, PerChannelMinMaxObserver, get_block_size
+from nncf.experimental.torch.fx.node_utils import get_tensor_constant_from_node
+from nncf.experimental.torch.fx.transformations import (
+ constant_update_fn,
+ module_insertion_transformation_builder,
+)
+from nncf.parameters import CompressWeightsMode
+from nncf.quantization.algorithms.weight_compression.config import (
+ WeightCompressionConfig,
+)
+
+from nncf.quantization.algorithms.weight_compression.weight_lowering import (
+ do_integer_quantization,
+)
+from nncf.tensor.tensor import Tensor
+from nncf.torch.graph.transformations.commands import PTTargetPoint, TargetType
+from nncf.torch.quantization.layers import (
+ INT4AsymmetricWeightsDecompressor,
+ INT4SymmetricWeightsDecompressor,
+ INT8AsymmetricWeightsDecompressor,
+ INT8SymmetricWeightsDecompressor,
+)
+from torch.ao.quantization.observer import (
+ get_block_size,
+ MappingType,
+ PerAxis,
+ PerChannelMinMaxObserver,
+ PerGroup,
+)
from torch.ao.quantization.pt2e._affine_quantization import (
_get_reduction_params,
AffineQuantizedMinMaxObserver,
)
-from nncf.torch.quantization.layers import INT4AsymmetricWeightsDecompressor, INT4SymmetricWeightsDecompressor, INT8AsymmetricWeightsDecompressor, INT8SymmetricWeightsDecompressor
-from nncf.experimental.torch.fx.transformations import constant_update_fn, module_insertion_transformation_builder
-from nncf.experimental.torch.fx.node_utils import get_tensor_constant_from_node
-from nncf.torch.graph.transformations.commands import PTTargetPoint, TargetType
-from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization
-from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
-from nncf.parameters import CompressWeightsMode
-from nncf.tensor.tensor import Tensor
class PTPerBlockParamObserver(AffineQuantizedMinMaxObserver):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
- qmode = CompressWeightsMode.INT4_ASYM if self.mapping_type==MappingType.ASYMMETRIC else CompressWeightsMode.INT4_SYM
- assert isinstance(self.granularity, PerGroup), "Only PerGroup granularity is supported"
- self.wc_config = WeightCompressionConfig(mode=qmode, group_size=self.granularity.group_size)
+ qmode = (
+ CompressWeightsMode.INT4_ASYM
+ if self.mapping_type == MappingType.ASYMMETRIC
+ else CompressWeightsMode.INT4_SYM
+ )
+ assert isinstance(
+ self.granularity, PerGroup
+ ), "Only PerGroup granularity is supported"
+ self.wc_config = WeightCompressionConfig(
+ mode=qmode, group_size=self.granularity.group_size
+ )
def calculate_qparams(self, weight):
assert hasattr(self, "min_val") and hasattr(
self, "max_val"
), "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams"
- _, reduction_dims = _get_reduction_params(
- self.block_size, weight.size()
- )
+ _, reduction_dims = _get_reduction_params(self.block_size, weight.size())
assert len(reduction_dims) == 1, "Only 1-D group size is supported"
reduction_dims = reduction_dims[0] - 1
- q_weight, scale, zp = do_integer_quantization(Tensor(weight), self.wc_config, reduction_axes=reduction_dims)
+ q_weight, scale, zp = do_integer_quantization(
+ Tensor(weight), self.wc_config, reduction_axes=reduction_dims
+ )
zp = zp.data if zp is not None else None
return q_weight.data, scale.data, zp
@@ -50,23 +77,38 @@ def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node):
weight_node = observer_node.args[0]
original_weight = get_tensor_constant_from_node(weight_node, model)
q_weight, scale, zero_point = self.calculate_qparams(original_weight)
-
+
with model.graph.inserting_before(observer_node):
- if(zero_point is not None):
- decompressor = INT4AsymmetricWeightsDecompressor(scale, zero_point, q_weight.shape, original_weight.shape, original_weight.dtype)
+ if zero_point is not None:
+ decompressor = INT4AsymmetricWeightsDecompressor(
+ scale,
+ zero_point,
+ q_weight.shape,
+ original_weight.shape,
+ original_weight.dtype,
+ )
else:
- decompressor = INT4SymmetricWeightsDecompressor(scale, q_weight.shape, original_weight.shape, original_weight.dtype)
+ decompressor = INT4SymmetricWeightsDecompressor(
+ scale, q_weight.shape, original_weight.shape, original_weight.dtype
+ )
packed_q_weight = decompressor.pack_weight(q_weight)
constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0)
compressed_weight_name = observer_node.all_input_nodes[0].name
- decompressor_suffix = "_".join(compressed_weight_name.replace(".", "_").split("_")[:-2])
+ decompressor_suffix = "_".join(
+ compressed_weight_name.replace(".", "_").split("_")[:-2]
+ )
decompressor_name = f"{decompressor.quantization_mode}_weights_decompressor_{decompressor_suffix}"
module_insertion_transformation_builder(
- decompressor,
- [PTTargetPoint(TargetType.OPERATOR_POST_HOOK, target_node_name=compressed_weight_name)],
- decompressor_name,
- )(model)
+ decompressor,
+ [
+ PTTargetPoint(
+ TargetType.OPERATOR_POST_HOOK,
+ target_node_name=compressed_weight_name,
+ )
+ ],
+ decompressor_name,
+ )(model)
decomp_node = observer_node.args[0]
observer_node.replace_all_uses_with(decomp_node)
model.graph.erase_node(observer_node)
@@ -75,7 +117,11 @@ def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node):
class NNCFInt8observer(PerChannelMinMaxObserver):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
- qmode = CompressWeightsMode.INT8_SYM if self.qscheme==torch.per_channel_symmetric else CompressWeightsMode.INT8_ASYM
+ qmode = (
+ CompressWeightsMode.INT8_SYM
+ if self.qscheme == torch.per_channel_symmetric
+ else CompressWeightsMode.INT8_ASYM
+ )
self.wc_config = WeightCompressionConfig(mode=qmode)
def calculate_qparams(self, weight):
@@ -84,10 +130,10 @@ def calculate_qparams(self, weight):
), "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams"
self.granularity = PerAxis(axis=self.ch_axis)
self.block_size = get_block_size(weight.shape, self.granularity)
- _, reduction_dims = _get_reduction_params(
- self.block_size, weight.size()
- )
- q_weight, scale, zp = do_integer_quantization(Tensor(weight), self.wc_config, reduction_axes=reduction_dims)
+ _, reduction_dims = _get_reduction_params(self.block_size, weight.size())
+ q_weight, scale, zp = do_integer_quantization(
+ Tensor(weight), self.wc_config, reduction_axes=reduction_dims
+ )
zp = zp.data if zp is not None else None
return q_weight.data, scale.data, zp
@@ -98,21 +144,32 @@ def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node):
q_weight, scale, zero_point = self.calculate_qparams(original_weight)
with model.graph.inserting_before(observer_node):
- if(zero_point is not None):
- decompressor = INT8AsymmetricWeightsDecompressor(scale, zero_point, original_weight.dtype)
+ if zero_point is not None:
+ decompressor = INT8AsymmetricWeightsDecompressor(
+ scale, zero_point, original_weight.dtype
+ )
else:
- decompressor = INT8SymmetricWeightsDecompressor(scale, original_weight.dtype)
+ decompressor = INT8SymmetricWeightsDecompressor(
+ scale, original_weight.dtype
+ )
packed_q_weight = decompressor.pack_weight(q_weight)
constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0)
compressed_weight_name = observer_node.all_input_nodes[0].name
- decompressor_suffix = "_".join(compressed_weight_name.replace(".", "_").split("_")[:-2])
+ decompressor_suffix = "_".join(
+ compressed_weight_name.replace(".", "_").split("_")[:-2]
+ )
decompressor_name = f"{decompressor.quantization_mode}_weights_decompressor_{decompressor_suffix}"
module_insertion_transformation_builder(
- decompressor,
- [PTTargetPoint(TargetType.OPERATOR_POST_HOOK, target_node_name=compressed_weight_name)],
- decompressor_name,
- )(model)
+ decompressor,
+ [
+ PTTargetPoint(
+ TargetType.OPERATOR_POST_HOOK,
+ target_node_name=compressed_weight_name,
+ )
+ ],
+ decompressor_name,
+ )(model)
decomp_node = observer_node.args[0]
observer_node.replace_all_uses_with(decomp_node)
model.graph.erase_node(observer_node)
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index bf7fd0859d5..820d5dd49ba 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -15,14 +15,20 @@
import nncf.experimental.torch.fx as nncf_fx # type: ignore[import-untyped]
import torch.fx
+from executorch.backends.openvino.quantizer.observers.nncf_observers import (
+ NNCFInt8observer,
+ PTPerBlockParamObserver,
+)
from nncf.common.graph.graph import NNCFGraph # type: ignore[import-untyped]
+from nncf.common.quantization.structs import QuantizationScheme, QuantizerConfig
+from nncf.quantization.quantize_model import get_weight_compression_configuration
from torchao.quantization.pt2e import (
HistogramObserver,
+ MappingType,
PerChannelMinMaxObserver,
- UniformQuantizationObserverBase,
PerGroup,
- MappingType,
+ UniformQuantizationObserverBase,
)
from torchao.quantization.pt2e.quantizer import (
EdgeOrNode,
@@ -32,9 +38,6 @@
Quantizer,
SharedQuantizationSpec,
)
-from nncf.quantization.quantize_model import get_weight_compression_configuration
-from nncf.common.quantization.structs import QuantizerConfig, QuantizationScheme
-from executorch.backends.openvino.quantizer.observers.nncf_observers import PTPerBlockParamObserver,NNCFInt8observer
QUANT_ANNOTATION_KEY = "quantization_annotation"
from torchao.quantization.pt2e.quantizer.quantizer import Q_ANNOTATION_KEY
@@ -81,7 +84,12 @@ def __init__(
:param kwargs: Arguments to pass to the NNCF MinMaxQuantization algorithm.
"""
self.mode = mode
- self.wc_modes = [QuantizationMode.INT4_ASYM_WC,QuantizationMode.INT4_SYM_WC, QuantizationMode.INT8_ASYM_WC, QuantizationMode.INT8_SYM_WC]
+ self.wc_modes = [
+ QuantizationMode.INT4_ASYM_WC,
+ QuantizationMode.INT4_SYM_WC,
+ QuantizationMode.INT8_ASYM_WC,
+ QuantizationMode.INT8_SYM_WC,
+ ]
if mode == QuantizationMode.INT8_SYM:
preset = quantization.structs.QuantizationPreset.PERFORMANCE
model_type = None
@@ -91,7 +99,7 @@ def __init__(
else:
preset = None
model_type = nncf.parameters.ModelType.TRANSFORMER
- if(self.mode not in self.wc_modes):
+ if self.mode not in self.wc_modes:
self._min_max_algo = (
nncf.quantization.algorithms.min_max.algorithm.MinMaxQuantization(
preset=preset, model_type=model_type, **kwargs
@@ -100,16 +108,16 @@ def __init__(
self._algo = self._min_max_algo
else:
weight_compression_configuration = get_weight_compression_configuration(
- mode.value.replace("_wc", ""), # Mode value has to match NNCF CompressWeightsMode
- **kwargs
+ mode.value.replace(
+ "_wc", ""
+ ), # Mode value has to match NNCF CompressWeightsMode
+ **kwargs,
)
self._weight_compression_algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression(
- subset_size=None,
- **weight_compression_configuration
+ subset_size=None, **weight_compression_configuration
)
self._algo = self._weight_compression_algo
-
def set_ignored_scope(
self,
names: Optional[List[str]] = None,
@@ -153,20 +161,40 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
defaultdict(QuantizationAnnotation)
)
# Serperate into annotation for quantize and compress
- if(self.mode in self.wc_modes):
+ if self.mode in self.wc_modes:
self._algo.set_backend_entity(model)
nodes_to_compress = self._algo.get_nodes_to_compress(nncf_graph)
for node in nodes_to_compress:
- quantization_insertion_point = quantization.quantizer_setup.WeightQuantizationInsertionPoint(target_node_name=node.node_name)
+ quantization_insertion_point = (
+ quantization.quantizer_setup.WeightQuantizationInsertionPoint(
+ target_node_name=node.node_name
+ )
+ )
group_size = self._algo._group_size
- num_bits = 4 if self.mode in [QuantizationMode.INT4_SYM_WC,QuantizationMode.INT4_ASYM_WC] else 8
- qmode = QuantizationScheme.SYMMETRIC if self.mode in [QuantizationMode.INT4_SYM_WC,QuantizationMode.INT8_SYM_WC] else QuantizationScheme.ASYMMETRIC
+ num_bits = (
+ 4
+ if self.mode
+ in [QuantizationMode.INT4_SYM_WC, QuantizationMode.INT4_ASYM_WC]
+ else 8
+ )
+ qmode = (
+ QuantizationScheme.SYMMETRIC
+ if self.mode
+ in [QuantizationMode.INT4_SYM_WC, QuantizationMode.INT8_SYM_WC]
+ else QuantizationScheme.ASYMMETRIC
+ )
nncf_qconfig = QuantizerConfig(num_bits=num_bits, mode=qmode)
- qp = quantization.quantizer_setup.SingleConfigQuantizationPoint(qip=quantization_insertion_point, qconfig=nncf_qconfig, directly_quantized_operator_node_names=[node])
+ qp = quantization.quantizer_setup.SingleConfigQuantizationPoint(
+ qip=quantization_insertion_point,
+ qconfig=nncf_qconfig,
+ directly_quantized_operator_node_names=[node],
+ )
edge_or_node, annotation = self._get_edge_or_node_and_annotation(
graph, nncf_graph, qp, node_vs_torch_annotation
)
- qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(qp, group_size=group_size, weights_only=True)
+ qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(
+ qp, group_size=group_size, weights_only=True
+ )
self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
else:
quantization_setup = self.get_nncf_quantization_setup(model, nncf_graph)
@@ -175,7 +203,9 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
edge_or_node, annotation = self._get_edge_or_node_and_annotation(
graph, nncf_graph, qp, node_vs_torch_annotation
)
- qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(qp)
+ qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(
+ qp
+ )
self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
for quantizer_ids in quantization_setup.unified_scale_groups.values():
@@ -186,7 +216,8 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
root_qp = quantization_setup.quantization_points[root_quantizer_id]
if any(
- root_qp.qconfig != quantization_setup.quantization_points[q_id].qconfig
+ root_qp.qconfig
+ != quantization_setup.quantization_points[q_id].qconfig
for q_id in quantizer_ids
):
qps = [
@@ -340,7 +371,9 @@ def _fill_torch_ao_annotation(
@staticmethod
def _get_torch_ao_qspec_from_nncf_config(
- qp: quantization.quantizer_setup.QuantizationPointBase, group_size=-1, weights_only=False
+ qp: quantization.quantizer_setup.QuantizationPointBase,
+ group_size=-1,
+ weights_only=False,
) -> QuantizationSpec:
"""
Retrieves the quantization configuration from the given quantization point and
@@ -368,8 +401,12 @@ def _get_torch_ao_qspec_from_nncf_config(
else torch.per_tensor_affine
)
if is_weight:
- mapping_type = MappingType.SYMMETRIC if qconfig.mode == QuantizationScheme.SYMMETRIC else MappingType.ASYMMETRIC
- if qconfig.num_bits==4:
+ mapping_type = (
+ MappingType.SYMMETRIC
+ if qconfig.mode == QuantizationScheme.SYMMETRIC
+ else MappingType.ASYMMETRIC
+ )
+ if qconfig.num_bits == 4:
extra_args["mapping_type"] = mapping_type
extra_args["target_dtype"] = torch.int8
extra_args["granularity"] = PerGroup(group_size=group_size)
@@ -378,16 +415,18 @@ def _get_torch_ao_qspec_from_nncf_config(
quant_max = 7
dtype = torch.int8
channel_axis = 0
- elif qconfig.num_bits==8:
- observer = NNCFInt8observer if weights_only else PerChannelMinMaxObserver
+ elif qconfig.num_bits == 8:
+ observer = (
+ NNCFInt8observer if weights_only else PerChannelMinMaxObserver
+ )
quant_min = -128
quant_max = 127
dtype = torch.int8
channel_axis = 0
torch_qscheme = (
- torch.per_channel_symmetric
- if qconfig.mode is quantization.structs.QuantizationScheme.SYMMETRIC
- else torch.per_channel_affine
+ torch.per_channel_symmetric
+ if qconfig.mode is quantization.structs.QuantizationScheme.SYMMETRIC
+ else torch.per_channel_affine
)
else:
observer = (
diff --git a/backends/openvino/runtime/OpenvinoBackend.cpp b/backends/openvino/runtime/OpenvinoBackend.cpp
index 546f4d68573..bac006ce916 100644
--- a/backends/openvino/runtime/OpenvinoBackend.cpp
+++ b/backends/openvino/runtime/OpenvinoBackend.cpp
@@ -116,23 +116,23 @@ exr::Error OpenvinoBackend::execute(
infer_request->set_input_tensor(i, ov_input_tensor);
if (args[i]->isInt()) {
- int64_t *val = &(args[i]->payload.copyable_union.as_int);
+ int64_t* val = &(args[i]->payload.copyable_union.as_int);
- // Create OpenVINO tensor from integer input
- ov::Tensor ov_input_tensor(ov::element::i64, ov::Shape{1}, val);
- infer_request->set_input_tensor(i, ov_input_tensor);
+ // Create OpenVINO tensor from integer input
+ ov::Tensor ov_input_tensor(ov::element::i64, ov::Shape{1}, val);
+ infer_request->set_input_tensor(i, ov_input_tensor);
} else {
- auto input_tensor = args[i]->toTensor();
- ov::Shape input_shape(
- input_tensor.sizes().begin(), input_tensor.sizes().end());
+ auto input_tensor = args[i]->toTensor();
+ ov::Shape input_shape(
+ input_tensor.sizes().begin(), input_tensor.sizes().end());
- // Convert input tensor to OpenVINO tensor
- ov::element::Type ov_type =
- convert_to_openvino_type(input_tensor.scalar_type());
- ov::Tensor ov_input_tensor(
- ov_type, input_shape, input_tensor.mutable_data_ptr());
+ // Convert input tensor to OpenVINO tensor
+ ov::element::Type ov_type =
+ convert_to_openvino_type(input_tensor.scalar_type());
+ ov::Tensor ov_input_tensor(
+ ov_type, input_shape, input_tensor.mutable_data_ptr());
- infer_request->set_input_tensor(i, ov_input_tensor);
+ infer_request->set_input_tensor(i, ov_input_tensor);
}
}
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 7b74ee21f77..47527a326f9 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -906,7 +906,6 @@ def _to_edge_and_lower_llama_openvino(
partitioners.append(get_openvino_partitioner(openvino_device))
modelname = f"openvino_{modelname}"
-
logging.info("Lowering model using following partitioner(s): ")
for partitioner in partitioners:
logging.info(f"--> {partitioner.__class__.__name__}")
@@ -915,8 +914,9 @@ def _to_edge_and_lower_llama_openvino(
# TODO: Enable passing OpenVINOQuantizer as a parameter to pt2e_quantize
if nncf_compression:
try:
- import nncf
from functools import partial
+
+ import nncf
from pytorch_tokenizers import get_tokenizer
except ImportError:
raise ImportError(
@@ -924,9 +924,7 @@ def _to_edge_and_lower_llama_openvino(
)
tokenizer = get_tokenizer(builder_exported.tokenizer_path)
- def transform_fn(
- prompts: str, tokenizer
- ):
+ def transform_fn(prompts: str, tokenizer):
tokenized_text = tokenizer.encode(prompts, bos=False, eos=False)
logging.error(tokenized_text)
@@ -938,20 +936,33 @@ def transform_fn(
return inputs
- builder_exported.calibration_data = [builder_exported.calibration_data] if isinstance(builder_exported.calibration_data, str) else builder_exported.calibration_data
- builder_exported.calibration_data = [word for prompt in builder_exported.calibration_data for word in prompt.split()] if not builder_exported.dynamic_shapes else builder_exported.calibration_data
+ builder_exported.calibration_data = (
+ [builder_exported.calibration_data]
+ if isinstance(builder_exported.calibration_data, str)
+ else builder_exported.calibration_data
+ )
+ builder_exported.calibration_data = (
+ [
+ word
+ for prompt in builder_exported.calibration_data
+ for word in prompt.split()
+ ]
+ if not builder_exported.dynamic_shapes
+ else builder_exported.calibration_data
+ )
builder_exported.pre_autograd_graph_module = nncf.compress_weights(
- builder_exported.pre_autograd_graph_module,
- dataset=nncf.Dataset(builder_exported.calibration_data, transform_func=partial(transform_fn, tokenizer=tokenizer)),
- mode=nncf.CompressWeightsMode.INT4_SYM,
- ratio=0.8,
- sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
- )
-
- builder = builder_exported.to_edge_transform_and_lower(
- partitioners
- )
+ builder_exported.pre_autograd_graph_module,
+ dataset=nncf.Dataset(
+ builder_exported.calibration_data,
+ transform_func=partial(transform_fn, tokenizer=tokenizer),
+ ),
+ mode=nncf.CompressWeightsMode.INT4_SYM,
+ ratio=0.8,
+ sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
+ )
+
+ builder = builder_exported.to_edge_transform_and_lower(partitioners)
if verbose:
print_delegation_info(builder.edge_manager.exported_program().graph_module)
diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py
index b34f0a85344..185bc011a32 100644
--- a/extension/llm/export/partitioner_lib.py
+++ b/extension/llm/export/partitioner_lib.py
@@ -63,12 +63,11 @@ def get_mps_partitioner(use_kv_cache: bool = False):
compile_specs = [CompileSpec("use_fp16", bytes([True]))]
return MPSPartitioner(compile_specs) # pyre-fixme[16]
+
def get_openvino_partitioner(device: str):
try:
+ from executorch.backends.openvino.partitioner import OpenvinoPartitioner
from executorch.exir.backend.backend_details import CompileSpec
- from executorch.backends.openvino.partitioner import (
- OpenvinoPartitioner,
- )
except ImportError:
raise ImportError(
"Please install the OpenVINO backend following https://github.com/pytorch/executorch/tree/main/backends/openvino"
@@ -77,6 +76,7 @@ def get_openvino_partitioner(device: str):
compile_specs = [CompileSpec("device", device.encode())]
return OpenvinoPartitioner(compile_specs)
+
def get_coreml_partitioner(
ios: int = 15,
embedding_quantize: Optional[str] = None,
From bf659439771f5a52ec40a00070ef5ac5c6237cfa Mon Sep 17 00:00:00 2001
From: Cavus Mustafa
Date: Wed, 20 Aug 2025 18:54:57 -0700
Subject: [PATCH 014/266] code formating changes
---
.../quantizer/observers/nncf_observers.py | 31 ++++++++++---------
backends/openvino/quantizer/quantizer.py | 9 ++++--
2 files changed, 23 insertions(+), 17 deletions(-)
diff --git a/backends/openvino/quantizer/observers/nncf_observers.py b/backends/openvino/quantizer/observers/nncf_observers.py
index aa531336d0c..f6ac2a3cb91 100644
--- a/backends/openvino/quantizer/observers/nncf_observers.py
+++ b/backends/openvino/quantizer/observers/nncf_observers.py
@@ -4,41 +4,42 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
-from typing import Tuple
-
import torch
-from nncf.experimental.torch.fx.node_utils import get_tensor_constant_from_node
-from nncf.experimental.torch.fx.transformations import (
+from nncf.experimental.torch.fx.node_utils import ( # type: ignore[import-untyped]
+ get_tensor_constant_from_node,
+)
+from nncf.experimental.torch.fx.transformations import ( # type: ignore[import-untyped]
constant_update_fn,
module_insertion_transformation_builder,
)
-from nncf.parameters import CompressWeightsMode
-from nncf.quantization.algorithms.weight_compression.config import (
+from nncf.parameters import CompressWeightsMode # type: ignore[import-untyped]
+from nncf.quantization.algorithms.weight_compression.config import ( # type: ignore[import-untyped]
WeightCompressionConfig,
)
-from nncf.quantization.algorithms.weight_compression.weight_lowering import (
+from nncf.quantization.algorithms.weight_compression.weight_lowering import ( # type: ignore[import-untyped]
do_integer_quantization,
)
-from nncf.tensor.tensor import Tensor
-from nncf.torch.graph.transformations.commands import PTTargetPoint, TargetType
-from nncf.torch.quantization.layers import (
+from nncf.tensor.tensor import Tensor # type: ignore[import-untyped]
+from nncf.torch.graph.transformations.commands import ( # type: ignore[import-untyped]
+ PTTargetPoint,
+ TargetType,
+)
+from nncf.torch.quantization.layers import ( # type: ignore[import-untyped]
INT4AsymmetricWeightsDecompressor,
INT4SymmetricWeightsDecompressor,
INT8AsymmetricWeightsDecompressor,
INT8SymmetricWeightsDecompressor,
)
-from torch.ao.quantization.observer import (
+from torchao.quantization.observer import AffineQuantizedMinMaxObserver
+from torchao.quantization.pt2e import (
get_block_size,
MappingType,
PerAxis,
PerChannelMinMaxObserver,
PerGroup,
)
-from torch.ao.quantization.pt2e._affine_quantization import (
- _get_reduction_params,
- AffineQuantizedMinMaxObserver,
-)
+from torchao.quantization.quant_primitives import _get_reduction_params
class PTPerBlockParamObserver(AffineQuantizedMinMaxObserver):
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index 820d5dd49ba..cd78f6907c7 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -21,8 +21,13 @@
)
from nncf.common.graph.graph import NNCFGraph # type: ignore[import-untyped]
-from nncf.common.quantization.structs import QuantizationScheme, QuantizerConfig
-from nncf.quantization.quantize_model import get_weight_compression_configuration
+from nncf.common.quantization.structs import ( # type: ignore[import-untyped]
+ QuantizationScheme,
+ QuantizerConfig,
+)
+from nncf.quantization.quantize_model import ( # type: ignore[import-untyped]
+ get_weight_compression_configuration,
+)
from torchao.quantization.pt2e import (
HistogramObserver,
MappingType,
From 30a1a258b22d1471c0aae328f30a5910af6af118 Mon Sep 17 00:00:00 2001
From: anzr299
Date: Tue, 26 Aug 2025 12:31:49 +0400
Subject: [PATCH 015/266] openvino quantizer refactored
---
backends/openvino/quantizer/__init__.py | 4 +-
backends/openvino/quantizer/observers.py | 286 ++++++++++++
.../quantizer/observers/nncf_observers.py | 176 --------
backends/openvino/quantizer/quantizer.py | 412 ++++++++++--------
examples/models/llama/export_llama_lib.py | 9 +
extension/llm/export/quantizer_lib.py | 38 +-
6 files changed, 573 insertions(+), 352 deletions(-)
create mode 100644 backends/openvino/quantizer/observers.py
delete mode 100644 backends/openvino/quantizer/observers/nncf_observers.py
diff --git a/backends/openvino/quantizer/__init__.py b/backends/openvino/quantizer/__init__.py
index df038483f2f..0fd8c10b249 100644
--- a/backends/openvino/quantizer/__init__.py
+++ b/backends/openvino/quantizer/__init__.py
@@ -1,3 +1,3 @@
-from .quantizer import OpenVINOQuantizer, quantize_model
+from .quantizer import OpenVINOQuantizer, quantize_model, QuantizationMode
-__all__ = ["OpenVINOQuantizer", "quantize_model"]
+__all__ = ["OpenVINOQuantizer", "quantize_model", "QuantizationMode"]
diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py
new file mode 100644
index 00000000000..2ea66f11a55
--- /dev/null
+++ b/backends/openvino/quantizer/observers.py
@@ -0,0 +1,286 @@
+# Copyright (c) Intel Corporation
+#
+# Licensed under the BSD License (the "License"); you may not use this file
+# except in compliance with the License. See the license file found in the
+# LICENSE file in the root directory of this source tree.
+
+# mypy: disable-error-code=import-not-found
+
+from abc import ABC, abstractmethod
+from typing import Optional, Tuple
+
+import nncf.torch.graph.operator_metatypes as om # type: ignore[import-untyped]
+
+import torch
+from nncf.experimental.torch.fx.nncf_graph_builder import ( # type: ignore[import-untyped]
+ GraphConverter,
+)
+
+from nncf.experimental.torch.fx.node_utils import ( # type: ignore[import-untyped]
+ get_tensor_constant_from_node,
+)
+from nncf.experimental.torch.fx.transformations import ( # type: ignore[import-untyped]
+ constant_update_fn,
+ module_insertion_transformation_builder,
+)
+from nncf.parameters import CompressWeightsMode # type: ignore[import-untyped]
+from nncf.quantization.algorithms.weight_compression.config import ( # type: ignore[import-untyped]
+ WeightCompressionConfig,
+)
+from nncf.quantization.algorithms.weight_compression.torch_fx_backend import ( # type: ignore[import-untyped]
+ FXWeightCompressionAlgoBackend,
+)
+from nncf.quantization.algorithms.weight_compression.weight_lowering import ( # type: ignore[import-untyped]
+ do_integer_quantization,
+)
+from nncf.tensor.tensor import Tensor # type: ignore[import-untyped]
+from nncf.torch.graph.transformations.commands import ( # type: ignore[import-untyped]
+ PTTargetPoint,
+ TargetType,
+)
+from nncf.torch.quantization.layers import ( # type: ignore[import-untyped]
+ BaseWeightsDecompressor,
+ INT4AsymmetricWeightsDecompressor,
+ INT4SymmetricWeightsDecompressor,
+ INT8AsymmetricWeightsDecompressor,
+ INT8SymmetricWeightsDecompressor,
+)
+from torchao.quantization.pt2e import MappingType, ObserverBase
+from nncf.torch.model_graph_manager import get_weight_compression_reduction_axes
+
+class WeightObserverBase(ObserverBase, ABC):
+ """
+ Base implementation of an NNCF observer that defines the rules for compressing layer weights into the OpenVINO representation.
+ """
+
+ def calculate_qparams( # type: ignore[override]
+ self,
+ weight: torch.Tensor,
+ observer_node: torch.fx.Node,
+ model: torch.fx.GraphModule,
+ ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+ """
+ Calculate quantization parameters such as scale, quantized weight and zero point.
+
+ :param weight: FP weight to be used for calculating qparams.
+ :return: quantization params quantized weight, scale and zero point
+ """
+ ndims = len(weight.size())
+ node_with_weight, weight_port_id = (
+ WeightObserverBase.get_node_with_weight_and_port_ids(observer_node, model)
+ )
+ _, node_metatype = GraphConverter.get_node_type_and_metatype(
+ node_with_weight, model
+ )
+ # Special case where embedding metatype has to be mapped to AtenEmbedding metatype
+ node_metatype = (
+ om.PTAtenEmbeddingMetatype
+ if node_metatype == om.PTEmbeddingMetatype
+ else node_metatype
+ )
+ reduction_dims = get_weight_compression_reduction_axes(
+ node_metatype, weight_port_id, ndims
+ )
+ reduction_dims = tuple(reduction_dims)
+
+ q_weight, scale, zp = do_integer_quantization(
+ Tensor(weight), self.wc_config, reduction_axes=reduction_dims
+ )
+ zp = zp.data if zp is not None else None
+ return q_weight.data, scale.data, zp
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ return x
+
+ @staticmethod
+ def get_node_with_weight_and_port_ids(
+ observer_node: torch.fx.Node, model: torch.fx.GraphModule
+ ) -> Tuple[torch.fx.Node, int]:
+ """
+ Returns the node which contains the weight and the weight port id.
+
+ :param observer_node: Observer node for the weight.
+ :param graph: The model.
+ :return: Node which contains the weight (for eg. Linear node) and the port ID for the weight.
+ """
+ for node in model.graph.nodes:
+ if observer_node in node.all_input_nodes:
+ return node, node.all_input_nodes.index(observer_node)
+ msg = f"Observer node {observer_node.name} has no consumer node"
+ raise RuntimeError(msg)
+
+ def convert(
+ self, model: torch.fx.GraphModule, observer_node: torch.fx.Node
+ ) -> None:
+ """
+ Converts the weight observer node into a decompression subgraph after calibration.
+ This method is responsible for transforming the model after the quantization preparation
+ and calibration phases. It replaces the observer node with the quantized weight and a decompression
+ module.
+
+ :param model: A `torch.fx.GraphModule` representing the statically traced model
+ with observer nodes attached and calibrated.
+ :param observer_node: The `torch.fx.Node` corresponding to the observer module for
+ the weight that is being transformed into a compressed representation.
+ """
+ weight_node = observer_node.args[0]
+ original_weight = get_tensor_constant_from_node(weight_node, model)
+ q_weight, scale, zero_point = self.calculate_qparams(
+ original_weight, observer_node, model
+ )
+
+ decompressor = self._create_decompressor(
+ scale, zero_point, q_weight, original_weight
+ )
+ packed_q_weight = decompressor.pack_weight(q_weight)
+
+ constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0)
+
+ compressed_weight_name = observer_node.all_input_nodes[0].name
+ decompressor_suffix = "_".join(
+ compressed_weight_name.replace(".", "_").split("_")[:-2]
+ )
+ decompressor_name = f"{decompressor.quantization_mode}_weights_decompressor_{decompressor_suffix}"
+
+ module_insertion_transformation_builder(
+ decompressor,
+ [
+ PTTargetPoint(
+ TargetType.OPERATOR_POST_HOOK,
+ target_node_name=compressed_weight_name,
+ )
+ ],
+ decompressor_name,
+ )(model)
+
+ decomp_node = observer_node.args[0]
+ observer_node.replace_all_uses_with(decomp_node) # type: ignore[arg-type]
+ model.graph.erase_node(observer_node)
+
+ @abstractmethod
+ def _create_decompressor(
+ self,
+ scale: torch.Tensor,
+ zero_point: Optional[torch.Tensor],
+ q_weight: torch.Tensor,
+ original_weight: torch.Tensor,
+ ) -> BaseWeightsDecompressor:
+ """
+ Used to return the respective NNCF decompressor for different types of quantization.
+
+ :param scale: Calculated scale quantization parameter.
+ :param zero_point: Calculated zero_point quantization parameter.
+ :param q_weight: Calculated quantized weight.
+ :param original_weight: FP weight.
+ :return: NNCF observer according to the qmode which creates the decompression subgraph supported by OpenVINO.
+ """
+ pass
+
+ @abstractmethod
+ def get_wc_config(self) -> WeightCompressionConfig:
+ """
+ Used to return the respective NNCF Weight Compression Config.
+
+ :return: Weight compression config with the compression information such as qmode, group_size etc.
+ """
+ pass
+
+
+class INT4WeightObserver(WeightObserverBase):
+ """
+ This class defines the behavior for INT4 Weight Compression which has per-group granularity.
+ """
+
+ def __init__(
+ self,
+ group_size: int,
+ mapping_type: MappingType,
+ target_dtype: torch.dtype,
+ *args,
+ **kwargs,
+ ) -> None:
+ """
+ :param group_size: Group size for group wise quantization. group_size=-1 means it is per-channel quantization.
+ :param mapping_type: MappingType.SYMMETRIC and MappingType.ASYMMETRIC are supported types for this argument for symmetric or asymmetric quantization.
+ :param target_dtype: target dtype for quantization such as int8, uint8, etc.
+ """
+ super().__init__(dtype=target_dtype, is_dynamic=False)
+ self.wc_config = None
+ self.mapping_type = mapping_type
+
+ qmode = (
+ CompressWeightsMode.INT4_ASYM
+ if self.mapping_type == MappingType.ASYMMETRIC
+ else CompressWeightsMode.INT4_SYM
+ )
+ self.wc_config = WeightCompressionConfig(mode=qmode, group_size=group_size)
+
+ def _create_decompressor(
+ self,
+ scale: torch.Tensor,
+ zero_point: Optional[torch.Tensor],
+ q_weight: torch.Tensor,
+ original_weight: torch.Tensor,
+ ) -> BaseWeightsDecompressor:
+ if zero_point is not None:
+ return INT4AsymmetricWeightsDecompressor(
+ scale,
+ zero_point,
+ q_weight.shape,
+ original_weight.shape,
+ original_weight.dtype,
+ )
+ else:
+ return INT4SymmetricWeightsDecompressor(
+ scale, q_weight.shape, original_weight.shape, original_weight.dtype
+ )
+
+ def get_wc_config(self):
+ return self.wc_config
+
+
+class INT8WeightObserver(WeightObserverBase):
+ """
+ This class defines the behavior for Int8 WC which has per channel granularity.
+ """
+
+ def __init__(
+ self,
+ qscheme: torch.qscheme,
+ dtype: torch.dtype,
+ ch_axis: int = 0,
+ *args,
+ **kwargs,
+ ) -> None:
+ """
+ :param qscheme: Quantization scheme which is per-channel for Int8 WC.
+ :param dtype: dtype for quantization such as int8, uint8, etc..
+ :param ch_axis: Channel axis.
+ """
+ super().__init__(dtype=dtype, is_dynamic=False)
+ self.wc_config = None
+ self.qscheme = qscheme
+
+ qmode = (
+ CompressWeightsMode.INT8_SYM
+ if self.qscheme == torch.per_channel_symmetric
+ else CompressWeightsMode.INT8_ASYM
+ )
+ self.wc_config = WeightCompressionConfig(mode=qmode)
+
+ def _create_decompressor(
+ self,
+ scale: torch.Tensor,
+ zero_point: Optional[torch.Tensor],
+ q_weight: torch.Tensor,
+ original_weight: torch.Tensor,
+ ) -> BaseWeightsDecompressor:
+ if zero_point is not None:
+ return INT8AsymmetricWeightsDecompressor(
+ scale, zero_point, original_weight.dtype
+ )
+ else:
+ return INT8SymmetricWeightsDecompressor(scale, original_weight.dtype)
+
+ def get_wc_config(self):
+ return self.wc_config
\ No newline at end of file
diff --git a/backends/openvino/quantizer/observers/nncf_observers.py b/backends/openvino/quantizer/observers/nncf_observers.py
deleted file mode 100644
index f6ac2a3cb91..00000000000
--- a/backends/openvino/quantizer/observers/nncf_observers.py
+++ /dev/null
@@ -1,176 +0,0 @@
-# Copyright (c) Qualcomm Innovation Center, Inc.
-# All rights reserved
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-from nncf.experimental.torch.fx.node_utils import ( # type: ignore[import-untyped]
- get_tensor_constant_from_node,
-)
-from nncf.experimental.torch.fx.transformations import ( # type: ignore[import-untyped]
- constant_update_fn,
- module_insertion_transformation_builder,
-)
-from nncf.parameters import CompressWeightsMode # type: ignore[import-untyped]
-from nncf.quantization.algorithms.weight_compression.config import ( # type: ignore[import-untyped]
- WeightCompressionConfig,
-)
-
-from nncf.quantization.algorithms.weight_compression.weight_lowering import ( # type: ignore[import-untyped]
- do_integer_quantization,
-)
-from nncf.tensor.tensor import Tensor # type: ignore[import-untyped]
-from nncf.torch.graph.transformations.commands import ( # type: ignore[import-untyped]
- PTTargetPoint,
- TargetType,
-)
-from nncf.torch.quantization.layers import ( # type: ignore[import-untyped]
- INT4AsymmetricWeightsDecompressor,
- INT4SymmetricWeightsDecompressor,
- INT8AsymmetricWeightsDecompressor,
- INT8SymmetricWeightsDecompressor,
-)
-from torchao.quantization.observer import AffineQuantizedMinMaxObserver
-from torchao.quantization.pt2e import (
- get_block_size,
- MappingType,
- PerAxis,
- PerChannelMinMaxObserver,
- PerGroup,
-)
-from torchao.quantization.quant_primitives import _get_reduction_params
-
-
-class PTPerBlockParamObserver(AffineQuantizedMinMaxObserver):
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
- qmode = (
- CompressWeightsMode.INT4_ASYM
- if self.mapping_type == MappingType.ASYMMETRIC
- else CompressWeightsMode.INT4_SYM
- )
- assert isinstance(
- self.granularity, PerGroup
- ), "Only PerGroup granularity is supported"
- self.wc_config = WeightCompressionConfig(
- mode=qmode, group_size=self.granularity.group_size
- )
-
- def calculate_qparams(self, weight):
- assert hasattr(self, "min_val") and hasattr(
- self, "max_val"
- ), "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams"
- _, reduction_dims = _get_reduction_params(self.block_size, weight.size())
- assert len(reduction_dims) == 1, "Only 1-D group size is supported"
- reduction_dims = reduction_dims[0] - 1
- q_weight, scale, zp = do_integer_quantization(
- Tensor(weight), self.wc_config, reduction_axes=reduction_dims
- )
- zp = zp.data if zp is not None else None
- return q_weight.data, scale.data, zp
-
- def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node):
- print("calling convert")
- assert (
- self.original_dtype is not None
- ), "Expecting original_dtype to be populated"
- weight_node = observer_node.args[0]
- original_weight = get_tensor_constant_from_node(weight_node, model)
- q_weight, scale, zero_point = self.calculate_qparams(original_weight)
-
- with model.graph.inserting_before(observer_node):
- if zero_point is not None:
- decompressor = INT4AsymmetricWeightsDecompressor(
- scale,
- zero_point,
- q_weight.shape,
- original_weight.shape,
- original_weight.dtype,
- )
- else:
- decompressor = INT4SymmetricWeightsDecompressor(
- scale, q_weight.shape, original_weight.shape, original_weight.dtype
- )
- packed_q_weight = decompressor.pack_weight(q_weight)
- constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0)
- compressed_weight_name = observer_node.all_input_nodes[0].name
- decompressor_suffix = "_".join(
- compressed_weight_name.replace(".", "_").split("_")[:-2]
- )
- decompressor_name = f"{decompressor.quantization_mode}_weights_decompressor_{decompressor_suffix}"
-
- module_insertion_transformation_builder(
- decompressor,
- [
- PTTargetPoint(
- TargetType.OPERATOR_POST_HOOK,
- target_node_name=compressed_weight_name,
- )
- ],
- decompressor_name,
- )(model)
- decomp_node = observer_node.args[0]
- observer_node.replace_all_uses_with(decomp_node)
- model.graph.erase_node(observer_node)
-
-
-class NNCFInt8observer(PerChannelMinMaxObserver):
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
- qmode = (
- CompressWeightsMode.INT8_SYM
- if self.qscheme == torch.per_channel_symmetric
- else CompressWeightsMode.INT8_ASYM
- )
- self.wc_config = WeightCompressionConfig(mode=qmode)
-
- def calculate_qparams(self, weight):
- assert hasattr(self, "min_val") and hasattr(
- self, "max_val"
- ), "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams"
- self.granularity = PerAxis(axis=self.ch_axis)
- self.block_size = get_block_size(weight.shape, self.granularity)
- _, reduction_dims = _get_reduction_params(self.block_size, weight.size())
- q_weight, scale, zp = do_integer_quantization(
- Tensor(weight), self.wc_config, reduction_axes=reduction_dims
- )
- zp = zp.data if zp is not None else None
- return q_weight.data, scale.data, zp
-
- def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node):
- print("calling convert")
- weight_node = observer_node.args[0]
- original_weight = get_tensor_constant_from_node(weight_node, model)
- q_weight, scale, zero_point = self.calculate_qparams(original_weight)
-
- with model.graph.inserting_before(observer_node):
- if zero_point is not None:
- decompressor = INT8AsymmetricWeightsDecompressor(
- scale, zero_point, original_weight.dtype
- )
- else:
- decompressor = INT8SymmetricWeightsDecompressor(
- scale, original_weight.dtype
- )
- packed_q_weight = decompressor.pack_weight(q_weight)
- constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0)
- compressed_weight_name = observer_node.all_input_nodes[0].name
- decompressor_suffix = "_".join(
- compressed_weight_name.replace(".", "_").split("_")[:-2]
- )
- decompressor_name = f"{decompressor.quantization_mode}_weights_decompressor_{decompressor_suffix}"
-
- module_insertion_transformation_builder(
- decompressor,
- [
- PTTargetPoint(
- TargetType.OPERATOR_POST_HOOK,
- target_node_name=compressed_weight_name,
- )
- ],
- decompressor_name,
- )(model)
- decomp_node = observer_node.args[0]
- observer_node.replace_all_uses_with(decomp_node)
- model.graph.erase_node(observer_node)
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index cd78f6907c7..31d41bff7be 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -15,16 +15,11 @@
import nncf.experimental.torch.fx as nncf_fx # type: ignore[import-untyped]
import torch.fx
-from executorch.backends.openvino.quantizer.observers.nncf_observers import (
- NNCFInt8observer,
- PTPerBlockParamObserver,
+from executorch.backends.openvino.quantizer.observers import (
+ INT4WeightObserver,
+ INT8WeightObserver,
)
-
from nncf.common.graph.graph import NNCFGraph # type: ignore[import-untyped]
-from nncf.common.quantization.structs import ( # type: ignore[import-untyped]
- QuantizationScheme,
- QuantizerConfig,
-)
from nncf.quantization.quantize_model import ( # type: ignore[import-untyped]
get_weight_compression_configuration,
)
@@ -32,7 +27,6 @@
HistogramObserver,
MappingType,
PerChannelMinMaxObserver,
- PerGroup,
UniformQuantizationObserverBase,
)
from torchao.quantization.pt2e.quantizer import (
@@ -45,7 +39,6 @@
)
QUANT_ANNOTATION_KEY = "quantization_annotation"
-from torchao.quantization.pt2e.quantizer.quantizer import Q_ANNOTATION_KEY
class QuantizationMode(Enum):
@@ -55,15 +48,19 @@ class QuantizationMode(Enum):
- INT8_SYM: INT8 symmetric quantization for both activations and weights.
- INT8_MIXED: INT8 asymmetric quantization for activations, symmetric for weights.
- INT8_TRANSFORMER: Optimized INT8 quantization for transformer-based models
+ - INT8WO_SYM: INT8 symmetric quantization for weights only.
+ - INT8WO_ASYM: INT8 asymmetric quantization for weights only.
+ - INT4WO_SYM: INT4 symmetric quantization for weights only.
+ - INT4WO_ASYM: INT4 asymmetric quantization for weights only
"""
INT8_SYM = "int8_sym"
INT8_MIXED = "int8_mixed"
INT8_TRANSFORMER = "int8_transformer"
- INT8_SYM_WC = "int8_sym_wc"
- INT8_ASYM_WC = "int8_asym_wc"
- INT4_SYM_WC = "int4_sym"
- INT4_ASYM_WC = "int4_asym"
+ INT8WO_SYM = "int8wo_sym"
+ INT8WO_ASYM = "int8wo_asym"
+ INT4WO_SYM = "int4wo_sym"
+ INT4WO_ASYM = "int4wo_asym"
class OpenVINOQuantizer(Quantizer):
@@ -72,10 +69,17 @@ class OpenVINOQuantizer(Quantizer):
optimally for the inference via OpenVINO.
"""
+ WEIGHTS_ONLY_COMPRESSION_MODES = (
+ QuantizationMode.INT4WO_SYM,
+ QuantizationMode.INT4WO_ASYM,
+ QuantizationMode.INT8WO_SYM,
+ QuantizationMode.INT8WO_ASYM,
+ )
+
def __init__(
self,
*,
- mode: Optional[QuantizationMode] = QuantizationMode.INT8_SYM,
+ mode: QuantizationMode = QuantizationMode.INT8_SYM,
**kwargs,
):
"""
@@ -89,28 +93,21 @@ def __init__(
:param kwargs: Arguments to pass to the NNCF MinMaxQuantization algorithm.
"""
self.mode = mode
- self.wc_modes = [
- QuantizationMode.INT4_ASYM_WC,
- QuantizationMode.INT4_SYM_WC,
- QuantizationMode.INT8_ASYM_WC,
- QuantizationMode.INT8_SYM_WC,
- ]
- if mode == QuantizationMode.INT8_SYM:
- preset = quantization.structs.QuantizationPreset.PERFORMANCE
- model_type = None
- elif mode == QuantizationMode.INT8_MIXED:
- preset = quantization.structs.QuantizationPreset.MIXED
- model_type = None
- else:
- preset = None
- model_type = nncf.parameters.ModelType.TRANSFORMER
- if self.mode not in self.wc_modes:
- self._min_max_algo = (
+ if self.mode not in OpenVINOQuantizer.WEIGHTS_ONLY_COMPRESSION_MODES:
+ if mode == QuantizationMode.INT8_SYM:
+ preset = quantization.structs.QuantizationPreset.PERFORMANCE
+ model_type = None
+ elif mode == QuantizationMode.INT8_MIXED:
+ preset = quantization.structs.QuantizationPreset.MIXED
+ model_type = None
+ else:
+ preset = None
+ model_type = nncf.parameters.ModelType.TRANSFORMER
+ self._algo = (
nncf.quantization.algorithms.min_max.algorithm.MinMaxQuantization(
preset=preset, model_type=model_type, **kwargs
)
)
- self._algo = self._min_max_algo
else:
weight_compression_configuration = get_weight_compression_configuration(
mode.value.replace(
@@ -118,10 +115,9 @@ def __init__(
), # Mode value has to match NNCF CompressWeightsMode
**kwargs,
)
- self._weight_compression_algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression(
+ self._algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression(
subset_size=None, **weight_compression_configuration
)
- self._algo = self._weight_compression_algo
def set_ignored_scope(
self,
@@ -158,104 +154,131 @@ def get_nncf_quantization_setup(
self._algo._set_backend_entity(model)
return self._algo.find_quantization_setup(model, nncf_graph)
- def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
- nncf_graph = nncf_fx.nncf_graph_builder.GraphConverter.create_nncf_graph(model)
+ def _annotate_weight_compression(
+ self,
+ model: torch.fx.GraphModule,
+ graph: torch.fx.Graph,
+ nncf_graph: NNCFGraph,
+ node_vs_torch_annotation: DefaultDict[torch.fx.Node, QuantizationAnnotation],
+ ) -> DefaultDict[torch.fx.Node, QuantizationAnnotation]:
+ """
+ Annotates the model graph with weight-only quantization specs.
- graph = model.graph
- node_vs_torch_annotation: DefaultDict[torch.fx.Node, QuantizationAnnotation] = (
- defaultdict(QuantizationAnnotation)
- )
- # Serperate into annotation for quantize and compress
- if self.mode in self.wc_modes:
- self._algo.set_backend_entity(model)
- nodes_to_compress = self._algo.get_nodes_to_compress(nncf_graph)
- for node in nodes_to_compress:
- quantization_insertion_point = (
- quantization.quantizer_setup.WeightQuantizationInsertionPoint(
- target_node_name=node.node_name
- )
- )
- group_size = self._algo._group_size
- num_bits = (
- 4
- if self.mode
- in [QuantizationMode.INT4_SYM_WC, QuantizationMode.INT4_ASYM_WC]
- else 8
- )
- qmode = (
- QuantizationScheme.SYMMETRIC
- if self.mode
- in [QuantizationMode.INT4_SYM_WC, QuantizationMode.INT8_SYM_WC]
- else QuantizationScheme.ASYMMETRIC
- )
- nncf_qconfig = QuantizerConfig(num_bits=num_bits, mode=qmode)
- qp = quantization.quantizer_setup.SingleConfigQuantizationPoint(
- qip=quantization_insertion_point,
- qconfig=nncf_qconfig,
- directly_quantized_operator_node_names=[node],
- )
- edge_or_node, annotation = self._get_edge_or_node_and_annotation(
- graph, nncf_graph, qp, node_vs_torch_annotation
- )
- qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(
- qp, group_size=group_size, weights_only=True
+ Identifies compressible nodes in the NNCF graph and attaches the corresponding
+ TorchAO quantization specifications to their weight edges for later transformation.
+
+ :param model: The FX GraphModule to annotate.
+ :param graph: The underlying FX graph.
+ :param nncf_graph: The corresponding NNCF graph.
+ :param node_vs_torch_annotation: A mapping of FX nodes to quantization annotations.
+
+ :return: Updated mapping of FX nodes with weight compression annotations.
+ """
+ self._algo.set_backend_entity(model)
+ nodes_to_compress = self._algo.get_nodes_to_compress(nncf_graph)
+
+ for node in nodes_to_compress:
+ target_node = nncf_fx.node_utils.get_graph_node_by_name(
+ graph, node.node_name
+ )
+ annotation = node_vs_torch_annotation[target_node]
+ edge_or_node = OpenVINOQuantizer._get_weight_edge(target_node, nncf_graph)
+ group_size = getattr(self._algo, "_group_size", -1)
+ qspec = self._get_torch_ao_qspec_from_nncf_config(
+ qp=None, group_size=group_size, qmode=self.mode, weights_only=True
+ )
+ self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
+
+ return node_vs_torch_annotation
+
+ def _annotate_post_training_quantization(
+ self,
+ model: torch.fx.GraphModule,
+ graph: torch.fx.Graph,
+ nncf_graph: NNCFGraph,
+ node_vs_torch_annotation: DefaultDict[torch.fx.Node, QuantizationAnnotation],
+ ) -> DefaultDict[torch.fx.Node, QuantizationAnnotation]:
+ """
+ Annotates the model graph with post-training quantization configurations.
+
+ Converts NNCF quantization points into TorchAO-compatible quantization specs,
+ assigning them to corresponding nodes or edges. Also handles unified scale groups,
+ ensuring shared quantization specs across grouped quantizers with consistent configs.
+
+ :param model: The FX GraphModule to annotate.
+ :param graph: The underlying FX graph.
+ :param nncf_graph: The corresponding NNCF graph.
+ :param node_vs_torch_annotation: A mapping of FX nodes to quantization annotations.
+
+ :return: Updated mapping of FX nodes with post-training quantization annotations.
+ """
+ quantization_setup = self.get_nncf_quantization_setup(model, nncf_graph)
+
+ for qp in quantization_setup.quantization_points.values():
+ edge_or_node, annotation = self._get_edge_or_node_and_annotation(
+ graph, nncf_graph, qp, node_vs_torch_annotation
+ )
+ qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(qp)
+ self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
+
+ for quantizer_ids in quantization_setup.unified_scale_groups.values():
+ root_quantizer_id = self._get_unified_scales_root_quantizer_id(
+ nncf_graph, quantizer_ids, quantization_setup
+ )
+ root_qp = quantization_setup.quantization_points[root_quantizer_id]
+
+ if any(
+ root_qp.qconfig != quantization_setup.quantization_points[q_id].qconfig
+ for q_id in quantizer_ids
+ ):
+ qps = [
+ quantization_setup.quantization_points[qid] for qid in quantizer_ids
+ ]
+ raise nncf.InternalError(
+ "Different quantization configs are set to one unified scale group:"
+ f"{[(qp.insertion_point.__dict__, str(qp.qconfig)) for qp in qps]}"
)
- self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
- else:
- quantization_setup = self.get_nncf_quantization_setup(model, nncf_graph)
- for qp in quantization_setup.quantization_points.values():
+ root_target_node = nncf_fx.node_utils.get_graph_node_by_name(
+ graph, root_qp.insertion_point.target_node_name
+ )
+ root_edge_or_node = self._get_edge_or_node(
+ root_target_node, root_qp, nncf_graph
+ )
+
+ for quantizer_id in quantizer_ids:
+ if quantizer_id == root_quantizer_id:
+ continue
+
+ qspec = SharedQuantizationSpec(root_edge_or_node) # type: ignore[assignment]
+ qp = quantization_setup.quantization_points[quantizer_id]
edge_or_node, annotation = self._get_edge_or_node_and_annotation(
graph, nncf_graph, qp, node_vs_torch_annotation
)
- qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(
- qp
- )
self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
- for quantizer_ids in quantization_setup.unified_scale_groups.values():
+ return node_vs_torch_annotation
- root_quantizer_id = self._get_unified_scales_root_quantizer_id(
- nncf_graph, quantizer_ids, quantization_setup
- )
- root_qp = quantization_setup.quantization_points[root_quantizer_id]
-
- if any(
- root_qp.qconfig
- != quantization_setup.quantization_points[q_id].qconfig
- for q_id in quantizer_ids
- ):
- qps = [
- quantization_setup.quantization_points[q_id]
- for q_id in quantizer_ids
- ]
- msg = (
- "Different quantization configs are set to one unified scale group:"
- f"{[(qp.insertion_point.__dict__, str(qp.qconfig)) for qp in qps]}"
- )
- raise nncf.InternalError(msg)
-
- root_target_node = nncf_fx.node_utils.get_graph_node_by_name(
- graph, root_qp.insertion_point.target_node_name
- )
- root_edge_or_node = self._get_edge_or_node(
- root_target_node, root_qp, nncf_graph
- )
-
- for quantizer_id in quantizer_ids:
- if quantizer_id == root_quantizer_id:
- continue
+ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+ nncf_graph = nncf_fx.nncf_graph_builder.GraphConverter.create_nncf_graph(model)
+ graph = model.graph
+ node_vs_torch_annotation: DefaultDict[torch.fx.Node, QuantizationAnnotation] = (
+ defaultdict(QuantizationAnnotation)
+ )
- qspec = SharedQuantizationSpec(root_edge_or_node)
- qp = quantization_setup.quantization_points[quantizer_id]
- edge_or_node, annotation = self._get_edge_or_node_and_annotation(
- graph, nncf_graph, qp, node_vs_torch_annotation
- )
- self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
+ if self.mode in OpenVINOQuantizer.WEIGHTS_ONLY_COMPRESSION_MODES:
+ node_vs_torch_annotation = self._annotate_weight_compression(
+ model, graph, nncf_graph, node_vs_torch_annotation
+ )
+ else:
+ node_vs_torch_annotation = self._annotate_post_training_quantization(
+ model, graph, nncf_graph, node_vs_torch_annotation
+ )
for node, annotation in node_vs_torch_annotation.items():
- assert Q_ANNOTATION_KEY not in node.meta
- node.meta[Q_ANNOTATION_KEY] = annotation
+ assert QUANT_ANNOTATION_KEY not in node.meta
+ node.meta[QUANT_ANNOTATION_KEY] = annotation
+
return model
@staticmethod
@@ -317,6 +340,36 @@ def _get_edge_or_node_and_annotation(
edge_or_node = OpenVINOQuantizer._get_edge_or_node(target_node, qp, nncf_graph)
return edge_or_node, annotation
+ @staticmethod
+ def _get_weight_edge(
+ target_node: torch.fx.Node,
+ nncf_graph: NNCFGraph,
+ ):
+ """
+ Returns the FX node corresponding to the weight tensor input of a given operator node.
+ Uses the NNCF graph to identify which input port of the target node holds the weight.
+ If multiple weight ports are present, a warning is issued and only the first one is used.
+
+ :param target_node: FX node representing a weighted operation (e.g., Linear, Conv).
+ :param nncf_graph: NNCFGraph used to determine weight port indices.
+
+ :return: Edge represented by a Tuple of (weight_node, target_node), where weight_node is the FX node supplying the weight.
+ """
+ nncf_node = nncf_graph.get_node_by_name(target_node.name)
+ weights_ports_ids = nncf.torch.model_graph_manager.get_weight_tensor_port_ids(
+ nncf_node, nncf_graph
+ )
+ if len(weights_ports_ids) > 1:
+ # TODO(dlyakhov): support quantization for nodes with several weights
+ nncf.common.logging.nncf_logger.warning(
+ f"Quantization of the weighted node {target_node.name}"
+ " is not yet supported by the OpenVINOQuantizer."
+ f" Only the weight on port ID {weights_ports_ids[0]} will be quantized."
+ f" Quantizable weights are located on ports: {weights_ports_ids}."
+ )
+ weight_node = target_node.all_input_nodes[weights_ports_ids[0]]
+ return (weight_node, target_node)
+
@staticmethod
def _get_edge_or_node(
target_node: torch.fx.Node,
@@ -333,22 +386,7 @@ def _get_edge_or_node(
"""
ip = qp.insertion_point
if qp.is_weight_quantization_point():
- nncf_node = nncf_graph.get_node_by_name(target_node.name)
- weights_ports_ids = (
- nncf.torch.model_graph_manager.get_weight_tensor_port_ids(
- nncf_node, nncf_graph
- )
- )
- if len(weights_ports_ids) > 1:
- # TODO(dlyakhov): support quantization for nodes with several weights
- nncf.common.logging.nncf_logger.warning(
- f"Quantization of the weighted node {target_node.name}"
- " is not yet supported by the OpenVINOQuantizer."
- f" Only the weight on port ID {weights_ports_ids[0]} will be quantized."
- f" Quantizable weights are located on ports: {weights_ports_ids}."
- )
- weight_node = target_node.all_input_nodes[weights_ports_ids[0]]
- return (weight_node, target_node)
+ OpenVINOQuantizer._get_weight_edge(target_node, nncf_graph)
if ip.input_port_id is None:
return target_node
@@ -377,22 +415,67 @@ def _fill_torch_ao_annotation(
@staticmethod
def _get_torch_ao_qspec_from_nncf_config(
qp: quantization.quantizer_setup.QuantizationPointBase,
- group_size=-1,
- weights_only=False,
+ group_size: int = -1,
+ qmode: Optional[QuantizationMode] = None,
+ weights_only: bool = False,
) -> QuantizationSpec:
"""
- Retrieves the quantization configuration from the given quantization point and
- converts it into a QuantizationSpec.
-
- :param qp: An instance of QuantizationPointBase.
- :return: A QuantizationSpec retrieved and converted from the quantization point.
+ Returns a TorchAO QuantizationSpec based on NNCF quantization config and other arguments.
+ For weight-only quantization (e.g., INT4/INT8 compression), uses `qmode`, `group_size`,
+ and `weights_only`. For post-training quantization, only `qp` is required.
+
+ :param qp: Quantization point from NNCF.
+ :param group_size: Group size for INT4 group-wise quantization.
+ :param qmode: Quantization mode for weight compression.
+ :param weights_only: If True, applies weight-only quantization logic.
+ :return: A TorchAO QuantizationSpec.
"""
+ observer: Type[UniformQuantizationObserverBase]
+
# Eps value is copied from nncf/torch/quantization/layers.py
- extra_args = {"eps": 1e-16}
+ extra_args: Dict[str, Any] = {"eps": 1e-16}
+
+ if weights_only:
+ mapping_type = (
+ MappingType.SYMMETRIC
+ if qmode == QuantizationMode.INT4WO_SYM
+ else MappingType.ASYMMETRIC
+ )
+ if qmode in [QuantizationMode.INT4WO_SYM, QuantizationMode.INT4WO_SYM]:
+ extra_args["mapping_type"] = mapping_type
+ extra_args["target_dtype"] = torch.int8
+ extra_args["group_size"] = group_size
+ observer = INT4WeightObserver
+ quant_min = -8 if mapping_type == MappingType.SYMMETRIC else 0
+ quant_max = 7 if mapping_type == MappingType.SYMMETRIC else 15
+ dtype = torch.int8
+ channel_axis = 0
+ torch_qscheme = None
+ else:
+ observer = INT8WeightObserver
+ quant_min = -128 if mapping_type == MappingType.SYMMETRIC else 0
+ quant_max = 1277 if mapping_type == MappingType.SYMMETRIC else 255
+ dtype = torch.int8
+ channel_axis = 0
+ torch_qscheme = (
+ torch.per_channel_symmetric
+ if qmode == QuantizationMode.INT8WO_SYM
+ else torch.per_channel_affine
+ )
+
+ return QuantizationSpec(
+ dtype=dtype,
+ observer_or_fake_quant_ctr=observer.with_args(**extra_args),
+ quant_min=quant_min,
+ quant_max=quant_max,
+ qscheme=torch_qscheme,
+ ch_axis=channel_axis,
+ is_dynamic=False,
+ )
+
is_weight = qp.is_weight_quantization_point()
qconfig = qp.qconfig
- observer: Type[UniformQuantizationObserverBase]
if qconfig.per_channel:
torch_qscheme = (
torch.per_channel_symmetric
@@ -406,33 +489,16 @@ def _get_torch_ao_qspec_from_nncf_config(
else torch.per_tensor_affine
)
if is_weight:
- mapping_type = (
- MappingType.SYMMETRIC
- if qconfig.mode == QuantizationScheme.SYMMETRIC
- else MappingType.ASYMMETRIC
+ observer = PerChannelMinMaxObserver
+ quant_min = -128
+ quant_max = 127
+ dtype = torch.int8
+ channel_axis = 0
+ torch_qscheme = (
+ torch.per_channel_symmetric
+ if qconfig.mode is quantization.structs.QuantizationScheme.SYMMETRIC
+ else torch.per_channel_affine
)
- if qconfig.num_bits == 4:
- extra_args["mapping_type"] = mapping_type
- extra_args["target_dtype"] = torch.int8
- extra_args["granularity"] = PerGroup(group_size=group_size)
- observer = PTPerBlockParamObserver
- quant_min = -8
- quant_max = 7
- dtype = torch.int8
- channel_axis = 0
- elif qconfig.num_bits == 8:
- observer = (
- NNCFInt8observer if weights_only else PerChannelMinMaxObserver
- )
- quant_min = -128
- quant_max = 127
- dtype = torch.int8
- channel_axis = 0
- torch_qscheme = (
- torch.per_channel_symmetric
- if qconfig.mode is quantization.structs.QuantizationScheme.SYMMETRIC
- else torch.per_channel_affine
- )
else:
observer = (
HistogramObserver
@@ -514,4 +580,4 @@ def quantize_model(
smooth_quant=smooth_quant,
**kwargs,
)
- return quantized_model
+ return quantized_model
\ No newline at end of file
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 47527a326f9..54acf67a21d 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -50,6 +50,7 @@
get_pt2e_quantization_params,
get_pt2e_quantizers,
get_qnn_quantizer,
+ get_ov_quantizer,
get_vulkan_quantizer,
)
from executorch.util.activation_memory_profiler import generate_memory_trace
@@ -205,6 +206,8 @@ def build_args_parser() -> argparse.ArgumentParser:
choices=[
"xnnpack_dynamic",
"xnnpack_dynamic_qc4",
+ "openvino_8da4w",
+ "openvino_8da8w",
"qnn_8a8w",
"qnn_16a16w",
"qnn_16a4w",
@@ -786,6 +789,12 @@ def get_quantizer_and_quant_params(llm_config):
llm_config.quantization.pt2e_quantize.value, llm_config.quantization.qmode
)
quantizers.append(qnn_quantizer)
+ if llm_config.backend.openvino.enabled and llm_config.quantization.pt2e_quantize:
+ assert len(quantizers) == 0, "Should not enable both xnnpack and openvino"
+ ov_quantizer = get_ov_quantizer(
+ llm_config.quantization.pt2e_quantize.value, llm_config.quantization.group_size
+ )
+ quantizers.append(ov_quantizer)
if llm_config.backend.coreml.enabled and llm_config.quantization.pt2e_quantize:
assert len(quantizers) == 0, "Should not enable both xnnpack / qnn and coreml"
coreml_quantizer = get_coreml_quantizer(
diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index d87c722363f..4669d09e0e7 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -207,7 +207,7 @@ def get_qnn_quantizer(
f"No support for quant type {quant_config}. Support 8a8w, 16a16w and 16a4w."
)
- assert (
+ assert (get_qnn_quantizer
quantization_mode is None
), "Currently qnn backend only supports QnnQuantizer via pt2e flow"
qnn_quantizer.add_custom_quant_annotations(custom_annotations)
@@ -215,6 +215,42 @@ def get_qnn_quantizer(
return qnn_quantizer, quant_dtype
+def get_ov_quantizer(
+ pt2e_quantize: str,
+ group_size: int = 32,
+):
+ try:
+ from executorch.backends.openvino.quantizer import OpenVINOQuantizer, QuantizationMode
+
+ except ImportError:
+ raise ImportError(
+ "Please install nncf via backends/openvino/requirements.txt"
+ )
+
+ backend, quant_config = pt2e_quantize.split("_")
+ assert (
+ backend == "openvino"
+ ), f"The quantization config is for backend {backend} instead of openvino."
+ ov_quantizer = OpenVINOQuantizer()
+ # Manually ignore MP layers.
+ # ov_quantizer.set_ignored_scope()
+
+ extra_quantizer_options = {"group_size": group_size}
+ if quant_config == "8da4w":
+ mode = QuantizationMode.INT4WO_SYM
+
+ elif quant_config == "8da8w":
+ mode = QuantizationMode.INT8WO_SYM
+ else:
+ raise AssertionError(
+ f"No support for quant type {quant_config}. Support 8a4w, 8a8w only."
+ )
+
+ ov_quantizer = OpenVINOQuantizer(mode=mode, **extra_quantizer_options)
+
+ return ov_quantizer
+
+
def get_coreml_quantizer(pt2e_quantize: str):
try:
from coremltools.optimize.torch.quantization.quantization_config import (
From 4cc7694433b12f7c8afe4c61b785e5158e0798e0 Mon Sep 17 00:00:00 2001
From: anzr299
Date: Tue, 26 Aug 2025 18:32:27 +0400
Subject: [PATCH 016/266] fixes
---
backends/openvino/quantizer/quantizer.py | 10 ++++--
examples/models/llama/export_llama_lib.py | 9 +++--
extension/llm/export/config/llm_config.py | 2 ++
extension/llm/export/quantizer_lib.py | 42 +++++++++++++++++++----
4 files changed, 51 insertions(+), 12 deletions(-)
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index 31d41bff7be..f594c6fffa8 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -12,6 +12,7 @@
import nncf # type: ignore[import-untyped]
import nncf.common.quantization as quantization # type: ignore[import-untyped]
+from nncf.common.scopes import should_consider_scope # type: ignore[import-untyped]
import nncf.experimental.torch.fx as nncf_fx # type: ignore[import-untyped]
import torch.fx
@@ -176,8 +177,12 @@ def _annotate_weight_compression(
"""
self._algo.set_backend_entity(model)
nodes_to_compress = self._algo.get_nodes_to_compress(nncf_graph)
+ ignored_names = self._algo.get_ignored_node_names(nncf_graph)
for node in nodes_to_compress:
+ is_target_node = should_consider_scope(node.node_name, ignored_names)
+ if not is_target_node:
+ continue
target_node = nncf_fx.node_utils.get_graph_node_by_name(
graph, node.node_name
)
@@ -442,9 +447,9 @@ def _get_torch_ao_qspec_from_nncf_config(
else MappingType.ASYMMETRIC
)
if qmode in [QuantizationMode.INT4WO_SYM, QuantizationMode.INT4WO_SYM]:
+ extra_args["group_size"] = group_size
extra_args["mapping_type"] = mapping_type
extra_args["target_dtype"] = torch.int8
- extra_args["group_size"] = group_size
observer = INT4WeightObserver
quant_min = -8 if mapping_type == MappingType.SYMMETRIC else 0
quant_max = 7 if mapping_type == MappingType.SYMMETRIC else 15
@@ -454,7 +459,7 @@ def _get_torch_ao_qspec_from_nncf_config(
else:
observer = INT8WeightObserver
quant_min = -128 if mapping_type == MappingType.SYMMETRIC else 0
- quant_max = 1277 if mapping_type == MappingType.SYMMETRIC else 255
+ quant_max = 127 if mapping_type == MappingType.SYMMETRIC else 255
dtype = torch.int8
channel_axis = 0
torch_qscheme = (
@@ -462,7 +467,6 @@ def _get_torch_ao_qspec_from_nncf_config(
if qmode == QuantizationMode.INT8WO_SYM
else torch.per_channel_affine
)
-
return QuantizationSpec(
dtype=dtype,
observer_or_fake_quant_ctr=observer.with_args(**extra_args),
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 54acf67a21d..269f927e9f6 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -791,8 +791,10 @@ def get_quantizer_and_quant_params(llm_config):
quantizers.append(qnn_quantizer)
if llm_config.backend.openvino.enabled and llm_config.quantization.pt2e_quantize:
assert len(quantizers) == 0, "Should not enable both xnnpack and openvino"
+ group_size = llm_config.quantization.group_size
+ group_size = group_size if group_size else 32
ov_quantizer = get_ov_quantizer(
- llm_config.quantization.pt2e_quantize.value, llm_config.quantization.group_size
+ llm_config.quantization.pt2e_quantize.value,
)
quantizers.append(ov_quantizer)
if llm_config.backend.coreml.enabled and llm_config.quantization.pt2e_quantize:
@@ -904,6 +906,7 @@ def _to_edge_and_lower_llama_xnnpack(
def _to_edge_and_lower_llama_openvino(
builder_exported,
modelname,
+ quantizers,
additional_passes,
openvino_device: str = "CPU",
nncf_compression: bool = False,
@@ -935,7 +938,6 @@ def _to_edge_and_lower_llama_openvino(
def transform_fn(prompts: str, tokenizer):
tokenized_text = tokenizer.encode(prompts, bos=False, eos=False)
- logging.error(tokenized_text)
inputs = ()
inputs = (
@@ -971,7 +973,7 @@ def transform_fn(prompts: str, tokenizer):
sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
)
- builder = builder_exported.to_edge_transform_and_lower(partitioners)
+ builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(partitioners)
if verbose:
print_delegation_info(builder.edge_manager.exported_program().graph_module)
@@ -1214,6 +1216,7 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager: # noqa: C901
builder = _to_edge_and_lower_llama_openvino(
builder_exported,
modelname,
+ quantizers,
additional_passes,
openvino_device=llm_config.backend.openvino.device,
nncf_compression=llm_config.backend.openvino.nncf_compression,
diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py
index ab18c19159b..b4175d54cd7 100644
--- a/extension/llm/export/config/llm_config.py
+++ b/extension/llm/export/config/llm_config.py
@@ -275,6 +275,8 @@ class Pt2eQuantize(str, Enum):
xnnpack_dynamic = "xnnpack_dynamic"
xnnpack_dynamic_qc4 = "xnnpack_dynamic_qc4"
+ openvino_8da4w = "openvino_8da4w"
+ openvino_8da8w = "openvino_8da8w"
qnn_8a8w = "qnn_8a8w"
qnn_16a16w = "qnn_16a16w"
qnn_16a4w = "qnn_16a4w"
diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index 4669d09e0e7..2a20a90d55a 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -207,7 +207,7 @@ def get_qnn_quantizer(
f"No support for quant type {quant_config}. Support 8a8w, 16a16w and 16a4w."
)
- assert (get_qnn_quantizer
+ assert (
quantization_mode is None
), "Currently qnn backend only supports QnnQuantizer via pt2e flow"
qnn_quantizer.add_custom_quant_annotations(custom_annotations)
@@ -231,22 +231,52 @@ def get_ov_quantizer(
assert (
backend == "openvino"
), f"The quantization config is for backend {backend} instead of openvino."
- ov_quantizer = OpenVINOQuantizer()
+ assert group_size != None, "Group Size None is Not Supported. It should be set to -1 for per-channel."
+
# Manually ignore MP layers.
- # ov_quantizer.set_ignored_scope()
+ fp_node_names = linear_list = [
+ "embedding", # First embedding is kept in Full precision
+ "linear_14",
+ "linear_15",
+ "linear_35",
+ "linear_56",
+ "linear_57",
+ "linear_63",
+ "linear_70",
+ "linear_71",
+ "linear_77",
+ "linear_78",
+ "linear_81",
+ "linear_84",
+ "linear_85",
+ "linear_88",
+ "linear_89",
+ "linear_91",
+ "linear_92",
+ "linear_95",
+ "linear_96",
+ "linear_98",
+ "linear_99",
+ "linear_102",
+ "linear_103",
+ "linear_105",
+ "linear_106",
+ "linear_109",
+ "linear_110",
+ "linear_112",]
- extra_quantizer_options = {"group_size": group_size}
if quant_config == "8da4w":
mode = QuantizationMode.INT4WO_SYM
elif quant_config == "8da8w":
+ group_size = -1
mode = QuantizationMode.INT8WO_SYM
else:
raise AssertionError(
f"No support for quant type {quant_config}. Support 8a4w, 8a8w only."
)
-
- ov_quantizer = OpenVINOQuantizer(mode=mode, **extra_quantizer_options)
+ ov_quantizer = OpenVINOQuantizer(mode=mode, group_size=group_size)
+ ov_quantizer.set_ignored_scope(names=fp_node_names)
return ov_quantizer
From 5da40a57d7d42363b795d483630b00d9ce4b5f31 Mon Sep 17 00:00:00 2001
From: anzr299
Date: Wed, 27 Aug 2025 13:48:41 +0400
Subject: [PATCH 017/266] support all_layers, backup mode in OVQuantizer
---
backends/openvino/quantizer/quantizer.py | 25 ++++---
examples/models/llama/export_llama_lib.py | 82 ++++++++++-------------
extension/llm/export/quantizer_lib.py | 8 +--
3 files changed, 55 insertions(+), 60 deletions(-)
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index f594c6fffa8..2ede04e53db 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -116,8 +116,14 @@ def __init__(
), # Mode value has to match NNCF CompressWeightsMode
**kwargs,
)
+ subset_size = 1 # Doesn't really matter in this case since it is data-free. Should just be +ve
+ dataset = None # Only Data Free Quantization is Supported in OVQuantizer
+ compression_format = nncf.CompressionFormat.DQ
+ nncf.quantization.algorithms.weight_compression.algorithm.check_user_compression_configuration(
+ subset_size=subset_size, dataset=dataset, compression_format=compression_format, **weight_compression_configuration
+ )
self._algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression(
- subset_size=None, **weight_compression_configuration
+ subset_size=subset_size, **weight_compression_configuration
)
def set_ignored_scope(
@@ -176,21 +182,20 @@ def _annotate_weight_compression(
:return: Updated mapping of FX nodes with weight compression annotations.
"""
self._algo.set_backend_entity(model)
- nodes_to_compress = self._algo.get_nodes_to_compress(nncf_graph)
- ignored_names = self._algo.get_ignored_node_names(nncf_graph)
+ all_wc_params, _ = self._algo.get_processed_weight_compression_parameters(model, nncf_graph)
- for node in nodes_to_compress:
- is_target_node = should_consider_scope(node.node_name, ignored_names)
- if not is_target_node:
- continue
+ for wc_param in all_wc_params:
+ wc_config = wc_param.compression_config
+ node_with_weight = wc_param.node_with_weight
target_node = nncf_fx.node_utils.get_graph_node_by_name(
- graph, node.node_name
+ graph, node_with_weight.node_name
)
annotation = node_vs_torch_annotation[target_node]
edge_or_node = OpenVINOQuantizer._get_weight_edge(target_node, nncf_graph)
- group_size = getattr(self._algo, "_group_size", -1)
+ group_size = wc_config.group_size
+ qmode = wc_config.mode
qspec = self._get_torch_ao_qspec_from_nncf_config(
- qp=None, group_size=group_size, qmode=self.mode, weights_only=True
+ qp=None, group_size=group_size, qmode=qmode, weights_only=True
)
self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 269f927e9f6..00785491100 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -792,9 +792,9 @@ def get_quantizer_and_quant_params(llm_config):
if llm_config.backend.openvino.enabled and llm_config.quantization.pt2e_quantize:
assert len(quantizers) == 0, "Should not enable both xnnpack and openvino"
group_size = llm_config.quantization.group_size
- group_size = group_size if group_size else 32
+ group_size = group_size if group_size else 32
ov_quantizer = get_ov_quantizer(
- llm_config.quantization.pt2e_quantize.value,
+ llm_config.quantization.pt2e_quantize.value, group_size
)
quantizers.append(ov_quantizer)
if llm_config.backend.coreml.enabled and llm_config.quantization.pt2e_quantize:
@@ -921,59 +921,51 @@ def _to_edge_and_lower_llama_openvino(
logging.info("Lowering model using following partitioner(s): ")
for partitioner in partitioners:
logging.info(f"--> {partitioner.__class__.__name__}")
-
+ try:
+ import nncf
+ from functools import partial
+ from pytorch_tokenizers import get_tokenizer
+ except ImportError:
+ raise ImportError(
+ "Please install nncf via backends/openvino/requirements.txt"
+ )
+
+ tokenizer = get_tokenizer(builder_exported.tokenizer_path)
+ from datasets import load_dataset
# Use NNCF compression if enabled
# TODO: Enable passing OpenVINOQuantizer as a parameter to pt2e_quantize
if nncf_compression:
- try:
- from functools import partial
-
- import nncf
- from pytorch_tokenizers import get_tokenizer
- except ImportError:
- raise ImportError(
- "Please install nncf via backends/openvino/requirements.txt"
- )
- tokenizer = get_tokenizer(builder_exported.tokenizer_path)
-
- def transform_fn(prompts: str, tokenizer):
- tokenized_text = tokenizer.encode(prompts, bos=False, eos=False)
-
+ dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+ dataset = dataset.filter(lambda example: example['text'].strip() != "")
+ dataset = dataset.filter(lambda example: example['text'].strip() != "\n")
+ def transform_fn(
+ prompts: str, tokenizer
+ ):
+ tokenized_text = tokenizer.encode(prompts["text"], bos=False, eos=False)
+ device = torch.device("cpu") if openvino_device=="CPU" else torch.device("cuda")
inputs = ()
inputs = (
- torch.tensor(tokenized_text).unsqueeze(0),
- {"input_pos": torch.tensor([0])},
+ torch.tensor(tokenized_text[:128], device=device).unsqueeze(0),
+ {"input_pos": torch.tensor([0], device=device)},
)
return inputs
-
- builder_exported.calibration_data = (
- [builder_exported.calibration_data]
- if isinstance(builder_exported.calibration_data, str)
- else builder_exported.calibration_data
- )
- builder_exported.calibration_data = (
- [
- word
- for prompt in builder_exported.calibration_data
- for word in prompt.split()
- ]
- if not builder_exported.dynamic_shapes
- else builder_exported.calibration_data
- )
-
+
builder_exported.pre_autograd_graph_module = nncf.compress_weights(
- builder_exported.pre_autograd_graph_module,
- dataset=nncf.Dataset(
- builder_exported.calibration_data,
- transform_func=partial(transform_fn, tokenizer=tokenizer),
- ),
- mode=nncf.CompressWeightsMode.INT4_SYM,
- ratio=0.8,
- sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
- )
+ builder_exported.pre_autograd_graph_module,
+ dataset=nncf.Dataset(dataset, partial(transform_fn, tokenizer=tokenizer)),
+ mode=nncf.CompressWeightsMode.INT4_SYM,
+ group_size=32,
+ backup_mode=nncf.BackupMode.NONE,
+ ratio=0.8,
+ sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
+ )
+
+ builder = builder_exported.to_edge_transform_and_lower(partitioners)
+
+ else:
+ builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(partitioners)
- builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(partitioners)
if verbose:
print_delegation_info(builder.edge_manager.exported_program().graph_module)
diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index 2a20a90d55a..9220c1efbdc 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -221,7 +221,7 @@ def get_ov_quantizer(
):
try:
from executorch.backends.openvino.quantizer import OpenVINOQuantizer, QuantizationMode
-
+ import nncf
except ImportError:
raise ImportError(
"Please install nncf via backends/openvino/requirements.txt"
@@ -234,8 +234,7 @@ def get_ov_quantizer(
assert group_size != None, "Group Size None is Not Supported. It should be set to -1 for per-channel."
# Manually ignore MP layers.
- fp_node_names = linear_list = [
- "embedding", # First embedding is kept in Full precision
+ fp_node_names = [
"linear_14",
"linear_15",
"linear_35",
@@ -262,8 +261,7 @@ def get_ov_quantizer(
"linear_105",
"linear_106",
"linear_109",
- "linear_110",
- "linear_112",]
+ "linear_110",]
if quant_config == "8da4w":
mode = QuantizationMode.INT4WO_SYM
From 9e65a7ef860e5725522859bbf8d863c76e26503d Mon Sep 17 00:00:00 2001
From: anzr299
Date: Wed, 27 Aug 2025 17:29:05 +0400
Subject: [PATCH 018/266] clean up and use new nncf method for obtaining
compression parameters
---
backends/openvino/quantizer/observers.py | 127 ++++++-----------------
backends/openvino/quantizer/quantizer.py | 52 ++++------
2 files changed, 48 insertions(+), 131 deletions(-)
diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py
index 2ea66f11a55..845a091d24b 100644
--- a/backends/openvino/quantizer/observers.py
+++ b/backends/openvino/quantizer/observers.py
@@ -25,10 +25,7 @@
)
from nncf.parameters import CompressWeightsMode # type: ignore[import-untyped]
from nncf.quantization.algorithms.weight_compression.config import ( # type: ignore[import-untyped]
- WeightCompressionConfig,
-)
-from nncf.quantization.algorithms.weight_compression.torch_fx_backend import ( # type: ignore[import-untyped]
- FXWeightCompressionAlgoBackend,
+ WeightCompressionParameters,
)
from nncf.quantization.algorithms.weight_compression.weight_lowering import ( # type: ignore[import-untyped]
do_integer_quantization,
@@ -45,19 +42,31 @@
INT8AsymmetricWeightsDecompressor,
INT8SymmetricWeightsDecompressor,
)
-from torchao.quantization.pt2e import MappingType, ObserverBase
-from nncf.torch.model_graph_manager import get_weight_compression_reduction_axes
+from torchao.quantization.pt2e import ObserverBase
+
class WeightObserverBase(ObserverBase, ABC):
"""
Base implementation of an NNCF observer that defines the rules for compressing layer weights into the OpenVINO representation.
"""
+ def __init__(
+ self,
+ wc_param: WeightCompressionParameters,
+ dtype: torch.dtype,
+ **kwargs,
+ ) -> None:
+ """
+ :param wc_param: Weight compression parameter which contains information such as group_size
+ reduction_axes, quantization mode etc.
+ :param dtype: target dtype for quantization such as int8, uint8, etc.
+ """
+ super().__init__(dtype=dtype, is_dynamic=False)
+ self.wc_param = wc_param
+
def calculate_qparams( # type: ignore[override]
self,
weight: torch.Tensor,
- observer_node: torch.fx.Node,
- model: torch.fx.GraphModule,
) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
"""
Calculate quantization parameters such as scale, quantized weight and zero point.
@@ -65,26 +74,11 @@ def calculate_qparams( # type: ignore[override]
:param weight: FP weight to be used for calculating qparams.
:return: quantization params quantized weight, scale and zero point
"""
- ndims = len(weight.size())
- node_with_weight, weight_port_id = (
- WeightObserverBase.get_node_with_weight_and_port_ids(observer_node, model)
- )
- _, node_metatype = GraphConverter.get_node_type_and_metatype(
- node_with_weight, model
- )
- # Special case where embedding metatype has to be mapped to AtenEmbedding metatype
- node_metatype = (
- om.PTAtenEmbeddingMetatype
- if node_metatype == om.PTEmbeddingMetatype
- else node_metatype
- )
- reduction_dims = get_weight_compression_reduction_axes(
- node_metatype, weight_port_id, ndims
- )
- reduction_dims = tuple(reduction_dims)
-
+ wc_param = self.get_wc_param()
+ wc_config = wc_param.compression_config
+ reduction_axes = wc_param.reduction_axes
q_weight, scale, zp = do_integer_quantization(
- Tensor(weight), self.wc_config, reduction_axes=reduction_dims
+ Tensor(weight), wc_config, reduction_axes=reduction_axes
)
zp = zp.data if zp is not None else None
return q_weight.data, scale.data, zp
@@ -92,23 +86,6 @@ def calculate_qparams( # type: ignore[override]
def forward(self, x: torch.Tensor) -> torch.Tensor:
return x
- @staticmethod
- def get_node_with_weight_and_port_ids(
- observer_node: torch.fx.Node, model: torch.fx.GraphModule
- ) -> Tuple[torch.fx.Node, int]:
- """
- Returns the node which contains the weight and the weight port id.
-
- :param observer_node: Observer node for the weight.
- :param graph: The model.
- :return: Node which contains the weight (for eg. Linear node) and the port ID for the weight.
- """
- for node in model.graph.nodes:
- if observer_node in node.all_input_nodes:
- return node, node.all_input_nodes.index(observer_node)
- msg = f"Observer node {observer_node.name} has no consumer node"
- raise RuntimeError(msg)
-
def convert(
self, model: torch.fx.GraphModule, observer_node: torch.fx.Node
) -> None:
@@ -126,7 +103,7 @@ def convert(
weight_node = observer_node.args[0]
original_weight = get_tensor_constant_from_node(weight_node, model)
q_weight, scale, zero_point = self.calculate_qparams(
- original_weight, observer_node, model
+ original_weight
)
decompressor = self._create_decompressor(
@@ -134,6 +111,7 @@ def convert(
)
packed_q_weight = decompressor.pack_weight(q_weight)
+ # Weight port id is 0 since observer is inserted for a single weight only.
constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0)
compressed_weight_name = observer_node.all_input_nodes[0].name
@@ -177,7 +155,7 @@ def _create_decompressor(
pass
@abstractmethod
- def get_wc_config(self) -> WeightCompressionConfig:
+ def get_wc_param(self) -> WeightCompressionParameters:
"""
Used to return the respective NNCF Weight Compression Config.
@@ -191,30 +169,6 @@ class INT4WeightObserver(WeightObserverBase):
This class defines the behavior for INT4 Weight Compression which has per-group granularity.
"""
- def __init__(
- self,
- group_size: int,
- mapping_type: MappingType,
- target_dtype: torch.dtype,
- *args,
- **kwargs,
- ) -> None:
- """
- :param group_size: Group size for group wise quantization. group_size=-1 means it is per-channel quantization.
- :param mapping_type: MappingType.SYMMETRIC and MappingType.ASYMMETRIC are supported types for this argument for symmetric or asymmetric quantization.
- :param target_dtype: target dtype for quantization such as int8, uint8, etc.
- """
- super().__init__(dtype=target_dtype, is_dynamic=False)
- self.wc_config = None
- self.mapping_type = mapping_type
-
- qmode = (
- CompressWeightsMode.INT4_ASYM
- if self.mapping_type == MappingType.ASYMMETRIC
- else CompressWeightsMode.INT4_SYM
- )
- self.wc_config = WeightCompressionConfig(mode=qmode, group_size=group_size)
-
def _create_decompressor(
self,
scale: torch.Tensor,
@@ -235,8 +189,8 @@ def _create_decompressor(
scale, q_weight.shape, original_weight.shape, original_weight.dtype
)
- def get_wc_config(self):
- return self.wc_config
+ def get_wc_param(self) -> WeightCompressionParameters:
+ return self.wc_param
class INT8WeightObserver(WeightObserverBase):
@@ -244,30 +198,6 @@ class INT8WeightObserver(WeightObserverBase):
This class defines the behavior for Int8 WC which has per channel granularity.
"""
- def __init__(
- self,
- qscheme: torch.qscheme,
- dtype: torch.dtype,
- ch_axis: int = 0,
- *args,
- **kwargs,
- ) -> None:
- """
- :param qscheme: Quantization scheme which is per-channel for Int8 WC.
- :param dtype: dtype for quantization such as int8, uint8, etc..
- :param ch_axis: Channel axis.
- """
- super().__init__(dtype=dtype, is_dynamic=False)
- self.wc_config = None
- self.qscheme = qscheme
-
- qmode = (
- CompressWeightsMode.INT8_SYM
- if self.qscheme == torch.per_channel_symmetric
- else CompressWeightsMode.INT8_ASYM
- )
- self.wc_config = WeightCompressionConfig(mode=qmode)
-
def _create_decompressor(
self,
scale: torch.Tensor,
@@ -282,5 +212,6 @@ def _create_decompressor(
else:
return INT8SymmetricWeightsDecompressor(scale, original_weight.dtype)
- def get_wc_config(self):
- return self.wc_config
\ No newline at end of file
+ def get_wc_param(self) -> WeightCompressionParameters:
+ return self.wc_param
+
\ No newline at end of file
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index 2ede04e53db..ef9a83ca77c 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -24,9 +24,11 @@
from nncf.quantization.quantize_model import ( # type: ignore[import-untyped]
get_weight_compression_configuration,
)
+from nncf.quantization.algorithms.weight_compression.config import ( # type: ignore[import-untyped]
+ WeightCompressionParameters,
+)
from torchao.quantization.pt2e import (
HistogramObserver,
- MappingType,
PerChannelMinMaxObserver,
UniformQuantizationObserverBase,
)
@@ -112,16 +114,11 @@ def __init__(
else:
weight_compression_configuration = get_weight_compression_configuration(
mode.value.replace(
- "_wc", ""
+ "wo", ""
), # Mode value has to match NNCF CompressWeightsMode
**kwargs,
)
subset_size = 1 # Doesn't really matter in this case since it is data-free. Should just be +ve
- dataset = None # Only Data Free Quantization is Supported in OVQuantizer
- compression_format = nncf.CompressionFormat.DQ
- nncf.quantization.algorithms.weight_compression.algorithm.check_user_compression_configuration(
- subset_size=subset_size, dataset=dataset, compression_format=compression_format, **weight_compression_configuration
- )
self._algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression(
subset_size=subset_size, **weight_compression_configuration
)
@@ -185,17 +182,14 @@ def _annotate_weight_compression(
all_wc_params, _ = self._algo.get_processed_weight_compression_parameters(model, nncf_graph)
for wc_param in all_wc_params:
- wc_config = wc_param.compression_config
node_with_weight = wc_param.node_with_weight
target_node = nncf_fx.node_utils.get_graph_node_by_name(
graph, node_with_weight.node_name
)
annotation = node_vs_torch_annotation[target_node]
edge_or_node = OpenVINOQuantizer._get_weight_edge(target_node, nncf_graph)
- group_size = wc_config.group_size
- qmode = wc_config.mode
qspec = self._get_torch_ao_qspec_from_nncf_config(
- qp=None, group_size=group_size, qmode=qmode, weights_only=True
+ qp=None, wc_param=wc_param
)
self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
@@ -425,19 +419,16 @@ def _fill_torch_ao_annotation(
@staticmethod
def _get_torch_ao_qspec_from_nncf_config(
qp: quantization.quantizer_setup.QuantizationPointBase,
- group_size: int = -1,
- qmode: Optional[QuantizationMode] = None,
- weights_only: bool = False,
+ wc_param: WeightCompressionParameters = None,
) -> QuantizationSpec:
"""
Returns a TorchAO QuantizationSpec based on NNCF quantization config and other arguments.
- For weight-only quantization (e.g., INT4/INT8 compression), uses `qmode`, `group_size`,
- and `weights_only`. For post-training quantization, only `qp` is required.
+ For weight-only quantization (e.g., INT4/INT8 compression), uses `wc_param` which carries
+ weight only quantization info such as group_size, reduction_axes etc. For post-training
+ quantization, only `qp` is required.
:param qp: Quantization point from NNCF.
- :param group_size: Group size for INT4 group-wise quantization.
- :param qmode: Quantization mode for weight compression.
- :param weights_only: If True, applies weight-only quantization logic.
+ :param wc_param: NNCF Weight compression parameters for the node.
:return: A TorchAO QuantizationSpec.
"""
observer: Type[UniformQuantizationObserverBase]
@@ -445,26 +436,21 @@ def _get_torch_ao_qspec_from_nncf_config(
# Eps value is copied from nncf/torch/quantization/layers.py
extra_args: Dict[str, Any] = {"eps": 1e-16}
- if weights_only:
- mapping_type = (
- MappingType.SYMMETRIC
- if qmode == QuantizationMode.INT4WO_SYM
- else MappingType.ASYMMETRIC
- )
- if qmode in [QuantizationMode.INT4WO_SYM, QuantizationMode.INT4WO_SYM]:
- extra_args["group_size"] = group_size
- extra_args["mapping_type"] = mapping_type
- extra_args["target_dtype"] = torch.int8
+ if wc_param:
+ qmode = wc_param.compression_config.mode
+ if qmode in [nncf.CompressWeightsMode.INT4_ASYM, nncf.CompressWeightsMode.INT4_SYM]:
+ extra_args["wc_param"] = wc_param
observer = INT4WeightObserver
- quant_min = -8 if mapping_type == MappingType.SYMMETRIC else 0
- quant_max = 7 if mapping_type == MappingType.SYMMETRIC else 15
+ quant_min = -8 if not wc_param.compression_config.is_asym_mode else 0
+ quant_max = 7 if not wc_param.compression_config.is_asym_mode else 15
dtype = torch.int8
channel_axis = 0
torch_qscheme = None
else:
+ extra_args["wc_param"] = wc_param
observer = INT8WeightObserver
- quant_min = -128 if mapping_type == MappingType.SYMMETRIC else 0
- quant_max = 127 if mapping_type == MappingType.SYMMETRIC else 255
+ quant_min = -128 if not wc_param.compression_config.is_asym_mode else 0
+ quant_max = 127 if not wc_param.compression_config.is_asym_mode else 255
dtype = torch.int8
channel_axis = 0
torch_qscheme = (
From 53e0f4cd0e01ed5a8adb85a7c08a2722d4a5a622 Mon Sep 17 00:00:00 2001
From: anzr299
Date: Mon, 1 Sep 2025 10:39:20 +0400
Subject: [PATCH 019/266] review changes & update method names according to wc
algo
---
backends/openvino/quantizer/observers.py | 4 ++--
backends/openvino/quantizer/quantizer.py | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py
index 845a091d24b..50fcc673ed6 100644
--- a/backends/openvino/quantizer/observers.py
+++ b/backends/openvino/quantizer/observers.py
@@ -30,7 +30,7 @@
from nncf.quantization.algorithms.weight_compression.weight_lowering import ( # type: ignore[import-untyped]
do_integer_quantization,
)
-from nncf.tensor.tensor import Tensor # type: ignore[import-untyped]
+from nncf.tensor.tensor import Tensor as NNCFTensor # type: ignore[import-untyped]
from nncf.torch.graph.transformations.commands import ( # type: ignore[import-untyped]
PTTargetPoint,
TargetType,
@@ -78,7 +78,7 @@ def calculate_qparams( # type: ignore[override]
wc_config = wc_param.compression_config
reduction_axes = wc_param.reduction_axes
q_weight, scale, zp = do_integer_quantization(
- Tensor(weight), wc_config, reduction_axes=reduction_axes
+ NNCFTensor(weight), wc_config, reduction_axes=reduction_axes
)
zp = zp.data if zp is not None else None
return q_weight.data, scale.data, zp
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index ef9a83ca77c..2e364424b16 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -179,7 +179,7 @@ def _annotate_weight_compression(
:return: Updated mapping of FX nodes with weight compression annotations.
"""
self._algo.set_backend_entity(model)
- all_wc_params, _ = self._algo.get_processed_weight_compression_parameters(model, nncf_graph)
+ all_wc_params, _ = self._algo.get_weight_compression_parameters(model, nncf_graph)
for wc_param in all_wc_params:
node_with_weight = wc_param.node_with_weight
From bf959305dc210416f20c327509291db3655028e9 Mon Sep 17 00:00:00 2001
From: anzr299
Date: Mon, 1 Sep 2025 11:14:13 +0400
Subject: [PATCH 020/266] review changes
---
backends/openvino/quantizer/observers.py | 2 +-
backends/openvino/quantizer/quantizer.py | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py
index 50fcc673ed6..b1054460a16 100644
--- a/backends/openvino/quantizer/observers.py
+++ b/backends/openvino/quantizer/observers.py
@@ -166,7 +166,7 @@ def get_wc_param(self) -> WeightCompressionParameters:
class INT4WeightObserver(WeightObserverBase):
"""
- This class defines the behavior for INT4 Weight Compression which has per-group granularity.
+ OpenVINO INT4 Weight Compression observer.
"""
def _create_decompressor(
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index 2e364424b16..485d67e3bb9 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -187,7 +187,7 @@ def _annotate_weight_compression(
graph, node_with_weight.node_name
)
annotation = node_vs_torch_annotation[target_node]
- edge_or_node = OpenVINOQuantizer._get_weight_edge(target_node, nncf_graph)
+ edge_or_node = self._get_weight_edge(target_node, nncf_graph)
qspec = self._get_torch_ao_qspec_from_nncf_config(
qp=None, wc_param=wc_param
)
From 2d4bec7a4b0041ead027a6c651e00eee32343dc4 Mon Sep 17 00:00:00 2001
From: anzr299
Date: Mon, 1 Sep 2025 11:31:40 +0400
Subject: [PATCH 021/266] review changes
---
backends/openvino/quantizer/observers.py | 38 ++++++-----------------
backends/openvino/quantizer/quantizer.py | 7 +----
examples/models/llama/export_llama_lib.py | 2 +-
3 files changed, 12 insertions(+), 35 deletions(-)
diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py
index b1054460a16..d44a22556dd 100644
--- a/backends/openvino/quantizer/observers.py
+++ b/backends/openvino/quantizer/observers.py
@@ -9,12 +9,7 @@
from abc import ABC, abstractmethod
from typing import Optional, Tuple
-import nncf.torch.graph.operator_metatypes as om # type: ignore[import-untyped]
-
import torch
-from nncf.experimental.torch.fx.nncf_graph_builder import ( # type: ignore[import-untyped]
- GraphConverter,
-)
from nncf.experimental.torch.fx.node_utils import ( # type: ignore[import-untyped]
get_tensor_constant_from_node,
@@ -23,7 +18,6 @@
constant_update_fn,
module_insertion_transformation_builder,
)
-from nncf.parameters import CompressWeightsMode # type: ignore[import-untyped]
from nncf.quantization.algorithms.weight_compression.config import ( # type: ignore[import-untyped]
WeightCompressionParameters,
)
@@ -57,9 +51,8 @@ def __init__(
**kwargs,
) -> None:
"""
- :param wc_param: Weight compression parameter which contains information such as group_size
- reduction_axes, quantization mode etc.
- :param dtype: target dtype for quantization such as int8, uint8, etc.
+ :param wc_param: Weight compression parameters container.
+ :param dtype: target dtype for the quantization.
"""
super().__init__(dtype=dtype, is_dynamic=False)
self.wc_param = wc_param
@@ -69,10 +62,10 @@ def calculate_qparams( # type: ignore[override]
weight: torch.Tensor,
) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
"""
- Calculate quantization parameters such as scale, quantized weight and zero point.
+ Calculates quantization parameters: quantized weight, quantization scale and quantization zero point.
:param weight: FP weight to be used for calculating qparams.
- :return: quantization params quantized weight, scale and zero point
+ :return: A tuple containing the quantized weight, quantization scale and quantization zero point.
"""
wc_param = self.get_wc_param()
wc_config = wc_param.compression_config
@@ -90,10 +83,8 @@ def convert(
self, model: torch.fx.GraphModule, observer_node: torch.fx.Node
) -> None:
"""
- Converts the weight observer node into a decompression subgraph after calibration.
- This method is responsible for transforming the model after the quantization preparation
- and calibration phases. It replaces the observer node with the quantized weight and a decompression
- module.
+ Replaces the given observer node from the given model with a quantized
+ weight and a OpenVINO specific decompression module.
:param model: A `torch.fx.GraphModule` representing the statically traced model
with observer nodes attached and calibrated.
@@ -144,7 +135,7 @@ def _create_decompressor(
original_weight: torch.Tensor,
) -> BaseWeightsDecompressor:
"""
- Used to return the respective NNCF decompressor for different types of quantization.
+ Returns a respective NNCF decompressor for different types of quantization.
:param scale: Calculated scale quantization parameter.
:param zero_point: Calculated zero_point quantization parameter.
@@ -152,17 +143,14 @@ def _create_decompressor(
:param original_weight: FP weight.
:return: NNCF observer according to the qmode which creates the decompression subgraph supported by OpenVINO.
"""
- pass
- @abstractmethod
def get_wc_param(self) -> WeightCompressionParameters:
"""
- Used to return the respective NNCF Weight Compression Config.
+ Returns a respective NNCF Weight Compression Config.
:return: Weight compression config with the compression information such as qmode, group_size etc.
"""
- pass
-
+ return self.wc_param
class INT4WeightObserver(WeightObserverBase):
"""
@@ -189,13 +177,10 @@ def _create_decompressor(
scale, q_weight.shape, original_weight.shape, original_weight.dtype
)
- def get_wc_param(self) -> WeightCompressionParameters:
- return self.wc_param
-
class INT8WeightObserver(WeightObserverBase):
"""
- This class defines the behavior for Int8 WC which has per channel granularity.
+ OpenVINO INT8 Weight Compression per channel observer.
"""
def _create_decompressor(
@@ -212,6 +197,3 @@ def _create_decompressor(
else:
return INT8SymmetricWeightsDecompressor(scale, original_weight.dtype)
- def get_wc_param(self) -> WeightCompressionParameters:
- return self.wc_param
-
\ No newline at end of file
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index 485d67e3bb9..7f86686d03c 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -205,15 +205,10 @@ def _annotate_post_training_quantization(
"""
Annotates the model graph with post-training quantization configurations.
- Converts NNCF quantization points into TorchAO-compatible quantization specs,
- assigning them to corresponding nodes or edges. Also handles unified scale groups,
- ensuring shared quantization specs across grouped quantizers with consistent configs.
-
:param model: The FX GraphModule to annotate.
:param graph: The underlying FX graph.
:param nncf_graph: The corresponding NNCF graph.
:param node_vs_torch_annotation: A mapping of FX nodes to quantization annotations.
-
:return: Updated mapping of FX nodes with post-training quantization annotations.
"""
quantization_setup = self.get_nncf_quantization_setup(model, nncf_graph)
@@ -575,4 +570,4 @@ def quantize_model(
smooth_quant=smooth_quant,
**kwargs,
)
- return quantized_model
\ No newline at end of file
+ return quantized_model
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 00785491100..269022f2cf7 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -790,7 +790,7 @@ def get_quantizer_and_quant_params(llm_config):
)
quantizers.append(qnn_quantizer)
if llm_config.backend.openvino.enabled and llm_config.quantization.pt2e_quantize:
- assert len(quantizers) == 0, "Should not enable both xnnpack and openvino"
+ assert quantizers, "Should not enable both xnnpack and openvino"
group_size = llm_config.quantization.group_size
group_size = group_size if group_size else 32
ov_quantizer = get_ov_quantizer(
From 0a2e361f04aa724c8af7d88c1dbd286b4c7556d6 Mon Sep 17 00:00:00 2001
From: Aamir Nazir
Date: Wed, 3 Sep 2025 20:48:10 +0400
Subject: [PATCH 022/266] Update export_llama_lib.py
---
examples/models/llama/export_llama_lib.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 269022f2cf7..8eab3eefbc0 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -790,7 +790,7 @@ def get_quantizer_and_quant_params(llm_config):
)
quantizers.append(qnn_quantizer)
if llm_config.backend.openvino.enabled and llm_config.quantization.pt2e_quantize:
- assert quantizers, "Should not enable both xnnpack and openvino"
+ assert not quantizers, "Should not enable both xnnpack and openvino"
group_size = llm_config.quantization.group_size
group_size = group_size if group_size else 32
ov_quantizer = get_ov_quantizer(
From 4c86a9c91d6eeec8eca53ea66d4f5132cd007a6d Mon Sep 17 00:00:00 2001
From: Cavus Mustafa
Date: Wed, 3 Sep 2025 13:32:08 -0700
Subject: [PATCH 023/266] enable group_size parameter for nncf compression
---
backends/openvino/requirements.txt | 2 +-
examples/models/llama/export_llama_lib.py | 3 +++
extension/llm/export/config/llm_config.py | 5 ++++-
3 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/backends/openvino/requirements.txt b/backends/openvino/requirements.txt
index 316633e9004..2ada445414c 100644
--- a/backends/openvino/requirements.txt
+++ b/backends/openvino/requirements.txt
@@ -1,2 +1,2 @@
transformers
-git+https://github.com/openvinotoolkit/nncf@6b0fc1c#egg=nncf
+git+https://github.com/openvinotoolkit/nncf@5cb2b58#egg=nncf
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 47527a326f9..417d25550ab 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -898,6 +898,7 @@ def _to_edge_and_lower_llama_openvino(
additional_passes,
openvino_device: str = "CPU",
nncf_compression: bool = False,
+ nncf_compression_group_size: int = 32,
verbose: bool = False,
) -> LLMEdgeManager: # noqa: C901
partitioners = []
@@ -959,6 +960,7 @@ def transform_fn(prompts: str, tokenizer):
),
mode=nncf.CompressWeightsMode.INT4_SYM,
ratio=0.8,
+ group_size=nncf_compression_group_size,
sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
)
@@ -1208,6 +1210,7 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager: # noqa: C901
additional_passes,
openvino_device=llm_config.backend.openvino.device,
nncf_compression=llm_config.backend.openvino.nncf_compression,
+ nncf_compression_group_size=llm_config.backend.openvino.nncf_compression_group_size,
verbose=llm_config.debug.verbose,
)
else:
diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py
index ab18c19159b..c8f15bc1f9a 100644
--- a/extension/llm/export/config/llm_config.py
+++ b/extension/llm/export/config/llm_config.py
@@ -456,7 +456,8 @@ class OpenvinoConfig:
enabled: bool = False
device: str = "CPU"
- nncf_compression = False
+ nncf_compression: bool = False
+ nncf_compression_group_size: int = 32
@dataclass
@@ -645,6 +646,8 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig": # noqa: C901
llm_config.backend.openvino.device = args.openvino_device
if hasattr(args, "nncf_compression"):
llm_config.backend.openvino.nncf_compression = args.nncf_compression
+ if hasattr(args, "group_size") and args.group_size:
+ llm_config.backend.openvino.nncf_compression_group_size = args.group_size
# DebugConfig
if hasattr(args, "profile_memory"):
From 46ed3f6d5ca71439c13c781eea1156bd4383ad3c Mon Sep 17 00:00:00 2001
From: Mustafa Cavus
Date: Wed, 3 Sep 2025 15:09:13 -0700
Subject: [PATCH 024/266] Update README.md
---
backends/openvino/README.md | 28 ++++++++++++++--------------
1 file changed, 14 insertions(+), 14 deletions(-)
diff --git a/backends/openvino/README.md b/backends/openvino/README.md
index a67cf12eca2..73b6bd9b20a 100644
--- a/backends/openvino/README.md
+++ b/backends/openvino/README.md
@@ -42,11 +42,23 @@ executorch
Before you begin, ensure you have openvino installed and configured on your system.
-### Build OpenVINO from Source
+### Use OpenVINO from Release Packages
+
+1. Download the OpenVINO release package from [here](https://docs.openvino.ai/2025/get-started/install-openvino.html). Make sure to select your configuration and click on **OpenVINO Archives** under the distribution section to download the appropriate archive for your platform.
+
+2. Extract the release package from the archive and set the environment variables.
+
+ ```bash
+ tar -zxf openvino_toolkit_.tgz
+ cd openvino_toolkit_
+ source setupvars.sh
+ ```
+
+### (Optional) Build OpenVINO from Source
```bash
git clone https://github.com/openvinotoolkit/openvino.git
-cd openvino && git checkout b16b776ac119dafda51f69a80f1e6b7376d02c3b
+cd openvino
git submodule update --init --recursive
sudo ./install_build_dependencies.sh
mkdir build && cd build
@@ -59,18 +71,6 @@ cd
source setupvars.sh
```
-### Use OpenVINO from Release Packages
-
-1. Download the OpenVINO release package from [here](https://docs.openvino.ai/2025/get-started/install-openvino.html). Make sure to select your configuration and click on **OpenVINO Archives** under the distribution section to download the appropriate archive for your platform.
-
-2. Extract the release package from the archive and set the environment variables.
-
- ```bash
- tar -zxf openvino_toolkit_.tgz
- cd openvino_toolkit_
- source setupvars.sh
- ```
-
For more information about OpenVINO build, refer to the [OpenVINO Build Instructions](https://github.com/openvinotoolkit/openvino/blob/master/docs/dev/build_linux.md).
### Setup
From 0a1256eb351a5562e593f82ed921da2eeb9b245f Mon Sep 17 00:00:00 2001
From: Mustafa Cavus
Date: Wed, 3 Sep 2025 15:26:08 -0700
Subject: [PATCH 025/266] Update README.md
---
backends/openvino/README.md | 19 +++++++++++++++----
1 file changed, 15 insertions(+), 4 deletions(-)
diff --git a/backends/openvino/README.md b/backends/openvino/README.md
index 73b6bd9b20a..ce10b902646 100644
--- a/backends/openvino/README.md
+++ b/backends/openvino/README.md
@@ -77,17 +77,27 @@ For more information about OpenVINO build, refer to the [OpenVINO Build Instruct
Follow the steps below to setup your build environment:
-1. **Setup ExecuTorch Environment**: Refer to the [Environment Setup](https://pytorch.org/executorch/main/getting-started-setup#environment-setup) guide for detailed instructions on setting up the ExecuTorch environment.
-2. **Setup OpenVINO Backend Environment**
+1. **Create a Virtual Environment**
+- Create a virtual environment and activate it by executing the commands below.
+ ```bash
+ python -m venv env
+ source env/bin/activate
+ ```
+2. **Clone ExecuTorch Repository from Github**
+- Clone Executorch repository by executing the command below.
+ ```bash
+ git clone --recurse-submodules https://github.com/pytorch/executorch.git
+ ```
+3. **Setup OpenVINO Backend Environment**
- Install the dependent libs. Ensure that you are inside `executorch/backends/openvino/` directory
```bash
pip install -r requirements.txt
```
Note: To achieve optimal performance with NNCF quantization, you should install the latest development version of NNCF (version 2.16.0.dev0+191b53d9 or higher).
-3. Navigate to `scripts/` directory.
+4. Navigate to `scripts/` directory.
-4. **Build OpenVINO Backend C++ Libraries and Executor Runner**: Once the prerequisites are in place, run the `openvino_build.sh` script to start the build process. By default, OpenVINO backend will be built under `cmake-out/backends/openvino/` as `libopenvino_backend.a`
+5. **Build OpenVINO Backend C++ Libraries and Executor Runner**: Once the prerequisites are in place, run the `openvino_build.sh` script to start the build process. By default, OpenVINO backend will be built under `cmake-out/backends/openvino/` as `libopenvino_backend.a`
```bash
./openvino_build.sh
@@ -97,6 +107,7 @@ Follow the steps below to setup your build environment:
```bash
./openvino_build.sh --enable_python
```
+For more information about ExecuTorch environment setup, refer to the [Environment Setup](https://pytorch.org/executorch/main/getting-started-setup#environment-setup) guide.
### Run
From f2151e3baddd32003f5d0e5bb36e34830207a76c Mon Sep 17 00:00:00 2001
From: Mustafa Cavus
Date: Wed, 3 Sep 2025 17:25:15 -0700
Subject: [PATCH 026/266] Update README.md
---
backends/openvino/README.md | 27 ++++++++++++++-------------
1 file changed, 14 insertions(+), 13 deletions(-)
diff --git a/backends/openvino/README.md b/backends/openvino/README.md
index ce10b902646..cc5b20cbab8 100644
--- a/backends/openvino/README.md
+++ b/backends/openvino/README.md
@@ -89,24 +89,25 @@ Follow the steps below to setup your build environment:
```bash
git clone --recurse-submodules https://github.com/pytorch/executorch.git
```
-3. **Setup OpenVINO Backend Environment**
-- Install the dependent libs. Ensure that you are inside `executorch/backends/openvino/` directory
+3. **Build ExecuTorch with OpenVINO Backend**
+- Ensure that you are inside `executorch/backends/openvino/scripts` directory. The following command builds and installs ExecuTorch with the OpenVINO backend, and also compiles the C++ runtime binaries into `/cmake-out` for quick inference testing.
```bash
- pip install -r requirements.txt
- ```
- Note: To achieve optimal performance with NNCF quantization, you should install the latest development version of NNCF (version 2.16.0.dev0+191b53d9 or higher).
-4. Navigate to `scripts/` directory.
-
-5. **Build OpenVINO Backend C++ Libraries and Executor Runner**: Once the prerequisites are in place, run the `openvino_build.sh` script to start the build process. By default, OpenVINO backend will be built under `cmake-out/backends/openvino/` as `libopenvino_backend.a`
-
- ```bash
- ./openvino_build.sh
+ openvino_build.sh
```
+- Optionally, `openvino_build.sh` script can be used to build python package or C++ bineries seperately.
**Build OpenVINO Backend Python Package with Pybindings**: To build and install the OpenVINO backend Python package with Python bindings, run the `openvino_build.sh` script with the `--enable_python` argument. This will compile and install the ExecuTorch Python package with the OpenVINO backend into your Python environment. This option will also enable python bindings required to execute OpenVINO backend tests and `aot_optimize_and_infer.py` script inside `executorch/examples/openvino` folder.
-
- ```bash
+ ```bash
./openvino_build.sh --enable_python
```
+ **Build C++ Runtime Libraries for OpenVINO Backend**: Run the `openvino_build.sh` script with the `--cpp_runtime` argument to build C++ runtime libraries into `/cmake-out` folder. `/cmake-out/backends/openvino/openvino_executor_runner` binary file can be used for quick inferencing with vision models.
+ ```bash
+ ./openvino_build.sh --cpp_runtime
+ ```
+ **Build C++ Llama Runner**: This step requires first building the C++ runtime libraries by following the previous instructions. Then, run `openvino_build.sh` script with the `--llama_runner` argument to compile the llama runner to execute inference with models exported using `export_llama`. The compiled binary file is located in `/cmake-out/examples/models/llama/llama_main`.
+ ```bash
+ ./openvino_build.sh --llama_runner
+ ```
+
For more information about ExecuTorch environment setup, refer to the [Environment Setup](https://pytorch.org/executorch/main/getting-started-setup#environment-setup) guide.
### Run
From dfc8eab6d862a9be10e95fd6ae82e122c9869574 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa
Date: Wed, 3 Sep 2025 17:55:26 -0700
Subject: [PATCH 027/266] openvino backend build script updates
---
backends/openvino/scripts/openvino_build.sh | 155 ++++++++++++--------
1 file changed, 91 insertions(+), 64 deletions(-)
diff --git a/backends/openvino/scripts/openvino_build.sh b/backends/openvino/scripts/openvino_build.sh
index 08741840ddb..b7e5f5270ab 100755
--- a/backends/openvino/scripts/openvino_build.sh
+++ b/backends/openvino/scripts/openvino_build.sh
@@ -7,79 +7,106 @@ set -e
EXECUTORCH_ROOT=$(realpath "$(dirname "$0")/../../..")
echo EXECUTORCH_ROOT=${EXECUTORCH_ROOT}
-main() {
- build_type=${1:-"--cpp_runtime"}
-
- # If the first arguments is --cpp_runtime (default), build libraries for C++ runtime
- if [[ -z "$build_type" || "$build_type" == "--cpp_runtime" ]]; then
- echo "Building C++ Runtime Libraries"
-
- # Set build directory
- local build_dir="cmake-out"
-
- # Enter the Executorch root directory
- cd "$EXECUTORCH_ROOT"
- rm -rf "${build_dir}"
-
- # Configure the project with CMake
- # Note: Add any additional configuration options you need here
- cmake -DCMAKE_INSTALL_PREFIX="${build_dir}" \
- -DCMAKE_BUILD_TYPE=Release \
- -DEXECUTORCH_BUILD_OPENVINO=ON \
- -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
- -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
- -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
- -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
- -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
- -DEXECUTORCH_BUILD_OPENVINO_EXECUTOR_RUNNER=ON \
- -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
- -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
- -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
- -B"${build_dir}"
-
-
- # Build the project
- cmake --build ${build_dir} --target install --config Release -j$(nproc)
+install_requirements() {
+ echo "Installing Requirements For OpenVINO Backend"
+ cd "$EXECUTORCH_ROOT"
+ pip install -r backends/openvino/requirements.txt
+}
- # If the first arguments is --enable_python, build python package with python bindings
- elif [[ "$build_type" == "--enable_python" ]]; then
- echo "Building Python Package with Pybinding"
+build_cpp_runtime() {
+ echo "Building C++ Runtime Libraries"
+
+ # Set build directory
+ local build_dir="cmake-out"
+
+ # Enter the Executorch root directory
+ cd "$EXECUTORCH_ROOT"
+ rm -rf "${build_dir}"
+
+ # Configure the project with CMake
+ # Note: Add any additional configuration options you need here
+ cmake -DCMAKE_INSTALL_PREFIX="${build_dir}" \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DEXECUTORCH_BUILD_OPENVINO=ON \
+ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+ -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
+ -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+ -DEXECUTORCH_BUILD_OPENVINO_EXECUTOR_RUNNER=ON \
+ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+ -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
+ -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
+ -B"${build_dir}"
+
+
+ # Build the project
+ cmake --build ${build_dir} --target install --config Release -j$(nproc)
+}
+
+build_llama_runner() {
+ echo "Building Export Llama Runner"
+
+ # Set build directory
+ local build_dir="cmake-out"
+
+ # Enter the Executorch root directory
+ cd "$EXECUTORCH_ROOT"
+
+ # Configure the project with CMake
+ # Note: Add any additional configuration options you need here
+ cmake -DCMAKE_INSTALL_PREFIX="${build_dir}" \
+ -DCMAKE_BUILD_TYPE=Release \
+ -B"${build_dir}"/examples/models/llama \
+ examples/models/llama
+ # Build the export llama runner
+ cmake --build cmake-out/examples/models/llama -j$(nproc) --config Release
+}
- # Enter the Executorch root directory
- cd "$EXECUTORCH_ROOT"
- ./install_executorch.sh --clean
+build_python_enabled() {
+ echo "Building Python Package with Pybinding"
- # Set parameters to configure the project with CMake
- # Note: Add any additional configuration options you need here
- export CMAKE_ARGS="-DEXECUTORCH_BUILD_OPENVINO=ON \
- -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON"
- export CMAKE_BUILD_ARGS="--target openvino_backend"
+ # Enter the Executorch root directory
+ cd "$EXECUTORCH_ROOT"
+ ./install_executorch.sh --clean
- # Build the package
- ./install_executorch.sh --minimal
+ # Set parameters to configure the project with CMake
+ # Note: Add any additional configuration options you need here
+ export CMAKE_ARGS="-DEXECUTORCH_BUILD_OPENVINO=ON \
+ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON"
+ export CMAKE_BUILD_ARGS="--target openvino_backend"
- # Install torchao
- pip install third-party/ao
+ # Build the package
+ ./install_executorch.sh --minimal
+
+ # Install torchao
+ pip install third-party/ao
+}
+
+main() {
+ build_type=${1:-"--build_all"}
+
+ # If the first arguments is --build_all (default), build python package, C++ runtime, and llama runner binary
+ if [[ -z "$build_type" || "$build_type" == "--build_all" ]]; then
+ install_requirements
+ build_python_enabled
+ build_cpp_runtime
+ build_llama_runner
+
+ # If the first arguments is --cpp_runtime, build libraries for C++ runtime
+ elif [[ "$build_type" == "--cpp_runtime" ]]; then
+ build_cpp_runtime
# If the first arguments is --llama_runner, build export llama runner binary
# Note: c++ runtime with openvino backend should be built before building export llama runner
elif [[ "$build_type" == "--llama_runner" ]]; then
- echo "Building Export Llama Runner"
-
- # Set build directory
- local build_dir="cmake-out"
-
- # Enter the Executorch root directory
- cd "$EXECUTORCH_ROOT"
-
- # Configure the project with CMake
- # Note: Add any additional configuration options you need here
- cmake -DCMAKE_INSTALL_PREFIX="${build_dir}" \
- -DCMAKE_BUILD_TYPE=Release \
- -B"${build_dir}"/examples/models/llama \
- examples/models/llama
- # Build the export llama runner
- cmake --build cmake-out/examples/models/llama -j$(nproc) --config Release
+ build_llama_runner
+
+ # If the first arguments is --enable_python, build python package with python bindings
+ elif [[ "$build_type" == "--enable_python" ]]; then
+ install_requirements
+ build_python_enabled
+
else
echo "Error: Argument is not valid: $build_type"
exit 1 # Exit the script with an error code
From 2ac8a8c0b7ea3f2e0b391b1b7cba9460b71dad86 Mon Sep 17 00:00:00 2001
From: Mustafa Cavus
Date: Thu, 4 Sep 2025 15:41:46 -0700
Subject: [PATCH 028/266] Update README.md
---
backends/openvino/README.md | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/backends/openvino/README.md b/backends/openvino/README.md
index cc5b20cbab8..71bd27f6b50 100644
--- a/backends/openvino/README.md
+++ b/backends/openvino/README.md
@@ -18,6 +18,11 @@ For more information on the supported hardware, please refer to [OpenVINO System
executorch
├── backends
│ └── openvino
+│ ├── quantizer
+│ ├── observers
+│ └── nncf_observers.py
+│ ├── __init__.py
+│ └── quantizer.py
│ ├── runtime
│ ├── OpenvinoBackend.cpp
│ └── OpenvinoBackend.h
@@ -95,6 +100,7 @@ Follow the steps below to setup your build environment:
openvino_build.sh
```
- Optionally, `openvino_build.sh` script can be used to build python package or C++ bineries seperately.
+
**Build OpenVINO Backend Python Package with Pybindings**: To build and install the OpenVINO backend Python package with Python bindings, run the `openvino_build.sh` script with the `--enable_python` argument. This will compile and install the ExecuTorch Python package with the OpenVINO backend into your Python environment. This option will also enable python bindings required to execute OpenVINO backend tests and `aot_optimize_and_infer.py` script inside `executorch/examples/openvino` folder.
```bash
./openvino_build.sh --enable_python
From 35444aefa26b92b802305669fcef5a7ee857a654 Mon Sep 17 00:00:00 2001
From: Mustafa Cavus
Date: Thu, 4 Sep 2025 15:59:36 -0700
Subject: [PATCH 029/266] Update README.md
---
backends/openvino/README.md | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/backends/openvino/README.md b/backends/openvino/README.md
index 71bd27f6b50..0046ad23486 100644
--- a/backends/openvino/README.md
+++ b/backends/openvino/README.md
@@ -95,21 +95,21 @@ Follow the steps below to setup your build environment:
git clone --recurse-submodules https://github.com/pytorch/executorch.git
```
3. **Build ExecuTorch with OpenVINO Backend**
-- Ensure that you are inside `executorch/backends/openvino/scripts` directory. The following command builds and installs ExecuTorch with the OpenVINO backend, and also compiles the C++ runtime binaries into `/cmake-out` for quick inference testing.
+- Ensure that you are inside `executorch/backends/openvino/scripts` directory. The following command builds and installs ExecuTorch with the OpenVINO backend, also compiles the C++ runtime libraries and binaries into `/cmake-out` for quick inference testing.
```bash
openvino_build.sh
```
-- Optionally, `openvino_build.sh` script can be used to build python package or C++ bineries seperately.
+- Optionally, `openvino_build.sh` script can be used to build python package or C++ libraries/binaries seperately.
- **Build OpenVINO Backend Python Package with Pybindings**: To build and install the OpenVINO backend Python package with Python bindings, run the `openvino_build.sh` script with the `--enable_python` argument. This will compile and install the ExecuTorch Python package with the OpenVINO backend into your Python environment. This option will also enable python bindings required to execute OpenVINO backend tests and `aot_optimize_and_infer.py` script inside `executorch/examples/openvino` folder.
+ **Build OpenVINO Backend Python Package with Pybindings**: To build and install the OpenVINO backend Python package with Python bindings, run the `openvino_build.sh` script with the `--enable_python` argument as shown in the below command. This will compile and install the ExecuTorch Python package with the OpenVINO backend into your Python environment. This option will also enable python bindings required to execute OpenVINO backend tests and `aot_optimize_and_infer.py` script inside `executorch/examples/openvino` folder.
```bash
./openvino_build.sh --enable_python
```
- **Build C++ Runtime Libraries for OpenVINO Backend**: Run the `openvino_build.sh` script with the `--cpp_runtime` argument to build C++ runtime libraries into `/cmake-out` folder. `/cmake-out/backends/openvino/openvino_executor_runner` binary file can be used for quick inferencing with vision models.
+ **Build C++ Runtime Libraries for OpenVINO Backend**: Run the `openvino_build.sh` script with the `--cpp_runtime` flag to build the C++ runtime libraries as shown in the below command. The compiled libraries files and binaries can be found in the `/cmake-out` directory. The binary located at `/cmake-out/backends/openvino/openvino_executor_runner` can be used to run inference with vision models.
```bash
./openvino_build.sh --cpp_runtime
```
- **Build C++ Llama Runner**: This step requires first building the C++ runtime libraries by following the previous instructions. Then, run `openvino_build.sh` script with the `--llama_runner` argument to compile the llama runner to execute inference with models exported using `export_llama`. The compiled binary file is located in `/cmake-out/examples/models/llama/llama_main`.
+ **Build C++ Llama Runner**: First, ensure the C++ runtime libraries are built by following the earlier instructions. Then, run the `openvino_build.sh` script with the `--llama_runner flag` to compile the LlaMA runner as shown the below command, which enables executing inference with models exported using export_llama. The resulting binary is located at: `/cmake-out/examples/models/llama/llama_main`
```bash
./openvino_build.sh --llama_runner
```
From 5b8b633a94ca13b672db873e07725363c2e2014c Mon Sep 17 00:00:00 2001
From: Cavus Mustafa
Date: Thu, 4 Sep 2025 17:18:03 -0700
Subject: [PATCH 030/266] formatting fix
---
backends/openvino/partitioner.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py
index a2920285f99..10d4b2b30a7 100644
--- a/backends/openvino/partitioner.py
+++ b/backends/openvino/partitioner.py
@@ -27,7 +27,7 @@
class PatternNode:
- op_types = {}
+ op_types: dict[str, list] = {}
def __init__(self):
self.op_types = {}
From f4a1423ddc5517495b0993d7d183450e4605f702 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa
Date: Thu, 4 Sep 2025 17:33:16 -0700
Subject: [PATCH 031/266] formatting fix
---
backends/openvino/partitioner.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py
index 10d4b2b30a7..4893a89bebb 100644
--- a/backends/openvino/partitioner.py
+++ b/backends/openvino/partitioner.py
@@ -114,7 +114,7 @@ def __init__(
self.delegation_spec = DelegationSpec(OpenvinoBackend.__name__, compile_spec)
self._op_types_to_skip = op_types_to_skip
self._op_names_to_skip = op_names_to_skip
- self._enabled_ops_by_name = set()
+ self._enabled_ops_by_name: set = set()
def ops_to_not_decompose(
self,
From 44f08831df4d4707b1fba855299293ab435815f6 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa
Date: Thu, 4 Sep 2025 17:39:03 -0700
Subject: [PATCH 032/266] formatting fix
---
backends/openvino/partitioner.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py
index 4893a89bebb..1d93ebd9cec 100644
--- a/backends/openvino/partitioner.py
+++ b/backends/openvino/partitioner.py
@@ -141,10 +141,10 @@ def check_pattern(
self, node: torch.fx.Node, pattern: PatternNode, enabled_ops: list
) -> bool:
if node.op == "call_function":
- if ("call_function" + ":" + str(node.target.__name__)) in pattern.op_types:
+ if ("call_function" + ":" + str(node.target.__name__)) in pattern.op_types: # type: ignore[union-attr]
pt_input_nodes = node.all_input_nodes
pattern_input_ops = pattern.op_types[
- "call_function" + ":" + str(node.target.__name__)
+ "call_function" + ":" + str(node.target.__name__) # type: ignore[union-attr]
]
if pattern_input_ops is None:
enabled_ops.append(node)
From 5f657d3ce8cdc34edf2c3129b274c02917a30231 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa
Date: Fri, 5 Sep 2025 10:02:55 -0700
Subject: [PATCH 033/266] formatting fix
---
backends/openvino/partitioner.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py
index 1d93ebd9cec..d4aff6fa7d3 100644
--- a/backends/openvino/partitioner.py
+++ b/backends/openvino/partitioner.py
@@ -27,7 +27,7 @@
class PatternNode:
- op_types: dict[str, list] = {}
+ op_types: dict[str, Optional[list]] = {}
def __init__(self):
self.op_types = {}
From eafcc33ab6bf99b0bfe8155f324af3e961cba279 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa
Date: Fri, 5 Sep 2025 10:06:29 -0700
Subject: [PATCH 034/266] formatting fix
---
backends/openvino/partitioner.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py
index d4aff6fa7d3..5ed9508ca89 100644
--- a/backends/openvino/partitioner.py
+++ b/backends/openvino/partitioner.py
@@ -193,7 +193,7 @@ def capture_nncf_patterns(self, graph_module: torch.fx.GraphModule):
str(node.op) == "call_function"
and str(node.target.__name__) == "aten.stack.default"
):
- enabled_ops = []
+ enabled_ops: list = []
pattern_match = self.check_pattern(node, stack_node, enabled_ops)
if pattern_match:
for pattern_op in enabled_ops:
From 1763b99d7c7785a1b2f5c3152601924f97c07fea Mon Sep 17 00:00:00 2001
From: Cavus Mustafa
Date: Fri, 5 Sep 2025 10:14:59 -0700
Subject: [PATCH 035/266] formatting fix
---
backends/openvino/partitioner.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py
index 5ed9508ca89..20841d6730b 100644
--- a/backends/openvino/partitioner.py
+++ b/backends/openvino/partitioner.py
@@ -138,7 +138,7 @@ def ops_to_not_decompose(
return (ops_not_decompose, None)
def check_pattern(
- self, node: torch.fx.Node, pattern: PatternNode, enabled_ops: list
+ self, node: torch.fx.Node, pattern: type[PatternNode], enabled_ops: list
) -> bool:
if node.op == "call_function":
if ("call_function" + ":" + str(node.target.__name__)) in pattern.op_types: # type: ignore[union-attr]
From 486382636b43a348512a934110f3215bbc67e842 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa
Date: Fri, 5 Sep 2025 10:23:35 -0700
Subject: [PATCH 036/266] formatting fix
---
backends/openvino/quantizer/observers/nncf_observers.py | 4 ++--
backends/openvino/quantizer/quantizer.py | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/backends/openvino/quantizer/observers/nncf_observers.py b/backends/openvino/quantizer/observers/nncf_observers.py
index f6ac2a3cb91..ac95b1bbef5 100644
--- a/backends/openvino/quantizer/observers/nncf_observers.py
+++ b/backends/openvino/quantizer/observers/nncf_observers.py
@@ -111,7 +111,7 @@ def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node):
decompressor_name,
)(model)
decomp_node = observer_node.args[0]
- observer_node.replace_all_uses_with(decomp_node)
+ observer_node.replace_all_uses_with(decomp_node) # type: ignore[arg-type]
model.graph.erase_node(observer_node)
@@ -172,5 +172,5 @@ def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node):
decompressor_name,
)(model)
decomp_node = observer_node.args[0]
- observer_node.replace_all_uses_with(decomp_node)
+ observer_node.replace_all_uses_with(decomp_node) # type: ignore[arg-type]
model.graph.erase_node(observer_node)
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index cd78f6907c7..84e29239419 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -75,7 +75,7 @@ class OpenVINOQuantizer(Quantizer):
def __init__(
self,
*,
- mode: Optional[QuantizationMode] = QuantizationMode.INT8_SYM,
+ mode: QuantizationMode = QuantizationMode.INT8_SYM,
**kwargs,
):
"""
From e24072fc68c7884b62a437de3d8d2b7f60cd9efe Mon Sep 17 00:00:00 2001
From: Cavus Mustafa
Date: Fri, 5 Sep 2025 10:43:00 -0700
Subject: [PATCH 037/266] formatting fix
---
backends/openvino/quantizer/quantizer.py | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index 84e29239419..5cbd50c3136 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -208,7 +208,7 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
edge_or_node, annotation = self._get_edge_or_node_and_annotation(
graph, nncf_graph, qp, node_vs_torch_annotation
)
- qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(
+ qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config( # type: ignore[no-redef]
qp
)
self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
@@ -412,9 +412,9 @@ def _get_torch_ao_qspec_from_nncf_config(
else MappingType.ASYMMETRIC
)
if qconfig.num_bits == 4:
- extra_args["mapping_type"] = mapping_type
- extra_args["target_dtype"] = torch.int8
- extra_args["granularity"] = PerGroup(group_size=group_size)
+ extra_args["mapping_type"] = mapping_type # type: ignore[assignment]
+ extra_args["target_dtype"] = torch.int8 # type: ignore[assignment]
+ extra_args["granularity"] = PerGroup(group_size=group_size) # type: ignore[assignment]
observer = PTPerBlockParamObserver
quant_min = -8
quant_max = 7
From b9bb5f08224544f9f4e9a6896bf756fc41462ce3 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa
Date: Fri, 5 Sep 2025 10:51:16 -0700
Subject: [PATCH 038/266] formatting fix
---
backends/openvino/quantizer/quantizer.py | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index 5cbd50c3136..aef9e56876b 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -391,6 +391,10 @@ def _get_torch_ao_qspec_from_nncf_config(
extra_args = {"eps": 1e-16}
is_weight = qp.is_weight_quantization_point()
qconfig = qp.qconfig
+ dtype = None
+ quant_min = None
+ quant_max = None
+ channel_axis = None
observer: Type[UniformQuantizationObserverBase]
if qconfig.per_channel:
From 291dcd993e17136a3609e30919aa4d406ed54113 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa
Date: Fri, 5 Sep 2025 10:56:31 -0700
Subject: [PATCH 039/266] formatting fix
---
backends/openvino/quantizer/quantizer.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index aef9e56876b..f2011431a03 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -391,7 +391,7 @@ def _get_torch_ao_qspec_from_nncf_config(
extra_args = {"eps": 1e-16}
is_weight = qp.is_weight_quantization_point()
qconfig = qp.qconfig
- dtype = None
+ dtype = torch.int8
quant_min = None
quant_max = None
channel_axis = None
From c8ea777098b8a812e6162b767dbfeabdd7c193c4 Mon Sep 17 00:00:00 2001
From: anzr299
Date: Sat, 6 Sep 2025 13:39:52 +0400
Subject: [PATCH 040/266] use new transformations
---
backends/openvino/quantizer/observers.py | 17 ++++++++---------
1 file changed, 8 insertions(+), 9 deletions(-)
diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py
index d44a22556dd..76ab33eb5c5 100644
--- a/backends/openvino/quantizer/observers.py
+++ b/backends/openvino/quantizer/observers.py
@@ -15,8 +15,9 @@
get_tensor_constant_from_node,
)
from nncf.experimental.torch.fx.transformations import ( # type: ignore[import-untyped]
- constant_update_fn,
- module_insertion_transformation_builder,
+ constant_update,
+ module_insertion,
+ node_removal,
)
from nncf.quantization.algorithms.weight_compression.config import ( # type: ignore[import-untyped]
WeightCompressionParameters,
@@ -103,7 +104,7 @@ def convert(
packed_q_weight = decompressor.pack_weight(q_weight)
# Weight port id is 0 since observer is inserted for a single weight only.
- constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0)
+ constant_update(model, observer_node, packed_q_weight, input_port_id=0)
compressed_weight_name = observer_node.all_input_nodes[0].name
decompressor_suffix = "_".join(
@@ -111,7 +112,8 @@ def convert(
)
decompressor_name = f"{decompressor.quantization_mode}_weights_decompressor_{decompressor_suffix}"
- module_insertion_transformation_builder(
+ module_insertion(
+ model,
decompressor,
[
PTTargetPoint(
@@ -120,11 +122,8 @@ def convert(
)
],
decompressor_name,
- )(model)
-
- decomp_node = observer_node.args[0]
- observer_node.replace_all_uses_with(decomp_node) # type: ignore[arg-type]
- model.graph.erase_node(observer_node)
+ )
+ node_removal(model, observer_node, 0)
@abstractmethod
def _create_decompressor(
From a6b605f41b5390ff9de70b2397a2d00003f34ff2 Mon Sep 17 00:00:00 2001
From: anzr299
Date: Sat, 6 Sep 2025 13:46:24 +0400
Subject: [PATCH 041/266] add comment for manual MP allocation
---
extension/llm/export/quantizer_lib.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index 9220c1efbdc..e839827208c 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -233,7 +233,7 @@ def get_ov_quantizer(
), f"The quantization config is for backend {backend} instead of openvino."
assert group_size != None, "Group Size None is Not Supported. It should be set to -1 for per-channel."
- # Manually ignore MP layers.
+ # (TODO) Manually ignore MP layers. This is done manually for now till we use the dynamic allocation MP
fp_node_names = [
"linear_14",
"linear_15",
From 9614fc4da170d76a39e047d0c364177bf96d0209 Mon Sep 17 00:00:00 2001
From: anzr299
Date: Sat, 6 Sep 2025 13:48:58 +0400
Subject: [PATCH 042/266] remove nncf_compression from export llama lib
---
examples/models/llama/export_llama_lib.py | 54 +----------------------
1 file changed, 1 insertion(+), 53 deletions(-)
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 8eab3eefbc0..ac52893b99c 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -567,13 +567,6 @@ def build_args_parser() -> argparse.ArgumentParser:
help="path to the input pruning token mapping file (token_map.json)",
)
- parser.add_argument(
- "--nncf_compression",
- default=False,
- action="store_true",
- help="Enables nncf compression for openvino backend",
- )
-
parser.add_argument(
"--export_only",
default=False,
@@ -909,7 +902,6 @@ def _to_edge_and_lower_llama_openvino(
quantizers,
additional_passes,
openvino_device: str = "CPU",
- nncf_compression: bool = False,
verbose: bool = False,
) -> LLMEdgeManager: # noqa: C901
partitioners = []
@@ -921,51 +913,8 @@ def _to_edge_and_lower_llama_openvino(
logging.info("Lowering model using following partitioner(s): ")
for partitioner in partitioners:
logging.info(f"--> {partitioner.__class__.__name__}")
- try:
- import nncf
- from functools import partial
- from pytorch_tokenizers import get_tokenizer
- except ImportError:
- raise ImportError(
- "Please install nncf via backends/openvino/requirements.txt"
- )
-
- tokenizer = get_tokenizer(builder_exported.tokenizer_path)
- from datasets import load_dataset
- # Use NNCF compression if enabled
- # TODO: Enable passing OpenVINOQuantizer as a parameter to pt2e_quantize
- if nncf_compression:
- dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
- dataset = dataset.filter(lambda example: example['text'].strip() != "")
- dataset = dataset.filter(lambda example: example['text'].strip() != "\n")
- def transform_fn(
- prompts: str, tokenizer
- ):
- tokenized_text = tokenizer.encode(prompts["text"], bos=False, eos=False)
- device = torch.device("cpu") if openvino_device=="CPU" else torch.device("cuda")
- inputs = ()
- inputs = (
- torch.tensor(tokenized_text[:128], device=device).unsqueeze(0),
- {"input_pos": torch.tensor([0], device=device)},
- )
-
- return inputs
-
- builder_exported.pre_autograd_graph_module = nncf.compress_weights(
- builder_exported.pre_autograd_graph_module,
- dataset=nncf.Dataset(dataset, partial(transform_fn, tokenizer=tokenizer)),
- mode=nncf.CompressWeightsMode.INT4_SYM,
- group_size=32,
- backup_mode=nncf.BackupMode.NONE,
- ratio=0.8,
- sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
- )
-
- builder = builder_exported.to_edge_transform_and_lower(partitioners)
-
- else:
- builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(partitioners)
+ builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(partitioners)
if verbose:
print_delegation_info(builder.edge_manager.exported_program().graph_module)
@@ -1211,7 +1160,6 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager: # noqa: C901
quantizers,
additional_passes,
openvino_device=llm_config.backend.openvino.device,
- nncf_compression=llm_config.backend.openvino.nncf_compression,
verbose=llm_config.debug.verbose,
)
else:
From 45007cf90c054ccfd527874ae35d383fc34a4ee8 Mon Sep 17 00:00:00 2001
From: anzr299
Date: Sat, 6 Sep 2025 13:52:58 +0400
Subject: [PATCH 043/266] change pt2e quantize flag to use openvino_4wo instead
of openvino_8da4w and so on
---
extension/llm/export/config/llm_config.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py
index b4175d54cd7..49855d61e6e 100644
--- a/extension/llm/export/config/llm_config.py
+++ b/extension/llm/export/config/llm_config.py
@@ -275,8 +275,8 @@ class Pt2eQuantize(str, Enum):
xnnpack_dynamic = "xnnpack_dynamic"
xnnpack_dynamic_qc4 = "xnnpack_dynamic_qc4"
- openvino_8da4w = "openvino_8da4w"
- openvino_8da8w = "openvino_8da8w"
+ openvino_4wo = "openvino_4wo"
+ openvino_8wo = "openvino_8wo"
qnn_8a8w = "qnn_8a8w"
qnn_16a16w = "qnn_16a16w"
qnn_16a4w = "qnn_16a4w"
From 9d494147457e6696f7149e4b7cb69f95811cbd47 Mon Sep 17 00:00:00 2001
From: anzr299
Date: Sat, 6 Sep 2025 13:53:14 +0400
Subject: [PATCH 044/266] follow up to last commit
---
examples/models/llama/export_llama_lib.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index ac52893b99c..ec03f4b26c9 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -206,8 +206,8 @@ def build_args_parser() -> argparse.ArgumentParser:
choices=[
"xnnpack_dynamic",
"xnnpack_dynamic_qc4",
- "openvino_8da4w",
- "openvino_8da8w",
+ "openvino_4wo",
+ "openvino_8wo",
"qnn_8a8w",
"qnn_16a16w",
"qnn_16a4w",
From d6727cfed609d07281fdea42358d2e234ac82f19 Mon Sep 17 00:00:00 2001
From: anzr299
Date: Sat, 6 Sep 2025 13:56:47 +0400
Subject: [PATCH 045/266] update quantizer lib with openvino_4wo
---
extension/llm/export/quantizer_lib.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index e839827208c..8a097f9b8f1 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -263,10 +263,10 @@ def get_ov_quantizer(
"linear_109",
"linear_110",]
- if quant_config == "8da4w":
+ if quant_config == "4wo":
mode = QuantizationMode.INT4WO_SYM
- elif quant_config == "8da8w":
+ elif quant_config == "8wo":
group_size = -1
mode = QuantizationMode.INT8WO_SYM
else:
From 4a0a7819ab69aa0d8fdfce70f3be219c14abc409 Mon Sep 17 00:00:00 2001
From: anzr299
Date: Sat, 6 Sep 2025 14:06:48 +0400
Subject: [PATCH 046/266] split qspec function into 2 parts; 1 for WC and other
for PTQ qspecs
---
backends/openvino/quantizer/quantizer.py | 92 +++++++++++++-----------
1 file changed, 50 insertions(+), 42 deletions(-)
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index 7f86686d03c..ef04ed0de46 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -188,8 +188,8 @@ def _annotate_weight_compression(
)
annotation = node_vs_torch_annotation[target_node]
edge_or_node = self._get_weight_edge(target_node, nncf_graph)
- qspec = self._get_torch_ao_qspec_from_nncf_config(
- qp=None, wc_param=wc_param
+ qspec = self._get_torch_ao_qspec_from_nncf_config_for_wc(
+ wc_param=wc_param
)
self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
@@ -217,7 +217,7 @@ def _annotate_post_training_quantization(
edge_or_node, annotation = self._get_edge_or_node_and_annotation(
graph, nncf_graph, qp, node_vs_torch_annotation
)
- qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(qp)
+ qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config_for_ptq(qp)
self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
for quantizer_ids in quantization_setup.unified_scale_groups.values():
@@ -412,18 +412,58 @@ def _fill_torch_ao_annotation(
annotation_to_update.input_qspec_map[edge_or_node[0]] = qspec
@staticmethod
- def _get_torch_ao_qspec_from_nncf_config(
+ def _get_torch_ao_qspec_from_nncf_config_for_wc(
+ wc_param: WeightCompressionParameters,
+ ) -> QuantizationSpec:
+ """
+ Returns a TorchAO QuantizationSpec based on NNCF weight compression parameter.
+
+ :param wc_param: NNCF Weight compression parameters for the node.
+ :return: A TorchAO QuantizationSpec.
+ """
+ observer: Type[UniformQuantizationObserverBase]
+
+ extra_args: Dict[str, Any] = {}
+
+ qmode = wc_param.compression_config.mode
+ if qmode in [nncf.CompressWeightsMode.INT4_ASYM, nncf.CompressWeightsMode.INT4_SYM]:
+ extra_args["wc_param"] = wc_param
+ observer = INT4WeightObserver
+ quant_min = -8 if not wc_param.compression_config.is_asym_mode else 0
+ quant_max = 7 if not wc_param.compression_config.is_asym_mode else 15
+ dtype = torch.int8
+ channel_axis = 0
+ torch_qscheme = None
+ else:
+ extra_args["wc_param"] = wc_param
+ observer = INT8WeightObserver
+ quant_min = -128 if not wc_param.compression_config.is_asym_mode else 0
+ quant_max = 127 if not wc_param.compression_config.is_asym_mode else 255
+ dtype = torch.int8
+ channel_axis = 0
+ torch_qscheme = (
+ torch.per_channel_symmetric
+ if qmode == QuantizationMode.INT8WO_SYM
+ else torch.per_channel_affine
+ )
+ return QuantizationSpec(
+ dtype=dtype,
+ observer_or_fake_quant_ctr=observer.with_args(**extra_args),
+ quant_min=quant_min,
+ quant_max=quant_max,
+ qscheme=torch_qscheme,
+ ch_axis=channel_axis,
+ is_dynamic=False,
+ )
+
+ @staticmethod
+ def _get_torch_ao_qspec_from_nncf_config_for_ptq(
qp: quantization.quantizer_setup.QuantizationPointBase,
- wc_param: WeightCompressionParameters = None,
) -> QuantizationSpec:
"""
- Returns a TorchAO QuantizationSpec based on NNCF quantization config and other arguments.
- For weight-only quantization (e.g., INT4/INT8 compression), uses `wc_param` which carries
- weight only quantization info such as group_size, reduction_axes etc. For post-training
- quantization, only `qp` is required.
+ Returns a TorchAO QuantizationSpec based on NNCF quantization point.
:param qp: Quantization point from NNCF.
- :param wc_param: NNCF Weight compression parameters for the node.
:return: A TorchAO QuantizationSpec.
"""
observer: Type[UniformQuantizationObserverBase]
@@ -431,38 +471,6 @@ def _get_torch_ao_qspec_from_nncf_config(
# Eps value is copied from nncf/torch/quantization/layers.py
extra_args: Dict[str, Any] = {"eps": 1e-16}
- if wc_param:
- qmode = wc_param.compression_config.mode
- if qmode in [nncf.CompressWeightsMode.INT4_ASYM, nncf.CompressWeightsMode.INT4_SYM]:
- extra_args["wc_param"] = wc_param
- observer = INT4WeightObserver
- quant_min = -8 if not wc_param.compression_config.is_asym_mode else 0
- quant_max = 7 if not wc_param.compression_config.is_asym_mode else 15
- dtype = torch.int8
- channel_axis = 0
- torch_qscheme = None
- else:
- extra_args["wc_param"] = wc_param
- observer = INT8WeightObserver
- quant_min = -128 if not wc_param.compression_config.is_asym_mode else 0
- quant_max = 127 if not wc_param.compression_config.is_asym_mode else 255
- dtype = torch.int8
- channel_axis = 0
- torch_qscheme = (
- torch.per_channel_symmetric
- if qmode == QuantizationMode.INT8WO_SYM
- else torch.per_channel_affine
- )
- return QuantizationSpec(
- dtype=dtype,
- observer_or_fake_quant_ctr=observer.with_args(**extra_args),
- quant_min=quant_min,
- quant_max=quant_max,
- qscheme=torch_qscheme,
- ch_axis=channel_axis,
- is_dynamic=False,
- )
-
is_weight = qp.is_weight_quantization_point()
qconfig = qp.qconfig
From f6a1ee3d708ca46fe495f081bc45872042b1bed6 Mon Sep 17 00:00:00 2001
From: anzr299
Date: Mon, 8 Sep 2025 12:14:34 +0400
Subject: [PATCH 047/266] micro fix
---
backends/openvino/quantizer/quantizer.py | 17 +++++++++++------
1 file changed, 11 insertions(+), 6 deletions(-)
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index ef04ed0de46..762ed2a9171 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -426,24 +426,29 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc(
extra_args: Dict[str, Any] = {}
qmode = wc_param.compression_config.mode
+ is_asym_mode = wc_param.compression_config.is_asym_mode
if qmode in [nncf.CompressWeightsMode.INT4_ASYM, nncf.CompressWeightsMode.INT4_SYM]:
extra_args["wc_param"] = wc_param
observer = INT4WeightObserver
- quant_min = -8 if not wc_param.compression_config.is_asym_mode else 0
- quant_max = 7 if not wc_param.compression_config.is_asym_mode else 15
+ quant_min = -8 if not is_asym_mode else 0
+ quant_max = 7 if not is_asym_mode else 15
dtype = torch.int8
channel_axis = 0
- torch_qscheme = None
+ torch_qscheme = torch_qscheme = (
+ torch.per_channel_symmetric
+ if not is_asym_mode
+ else torch.per_channel_affine
+ )
else:
extra_args["wc_param"] = wc_param
observer = INT8WeightObserver
- quant_min = -128 if not wc_param.compression_config.is_asym_mode else 0
- quant_max = 127 if not wc_param.compression_config.is_asym_mode else 255
+ quant_min = -128 if not is_asym_mode else 0
+ quant_max = 127 if not is_asym_mode else 255
dtype = torch.int8
channel_axis = 0
torch_qscheme = (
torch.per_channel_symmetric
- if qmode == QuantizationMode.INT8WO_SYM
+ if not is_asym_mode
else torch.per_channel_affine
)
return QuantizationSpec(
From d285fcce354f8bde55e968892932cbe4a34421cd Mon Sep 17 00:00:00 2001
From: anzr299
Date: Mon, 8 Sep 2025 15:35:49 +0400
Subject: [PATCH 048/266] udpate mixed precision layers for higher accuracy.
Change INT4 mode to Asymmetric
---
extension/llm/export/quantizer_lib.py | 11 ++++-------
1 file changed, 4 insertions(+), 7 deletions(-)
diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index 8a097f9b8f1..46b10dcb960 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -235,21 +235,17 @@ def get_ov_quantizer(
# (TODO) Manually ignore MP layers. This is done manually for now till we use the dynamic allocation MP
fp_node_names = [
+ "linear_13",
"linear_14",
- "linear_15",
"linear_35",
"linear_56",
- "linear_57",
- "linear_63",
"linear_70",
"linear_71",
"linear_77",
"linear_78",
- "linear_81",
"linear_84",
"linear_85",
"linear_88",
- "linear_89",
"linear_91",
"linear_92",
"linear_95",
@@ -261,10 +257,11 @@ def get_ov_quantizer(
"linear_105",
"linear_106",
"linear_109",
- "linear_110",]
+ "linear_110",
+ "linear_111",]
if quant_config == "4wo":
- mode = QuantizationMode.INT4WO_SYM
+ mode = QuantizationMode.INT4WO_ASYM
elif quant_config == "8wo":
group_size = -1
From 4e66df1a52e40e90178f4c9fce815d364c5282f9 Mon Sep 17 00:00:00 2001
From: Aamir Nazir
Date: Mon, 8 Sep 2025 18:12:37 +0400
Subject: [PATCH 049/266] Apply suggestions from code review
Co-authored-by: Daniil Lyakhov
---
backends/openvino/quantizer/observers.py | 13 +++----------
1 file changed, 3 insertions(+), 10 deletions(-)
diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py
index 76ab33eb5c5..59a40f2be2d 100644
--- a/backends/openvino/quantizer/observers.py
+++ b/backends/openvino/quantizer/observers.py
@@ -56,9 +56,9 @@ def __init__(
:param dtype: target dtype for the quantization.
"""
super().__init__(dtype=dtype, is_dynamic=False)
- self.wc_param = wc_param
+ self._wc_param = wc_param
- def calculate_qparams( # type: ignore[override]
+ def _calculate_qparams( # type: ignore[override]
self,
weight: torch.Tensor,
) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
@@ -68,7 +68,7 @@ def calculate_qparams( # type: ignore[override]
:param weight: FP weight to be used for calculating qparams.
:return: A tuple containing the quantized weight, quantization scale and quantization zero point.
"""
- wc_param = self.get_wc_param()
+ wc_param = self._wc_param
wc_config = wc_param.compression_config
reduction_axes = wc_param.reduction_axes
q_weight, scale, zp = do_integer_quantization(
@@ -143,13 +143,6 @@ def _create_decompressor(
:return: NNCF observer according to the qmode which creates the decompression subgraph supported by OpenVINO.
"""
- def get_wc_param(self) -> WeightCompressionParameters:
- """
- Returns a respective NNCF Weight Compression Config.
-
- :return: Weight compression config with the compression information such as qmode, group_size etc.
- """
- return self.wc_param
class INT4WeightObserver(WeightObserverBase):
"""
From e850e419cb313e86fd0f5669e7eaa1d115fcc10c Mon Sep 17 00:00:00 2001
From: anzr299
Date: Mon, 8 Sep 2025 18:13:28 +0400
Subject: [PATCH 050/266] Review changes
---
backends/openvino/quantizer/observers.py | 30 ++++++++++++------------
1 file changed, 15 insertions(+), 15 deletions(-)
diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py
index 59a40f2be2d..457399117e0 100644
--- a/backends/openvino/quantizer/observers.py
+++ b/backends/openvino/quantizer/observers.py
@@ -94,7 +94,7 @@ def convert(
"""
weight_node = observer_node.args[0]
original_weight = get_tensor_constant_from_node(weight_node, model)
- q_weight, scale, zero_point = self.calculate_qparams(
+ q_weight, scale, zero_point = self._calculate_qparams(
original_weight
)
@@ -156,18 +156,17 @@ def _create_decompressor(
q_weight: torch.Tensor,
original_weight: torch.Tensor,
) -> BaseWeightsDecompressor:
- if zero_point is not None:
- return INT4AsymmetricWeightsDecompressor(
- scale,
- zero_point,
- q_weight.shape,
- original_weight.shape,
- original_weight.dtype,
- )
- else:
+ if zero_point is None:
return INT4SymmetricWeightsDecompressor(
scale, q_weight.shape, original_weight.shape, original_weight.dtype
)
+ return INT4AsymmetricWeightsDecompressor(
+ scale,
+ zero_point,
+ q_weight.shape,
+ original_weight.shape,
+ original_weight.dtype,
+ )
class INT8WeightObserver(WeightObserverBase):
@@ -182,10 +181,11 @@ def _create_decompressor(
q_weight: torch.Tensor,
original_weight: torch.Tensor,
) -> BaseWeightsDecompressor:
- if zero_point is not None:
- return INT8AsymmetricWeightsDecompressor(
- scale, zero_point, original_weight.dtype
+ if zero_point is None:
+ return INT8SymmetricWeightsDecompressor(
+ scale, original_weight.dtype
)
- else:
- return INT8SymmetricWeightsDecompressor(scale, original_weight.dtype)
+ return INT8AsymmetricWeightsDecompressor(
+ scale, zero_point, original_weight.dtype
+ )
From 204043f973ba928c3f2b73dc11e1db6572b7c4a7 Mon Sep 17 00:00:00 2001
From: anzr299
Date: Mon, 8 Sep 2025 18:33:16 +0400
Subject: [PATCH 051/266] review changes in quantizer
---
backends/openvino/quantizer/quantizer.py | 8 ++------
1 file changed, 2 insertions(+), 6 deletions(-)
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index 762ed2a9171..7e0e3c92af0 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -175,7 +175,6 @@ def _annotate_weight_compression(
:param graph: The underlying FX graph.
:param nncf_graph: The corresponding NNCF graph.
:param node_vs_torch_annotation: A mapping of FX nodes to quantization annotations.
-
:return: Updated mapping of FX nodes with weight compression annotations.
"""
self._algo.set_backend_entity(model)
@@ -343,7 +342,7 @@ def _get_edge_or_node_and_annotation(
def _get_weight_edge(
target_node: torch.fx.Node,
nncf_graph: NNCFGraph,
- ):
+ ) -> tuple[torch.fx.Node, torch.fx.Node]:
"""
Returns the FX node corresponding to the weight tensor input of a given operator node.
Uses the NNCF graph to identify which input port of the target node holds the weight.
@@ -351,7 +350,6 @@ def _get_weight_edge(
:param target_node: FX node representing a weighted operation (e.g., Linear, Conv).
:param nncf_graph: NNCFGraph used to determine weight port indices.
-
:return: Edge represented by a Tuple of (weight_node, target_node), where weight_node is the FX node supplying the weight.
"""
nncf_node = nncf_graph.get_node_by_name(target_node.name)
@@ -428,7 +426,6 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc(
qmode = wc_param.compression_config.mode
is_asym_mode = wc_param.compression_config.is_asym_mode
if qmode in [nncf.CompressWeightsMode.INT4_ASYM, nncf.CompressWeightsMode.INT4_SYM]:
- extra_args["wc_param"] = wc_param
observer = INT4WeightObserver
quant_min = -8 if not is_asym_mode else 0
quant_max = 7 if not is_asym_mode else 15
@@ -440,7 +437,6 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc(
else torch.per_channel_affine
)
else:
- extra_args["wc_param"] = wc_param
observer = INT8WeightObserver
quant_min = -128 if not is_asym_mode else 0
quant_max = 127 if not is_asym_mode else 255
@@ -453,7 +449,7 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc(
)
return QuantizationSpec(
dtype=dtype,
- observer_or_fake_quant_ctr=observer.with_args(**extra_args),
+ observer_or_fake_quant_ctr=observer.with_args(wc_param=wc_param),
quant_min=quant_min,
quant_max=quant_max,
qscheme=torch_qscheme,
From ae6b089f293d20248df4c3d8a0d0c5ddfed62c4c Mon Sep 17 00:00:00 2001
From: anzr299
Date: Mon, 8 Sep 2025 18:45:54 +0400
Subject: [PATCH 052/266] revert extra args changes
---
backends/openvino/quantizer/quantizer.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index 7e0e3c92af0..89d528f8d16 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -424,6 +424,7 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc(
extra_args: Dict[str, Any] = {}
qmode = wc_param.compression_config.mode
+ extra_args["wc_param"] = wc_param
is_asym_mode = wc_param.compression_config.is_asym_mode
if qmode in [nncf.CompressWeightsMode.INT4_ASYM, nncf.CompressWeightsMode.INT4_SYM]:
observer = INT4WeightObserver
@@ -449,7 +450,7 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc(
)
return QuantizationSpec(
dtype=dtype,
- observer_or_fake_quant_ctr=observer.with_args(wc_param=wc_param),
+ observer_or_fake_quant_ctr=observer.with_args(**extra_args),
quant_min=quant_min,
quant_max=quant_max,
qscheme=torch_qscheme,
From 2de569398917362b9ffc02849037528c2a15efa7 Mon Sep 17 00:00:00 2001
From: anzr299
Date: Tue, 9 Sep 2025 11:43:00 +0400
Subject: [PATCH 053/266] precommit fixes
---
backends/openvino/quantizer/observers.py | 11 +++------
backends/openvino/quantizer/quantizer.py | 30 +++++++++++++----------
examples/models/llama/export_llama_lib.py | 6 +++--
extension/llm/export/quantizer_lib.py | 21 +++++++++-------
4 files changed, 36 insertions(+), 32 deletions(-)
diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py
index 457399117e0..faeb4fa7a60 100644
--- a/backends/openvino/quantizer/observers.py
+++ b/backends/openvino/quantizer/observers.py
@@ -84,7 +84,7 @@ def convert(
self, model: torch.fx.GraphModule, observer_node: torch.fx.Node
) -> None:
"""
- Replaces the given observer node from the given model with a quantized
+ Replaces the given observer node from the given model with a quantized
weight and a OpenVINO specific decompression module.
:param model: A `torch.fx.GraphModule` representing the statically traced model
@@ -94,9 +94,7 @@ def convert(
"""
weight_node = observer_node.args[0]
original_weight = get_tensor_constant_from_node(weight_node, model)
- q_weight, scale, zero_point = self._calculate_qparams(
- original_weight
- )
+ q_weight, scale, zero_point = self._calculate_qparams(original_weight)
decompressor = self._create_decompressor(
scale, zero_point, q_weight, original_weight
@@ -182,10 +180,7 @@ def _create_decompressor(
original_weight: torch.Tensor,
) -> BaseWeightsDecompressor:
if zero_point is None:
- return INT8SymmetricWeightsDecompressor(
- scale, original_weight.dtype
- )
+ return INT8SymmetricWeightsDecompressor(scale, original_weight.dtype)
return INT8AsymmetricWeightsDecompressor(
scale, zero_point, original_weight.dtype
)
-
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index 9db79fce9f9..bef1ef3274f 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -12,7 +12,6 @@
import nncf # type: ignore[import-untyped]
import nncf.common.quantization as quantization # type: ignore[import-untyped]
-from nncf.common.scopes import should_consider_scope # type: ignore[import-untyped]
import nncf.experimental.torch.fx as nncf_fx # type: ignore[import-untyped]
import torch.fx
@@ -21,12 +20,12 @@
INT8WeightObserver,
)
from nncf.common.graph.graph import NNCFGraph # type: ignore[import-untyped]
-from nncf.quantization.quantize_model import ( # type: ignore[import-untyped]
- get_weight_compression_configuration,
-)
from nncf.quantization.algorithms.weight_compression.config import ( # type: ignore[import-untyped]
WeightCompressionParameters,
)
+from nncf.quantization.quantize_model import ( # type: ignore[import-untyped]
+ get_weight_compression_configuration,
+)
from torchao.quantization.pt2e import (
HistogramObserver,
PerChannelMinMaxObserver,
@@ -118,7 +117,7 @@ def __init__(
), # Mode value has to match NNCF CompressWeightsMode
**kwargs,
)
- subset_size = 1 # Doesn't really matter in this case since it is data-free. Should just be +ve
+ subset_size = 1 # Doesn't really matter in this case since it is data-free. Should just be +ve
self._algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression(
subset_size=subset_size, **weight_compression_configuration
)
@@ -178,7 +177,9 @@ def _annotate_weight_compression(
:return: Updated mapping of FX nodes with weight compression annotations.
"""
self._algo.set_backend_entity(model)
- all_wc_params, _ = self._algo.get_weight_compression_parameters(model, nncf_graph)
+ all_wc_params, _ = self._algo.get_weight_compression_parameters(
+ model, nncf_graph
+ )
for wc_param in all_wc_params:
node_with_weight = wc_param.node_with_weight
@@ -187,9 +188,7 @@ def _annotate_weight_compression(
)
annotation = node_vs_torch_annotation[target_node]
edge_or_node = self._get_weight_edge(target_node, nncf_graph)
- qspec = self._get_torch_ao_qspec_from_nncf_config_for_wc(
- wc_param=wc_param
- )
+ qspec = self._get_torch_ao_qspec_from_nncf_config_for_wc(wc_param=wc_param)
self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
return node_vs_torch_annotation
@@ -216,7 +215,9 @@ def _annotate_post_training_quantization(
edge_or_node, annotation = self._get_edge_or_node_and_annotation(
graph, nncf_graph, qp, node_vs_torch_annotation
)
- qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config_for_ptq(qp)
+ qspec: QuantizationSpecBase = (
+ self._get_torch_ao_qspec_from_nncf_config_for_ptq(qp)
+ )
self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
for quantizer_ids in quantization_setup.unified_scale_groups.values():
@@ -426,8 +427,11 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc(
qmode = wc_param.compression_config.mode
extra_args["wc_param"] = wc_param
is_asym_mode = wc_param.compression_config.is_asym_mode
- if qmode in [nncf.CompressWeightsMode.INT4_ASYM, nncf.CompressWeightsMode.INT4_SYM]:
- observer = INT4WeightObserver
+ if qmode in [
+ nncf.CompressWeightsMode.INT4_ASYM,
+ nncf.CompressWeightsMode.INT4_SYM,
+ ]:
+ observer = INT4WeightObserver # type: ignore[type-abstract]
quant_min = -8 if not is_asym_mode else 0
quant_max = 7 if not is_asym_mode else 15
dtype = torch.int8
@@ -438,7 +442,7 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc(
else torch.per_channel_affine
)
else:
- observer = INT8WeightObserver
+ observer = INT8WeightObserver # type: ignore[type-abstract]
quant_min = -128 if not is_asym_mode else 0
quant_max = 127 if not is_asym_mode else 255
dtype = torch.int8
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 578fd0fea7b..d9c282888cc 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -43,10 +43,10 @@
)
from executorch.extension.llm.export.quantizer_lib import (
get_coreml_quantizer,
+ get_ov_quantizer,
get_pt2e_quantization_params,
get_pt2e_quantizers,
get_qnn_quantizer,
- get_ov_quantizer,
get_vulkan_quantizer,
)
from executorch.util.activation_memory_profiler import generate_memory_trace
@@ -897,7 +897,9 @@ def _to_edge_and_lower_llama_openvino(
for partitioner in partitioners:
logging.info(f"--> {partitioner.__class__.__name__}")
- builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(partitioners)
+ builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(
+ partitioners
+ )
if verbose:
print_delegation_info(builder.edge_manager.exported_program().graph_module)
diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index 83d4a84420d..df8c2a5e36c 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -220,20 +220,22 @@ def get_ov_quantizer(
group_size: int = 32,
):
try:
- from executorch.backends.openvino.quantizer import OpenVINOQuantizer, QuantizationMode
- import nncf
- except ImportError:
- raise ImportError(
- "Please install nncf via backends/openvino/requirements.txt"
+ from executorch.backends.openvino.quantizer import (
+ OpenVINOQuantizer,
+ QuantizationMode,
)
-
+ except ImportError:
+ raise ImportError("Please install nncf via backends/openvino/requirements.txt")
+
backend, quant_config = pt2e_quantize.split("_")
assert (
backend == "openvino"
), f"The quantization config is for backend {backend} instead of openvino."
- assert group_size != None, "Group Size None is Not Supported. It should be set to -1 for per-channel."
+ assert (
+ group_size
+ ), "Group Size None is Not Supported. It should be set to -1 for per-channel."
- # (TODO) Manually ignore MP layers. This is done manually for now till we use the dynamic allocation MP
+ # (TODO) Manually ignore MP layers. This is done manually for now till we use the dynamic allocation MP
fp_node_names = [
"linear_13",
"linear_14",
@@ -258,7 +260,8 @@ def get_ov_quantizer(
"linear_106",
"linear_109",
"linear_110",
- "linear_111",]
+ "linear_111",
+ ]
if quant_config == "4wo":
mode = QuantizationMode.INT4WO_ASYM
From 0e10f28242129a3c332ccdbd7a3b9a4340a8e1a1 Mon Sep 17 00:00:00 2001
From: Aamir Nazir
Date: Tue, 9 Sep 2025 21:52:23 +0400
Subject: [PATCH 054/266] revert _calculate_qparams back to calculate_qparams
---
backends/openvino/quantizer/observers.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py
index faeb4fa7a60..6cda4561604 100644
--- a/backends/openvino/quantizer/observers.py
+++ b/backends/openvino/quantizer/observers.py
@@ -58,7 +58,7 @@ def __init__(
super().__init__(dtype=dtype, is_dynamic=False)
self._wc_param = wc_param
- def _calculate_qparams( # type: ignore[override]
+ def calculate_qparams( # type: ignore[override]
self,
weight: torch.Tensor,
) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
@@ -94,7 +94,7 @@ def convert(
"""
weight_node = observer_node.args[0]
original_weight = get_tensor_constant_from_node(weight_node, model)
- q_weight, scale, zero_point = self._calculate_qparams(original_weight)
+ q_weight, scale, zero_point = self.calculate_qparams(original_weight)
decompressor = self._create_decompressor(
scale, zero_point, q_weight, original_weight
From 05f5a929c7c5b9a79859d9c9848ce37dd0c16b41 Mon Sep 17 00:00:00 2001
From: Aamir Nazir
Date: Wed, 10 Sep 2025 18:49:08 +0400
Subject: [PATCH 055/266] remove manual ignored nodes
---
extension/llm/export/quantizer_lib.py | 29 ---------------------------
1 file changed, 29 deletions(-)
diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index df8c2a5e36c..870080a7549 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -235,34 +235,6 @@ def get_ov_quantizer(
group_size
), "Group Size None is Not Supported. It should be set to -1 for per-channel."
- # (TODO) Manually ignore MP layers. This is done manually for now till we use the dynamic allocation MP
- fp_node_names = [
- "linear_13",
- "linear_14",
- "linear_35",
- "linear_56",
- "linear_70",
- "linear_71",
- "linear_77",
- "linear_78",
- "linear_84",
- "linear_85",
- "linear_88",
- "linear_91",
- "linear_92",
- "linear_95",
- "linear_96",
- "linear_98",
- "linear_99",
- "linear_102",
- "linear_103",
- "linear_105",
- "linear_106",
- "linear_109",
- "linear_110",
- "linear_111",
- ]
-
if quant_config == "4wo":
mode = QuantizationMode.INT4WO_ASYM
@@ -274,7 +246,6 @@ def get_ov_quantizer(
f"No support for quant type {quant_config}. Support 8a4w, 8a8w only."
)
ov_quantizer = OpenVINOQuantizer(mode=mode, group_size=group_size)
- ov_quantizer.set_ignored_scope(names=fp_node_names)
return ov_quantizer
From fbe0e21137ee9ebc8ea246e61fd9cfa252f57b15 Mon Sep 17 00:00:00 2001
From: Aamir Nazir
Date: Wed, 10 Sep 2025 18:52:42 +0400
Subject: [PATCH 056/266] add ratio to quantizer initialization
---
extension/llm/export/quantizer_lib.py | 14 ++++++++++----
1 file changed, 10 insertions(+), 4 deletions(-)
diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index 870080a7549..350e8b3ce7c 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -235,17 +235,23 @@ def get_ov_quantizer(
group_size
), "Group Size None is Not Supported. It should be set to -1 for per-channel."
+ quantization_params = {}
+
if quant_config == "4wo":
- mode = QuantizationMode.INT4WO_ASYM
+ quantization_params["mode"] = QuantizationMode.INT4WO_ASYM
+ quantization_params["group_size"] = group_size
+ quantization_params["ratio"] = 0.8
elif quant_config == "8wo":
- group_size = -1
- mode = QuantizationMode.INT8WO_SYM
+ quantization_params["mode"] = QuantizationMode.INT8WO_ASYM
+ quantization_params["group_size"] = -1
+ quantization_params["ratio"] = None
+
else:
raise AssertionError(
f"No support for quant type {quant_config}. Support 8a4w, 8a8w only."
)
- ov_quantizer = OpenVINOQuantizer(mode=mode, group_size=group_size)
+ ov_quantizer = OpenVINOQuantizer(**quantization_params)
return ov_quantizer
From 6bff1cdb00ebdae53b57ab706cab6e9e9ee7e335 Mon Sep 17 00:00:00 2001
From: Aamir Nazir
Date: Thu, 11 Sep 2025 23:04:13 +0400
Subject: [PATCH 057/266] Update export_llama_lib.py
---
examples/models/llama/export_llama_lib.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index d9c282888cc..cbbf169a085 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -768,7 +768,7 @@ def get_quantizer_and_quant_params(llm_config):
if llm_config.backend.openvino.enabled and llm_config.quantization.pt2e_quantize:
assert not quantizers, "Should not enable both xnnpack and openvino"
group_size = llm_config.quantization.group_size
- group_size = group_size if group_size else 32
+ group_size = group_size if group_size else 128
ov_quantizer = get_ov_quantizer(
llm_config.quantization.pt2e_quantize.value, group_size
)
From d744ae95f3cf806278b12db346105e233a2daec5 Mon Sep 17 00:00:00 2001
From: Aamir Nazir
Date: Thu, 11 Sep 2025 23:04:50 +0400
Subject: [PATCH 058/266] Update quantizer_lib.py
---
extension/llm/export/quantizer_lib.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index 350e8b3ce7c..f92c59cebd3 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -217,7 +217,7 @@ def get_qnn_quantizer(
def get_ov_quantizer(
pt2e_quantize: str,
- group_size: int = 32,
+ group_size: int = 128,
):
try:
from executorch.backends.openvino.quantizer import (
From b874204d7d8eba9aa35dc8f9e55bd47bc0719cbb Mon Sep 17 00:00:00 2001
From: suryasidd
Date: Thu, 11 Sep 2025 14:22:29 -0700
Subject: [PATCH 059/266] Updated NNCF commit id
---
backends/openvino/requirements.txt | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/backends/openvino/requirements.txt b/backends/openvino/requirements.txt
index 2ada445414c..519818d0aac 100644
--- a/backends/openvino/requirements.txt
+++ b/backends/openvino/requirements.txt
@@ -1,2 +1,2 @@
transformers
-git+https://github.com/openvinotoolkit/nncf@5cb2b58#egg=nncf
+git+https://github.com/openvinotoolkit/nncf@3d753ac#egg=nncf
From 41ac36a8a513e2adbc3015d231f071b7530efae0 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa
Date: Thu, 11 Sep 2025 16:21:43 -0700
Subject: [PATCH 060/266] openvino llama export configuration - initial
---
examples/openvino/llama/README.md | 11 ++++++++++
.../llama/llama3_2_ov_4wo_config.yaml | 20 +++++++++++++++++++
2 files changed, 31 insertions(+)
create mode 100644 examples/openvino/llama/README.md
create mode 100644 examples/openvino/llama/llama3_2_ov_4wo_config.yaml
diff --git a/examples/openvino/llama/README.md b/examples/openvino/llama/README.md
new file mode 100644
index 00000000000..30644af3cde
--- /dev/null
+++ b/examples/openvino/llama/README.md
@@ -0,0 +1,11 @@
+
+LLAMA_CHECKPOINT=/consolidated.00.pth
+LLAMA_PARAMS=/params.json
+LLAMA_TOKENIZER=/tokenizer.model
+
+python -m extension.llm.export.export_llm \
+ --config llama3_2_ov_4wo_config.yaml \
+ +base.model_class="llama3_2" \
+ +base.checkpoint="${LLAMA_CHECKPOINT:?}" \
+ +base.params="${LLAMA_PARAMS:?}" \
+ +base.tokenizer_path="${LLAMA_TOKENIZER:?}" \
diff --git a/examples/openvino/llama/llama3_2_ov_4wo_config.yaml b/examples/openvino/llama/llama3_2_ov_4wo_config.yaml
new file mode 100644
index 00000000000..7f47f133216
--- /dev/null
+++ b/examples/openvino/llama/llama3_2_ov_4wo_config.yaml
@@ -0,0 +1,20 @@
+base:
+ metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+
+model:
+ use_kv_cache: True
+ dtype_override: fp32
+ enable_dynamic_shape: False
+
+export:
+ output_dir: "../"
+
+quantization:
+ pt2e_quantize: "openvino_4wo"
+
+backend:
+ openvino:
+ enabled: True
+
+debug:
+ verbose: True
From 08461ec1b54de22b279511669a862d20ecef0f5d Mon Sep 17 00:00:00 2001
From: Cavus Mustafa
Date: Thu, 11 Sep 2025 16:32:20 -0700
Subject: [PATCH 061/266] updated ov llama config file
---
.../{llama3_2_ov_4wo_config.yaml => llama3_2_ov_4wo.yaml} | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
rename examples/openvino/llama/{llama3_2_ov_4wo_config.yaml => llama3_2_ov_4wo.yaml} (90%)
diff --git a/examples/openvino/llama/llama3_2_ov_4wo_config.yaml b/examples/openvino/llama/llama3_2_ov_4wo.yaml
similarity index 90%
rename from examples/openvino/llama/llama3_2_ov_4wo_config.yaml
rename to examples/openvino/llama/llama3_2_ov_4wo.yaml
index 7f47f133216..68a53708fb9 100644
--- a/examples/openvino/llama/llama3_2_ov_4wo_config.yaml
+++ b/examples/openvino/llama/llama3_2_ov_4wo.yaml
@@ -6,11 +6,9 @@ model:
dtype_override: fp32
enable_dynamic_shape: False
-export:
- output_dir: "../"
-
quantization:
pt2e_quantize: "openvino_4wo"
+ group_size: 128
backend:
openvino:
From be85af8b86b995b4879e4382cc00f00eb7584d16 Mon Sep 17 00:00:00 2001
From: Mustafa Cavus
Date: Thu, 11 Sep 2025 16:14:11 -0700
Subject: [PATCH 062/266] Update README.md
---
examples/openvino/llama/README.md | 40 +++++++++++++++++++++++++++----
1 file changed, 35 insertions(+), 5 deletions(-)
diff --git a/examples/openvino/llama/README.md b/examples/openvino/llama/README.md
index 30644af3cde..e5571e3da79 100644
--- a/examples/openvino/llama/README.md
+++ b/examples/openvino/llama/README.md
@@ -1,11 +1,41 @@
-LLAMA_CHECKPOINT=/consolidated.00.pth
-LLAMA_PARAMS=/params.json
-LLAMA_TOKENIZER=/tokenizer.model
+# Export Llama with OpenVINO Backend
-python -m extension.llm.export.export_llm \
+## Download the Model
+Follow the [instructions](../../examples/models/llama#step-2-prepare-model) to download the required model files. Export Llama with OpenVINO backend is only verified with Llama-3.2-1B variants at this time.
+
+## Environment Setup
+Follow the [instructions](../../backends/openvino/README.md) of **Prerequisites** and **Setup** in `backends/openvino/README.md` to set up the OpenVINO backend.
+
+## Export the model:
+Navigate into `/examples/openvino/llama` and execute the commands below to export the model. Update the model file paths to match the location where your model is downloaded.
+
+```
+LLAMA_CHECKPOINT=/consolidated.00.pth
+LLAMA_PARAMS=/params.json
+LLAMA_TOKENIZER=/tokenizer.model
+
+python -m executorch.extension.llm.export.export_llm \
--config llama3_2_ov_4wo_config.yaml \
+base.model_class="llama3_2" \
+base.checkpoint="${LLAMA_CHECKPOINT:?}" \
+base.params="${LLAMA_PARAMS:?}" \
- +base.tokenizer_path="${LLAMA_TOKENIZER:?}" \
+ +base.tokenizer_path="${LLAMA_TOKENIZER:?}"
+```
+
+## Build OpenVINO C++ Runtime with Llama Runner:
+First, build the backend libraries by executing the script below in `/backends/openvino/scripts` folder:
+```bash
+./openvino_build.sh --cpp_runtime
+```
+Then, build the llama runner by executing the script below (with `--llama_runner` argument) also in `/backends/openvino/scripts` folder:
+```bash
+./openvino_build.sh --llama_runner
+```
+The executable is saved in `/cmake-out/examples/models/llama/llama_main`
+
+## Execute Inference Using Llama Runner
+Update the model tokenizer file path to match the location where your model is downloaded and replace the prompt.
+```
+./cmake-out/examples/models/llama/llama_main --model_path=llama3_2.pte --tokenizer_path=/tokenizer.model --prompt="Your custom prompt"
+```
From 35f1d84b05b285f1cf041ac6e4c95b840e9631ca Mon Sep 17 00:00:00 2001
From: Mustafa Cavus
Date: Thu, 11 Sep 2025 16:20:28 -0700
Subject: [PATCH 063/266] Update README.md
---
examples/openvino/README.md | 53 +++----------------------------------
1 file changed, 4 insertions(+), 49 deletions(-)
diff --git a/examples/openvino/README.md b/examples/openvino/README.md
index dbce5df1b55..0ecedde092c 100644
--- a/examples/openvino/README.md
+++ b/examples/openvino/README.md
@@ -9,7 +9,10 @@ Below is the layout of the `examples/openvino` directory, which includes the nec
```
examples/openvino
├── README.md # Documentation for examples (this file)
-└── aot_optimize_and_infer.py # Example script to export and execute models
+├── aot_optimize_and_infer.py # Example script to export and execute models
+└── llama
+ ├── README.md # Documentation for Llama example
+ └── llama3_2_ov_4wo.yaml # Configuration file for exporting Llama3.2 with OpenVINO backend
```
# Build Instructions for Examples
@@ -183,51 +186,3 @@ Run inference with a given model for 10 iterations:
--model_path=model.pte \
--num_executions=10
```
-
-# Export Llama with OpenVINO Backend
-
-## Download the Model
-Follow the [instructions](../../examples/models/llama#step-2-prepare-model) to download the required model files. Export Llama with OpenVINO backend is only verified with Llama-3.2-1B variants at this time.
-
-## Environment Setup
-Follow the [instructions](../../backends/openvino/README.md) of **Prerequisites** and **Setup** in `backends/openvino/README.md` to set up the OpenVINO backend.
-
-## Export the model:
-Execute the commands below to export the model. Update the model file paths to match the location where your model is downloaded.
-
-```
-LLAMA_CHECKPOINT=/consolidated.00.pth
-LLAMA_PARAMS=/params.json
-LLAMA_TOKENIZER=/tokenizer.model
-
-python -u -m examples.models.llama.export_llama \
- --model "llama3_2" \
- --checkpoint "${LLAMA_CHECKPOINT:?}" \
- --params "${LLAMA_PARAMS:?}" \
- -kv \
- --openvino \
- -d fp32 \
- --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
- --output_name="llama.pte" \
- --verbose \
- --disable_dynamic_shape \
- --tokenizer_path "${LLAMA_TOKENIZER:?}" \
- --nncf_compression
-```
-
-## Build OpenVINO C++ Runtime with Llama Runner:
-First, build the backend libraries by executing the script below in `/backends/openvino/scripts` folder:
-```bash
-./openvino_build.sh
-```
-Then, build the llama runner by executing the script below (with `--llama_runner` argument) also in `/backends/openvino/scripts` folder:
-```bash
-./openvino_build.sh --llama_runner
-```
-The executable is saved in `/cmake-out/examples/models/llama/llama_main`
-
-## Execute Inference Using Llama Runner
-Update the model tokenizer file path to match the location where your model is downloaded and replace the prompt.
-```
-./cmake-out/examples/models/llama/llama_main --model_path=llama.pte --tokenizer_path=/tokenizer.model --prompt="Your custom prompt"
-```
From 4426541d133b8d9c3148c06654b870f27b4123d0 Mon Sep 17 00:00:00 2001
From: Mustafa Cavus
Date: Thu, 11 Sep 2025 16:25:34 -0700
Subject: [PATCH 064/266] Update README.md
---
examples/openvino/llama/README.md | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/examples/openvino/llama/README.md b/examples/openvino/llama/README.md
index e5571e3da79..abb3f5179cb 100644
--- a/examples/openvino/llama/README.md
+++ b/examples/openvino/llama/README.md
@@ -8,7 +8,7 @@ Follow the [instructions](../../examples/models/llama#step-2-prepare-model) to d
Follow the [instructions](../../backends/openvino/README.md) of **Prerequisites** and **Setup** in `backends/openvino/README.md` to set up the OpenVINO backend.
## Export the model:
-Navigate into `/examples/openvino/llama` and execute the commands below to export the model. Update the model file paths to match the location where your model is downloaded.
+Navigate into `/examples/openvino/llama` and execute the commands below to export the model. Update the model file paths to match the location where your model is downloaded. The exported model will be generated in the same directory with the filename `llama3_2.pte`.
```
LLAMA_CHECKPOINT=/consolidated.00.pth
@@ -37,5 +37,5 @@ The executable is saved in `/cmake-out/examples/models/llama/ll
## Execute Inference Using Llama Runner
Update the model tokenizer file path to match the location where your model is downloaded and replace the prompt.
```
-./cmake-out/examples/models/llama/llama_main --model_path=llama3_2.pte --tokenizer_path=/tokenizer.model --prompt="Your custom prompt"
+./cmake-out/examples/models/llama/llama_main --model_path=/examples/openvino/llama/llama3_2.pte --tokenizer_path=/tokenizer.model --prompt="Your custom prompt"
```
From 6b936c5ddf8ab6c356315fd67f293a331f1a4aaf Mon Sep 17 00:00:00 2001
From: Mustafa Cavus
Date: Thu, 11 Sep 2025 16:26:51 -0700
Subject: [PATCH 065/266] Update README.md
---
examples/openvino/llama/README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/examples/openvino/llama/README.md b/examples/openvino/llama/README.md
index abb3f5179cb..4de20a0f061 100644
--- a/examples/openvino/llama/README.md
+++ b/examples/openvino/llama/README.md
@@ -16,7 +16,7 @@ LLAMA_PARAMS=/params.json
LLAMA_TOKENIZER=/tokenizer.model
python -m executorch.extension.llm.export.export_llm \
- --config llama3_2_ov_4wo_config.yaml \
+ --config llama3_2_ov_4wo.yaml \
+base.model_class="llama3_2" \
+base.checkpoint="${LLAMA_CHECKPOINT:?}" \
+base.params="${LLAMA_PARAMS:?}" \
From bba4a01437ef5b1b6a6ddd7af5a406a9cc9842ca Mon Sep 17 00:00:00 2001
From: Mustafa Cavus
Date: Thu, 11 Sep 2025 16:51:22 -0700
Subject: [PATCH 066/266] Update README.md
---
examples/openvino/llama/README.md | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/examples/openvino/llama/README.md b/examples/openvino/llama/README.md
index 4de20a0f061..d357f038781 100644
--- a/examples/openvino/llama/README.md
+++ b/examples/openvino/llama/README.md
@@ -8,7 +8,7 @@ Follow the [instructions](../../examples/models/llama#step-2-prepare-model) to d
Follow the [instructions](../../backends/openvino/README.md) of **Prerequisites** and **Setup** in `backends/openvino/README.md` to set up the OpenVINO backend.
## Export the model:
-Navigate into `/examples/openvino/llama` and execute the commands below to export the model. Update the model file paths to match the location where your model is downloaded. The exported model will be generated in the same directory with the filename `llama3_2.pte`.
+Navigate into `/examples/openvino/llama` and execute the commands below to export the model. Update the model file paths to match the location where your model is downloaded. Replace device with the target hardware you want to compile the model for (`CPU`, `GPU`, or `NPU`). The exported model will be generated in the same directory with the filename `llama3_2.pte`.
```
LLAMA_CHECKPOINT=/consolidated.00.pth
@@ -17,6 +17,7 @@ LLAMA_TOKENIZER=/tokenizer.model
python -m executorch.extension.llm.export.export_llm \
--config llama3_2_ov_4wo.yaml \
+ +backend.openvino.device="CPU" \
+base.model_class="llama3_2" \
+base.checkpoint="${LLAMA_CHECKPOINT:?}" \
+base.params="${LLAMA_PARAMS:?}" \
From 1421921da0a6b083c17c9fe85b5b5f8beebd7216 Mon Sep 17 00:00:00 2001
From: Aamir Nazir
Date: Fri, 12 Sep 2025 13:05:24 +0400
Subject: [PATCH 067/266] Update README.md with quantization paragraph
---
examples/openvino/llama/README.md | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)
diff --git a/examples/openvino/llama/README.md b/examples/openvino/llama/README.md
index d357f038781..7a97e27410c 100644
--- a/examples/openvino/llama/README.md
+++ b/examples/openvino/llama/README.md
@@ -24,6 +24,24 @@ python -m executorch.extension.llm.export.export_llm \
+base.tokenizer_path="${LLAMA_TOKENIZER:?}"
```
+### Compress Model Weights and Export
+OpenVINO backend also offers Quantization support for llama models when exporting the model. The different quantization modes that are offered are INT4 groupwise & per-channel weights compression and INT8 per-channel weights compression. It can be achieved using the `--pt2e_quantize opevnino_4wo` flag. For modifying the group size `--group_size` can be used. By default group size 128 is used to achieve optimal performance with the NPU.
+
+```
+LLAMA_CHECKPOINT=/consolidated.00.pth
+LLAMA_PARAMS=/params.json
+LLAMA_TOKENIZER=/tokenizer.model
+
+python -m executorch.extension.llm.export.export_llm \
+ --config llama3_2_ov_4wo.yaml \
+ +backend.openvino.device="CPU" \
+ +base.model_class="llama3_2" \
+ +pt2e_quantize opevnino_4wo \
+ +base.checkpoint="${LLAMA_CHECKPOINT:?}" \
+ +base.params="${LLAMA_PARAMS:?}" \
+ +base.tokenizer_path="${LLAMA_TOKENIZER:?}"
+```
+
## Build OpenVINO C++ Runtime with Llama Runner:
First, build the backend libraries by executing the script below in `/backends/openvino/scripts` folder:
```bash
From f050eeac96dd63c158afb526c1df1ac13beec0f6 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa
Date: Sun, 14 Sep 2025 20:39:13 -0700
Subject: [PATCH 068/266] formatting fix
---
backends/openvino/quantizer/__init__.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/backends/openvino/quantizer/__init__.py b/backends/openvino/quantizer/__init__.py
index 0fd8c10b249..5aae52ef3e8 100644
--- a/backends/openvino/quantizer/__init__.py
+++ b/backends/openvino/quantizer/__init__.py
@@ -1,3 +1,3 @@
-from .quantizer import OpenVINOQuantizer, quantize_model, QuantizationMode
+from .quantizer import OpenVINOQuantizer, QuantizationMode, quantize_model
__all__ = ["OpenVINOQuantizer", "quantize_model", "QuantizationMode"]
From 4bfdca9e95de0bbe41e3f0e8df8e4f1e8476d97f Mon Sep 17 00:00:00 2001
From: Mustafa Cavus
Date: Sun, 14 Sep 2025 20:44:22 -0700
Subject: [PATCH 069/266] Update README.md
---
examples/openvino/llama/README.md | 17 +----------------
1 file changed, 1 insertion(+), 16 deletions(-)
diff --git a/examples/openvino/llama/README.md b/examples/openvino/llama/README.md
index 7a97e27410c..46dbfb8c2f0 100644
--- a/examples/openvino/llama/README.md
+++ b/examples/openvino/llama/README.md
@@ -25,22 +25,7 @@ python -m executorch.extension.llm.export.export_llm \
```
### Compress Model Weights and Export
-OpenVINO backend also offers Quantization support for llama models when exporting the model. The different quantization modes that are offered are INT4 groupwise & per-channel weights compression and INT8 per-channel weights compression. It can be achieved using the `--pt2e_quantize opevnino_4wo` flag. For modifying the group size `--group_size` can be used. By default group size 128 is used to achieve optimal performance with the NPU.
-
-```
-LLAMA_CHECKPOINT=/consolidated.00.pth
-LLAMA_PARAMS=/params.json
-LLAMA_TOKENIZER=/tokenizer.model
-
-python -m executorch.extension.llm.export.export_llm \
- --config llama3_2_ov_4wo.yaml \
- +backend.openvino.device="CPU" \
- +base.model_class="llama3_2" \
- +pt2e_quantize opevnino_4wo \
- +base.checkpoint="${LLAMA_CHECKPOINT:?}" \
- +base.params="${LLAMA_PARAMS:?}" \
- +base.tokenizer_path="${LLAMA_TOKENIZER:?}"
-```
+OpenVINO backend also offers Quantization support for llama models when exporting the model. The different quantization modes that are offered are INT4 groupwise & per-channel weights compression and INT8 per-channel weights compression. It can be achieved by setting `pt2e_quantize` option in `llama3_2_ov_4wo.yaml` file under `quantization`. Set this parameter to `openvino_4wo` for INT4 or `openvino_8wo` for INT8 weight compression. It is set to `openvino_4wo` in `llama3_2_ov_4wo.yaml` file by default. For modifying the group size, set `group_size` option in `llama3_2_ov_4wo.yaml` file under `quantization`. By default group size 128 is used to achieve optimal performance with the NPU.
## Build OpenVINO C++ Runtime with Llama Runner:
First, build the backend libraries by executing the script below in `/backends/openvino/scripts` folder:
From 16aba1bb1bb52632829e5a84ef0dd15f0e01d464 Mon Sep 17 00:00:00 2001
From: Mustafa Cavus
Date: Tue, 16 Sep 2025 10:21:59 -0700
Subject: [PATCH 070/266] Update non_cpu_backends.md for OpenVINO instructions
---
examples/models/llama/non_cpu_backends.md | 3 +++
1 file changed, 3 insertions(+)
diff --git a/examples/models/llama/non_cpu_backends.md b/examples/models/llama/non_cpu_backends.md
index f414582a3c1..6e5d0b63256 100644
--- a/examples/models/llama/non_cpu_backends.md
+++ b/examples/models/llama/non_cpu_backends.md
@@ -22,3 +22,6 @@ After exporting the CoreML model .pte file, please [follow the instruction to bu
### MTK
Please [follow the instructions](https://github.com/pytorch/executorch/tree/main/examples/mediatek#llama-example-instructions) to deploy llama3 8b to an Android phones with MediaTek chip
+
+### OpenVINO
+Please follow [the instructions](../../openvino/llama/README.md) to deploy Llama 3 1B to Intel CPUs, GPUs, and NPUs.
From 155529f2a63bffeaa6539908dabda16e8d0e415f Mon Sep 17 00:00:00 2001
From: Mustafa Cavus
Date: Tue, 16 Sep 2025 10:22:58 -0700
Subject: [PATCH 071/266] Update llama instructions link for OpenVINO backend
---
examples/models/llama/README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md
index 784142b61f1..aba3b255fee 100644
--- a/examples/models/llama/README.md
+++ b/examples/models/llama/README.md
@@ -136,7 +136,7 @@ Llama 3 8B performance was measured on the Samsung Galaxy S22, S24, and OnePlus
-[Please visit this section to try it on non-CPU backend, including CoreML, MPS, Qualcomm HTP or MediaTek](non_cpu_backends.md).
+[Please visit this section to try it on non-CPU backend, including CoreML, MPS, Qualcomm HTP, MediaTek, or OpenVINO](non_cpu_backends.md).
# Instructions
From 5875aa8af0b07474b6d7d066164dc5a298b26d9a Mon Sep 17 00:00:00 2001
From: Mustafa Cavus
Date: Tue, 16 Sep 2025 10:25:46 -0700
Subject: [PATCH 072/266] Remove OpenVINO from non_cpu_backends.md
---
examples/models/llama/non_cpu_backends.md | 3 ---
1 file changed, 3 deletions(-)
diff --git a/examples/models/llama/non_cpu_backends.md b/examples/models/llama/non_cpu_backends.md
index 6e5d0b63256..f414582a3c1 100644
--- a/examples/models/llama/non_cpu_backends.md
+++ b/examples/models/llama/non_cpu_backends.md
@@ -22,6 +22,3 @@ After exporting the CoreML model .pte file, please [follow the instruction to bu
### MTK
Please [follow the instructions](https://github.com/pytorch/executorch/tree/main/examples/mediatek#llama-example-instructions) to deploy llama3 8b to an Android phones with MediaTek chip
-
-### OpenVINO
-Please follow [the instructions](../../openvino/llama/README.md) to deploy Llama 3 1B to Intel CPUs, GPUs, and NPUs.
From 2630fd6c1db8f3e8eb5a840b34b96b48210c9362 Mon Sep 17 00:00:00 2001
From: Mustafa Cavus
Date: Tue, 16 Sep 2025 11:03:51 -0700
Subject: [PATCH 073/266] Update llama instructions for OpenVINO backend
---
examples/models/llama/README.md | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md
index aba3b255fee..516f0073ef1 100644
--- a/examples/models/llama/README.md
+++ b/examples/models/llama/README.md
@@ -94,6 +94,8 @@ Llama 3.2 1B and 3B performance was measured on Android OnePlus 12 device. The p
+[Please visit this section to try it on OpenVINO backend](../../openvino/llama/README.md).
+
## Llama 3/3.1 8B
Since Llama 3 8B model needs at least 4-bit quantization to fit even within some of the highend phones, results presented here correspond to 4-bit groupwise post-training quantized (PTQ) model.
@@ -136,7 +138,7 @@ Llama 3 8B performance was measured on the Samsung Galaxy S22, S24, and OnePlus
-[Please visit this section to try it on non-CPU backend, including CoreML, MPS, Qualcomm HTP, MediaTek, or OpenVINO](non_cpu_backends.md).
+[Please visit this section to try it on non-CPU backend, including CoreML, MPS, Qualcomm HTP, or MediaTek](non_cpu_backends.md).
# Instructions
From 6d0cbc53a5143c0bf66333872fdecefbc66b60d0 Mon Sep 17 00:00:00 2001
From: Mustafa Cavus
Date: Tue, 16 Sep 2025 11:11:17 -0700
Subject: [PATCH 074/266] Removed the comma which was added by mistake
---
examples/models/llama/README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md
index 516f0073ef1..d0e72234c54 100644
--- a/examples/models/llama/README.md
+++ b/examples/models/llama/README.md
@@ -138,7 +138,7 @@ Llama 3 8B performance was measured on the Samsung Galaxy S22, S24, and OnePlus
-[Please visit this section to try it on non-CPU backend, including CoreML, MPS, Qualcomm HTP, or MediaTek](non_cpu_backends.md).
+[Please visit this section to try it on non-CPU backend, including CoreML, MPS, Qualcomm HTP or MediaTek](non_cpu_backends.md).
# Instructions
From 3fbefecb61e147114c2aabc02079e88fa6d7777f Mon Sep 17 00:00:00 2001
From: suryasidd
Date: Tue, 16 Sep 2025 12:18:52 -0700
Subject: [PATCH 075/266] Added NPU in choices
---
examples/models/llama/export_llama_lib.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index ed352c0997e..4f4ef2553aa 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -464,8 +464,8 @@ def build_args_parser() -> argparse.ArgumentParser:
"--openvino_device",
type=str,
default="CPU",
- choices=["CPU", "GPU"],
- help="Specify the device for Openvino (CPU or GPU).",
+ choices=["CPU", "GPU", "NPU"],
+ help="Specify the device for Openvino (CPU, GPU or NPU).",
)
parser.add_argument(
From 12e51c72d6f184c1ee6902d6d8f895292a4d6d92 Mon Sep 17 00:00:00 2001
From: suryasidd
Date: Tue, 16 Sep 2025 15:26:06 -0700
Subject: [PATCH 076/266] Fixed ref links
---
examples/openvino/llama/README.md | 6 +++---
examples/openvino/llama/llama3_2_ov_4wo.yaml | 11 +++++++----
2 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/examples/openvino/llama/README.md b/examples/openvino/llama/README.md
index 46dbfb8c2f0..a98645b3918 100644
--- a/examples/openvino/llama/README.md
+++ b/examples/openvino/llama/README.md
@@ -2,13 +2,13 @@
# Export Llama with OpenVINO Backend
## Download the Model
-Follow the [instructions](../../examples/models/llama#step-2-prepare-model) to download the required model files. Export Llama with OpenVINO backend is only verified with Llama-3.2-1B variants at this time.
+Follow the [instructions](../../../examples/models/llama/README.md#step-2-prepare-model) to download the required model files. Export Llama with OpenVINO backend is only verified with Llama-3.2-1B variants at this time.
## Environment Setup
-Follow the [instructions](../../backends/openvino/README.md) of **Prerequisites** and **Setup** in `backends/openvino/README.md` to set up the OpenVINO backend.
+Follow the [instructions](../../../backends/openvino/README.md) of **Prerequisites** and **Setup** in `backends/openvino/README.md` to set up the OpenVINO backend.
## Export the model:
-Navigate into `/examples/openvino/llama` and execute the commands below to export the model. Update the model file paths to match the location where your model is downloaded. Replace device with the target hardware you want to compile the model for (`CPU`, `GPU`, or `NPU`). The exported model will be generated in the same directory with the filename `llama3_2.pte`.
+Navigate into `/examples/openvino/llama` and execute the commands below to export the model. Update the model file paths to match the location where your model is downloaded. Replace device with the target hardware you want to compile the model for (`CPU`, `GPU`, or `NPU`). The exported model will be generated in the same directory with the filename `llama3_2_ov.pte`. For modifying the output name, change `output_name` in `llama3_2_ov_4wo.yaml` file under `export`.
```
LLAMA_CHECKPOINT=/consolidated.00.pth
diff --git a/examples/openvino/llama/llama3_2_ov_4wo.yaml b/examples/openvino/llama/llama3_2_ov_4wo.yaml
index 68a53708fb9..8fb1d7a1c09 100644
--- a/examples/openvino/llama/llama3_2_ov_4wo.yaml
+++ b/examples/openvino/llama/llama3_2_ov_4wo.yaml
@@ -2,17 +2,20 @@ base:
metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
model:
- use_kv_cache: True
+ use_kv_cache: true
dtype_override: fp32
- enable_dynamic_shape: False
+ enable_dynamic_shape: false
quantization:
pt2e_quantize: "openvino_4wo"
group_size: 128
+export:
+ output_name: "llama3_2_ov.pte"
+
backend:
openvino:
- enabled: True
+ enabled: true
debug:
- verbose: True
+ verbose: false
From 72331f5d0feaea93cef7517fda0eba7942ac6dd2 Mon Sep 17 00:00:00 2001
From: suryasidd
Date: Wed, 17 Sep 2025 13:16:49 -0700
Subject: [PATCH 077/266] Added Remove clone ops transformation to OpenVINO
backend
---
backends/openvino/preprocess.py | 13 ++++++++++++-
1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/backends/openvino/preprocess.py b/backends/openvino/preprocess.py
index c343f44a8b5..66d5ec97b0a 100644
--- a/backends/openvino/preprocess.py
+++ b/backends/openvino/preprocess.py
@@ -8,6 +8,7 @@
from typing import final, List
+from executorch.backends.transforms.remove_clone_ops import RemoveCloneOpsTransform
from executorch.exir.backend.backend_details import (
BackendDetails,
ExportedProgram,
@@ -36,6 +37,14 @@ def preprocess(
Returns:
PreprocessResult: The result of preprocessing, including the compiled model bytes.
"""
+ # Apply RemoveCloneOpsTransform to eliminate unnecessary clone operations
+ remove_clone_transform = RemoveCloneOpsTransform()
+ transformed_result = remove_clone_transform(edge_program.graph_module)
+
+ # Update the edge_program with the transformed graph
+ if transformed_result.graph_module is not None:
+ edge_program._graph_module = transformed_result.graph_module
+
input_names = edge_program.graph_signature.user_inputs
args = []
for node in edge_program.graph.nodes:
@@ -47,7 +56,9 @@ def preprocess(
compile_options[spec.key] = spec.value.decode()
compiled = openvino_compile(
- edge_program.module(), *args, options=compile_options
+ edge_program.module(),
+ *args,
+ options=compile_options
)
model_bytes = compiled.export_model()
From 8016165619eee3777e2ef437e4b83de84b3582b6 Mon Sep 17 00:00:00 2001
From: suryasidd
Date: Wed, 17 Sep 2025 13:28:50 -0700
Subject: [PATCH 078/266] Fixed variable names
---
backends/openvino/preprocess.py | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/backends/openvino/preprocess.py b/backends/openvino/preprocess.py
index 66d5ec97b0a..7fc9d61d68e 100644
--- a/backends/openvino/preprocess.py
+++ b/backends/openvino/preprocess.py
@@ -38,12 +38,11 @@ def preprocess(
PreprocessResult: The result of preprocessing, including the compiled model bytes.
"""
# Apply RemoveCloneOpsTransform to eliminate unnecessary clone operations
- remove_clone_transform = RemoveCloneOpsTransform()
- transformed_result = remove_clone_transform(edge_program.graph_module)
+ transformed_ep = RemoveCloneOpsTransform()(edge_program.graph_module)
# Update the edge_program with the transformed graph
- if transformed_result.graph_module is not None:
- edge_program._graph_module = transformed_result.graph_module
+ if transformed_ep.graph_module is not None:
+ edge_program._graph_module = transformed_ep.graph_module
input_names = edge_program.graph_signature.user_inputs
args = []
From f0d9fc72f504cb7e80ee34c02bca2e62977a1c9e Mon Sep 17 00:00:00 2001
From: Cavus Mustafa
Date: Wed, 17 Sep 2025 15:30:48 -0700
Subject: [PATCH 079/266] Added extended support list for openvino backend
---
backends/openvino/partitioner.py | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py
index 20841d6730b..00107959412 100644
--- a/backends/openvino/partitioner.py
+++ b/backends/openvino/partitioner.py
@@ -34,6 +34,9 @@ def __init__(self):
class OpenvinoOperatorsSupport(OperatorSupportBase):
+ extended_support_dict = {
+ "torch.ops.dim_order_ops._clone_dim_order.default": None,
+ }
def __init__(
self,
@@ -77,7 +80,9 @@ def is_node_supported(self, _, node: torch.fx.Node) -> bool:
if node.name in self._enabled_ops_by_name:
return True
- supported_ops = OperatorSupport(options)._support_dict
+ supported_ops = (
+ OperatorSupport(options)._support_dict | self.extended_support_dict
+ )
if op_type == "getitem":
return True
From 9b41c28be3e266c10808ae07cc1cf1ff84112280 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa
Date: Wed, 17 Sep 2025 15:31:06 -0700
Subject: [PATCH 080/266] formating fix
---
backends/openvino/preprocess.py | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/backends/openvino/preprocess.py b/backends/openvino/preprocess.py
index 7fc9d61d68e..3ba693973e0 100644
--- a/backends/openvino/preprocess.py
+++ b/backends/openvino/preprocess.py
@@ -55,9 +55,7 @@ def preprocess(
compile_options[spec.key] = spec.value.decode()
compiled = openvino_compile(
- edge_program.module(),
- *args,
- options=compile_options
+ edge_program.module(), *args, options=compile_options
)
model_bytes = compiled.export_model()
From e7517263cdae812bf96941c6ececd73790f1c69a Mon Sep 17 00:00:00 2001
From: Cavus Mustafa
Date: Wed, 17 Sep 2025 16:09:00 -0700
Subject: [PATCH 081/266] formatting fix
---
backends/openvino/preprocess.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/backends/openvino/preprocess.py b/backends/openvino/preprocess.py
index 3ba693973e0..72c781c0fb3 100644
--- a/backends/openvino/preprocess.py
+++ b/backends/openvino/preprocess.py
@@ -41,7 +41,7 @@ def preprocess(
transformed_ep = RemoveCloneOpsTransform()(edge_program.graph_module)
# Update the edge_program with the transformed graph
- if transformed_ep.graph_module is not None:
+ if transformed_ep and transformed_ep.graph_module:
edge_program._graph_module = transformed_ep.graph_module
input_names = edge_program.graph_signature.user_inputs
From 8106204b8a4af557bc6d925b070d9202789c14b4 Mon Sep 17 00:00:00 2001
From: suryasidd
Date: Tue, 30 Sep 2025 15:32:58 -0700
Subject: [PATCH 082/266] Added DimorderOpsRevertPass to Openvino backend
---
backends/openvino/partitioner.py | 1 +
backends/openvino/preprocess.py | 5 ++---
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py
index 00107959412..0d407e33f6e 100644
--- a/backends/openvino/partitioner.py
+++ b/backends/openvino/partitioner.py
@@ -36,6 +36,7 @@ def __init__(self):
class OpenvinoOperatorsSupport(OperatorSupportBase):
extended_support_dict = {
"torch.ops.dim_order_ops._clone_dim_order.default": None,
+ "torch.ops.dim_order_ops._to_dim_order_copy.default": None,
}
def __init__(
diff --git a/backends/openvino/preprocess.py b/backends/openvino/preprocess.py
index 72c781c0fb3..7d89e117dc6 100644
--- a/backends/openvino/preprocess.py
+++ b/backends/openvino/preprocess.py
@@ -8,7 +8,7 @@
from typing import final, List
-from executorch.backends.transforms.remove_clone_ops import RemoveCloneOpsTransform
+from executorch.exir.passes.memory_format_ops_pass import DimOrderOpsRevertPass
from executorch.exir.backend.backend_details import (
BackendDetails,
ExportedProgram,
@@ -37,8 +37,7 @@ def preprocess(
Returns:
PreprocessResult: The result of preprocessing, including the compiled model bytes.
"""
- # Apply RemoveCloneOpsTransform to eliminate unnecessary clone operations
- transformed_ep = RemoveCloneOpsTransform()(edge_program.graph_module)
+ transformed_ep = DimOrderOpsRevertPass()(edge_program.graph_module)
# Update the edge_program with the transformed graph
if transformed_ep and transformed_ep.graph_module:
From d95143ebe0fee4bfe127ff6d99e7fe3bd1693728 Mon Sep 17 00:00:00 2001
From: Onuralp SEZER
Date: Wed, 1 Oct 2025 21:15:57 +0300
Subject: [PATCH 083/266] refactor:(samsung backend): replace pkg_resources
with importlib.resources for schema loading (#14654)
This PR refactors the Samsung backend schema loading logic in
compile_options.py by replacing pkg_resources with importlib.resources.
This modernizes resource access, improves compatibility with Python
packaging standards, and removes the dependency on setuptools. No
functional changes to the compile options logic; only the resource
loading mechanism is updated.
Signed-off-by: Onuralp SEZER
---
.../samsung/serialization/compile_options.py | 16 ++++++++++------
1 file changed, 10 insertions(+), 6 deletions(-)
diff --git a/backends/samsung/serialization/compile_options.py b/backends/samsung/serialization/compile_options.py
index 1ad2350cfeb..a4af40368e9 100644
--- a/backends/samsung/serialization/compile_options.py
+++ b/backends/samsung/serialization/compile_options.py
@@ -11,7 +11,8 @@
from dataclasses import dataclass
from enum import IntEnum, unique
-import pkg_resources
+from importlib.resources import files
+
from executorch.exir._serialize._dataclass import _DataclassEncoder
from executorch.exir._serialize._flatbuffer import _flatc_compile
from executorch.exir.backend.backend_details import CompileSpec
@@ -36,12 +37,15 @@ def gen_samsung_backend_compile_spec_core(options: EnnExecuTorchOptions) -> Comp
with tempfile.TemporaryDirectory() as d:
# schema
schema_path = os.path.join(d, "{}.fbs".format(COMPILE_OPTION_SCHEMA_NAME))
+
+ schema_content = (
+ files(__package__)
+ .joinpath(f"{COMPILE_OPTION_SCHEMA_NAME}.fbs")
+ .read_bytes()
+ )
+
with open(schema_path, "wb") as schema_file:
- schema_file.write(
- pkg_resources.resource_string(
- __name__, "{}.fbs".format(COMPILE_OPTION_SCHEMA_NAME)
- )
- )
+ schema_file.write(schema_content)
# dump json
json_path = os.path.join(d, "{}.json".format(COMPILE_OPTION_SCHEMA_NAME))
enn_options_json = json.dumps(options, cls=_DataclassEncoder, indent=4)
From eaf0e174f09e9cfa1584d8e77b8f06abf18b8e1b Mon Sep 17 00:00:00 2001
From: suryasidd
Date: Wed, 1 Oct 2025 11:28:42 -0700
Subject: [PATCH 084/266] Fixed linter issues
---
backends/openvino/preprocess.py | 3 ++-
extension/llm/export/config/llm_config.py | 1 +
2 files changed, 3 insertions(+), 1 deletion(-)
diff --git a/backends/openvino/preprocess.py b/backends/openvino/preprocess.py
index 7d89e117dc6..691115f6579 100644
--- a/backends/openvino/preprocess.py
+++ b/backends/openvino/preprocess.py
@@ -8,13 +8,14 @@
from typing import final, List
-from executorch.exir.passes.memory_format_ops_pass import DimOrderOpsRevertPass
from executorch.exir.backend.backend_details import (
BackendDetails,
ExportedProgram,
PreprocessResult,
)
from executorch.exir.backend.compile_spec_schema import CompileSpec
+
+from executorch.exir.passes.memory_format_ops_pass import DimOrderOpsRevertPass
from openvino.frontend.pytorch.torchdynamo.compile import ( # type: ignore[import-untyped]
openvino_compile,
)
diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py
index a176fa71dcc..0ac965b98cc 100644
--- a/extension/llm/export/config/llm_config.py
+++ b/extension/llm/export/config/llm_config.py
@@ -465,6 +465,7 @@ class OpenvinoConfig:
nncf_compression: bool = False
nncf_compression_group_size: int = 32
+
@dataclass
class TorchAOKernelsConfig:
"""
From 19be2a3ccbfb26f20cce1cc83a1f07e6e8c909be Mon Sep 17 00:00:00 2001
From: cccclai
Date: Wed, 1 Oct 2025 12:43:30 -0700
Subject: [PATCH 085/266] Try to get nightly wheel build work with qnn (#14633)
Our current nightly/release wheel package is done following
https://github.com/pytorch/test-infra/wiki/Using-Nova-Reusable-Build-Workflows
As described by
https://github.com/pytorch/test-infra/blob/5398e1a00c39939f43251f29031c37e6d0c84647/.github/workflows/build_wheels_linux.yml#L4,
The docker image infra team used to release nightly/release package is
from https://github.com/pypa/manylinux, and it's currently using
https://github.com/pypa/manylinux?tab=readme-ov-file#manylinux_2_28-almalinux-8-based.
It means the glibc version is 2.28 and GCC is 14.
The issue is that, QNN .so files are not compatible with 2.28. The
minimum version is 2.34 (I tried 2.29 the first time when it failed and
asked for 2.29, but it still fails).
In this PR, instead of checking glibc and failed directly when minimum
version isn't matched, we will download the glibc 2.34 to /tmp. A
different strategy compared with glibc++ is that, we don't load them,
because the python process itself start with the system glibc 2.28. We
need to re-execute the process with the new glibc
---
backends/qualcomm/__init__.py | 14 +-
backends/qualcomm/scripts/download_qnn_sdk.py | 280 ++++++++++++++----
setup.py | 3 +-
3 files changed, 218 insertions(+), 79 deletions(-)
diff --git a/backends/qualcomm/__init__.py b/backends/qualcomm/__init__.py
index 04ba5fcf24b..5770dfb0fcd 100644
--- a/backends/qualcomm/__init__.py
+++ b/backends/qualcomm/__init__.py
@@ -1,23 +1,13 @@
import os
-from .scripts.download_qnn_sdk import (
- check_glibc_exist_and_validate,
- install_qnn_sdk,
- is_linux_x86,
-)
+from .scripts.download_qnn_sdk import install_qnn_sdk, is_linux_x86
env_flag = os.getenv("EXECUTORCH_BUILDING_WHEEL", "0").lower()
# If users have preinstalled QNN_SDK_ROOT, we will use it.
qnn_sdk_root_flag = os.getenv("QNN_SDK_ROOT", None)
-if (
- env_flag not in ("1", "true", "yes")
- and not qnn_sdk_root_flag
- and is_linux_x86()
- and check_glibc_exist_and_validate()
-):
+if env_flag not in ("1", "true", "yes") and not qnn_sdk_root_flag and is_linux_x86():
ok = install_qnn_sdk()
-
if not ok:
raise RuntimeError("Failed to install QNN SDK. Please check the logs above.")
diff --git a/backends/qualcomm/scripts/download_qnn_sdk.py b/backends/qualcomm/scripts/download_qnn_sdk.py
index 35006a41433..747524a0e5b 100644
--- a/backends/qualcomm/scripts/download_qnn_sdk.py
+++ b/backends/qualcomm/scripts/download_qnn_sdk.py
@@ -6,12 +6,15 @@
import platform
import re
import shutil
+import subprocess
+import sys
import tarfile
import tempfile
import urllib.request
import zipfile
from typing import Dict, List, Optional, Tuple
+
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
@@ -34,68 +37,81 @@ def is_linux_x86() -> bool:
)
-import subprocess
+#########################
+# Cache directory helper
+#########################
-MINIMUM_LIBC_VERSION = 2.29
+APP_NAMESPACE = ["executorch", "qnn"]
-REQUIRED_LIBC_LIBS = [
- "/lib/x86_64-linux-gnu/libc.so.6",
- "/lib64/libc.so.6",
- "/lib/libc.so.6",
-]
+def _get_staging_dir(*parts: str) -> pathlib.Path:
+ r"""
+ Return a cross-platform staging directory for staging SDKs/libraries.
+
+ - On Linux:
+ ~/.cache/executorch/qnn/
+ (falls back to $HOME/.cache if $XDG_CACHE_HOME is unset)
-def check_glibc_exist_and_validate() -> bool:
+ - On Windows (not supported yet, but as placeholder):
+ %LOCALAPPDATA%\executorch\qnn\
+ (falls back to $HOME/AppData/Local if %LOCALAPPDATA% is unset)
+
+ - Override:
+ If QNN_STAGING_DIR is set in the environment, that path is used instead.
+
+ Args:
+ parts (str): Subdirectories to append under the root staging dir.
+
+ Returns:
+ pathlib.Path: Fully qualified staging path.
"""
- Check if users have glibc installed.
+ # Environment override wins
+ base = os.environ.get("QNN_STAGING_DIR")
+ if base:
+ return pathlib.Path(base).joinpath(*parts)
+
+ system = platform.system().lower()
+ if system == "windows":
+ # On Windows, prefer %LOCALAPPDATA%, fallback to ~/AppData/Local
+ base = pathlib.Path(
+ os.environ.get("LOCALAPPDATA", pathlib.Path.home() / "AppData" / "Local")
+ )
+ elif is_linux_x86():
+ # On Linux/Unix, prefer $XDG_CACHE_HOME, fallback to ~/.cache
+ base = pathlib.Path(
+ os.environ.get("XDG_CACHE_HOME", pathlib.Path.home() / ".cache")
+ )
+ else:
+ raise ValueError(f"Unsupported platform: {system}")
+
+ return base.joinpath(*APP_NAMESPACE, *parts)
+
+
+def _atomic_download(url: str, dest: pathlib.Path):
"""
- exists = False
- for path in REQUIRED_LIBC_LIBS:
- try:
- output = subprocess.check_output(
- [path, "--version"], stderr=subprocess.STDOUT
- )
- output = output.decode().split("\n")[0]
- logger.debug(f"[QNN] glibc version for path {path} is: {output}")
- match = re.search(r"version (\d+\.\d+)", output)
- if match:
- version = match.group(1)
- if float(version) >= MINIMUM_LIBC_VERSION:
- logger.debug(f"[QNN] glibc version is {version}.")
- exists = True
- return True
- else:
- logger.error(
- f"[QNN] glibc version is too low. The minimum libc version is {MINIMUM_LIBC_VERSION} Please install glibc following the commands below."
- )
- else:
- logger.error("[QNN] glibc version not found.")
+ Download URL into dest atomically:
+ - Write to a temp file in the same dir
+ - Move into place if successful
+ """
+ dest.parent.mkdir(parents=True, exist_ok=True)
- except Exception:
- continue
+ # Temp file in same dir (guarantees atomic rename)
+ with tempfile.NamedTemporaryFile(dir=dest.parent, delete=False) as tmp:
+ tmp_path = pathlib.Path(tmp.name)
- if not exists:
- logger.error(
- r""""
- [QNN] glibc not found or the version is too low. Please install glibc following the commands below.
- Ubuntu/Debian:
- sudo apt update
- sudo apt install libc6
-
- Fedora/Red Hat:
- sudo dnf install glibc
-
- Arch Linux:
- sudo pacman -S glibc
-
- Also please make sure the glibc version is >= MINIMUM_LIBC_VERSION. You can verify the glibc version by running the following command:
- Option 1:
- ldd --version
- Option 2:
- /path/to/libc.so.6 --version
- """
- )
- return exists
+ try:
+ urllib.request.urlretrieve(url, tmp_path)
+ tmp_path.replace(dest) # atomic rename
+ except Exception:
+ # Clean up partial file on failure
+ if tmp_path.exists():
+ tmp_path.unlink(missing_ok=True)
+ raise
+
+
+####################
+# qnn sdk download management
+####################
def _download_archive(url: str, archive_path: pathlib.Path) -> bool:
@@ -178,9 +194,6 @@ def _download_qnn_sdk(dst_folder=SDK_DIR) -> Optional[pathlib.Path]:
if not is_linux_x86():
logger.info("[QNN] Skipping Qualcomm SDK (only supported on Linux x86).")
return None
- elif not check_glibc_exist_and_validate():
- logger.info("[QNN] Skipping Qualcomm SDK (glibc not found or version too old).")
- return None
else:
logger.info("[QNN] Downloading Qualcomm SDK for Linux x86")
@@ -241,6 +254,136 @@ def _extract_tar(archive_path: pathlib.Path, prefix: str, target_dir: pathlib.Pa
dst.write(src.read())
+####################
+# libc management
+####################
+
+GLIBC_VERSION = "2.34"
+GLIBC_REEXEC_GUARD = "QNN_GLIBC_REEXEC"
+MINIMUM_LIBC_VERSION = GLIBC_VERSION
+
+
+def _get_glibc_libdir() -> pathlib.Path:
+ glibc_root = _get_staging_dir(f"glibc-{GLIBC_VERSION}")
+ return glibc_root / "lib"
+
+
+def _parse_version(v: str) -> tuple[int, int]:
+ """Turn '2.34' → (2,34) so it can be compared."""
+ parts = v.split(".")
+ return int(parts[0]), int(parts[1]) if len(parts) > 1 else 0
+
+
+def _current_glibc_version() -> str:
+ """Return system glibc version string (via ctypes)."""
+ try:
+ libc = ctypes.CDLL("libc.so.6")
+ func = libc.gnu_get_libc_version
+ func.restype = ctypes.c_char_p
+ return func().decode()
+ except Exception as e:
+ return f"error:{e}"
+
+
+def _resolve_glibc_loader() -> pathlib.Path | None:
+ """Return staged ld.so path if available."""
+ for p in [
+ _get_glibc_libdir() / f"ld-{GLIBC_VERSION}.so",
+ _get_glibc_libdir() / "ld-linux-x86-64.so.2",
+ ]:
+ if p.exists():
+ return p
+ return None
+
+
+def _stage_prebuilt_glibc():
+ """Download + extract Fedora 35 glibc RPM into /tmp."""
+ logger.info(">>> Staging prebuilt glibc-%s from Fedora 35 RPM", GLIBC_VERSION)
+ _get_glibc_libdir().mkdir(parents=True, exist_ok=True)
+ rpm_path = _get_staging_dir("glibc") / "glibc.rpm"
+ work_dir = _get_staging_dir("glibc") / "extracted"
+ rpm_url = (
+ "https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/35/"
+ "Everything/x86_64/os/Packages/g/glibc-2.34-7.fc35.x86_64.rpm"
+ )
+
+ rpm_path.parent.mkdir(parents=True, exist_ok=True)
+ logger.info("[glibc] Downloading %s -> %s", rpm_url, rpm_path)
+ try:
+ urllib.request.urlretrieve(rpm_url, rpm_path)
+ except Exception as e:
+ logger.error("[glibc] Failed to download %s: %s", rpm_url, e)
+ raise
+
+ # Extract
+ if work_dir.exists():
+ shutil.rmtree(work_dir)
+ work_dir.mkdir(parents=True)
+ subprocess.check_call(["bsdtar", "-C", str(work_dir), "-xf", str(rpm_path)])
+
+ # Copy runtime libs
+ staged = [
+ "ld-linux-x86-64.so.2",
+ "libc.so.6",
+ "libdl.so.2",
+ "libpthread.so.0",
+ "librt.so.1",
+ "libm.so.6",
+ "libutil.so.1",
+ ]
+ for lib in staged:
+ src = work_dir / "lib64" / lib
+ if src.exists():
+ shutil.copy2(src, _get_glibc_libdir() / lib)
+ logger.info("[glibc] Staged %s", lib)
+ else:
+ logger.warning("[glibc] Missing %s in RPM", lib)
+
+
+def ensure_glibc_minimum(min_version: str = GLIBC_VERSION):
+ """
+ Ensure process runs under glibc >= min_version.
+ - If system glibc is new enough → skip.
+ - Else → stage Fedora RPM and re-exec under staged loader.
+ """
+ current = _current_glibc_version()
+ logger.info("[glibc] Current loaded glibc: %s", current)
+
+ # If system glibc already sufficient → skip everything
+ m = re.match(r"(\d+\.\d+)", current)
+ if m and _parse_version(m.group(1)) >= _parse_version(min_version):
+ logger.info("[glibc] System glibc >= %s, no staging needed.", min_version)
+ return
+
+ # Avoid infinite loop
+ if os.environ.get(GLIBC_REEXEC_GUARD) == "1":
+ logger.info("[glibc] Already re-exec'd once, continuing.")
+ return
+
+ # Stage prebuilt if not already staged
+ if not (_get_glibc_libdir() / "libc.so.6").exists():
+ _stage_prebuilt_glibc()
+
+ loader = _resolve_glibc_loader()
+ if not loader:
+ logger.error("[glibc] Loader not found in %s", _get_glibc_libdir())
+ return
+
+ logger.info(
+ "[glibc] Re-execing under loader %s with libdir %s", loader, _get_glibc_libdir()
+ )
+ os.environ[GLIBC_REEXEC_GUARD] = "1"
+ os.execv(
+ str(loader),
+ [str(loader), "--library-path", str(_get_glibc_libdir()), sys.executable]
+ + sys.argv,
+ )
+
+
+####################
+# libc++ management
+####################
+
LLVM_VERSION = "14.0.0"
LIBCXX_BASE_NAME = f"clang+llvm-{LLVM_VERSION}-x86_64-linux-gnu-ubuntu-18.04"
LLVM_URL = f"https://github.com/llvm/llvm-project/releases/download/llvmorg-{LLVM_VERSION}/{LIBCXX_BASE_NAME}.tar.xz"
@@ -258,12 +401,17 @@ def _stage_libcxx(target_dir: pathlib.Path):
logger.info("[libcxx] Already staged at %s, skipping download", target_dir)
return
- temp_tar = pathlib.Path("/tmp") / f"{LIBCXX_BASE_NAME}.tar.xz"
- temp_extract = pathlib.Path("/tmp") / LIBCXX_BASE_NAME
+ libcxx_stage = _get_staging_dir(f"libcxx-{LLVM_VERSION}")
+ temp_tar = libcxx_stage / f"{LIBCXX_BASE_NAME}.tar.xz"
+ temp_extract = libcxx_stage / LIBCXX_BASE_NAME
if not temp_tar.exists():
logger.info("[libcxx] Downloading %s", LLVM_URL)
- urllib.request.urlretrieve(LLVM_URL, temp_tar)
+ _atomic_download(LLVM_URL, temp_tar)
+
+ # Sanity check before extracting
+ if not temp_tar.exists() or temp_tar.stat().st_size == 0:
+ raise FileNotFoundError(f"[libcxx] Tarball missing or empty: {temp_tar}")
logger.info("[libcxx] Extracting %s", temp_tar)
with tarfile.open(temp_tar, "r:xz") as tar:
@@ -437,8 +585,10 @@ def install_qnn_sdk() -> bool:
Returns:
True if both steps succeeded (or were already satisfied), else False.
"""
- if check_glibc_exist_and_validate():
- if _ensure_libcxx_stack():
- if _ensure_qnn_sdk_lib():
- return True
- return False
+ logger.info("[QNN] Starting SDK installation")
+
+ # Make sure we’re running under >= 2.34
+ ensure_glibc_minimum(GLIBC_VERSION)
+
+ # libc++ and QNN SDK setup
+ return _ensure_libcxx_stack() and _ensure_qnn_sdk_lib()
diff --git a/setup.py b/setup.py
index fe9543f3243..97a1d05096e 100644
--- a/setup.py
+++ b/setup.py
@@ -467,11 +467,10 @@ def run(self):
# Following code is for building the Qualcomm backend.
from backends.qualcomm.scripts.download_qnn_sdk import (
_download_qnn_sdk,
- check_glibc_exist_and_validate,
is_linux_x86,
)
- if is_linux_x86() and check_glibc_exist_and_validate():
+ if is_linux_x86():
os.environ["EXECUTORCH_BUILDING_WHEEL"] = "1"
with tempfile.TemporaryDirectory() as tmpdir:
From 7ed926693fbaf471ec8072ff8896090f9fe5fd44 Mon Sep 17 00:00:00 2001
From: Hardik Sharma
Date: Wed, 1 Oct 2025 13:31:09 -0700
Subject: [PATCH 086/266] Move to ProxyValue instead of FakeTensor weights.
Differential Revision: D82605179
Pull Request resolved: https://github.com/pytorch/executorch/pull/14697
---
backends/cadence/aot/replace_ops.py | 202 +++++++++------------------
backends/cadence/aot/simplify_ops.py | 4 +-
2 files changed, 68 insertions(+), 138 deletions(-)
diff --git a/backends/cadence/aot/replace_ops.py b/backends/cadence/aot/replace_ops.py
index 8de0af7311d..9e95460f2f5 100644
--- a/backends/cadence/aot/replace_ops.py
+++ b/backends/cadence/aot/replace_ops.py
@@ -43,7 +43,6 @@
from executorch.exir.dialects._ops import ops as exir_ops
from executorch.exir.dialects.edge._ops import EdgeOpOverload, EdgeOpOverloadPacket
from executorch.exir.pass_base import ExportPass, NodeMetadata, PassResult, ProxyValue
-from torch._subclasses import FakeTensor
from torch.fx.node import Argument
# A map to represent ops that:
@@ -90,11 +89,7 @@ def replace_logical_nop_where_with_where(
# Get the third arg node and its input
logical_not_node = node.args[0]
- logical_not_input_tensor = (
- logical_not_node.args[0].to_tensor()
- if isinstance(logical_not_node.args[0], ProxyValue)
- else logical_not_node.args[0]
- )
+ logical_not_input_tensor = logical_not_node.args[0].to_tensor()
# If the logical_not input is not a boolean tensor, bail.
if logical_not_input_tensor.meta["spec"].dtype != torch.bool:
@@ -263,7 +258,7 @@ def call_operator(self, op, args, kwargs, meta):
return super().call_operator(op, args, kwargs, meta)
# Glean the shape of input and output tensor
- in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0]
+ in_tensor = args[0].to_tensor()
in_shape = in_tensor.shape
out_shape = meta["val"].shape
# Get the select dimension
@@ -295,7 +290,7 @@ def call_operator(self, op, args, kwargs, meta):
# Create a zero bias tensor, and insert it as a graph buffer before the
# current node
- mat2_tensor = mat2.to_tensor() if isinstance(mat2, ProxyValue) else mat2
+ mat2_tensor = mat2.to_tensor()
bias_size = mat2_tensor.size(1)
zero_bias = super().call_operator(
exir_ops.edge.aten.full.default,
@@ -410,7 +405,7 @@ def call_operator(self, op, args, kwargs, meta):
return super().call_operator(op, args, kwargs, meta)
# Get the old dim and new dim order
- in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0]
+ in_tensor = args[0].to_tensor()
old_dims = tuple(range(in_tensor.dim()))
new_dims = args[1]
@@ -488,11 +483,7 @@ def call_operator(self, op, args, kwargs, meta):
repeats = args[1]
# Glean the shapes of input tensor
- in_shape = list(
- in_tensor.to_tensor().shape
- if isinstance(in_tensor, ProxyValue)
- else in_tensor.shape
- )
+ in_shape = list(in_tensor.to_tensor().shape)
# If the size of repeats is more than the dimensionality of the tensor,
# the output of repeat will be a higher-dimensional tensor. We reshape
@@ -793,15 +784,9 @@ def call_operator(self, op, args, kwargs, meta):
(in_tensor, weight, bias, stride, padding, dilation, groups) = args[0:7]
# Glean the shapes of input, weight, and output
- in_shape = (
- in_tensor.to_tensor().shape
- if isinstance(in_tensor, ProxyValue)
- else in_tensor.shape
- )
+ in_shape = in_tensor.to_tensor().shape
- weight_shape = (
- weight.to_tensor().shape if isinstance(weight, ProxyValue) else weight.shape
- )
+ weight_shape = weight.to_tensor().shape
out_shape = meta["val"].shape
assert None not in {in_shape, weight_shape, out_shape}
@@ -823,26 +808,16 @@ def call_operator(self, op, args, kwargs, meta):
# Reshape the weight to [out_channels, in_channels * X]
K = math.prod(weight_shape[1:])
- # If weight is a ProxyValue, linear_weight needs to be the output of a
- # graph operation (in this case a view_copy op) to be an explicit ProxyValue
- # as well. If not, the view op can be done directly on the tensor.
- linear_weight = (
- super().call_operator(
- exir_ops.edge.aten.view_copy.default,
- (
- weight,
- [weight_shape[0], K],
- ),
- kwargs,
- meta,
- )
- if isinstance(weight, ProxyValue)
- else weight.contiguous().view(weight_shape[0], K)
+ # Weight is always a ProxyValue, so we need a view_copy operation
+ linear_weight = super().call_operator(
+ exir_ops.edge.aten.view_copy.default,
+ (
+ weight,
+ [weight_shape[0], K],
+ ),
+ kwargs,
+ meta,
)
- # From the previous check, if linear_weight is a FakeTensor, it has to be
- # a constant (if not, it would be a ProxyValue). Mark it as such.
- if isinstance(linear_weight, FakeTensor):
- linear_weight.constant = linear_weight
# Reshape the input from 3d to 2d tensor
in_view = super().call_operator(
@@ -865,11 +840,7 @@ def call_operator(self, op, args, kwargs, meta):
out_zero_point,
) = args[7:12]
# If the multiplier and shift tensors are provided, use them.
- if (
- len(args) >= 14
- and isinstance(args[12], ProxyValue)
- and isinstance(args[13], ProxyValue)
- ):
+ if len(args) >= 14:
out_multiplier = args[12]
out_shift = args[13]
# If not, compute them.
@@ -1073,9 +1044,7 @@ def call_operator(self, op, args, kwargs, meta):
if groups != 1:
return super().call_operator(op, args, kwargs, meta)
- weight_shape = (
- weight.to_tensor().shape if isinstance(weight, ProxyValue) else weight.shape
- )
+ weight_shape = weight.to_tensor().shape
# If this is a pointwise convolution, im2col will start dominating the
# runtime. So we call convolution op for this case.
if (
@@ -1114,8 +1083,6 @@ def call_operator(self, op, args, kwargs, meta):
{"dtype": torch.int32},
meta,
)
- if isinstance(in_tensor.to_tensor(), FakeTensor)
- else get_zero_point(in_tensor.to_tensor())
)
if quantized_op
else torch.tensor(0, dtype=torch.int32)
@@ -1151,26 +1118,16 @@ def call_operator(self, op, args, kwargs, meta):
# Get the product of the >2 dims of the weight
K = math.prod(weight_shape[1:])
- # If weight is a ProxyValue, linear_weight needs to be the output of a
- # graph operation (in this case a view_copy op) to be an explicit ProxyValue
- # as well. If not, the view op can be done directly on the tensor.
- linear_weight = (
- super().call_operator(
- exir_ops.edge.aten.view_copy.default,
- (
- weight,
- [weight_shape[0], K],
- ),
- kwargs,
- meta,
- )
- if isinstance(weight, ProxyValue)
- else weight.contiguous().view(weight_shape[0], K)
+ # Weight is always a ProxyValue, so we need a view_copy operation
+ linear_weight = super().call_operator(
+ exir_ops.edge.aten.view_copy.default,
+ (
+ weight,
+ [weight_shape[0], K],
+ ),
+ kwargs,
+ meta,
)
- # From the previous check, if linear_weight is a FakeTensor, it has to be
- # a constant (if not, it would be a ProxyValue). Mark it as such.
- if isinstance(linear_weight, FakeTensor):
- linear_weight.constant = linear_weight
# Create the linear node, which multiplies the 3d input with 2d weight
# tensors with bias addition. The outermost dimension of the input is
@@ -1184,11 +1141,7 @@ def call_operator(self, op, args, kwargs, meta):
out_zero_point,
) = args[7:12]
# If the multiplier and shift tensors are provided, use them.
- if (
- len(args) >= 14
- and isinstance(args[12], ProxyValue)
- and isinstance(args[13], ProxyValue)
- ):
+ if len(args) >= 14:
out_multiplier = args[12]
out_shift = args[13]
# If not, compute them.
@@ -1276,9 +1229,7 @@ def call_operator(self, op, args, kwargs, meta):
# Get the shapes
out_shape = meta["val"].shape
- weight_shape = (
- weight.to_tensor().shape if isinstance(weight, ProxyValue) else weight.shape
- )
+ weight_shape = weight.to_tensor().shape
assert None not in {weight_shape, out_shape}
# Determine if the transposed_convolution is NCHW or NHWC. The NHWC,
@@ -1332,26 +1283,16 @@ def call_operator(self, op, args, kwargs, meta):
# Reshape the weight to [out_channels, in_channels * X]
K = math.prod(weight_shape[1:])
- # If weight is a ProxyValue, linear_weight needs to be the output of a
- # graph operation (in this case a view_copy op) to be an explicit ProxyValue
- # as well. If not, the view op can be done directly on the tensor.
- linear_weight = (
- super().call_operator(
- exir_ops.edge.aten.view_copy.default,
- (
- weight,
- [weight_shape[0], K],
- ),
- kwargs,
- meta,
- )
- if isinstance(weight, ProxyValue)
- else weight.contiguous().view(weight_shape[0], K)
+ # Weight is always a ProxyValue, so we need a view_copy operation
+ linear_weight = super().call_operator(
+ exir_ops.edge.aten.view_copy.default,
+ (
+ weight,
+ [weight_shape[0], K],
+ ),
+ kwargs,
+ meta,
)
- # From the previous check, if linear_weight is a FakeTensor, it has to be
- # a constant (if not, it would be a ProxyValue). Mark it as such.
- if isinstance(linear_weight, FakeTensor):
- linear_weight.constant = linear_weight
# Create the linear node, which multiplies the 3d input with 2d weight
# tensors with bias addition. The outermost dimension of the input is
@@ -1422,7 +1363,7 @@ def call_operator(self, op, args, kwargs, meta):
return super().call_operator(op, args, kwargs, meta)
# Get the input tensor and shape
- in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0]
+ in_tensor = args[0].to_tensor()
in_shape = in_tensor.shape
# Get the output tensor shape
out_shape = meta["val"].shape
@@ -1491,7 +1432,7 @@ def call_operator(self, op, args, kwargs, meta):
return super().call_operator(op, args, kwargs, meta)
# Extract the input tensor
- in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0]
+ in_tensor = args[0].to_tensor()
leading_dims = math.prod(in_tensor.shape[:-1])
# If the tensor is not a vector, do nothing.
if leading_dims != 1:
@@ -1557,11 +1498,7 @@ def call_operator(self, op, args, kwargs, meta):
return super().call_operator(
exir_ops.edge.aten.full.default,
(
- (
- args[0].to_tensor().shape
- if isinstance(args[0], ProxyValue)
- else args[0].shape
- ),
+ args[0].to_tensor().shape,
args[1],
),
{},
@@ -1602,59 +1539,57 @@ class ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass(ExportPass):
replaced_scalar_args: dict[
EdgeOpOverloadPacket, tuple[EdgeOpOverload, Sequence[int]]
] = {
- exir_ops.edge.cadence.quantized_add: (
+ exir_ops.edge.cadence.quantized_add.default: (
exir_ops.edge.cadence.quantized_add.per_tensor,
[1, 2, 4, 5],
),
- exir_ops.edge.cadence.quantized_conv2d_nchw: (
+ exir_ops.edge.cadence.quantized_conv2d_nchw.default: (
exir_ops.edge.cadence.quantized_conv2d_nchw.per_tensor,
[8, 9, 12, 13],
),
- exir_ops.edge.cadence.quantized_conv2d_nhwc: (
+ exir_ops.edge.cadence.quantized_conv2d_nhwc.default: (
exir_ops.edge.cadence.quantized_conv2d_nhwc.per_tensor,
[8, 9, 12, 13],
),
- exir_ops.edge.cadence.quantized_fully_connected: (
+ exir_ops.edge.cadence.quantized_fully_connected.default: (
exir_ops.edge.cadence.quantized_fully_connected.per_tensor,
[4, 5, 6],
),
- exir_ops.edge.cadence.quantized_layer_norm: (
+ exir_ops.edge.cadence.quantized_layer_norm.default: (
exir_ops.edge.cadence.quantized_layer_norm.per_tensor,
[1, 2],
),
- exir_ops.edge.cadence.quantized_linear: (
+ exir_ops.edge.cadence.quantized_linear.default: (
exir_ops.edge.cadence.quantized_linear.per_tensor,
[4, 5, 6],
),
- exir_ops.edge.cadence.quantized_relu: (
+ exir_ops.edge.cadence.quantized_relu.default: (
exir_ops.edge.cadence.quantized_relu.per_tensor,
[1, 3, 4],
),
- exir_ops.edge.cadence.im2row: (
+ exir_ops.edge.cadence.im2row.default: (
exir_ops.edge.cadence.im2row.per_tensor,
[5],
),
- exir_ops.edge.cadence.requantize: (
+ exir_ops.edge.cadence.requantize.default: (
exir_ops.edge.cadence.requantize.per_tensor,
[1, 2, 3, 4],
),
}
def call_operator(self, op, args, kwargs, meta):
- op_edge_overload_packet = get_edge_overload_packet(op)
-
- if op_edge_overload_packet not in self.replaced_scalar_args:
+ if op not in self.replaced_scalar_args:
return super().call_operator(op, args, kwargs, meta)
# Get all the args that need to be replaced.
- new_op, args_to_be_replaced = self.replaced_scalar_args[op_edge_overload_packet]
+ new_op, args_to_be_replaced = self.replaced_scalar_args[op]
+
+ if op == new_op:
+ return super().call_operator(op, args, kwargs, meta)
updated_args = list(args)
for op_arg_index in args_to_be_replaced:
arg = args[op_arg_index]
- if not isinstance(arg, ProxyValue):
- return super().call_operator(op, args, kwargs, meta)
-
if not arg.is_tensor():
return super().call_operator(op, args, kwargs, meta)
@@ -1696,7 +1631,7 @@ def call_operator(self, op, args, kwargs, meta):
# Determine if the op is avg_pool1d or avg_pool2d
avg_pool1d: bool = op == exir_ops.edge.aten.avg_pool1d.default
# Get the input tensor
- in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0]
+ in_tensor = args[0].to_tensor()
# Replace avg_pool2d with custom avg_pool2d, and if the input tensor is
# quantized, pass its zero_point tensor as arg to the custom avg_pool2d.
@@ -2062,7 +1997,7 @@ def call_operator(self, op, args, kwargs, meta):
return super().call_operator(op, args, kwargs, meta)
# Get the second tensor
- Y_tensor = Y_arg.to_tensor() if isinstance(Y_arg, ProxyValue) else Y_arg
+ Y_tensor = Y_arg.to_tensor()
# Concretize the bias
zero_bias = super().call_operator(
exir_ops.edge.aten.full.default,
@@ -2071,19 +2006,14 @@ def call_operator(self, op, args, kwargs, meta):
meta,
)
- # If the arg was a ProxyValue, insert a transpose node. Otherwise we
- # can simply transpose the tensor inplace.
- if isinstance(Y_arg, ProxyValue):
- transpose_args = (Y_arg, -1, -2)
- transpose_node = super().call_operator(
- exir_ops.edge.aten.transpose_copy.int,
- transpose_args,
- {},
- meta,
- )
- Y_arg_t = transpose_node
- else:
- Y_arg_t = Y_tensor.transpose(-1, -2)
+ # Y_arg is always a ProxyValue, so we insert a transpose node
+ transpose_args = (Y_arg, -1, -2)
+ Y_arg_t = super().call_operator(
+ exir_ops.edge.aten.transpose_copy.int,
+ transpose_args,
+ {},
+ meta,
+ )
# Construct the new args, and return the transposed matmult op
new_args = (
@@ -2178,7 +2108,7 @@ def call_operator(self, op, args, kwargs, meta):
return super().call_operator(op, args, kwargs, meta)
# Get the input tensor
- in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0]
+ in_tensor = args[0].to_tensor()
# Permute NCHW to NHWC for computation
in_tensor_permuted = in_tensor.permute(0, 2, 3, 1)
in_tensor_shape = in_tensor_permuted.shape
diff --git a/backends/cadence/aot/simplify_ops.py b/backends/cadence/aot/simplify_ops.py
index bf836f09044..92c14cb0f5d 100644
--- a/backends/cadence/aot/simplify_ops.py
+++ b/backends/cadence/aot/simplify_ops.py
@@ -19,7 +19,7 @@
from executorch.backends.cadence.aot.utils import rebind
from executorch.exir.dialects._ops import ops as exir_ops
from executorch.exir.dialects.edge._ops import EdgeOpOverload
-from executorch.exir.pass_base import ExportPass, ProxyValue
+from executorch.exir.pass_base import ExportPass
@register_cadence_pass(CadencePassAttribute(opt_level=0))
@@ -75,7 +75,7 @@ def call_operator(self, op, args, kwargs, meta):
slice_scatter = op == exir_ops.edge.aten.slice_scatter.default
# Parse the arguments
# Extract the tensor to be sliced, and the slicing dimension
- in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0]
+ in_tensor = args[0].to_tensor()
dim = args[1 + slice_scatter] if len(args) > 1 + slice_scatter else 0
# Make dim non-negative
dim = dim if dim >= 0 else dim + in_tensor.dim()
From a4ac70d965298a192eaff26464017363876aa400 Mon Sep 17 00:00:00 2001
From: Abhinayk
Date: Wed, 1 Oct 2025 14:38:34 -0700
Subject: [PATCH 087/266] Disable nxp tests (#14730)
---
backends/nxp/tests/TARGETS | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/backends/nxp/tests/TARGETS b/backends/nxp/tests/TARGETS
index f492111aff2..c8ccd5fe900 100644
--- a/backends/nxp/tests/TARGETS
+++ b/backends/nxp/tests/TARGETS
@@ -1,3 +1,4 @@
+load("@fbsource//tools/target_determinator/macros:ci.bzl", "ci")
load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
load("@fbcode_macros//build_defs:python_pytest.bzl", "python_pytest")
@@ -50,5 +51,9 @@ python_pytest(
"//executorch/backends/nxp:neutron_backend",
":executorch_pipeline",
":models",
- ]
+ ],
+ labels = [
+ "local_only",
+ ci.skip_test(),
+ ],
)
From 649f92d4e5426d93312f4aff74ef6ba02697e834 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Lindstr=C3=B6m?=
<33344797+martinlsm@users.noreply.github.com>
Date: Wed, 1 Oct 2025 23:51:57 +0200
Subject: [PATCH 088/266] Arm backend: Correct type annotations in
aot_arm_compiler (#14627)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- Correct/add type annotation in aot_arm_compiler.py
- Remove one redundant variable assignment (dead code)
Signed-off-by: Martin Lindström
---
examples/arm/aot_arm_compiler.py | 26 ++++++++++++++++++++------
1 file changed, 20 insertions(+), 6 deletions(-)
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
index 53020d1bea0..0f3526975ff 100644
--- a/examples/arm/aot_arm_compiler.py
+++ b/examples/arm/aot_arm_compiler.py
@@ -61,6 +61,8 @@
from executorch.extension.export_util.utils import save_pte_program
from tabulate import tabulate
+from torch.export import ExportedProgram
+from torch.fx import GraphModule
from torch.utils.data import DataLoader
# Quantize model if required using the standard export quantizaion flow.
@@ -145,13 +147,13 @@ def get_model_and_inputs_from_name(
def quantize(
- model: torch.nn.Module,
+ model: GraphModule,
model_name: str,
compile_specs: EthosUCompileSpec | VgfCompileSpec | TosaCompileSpec,
example_inputs: Tuple[torch.Tensor],
evaluator_name: str | None,
evaluator_config: Dict[str, Any] | None,
-) -> torch.nn.Module:
+) -> GraphModule:
"""This is the official recommended flow for quantization in pytorch 2.0
export"""
logging.info("Quantizing Model...")
@@ -601,7 +603,12 @@ def save_bpte_program(exec_prog, original_model: torch.nn.Module, output_name: s
save_bundled_program(exec_prog, method_test_suites, output_name)
-def quantize_model(args, model: torch.nn.Module, example_inputs, compile_spec):
+def quantize_model(
+ args,
+ model: GraphModule,
+ example_inputs: Tuple[torch.Tensor],
+ compile_spec,
+) -> Tuple[GraphModule, ExportedProgram]:
model_int8 = quantize(
model,
args.model_name,
@@ -619,7 +626,10 @@ def quantize_model(args, model: torch.nn.Module, example_inputs, compile_spec):
def to_edge_TOSA_delegate(
- exported_program, args, model: torch.nn.Module, example_inputs
+ exported_program: ExportedProgram,
+ args,
+ model: GraphModule,
+ example_inputs: Tuple[torch.Tensor],
):
# As we can target multiple output encodings, one must
# be specified.
@@ -638,7 +648,6 @@ def to_edge_TOSA_delegate(
model_int8, exported_program = quantize_model(
args, model, example_inputs, compile_spec
)
- model = model_int8
if isinstance(compile_spec, EthosUCompileSpec):
partitioner = EthosUPartitioner(compile_spec)
@@ -660,7 +669,12 @@ def to_edge_TOSA_delegate(
return model_int8, edge
-def to_edge_no_delegate(exported_program, args, model: torch.nn.Module, example_inputs):
+def to_edge_no_delegate(
+ exported_program: ExportedProgram,
+ args,
+ model: GraphModule,
+ example_inputs: Tuple[torch.Tensor],
+):
model_int8 = None
if args.quantize:
# As we can target multiple output encodings, one must
From 871fe39f4e2a2eb9833ac9d490543d9d7b73244a Mon Sep 17 00:00:00 2001
From: Oscar Andersson <87121123+oscarandersson8218@users.noreply.github.com>
Date: Wed, 1 Oct 2025 23:54:12 +0200
Subject: [PATCH 089/266] Arm backend: Update full quantization annotation
(#14585)
full, full.default and fill_.Scalar were previously part of
_one_to_one_shared_input_or_input_act_qspec without having any input
nodes. This meant that these nodes were never annotated and solely
relied on the next node to annotate its input. This patch changes so
that full, full.default and fill_.Scalar are annotated in the same way
as scalar_tensor.default.
Also adds these targets to _is_large_scalar().
Signed-off-by: Oscar Andersson
---
.../arm/quantizer/quantization_annotator.py | 24 ++++++++++++-------
1 file changed, 15 insertions(+), 9 deletions(-)
diff --git a/backends/arm/quantizer/quantization_annotator.py b/backends/arm/quantizer/quantization_annotator.py
index d7c85447dd5..ebc91c22bbb 100644
--- a/backends/arm/quantizer/quantization_annotator.py
+++ b/backends/arm/quantizer/quantization_annotator.py
@@ -6,7 +6,7 @@
import logging
import operator
from dataclasses import dataclass
-from typing import Callable, List, Optional, Sequence
+from typing import Callable, cast, List, Optional, Sequence
import torch
import torch.fx
@@ -137,11 +137,18 @@ def _is_large_scalar(node: Node, gm: torch.fx.GraphModule):
node since histc op (in HistogramObserver) only works for values up to certain upper
bound.
"""
+ HISTC_UPPER_BOUND = 3.4028235e15
if node.op == "get_attr" and isinstance(node.target, str):
tensor = _get_node_target(gm, node.target)
# torch.histc works until this upper bound
- HISTC_UPPER_BOUND = 3.4028235e15
return tensor.numel() == 1 and abs(tensor.item()) > HISTC_UPPER_BOUND
+ if node.op == "call_function" and node.target in (
+ torch.ops.aten.full.default,
+ torch.ops.aten.full,
+ torch.ops.aten.fill_.Scalar,
+ ):
+ fill_value = cast(float, node.args[1])
+ return abs(fill_value) > HISTC_UPPER_BOUND
return False
@@ -358,9 +365,6 @@ def _match_pattern(
torch.ops.aten.permute_copy.default,
torch.ops.aten.avg_pool2d.default,
torch.ops.aten.max_pool2d.default,
- torch.ops.aten.full.default,
- torch.ops.aten.full,
- torch.ops.aten.fill_.Scalar,
torch.ops.aten.flatten.using_ints,
torch.ops.aten.dropout.default,
torch.ops.aten.dropout_.default,
@@ -518,9 +522,6 @@ def any_or_hardtanh_min_zero(n: Node):
]
quant_properties.quant_output = _QuantProperty(0, shared_qspec) # type: ignore[arg-type]
elif node.target in _one_to_one_shared_input_or_input_act_qspec:
- if not isinstance(node.args[0], Node):
- return None
-
input_qspec = (
SharedQuantizationSpec(node.args[0]) # type: ignore[arg-type]
if is_output_annotated(node.args[0]) # type: ignore
@@ -578,7 +579,12 @@ def any_or_hardtanh_min_zero(n: Node):
),
]
quant_properties.quant_output = None
- elif node.target in [torch.ops.aten.scalar_tensor.default]:
+ elif node.target in [
+ torch.ops.aten.scalar_tensor.default,
+ torch.ops.aten.full.default,
+ torch.ops.aten.full,
+ torch.ops.aten.fill_.Scalar,
+ ]:
quant_properties.quant_inputs = []
quant_properties.quant_output = _QuantProperty(0, output_act_qspec)
elif node.target in [operator.getitem]:
From 0081bef92ef8bd0f58d0be3580f85dfcded2a3aa Mon Sep 17 00:00:00 2001
From: Erik Lundell
Date: Wed, 1 Oct 2025 23:56:36 +0200
Subject: [PATCH 090/266] Arm backend: Add complie spec factories (#14376)
Signed-off-by: Erik Lundell
Co-authored-by: Digant Desai
---
backends/arm/TARGETS | 14 ++++
backends/arm/test/TARGETS | 6 ++
backends/arm/test/common.py | 1 +
backends/arm/test/tester/arm_tester.py | 96 ++++++++------------------
backends/arm/tosa/backend.py | 8 +--
backends/arm/util/_factory.py | 59 ++++++++++++++++
examples/arm/aot_arm_compiler.py | 33 ++-------
7 files changed, 121 insertions(+), 96 deletions(-)
create mode 100644 backends/arm/util/_factory.py
diff --git a/backends/arm/TARGETS b/backends/arm/TARGETS
index a78ab252739..a737c4bc9de 100644
--- a/backends/arm/TARGETS
+++ b/backends/arm/TARGETS
@@ -106,3 +106,17 @@ runtime.python_library(
"//caffe2:torch",
]
)
+runtime.python_library(
+ name = "_factory",
+ srcs = [
+ "util/_factory.py"
+ ],
+ deps = [
+ ":ethosu",
+ ":vgf",
+ ":arm_compile_spec",
+ "//executorch/backends/arm/quantizer:lib",
+ "//executorch/exir/backend:operator_support",
+ "//executorch/exir/backend:compile_spec_schema",
+ ]
+)
diff --git a/backends/arm/test/TARGETS b/backends/arm/test/TARGETS
index ec35b63f8f6..fd7d894fbf0 100644
--- a/backends/arm/test/TARGETS
+++ b/backends/arm/test/TARGETS
@@ -1,3 +1,8 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
load(":targets.bzl", "define_arm_tests")
@@ -58,6 +63,7 @@ runtime.python_library(
"//executorch/backends/arm/quantizer:lib",
"//executorch/backends/arm/tosa:mapping",
"//executorch/backends/arm:vgf",
+ "//executorch/backends/arm:_factory",
"//executorch/devtools/backend_debug:delegation_info",
"//executorch/exir/backend:operator_support",
"fbsource//third-party/pypi/tabulate:tabulate",
diff --git a/backends/arm/test/common.py b/backends/arm/test/common.py
index 963084d6091..f8a6242fc0c 100644
--- a/backends/arm/test/common.py
+++ b/backends/arm/test/common.py
@@ -14,6 +14,7 @@
import pytest
from executorch.backends.arm.ethosu import EthosUCompileSpec
+
from executorch.backends.arm.test.runner_utils import (
arm_executor_runner_exists,
corstone300_installed,
diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py
index 8bf72827549..9f530f428ce 100644
--- a/backends/arm/test/tester/arm_tester.py
+++ b/backends/arm/test/tester/arm_tester.py
@@ -28,17 +28,11 @@
import torch.fx
import torch.utils._pytree as pytree
-
from executorch.backends.arm._passes.arm_pass_manager import ArmPassManager
from executorch.backends.arm.common.arm_compile_spec import ArmCompileSpec
-from executorch.backends.arm.ethosu import EthosUCompileSpec, EthosUPartitioner
-from executorch.backends.arm.quantizer import (
- EthosUQuantizer,
- get_symmetric_quantization_config,
- TOSAQuantizer,
- VgfQuantizer,
-)
+from executorch.backends.arm.ethosu import EthosUCompileSpec
+from executorch.backends.arm.quantizer import get_symmetric_quantization_config
from executorch.backends.arm.test.runner_utils import (
dbg_tosa_fb_to_json,
get_output_quantization_params,
@@ -53,9 +47,13 @@
from executorch.backends.arm.tosa import TosaSpecification
from executorch.backends.arm.tosa.compile_spec import TosaCompileSpec
from executorch.backends.arm.tosa.mapping import extract_tensor_meta
-from executorch.backends.arm.tosa.partitioner import TOSAPartitioner
-from executorch.backends.arm.vgf import VgfCompileSpec, VgfPartitioner
+from executorch.backends.arm.util._factory import (
+ create_partitioner,
+ create_quantizer,
+ parse_compile_spec,
+)
+from executorch.backends.arm.vgf import VgfCompileSpec
from executorch.backends.test.harness.error_statistics import ErrorStatistics
from executorch.backends.test.harness.stages import Stage, StageType
@@ -83,7 +81,6 @@
_copy_module,
_update_exported_program_graph_module,
)
-
from tabulate import tabulate
from torch.export.graph_signature import ExportGraphSignature, InputSpec, OutputSpec
@@ -103,12 +100,6 @@ def _dump_lowered_modules_artifact(
artifact.exported_program().graph_signature
)
- def get_output_format(lowered_module) -> str | None:
- for spec in lowered_module.compile_specs:
- if spec.key == "output_format":
- return spec.value.decode()
- return None
-
for node in graph_module.graph.nodes:
if node.op == "get_attr" and node.name.startswith("lowered_module_"):
lowered_module = getattr(graph_module, node.name)
@@ -116,13 +107,13 @@ def get_output_format(lowered_module) -> str | None:
lowered_module, LoweredBackendModule
), f"Attribute {node.name} must be of type LoweredBackendModule."
- output_format = get_output_format(lowered_module)
- if output_format == "tosa":
+ compile_spec = parse_compile_spec(lowered_module.compile_specs)
+ if isinstance(compile_spec, TosaCompileSpec):
tosa_fb = lowered_module.processed_bytes
to_print = dbg_tosa_fb_to_json(tosa_fb)
to_print = pformat(to_print, compact=True, indent=1)
output += f"\nTOSA deserialized {node.name}: \n{to_print}\n"
- elif output_format == EthosUCompileSpec.get_output_format():
+ elif isinstance(compile_spec, EthosUCompileSpec):
vela_cmd_stream = lowered_module.processed_bytes
output += f"\nVela command stream {node.name}: \n{vela_cmd_stream}\n"
else:
@@ -284,13 +275,7 @@ def quantize(
quantize_stage: Optional[tester.Quantize] = None,
):
if quantize_stage is None:
- quantizer = None
- if isinstance(self.compile_spec, TosaCompileSpec):
- quantizer = TOSAQuantizer(self.compile_spec)
- elif isinstance(self.compile_spec, EthosUCompileSpec):
- quantizer = EthosUQuantizer(self.compile_spec)
- elif isinstance(self.compile_spec, VgfCompileSpec):
- quantizer = VgfQuantizer(self.compile_spec)
+ quantizer = create_quantizer(self.compile_spec)
quantize_stage = tester.Quantize(
quantizer,
get_symmetric_quantization_config(),
@@ -312,14 +297,7 @@ def to_edge(
def partition(self, partition_stage: Optional[Partition] = None):
if partition_stage is None:
- if isinstance(self.compile_spec, TosaCompileSpec):
- arm_partitioner = TOSAPartitioner(self.compile_spec)
- elif isinstance(self.compile_spec, EthosUCompileSpec):
- arm_partitioner = EthosUPartitioner(self.compile_spec)
- elif isinstance(self.compile_spec, VgfCompileSpec):
- arm_partitioner = VgfPartitioner(self.compile_spec)
- else:
- raise ValueError("compile spec doesn't target any Arm Partitioner")
+ arm_partitioner = create_partitioner(self.compile_spec)
partition_stage = Partition(arm_partitioner)
return super().partition(partition_stage)
@@ -329,7 +307,7 @@ def to_edge_transform_and_lower(
partitioners: Optional[List[Partitioner]] = None,
edge_compile_config: Optional[EdgeCompileConfig] = None,
additional_checks: Optional[
- List[Union[DontPartition | DontPartitionModule | DontPartitionName]]
+ List[DontPartition | DontPartitionModule | DontPartitionName]
] = None,
transform_passes: Optional[
Union[Sequence[PassType], Dict[str, Sequence[PassType]]]
@@ -343,20 +321,9 @@ def to_edge_transform_and_lower(
if to_edge_and_lower_stage is None:
if partitioners is None:
- if isinstance(self.compile_spec, TosaCompileSpec):
- arm_partitioner = TOSAPartitioner(
- self.compile_spec, additional_checks
- )
- elif isinstance(self.compile_spec, EthosUCompileSpec):
- arm_partitioner = EthosUPartitioner(
- self.compile_spec, additional_checks
- )
- elif isinstance(self.compile_spec, VgfCompileSpec):
- arm_partitioner = VgfPartitioner(
- self.compile_spec, additional_checks
- )
- else:
- raise ValueError("compile spec doesn't target any Arm Partitioner")
+ arm_partitioner = create_partitioner(
+ self.compile_spec, additional_checks
+ )
partitioners = [arm_partitioner]
to_edge_and_lower_stage = ToEdgeTransformAndLower(
partitioners,
@@ -743,22 +710,19 @@ def _get_tosa_operator_distribution(
op_list = []
id = 0
while lowered_module := getattr(graph_module, f"lowered_module_{id}", None):
- for spec in lowered_module.compile_specs:
- if spec.key != "output_format":
- continue
- if spec.value == b"tosa":
- tosa_fb = lowered_module.processed_bytes
- tosa_json = dbg_tosa_fb_to_json(tosa_fb)
- for region in tosa_json["regions"]:
- for block in region["blocks"]:
- op_list.extend(
- [operator["op"] for operator in block["operators"]]
- )
- break
- elif spec.value == EthosUCompileSpec.get_output_format().encode():
- return "Can not get operator distribution for Vela command stream."
- else:
- return f"Unknown output format '{spec.value}'."
+ compile_spec = parse_compile_spec(lowered_module.compile_specs)
+ if isinstance(compile_spec, TosaCompileSpec):
+ tosa_fb = lowered_module.processed_bytes
+ tosa_json = dbg_tosa_fb_to_json(tosa_fb)
+ for region in tosa_json["regions"]:
+ for block in region["blocks"]:
+ op_list.extend([operator["op"] for operator in block["operators"]])
+ elif isinstance(compile_spec, EthosUCompileSpec):
+ return "Can not get operator distribution for Vela command stream."
+ elif isinstance(compile_spec, VgfCompileSpec):
+ return "Can not get operator distribution for VGF."
+ else:
+ return f"Unknown output format '{compile_spec.get_output_format()}'."
id += 1
if id == 0:
return "No delegate with name 'lowered_module_0 found in graph module."
diff --git a/backends/arm/tosa/backend.py b/backends/arm/tosa/backend.py
index 7596573be84..7a7ea2ca377 100644
--- a/backends/arm/tosa/backend.py
+++ b/backends/arm/tosa/backend.py
@@ -206,8 +206,8 @@ def filter_tosa_compile_specs(
hardware.
"""
- new_compile_spec = TosaCompileSpec.__new__(TosaCompileSpec)
- new_compile_spec._set_compile_specs(
- compile_spec.tosa_spec, [], compile_spec.get_intermediate_path()
+ return (
+ TosaCompileSpec(compile_spec.tosa_spec)
+ .dump_intermediate_artifacts_to(compile_spec.get_intermediate_path())
+ .dump_debug_info(compile_spec.tosa_debug_mode)
)
- return new_compile_spec
diff --git a/backends/arm/util/_factory.py b/backends/arm/util/_factory.py
new file mode 100644
index 00000000000..23d8215fc9b
--- /dev/null
+++ b/backends/arm/util/_factory.py
@@ -0,0 +1,59 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.arm.common.arm_compile_spec import ArmCompileSpec
+from executorch.backends.arm.ethosu import EthosUCompileSpec, EthosUPartitioner
+from executorch.backends.arm.quantizer import (
+ EthosUQuantizer,
+ TOSAQuantizer,
+ VgfQuantizer,
+)
+from executorch.backends.arm.tosa.compile_spec import TosaCompileSpec
+from executorch.backends.arm.tosa.partitioner import TOSAPartitioner
+from executorch.backends.arm.vgf import VgfCompileSpec, VgfPartitioner
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from torch.fx.passes.operator_support import OperatorSupportBase
+
+
+def parse_compile_spec(compile_specs: list[CompileSpec]) -> ArmCompileSpec:
+ output_format = None
+ for spec in compile_specs:
+ if spec.key == "output_format":
+ output_format = spec.value.decode()
+ break
+ else:
+ raise ValueError("Compile spec without output format.")
+ if output_format == TosaCompileSpec.get_output_format():
+ return TosaCompileSpec.from_list(compile_specs)
+ if output_format == EthosUCompileSpec.get_output_format():
+ return EthosUCompileSpec.from_list(compile_specs)
+ if output_format == VgfCompileSpec.get_output_format():
+ return VgfCompileSpec.from_list(compile_specs)
+ raise ValueError(f"Unknown output format {output_format}")
+
+
+def create_partitioner(
+ compile_spec: ArmCompileSpec,
+ additional_checks: list[OperatorSupportBase] | None = None,
+):
+ if isinstance(compile_spec, TosaCompileSpec):
+ return TOSAPartitioner(compile_spec, additional_checks)
+ elif isinstance(compile_spec, EthosUCompileSpec):
+ return EthosUPartitioner(compile_spec, additional_checks)
+ elif isinstance(compile_spec, VgfCompileSpec):
+ return VgfPartitioner(compile_spec, additional_checks)
+ else:
+ raise ValueError("compile spec doesn't target any Arm Partitioner")
+
+
+def create_quantizer(compile_spec: ArmCompileSpec):
+ if isinstance(compile_spec, TosaCompileSpec):
+ return TOSAQuantizer(compile_spec)
+ elif isinstance(compile_spec, EthosUCompileSpec):
+ return EthosUQuantizer(compile_spec)
+ elif isinstance(compile_spec, VgfCompileSpec):
+ return VgfQuantizer(compile_spec)
+ else:
+ raise ValueError("compile spec doesn't target any Arm Quantizer")
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
index 0f3526975ff..f3de38c20da 100644
--- a/examples/arm/aot_arm_compiler.py
+++ b/examples/arm/aot_arm_compiler.py
@@ -18,23 +18,18 @@
import torch
from examples.devtools.scripts.export_bundled_program import save_bundled_program
from executorch.backends.arm.common.arm_compile_spec import ArmCompileSpec
-from executorch.backends.arm.ethosu import EthosUCompileSpec, EthosUPartitioner
-from executorch.backends.arm.quantizer import (
- EthosUQuantizer,
- get_symmetric_quantization_config,
- TOSAQuantizer,
- VgfQuantizer,
-)
+from executorch.backends.arm.ethosu import EthosUCompileSpec
+from executorch.backends.arm.quantizer import get_symmetric_quantization_config
from executorch.backends.arm.tosa import TosaSpecification
from executorch.backends.arm.tosa.compile_spec import TosaCompileSpec
-from executorch.backends.arm.tosa.partitioner import TOSAPartitioner
+from executorch.backends.arm.util._factory import create_partitioner, create_quantizer
from executorch.backends.arm.util.arm_model_evaluator import (
evaluate_model,
evaluator_calibration_data,
)
-from executorch.backends.arm.vgf import VgfCompileSpec, VgfPartitioner
+from executorch.backends.arm.vgf import VgfCompileSpec
# To use Cortex-M backend
from executorch.backends.cortex_m.passes.quantized_linear_fusion_pass import (
@@ -158,15 +153,8 @@ def quantize(
export"""
logging.info("Quantizing Model...")
logging.debug(f"Original model: {model}")
- quantizer = None
- if isinstance(compile_specs, EthosUCompileSpec):
- quantizer = EthosUQuantizer(compile_specs)
- elif isinstance(compile_specs, TosaCompileSpec):
- quantizer = TOSAQuantizer(compile_specs)
- elif isinstance(compile_specs, VgfCompileSpec):
- quantizer = VgfQuantizer(compile_specs)
- else:
- raise RuntimeError("Unsupported compilespecs for quantization!")
+
+ quantizer = create_quantizer(compile_specs)
operator_config = get_symmetric_quantization_config()
quantizer.set_global(operator_config)
@@ -649,14 +637,7 @@ def to_edge_TOSA_delegate(
args, model, example_inputs, compile_spec
)
- if isinstance(compile_spec, EthosUCompileSpec):
- partitioner = EthosUPartitioner(compile_spec)
- elif isinstance(compile_spec, TosaCompileSpec):
- partitioner = TOSAPartitioner(compile_spec)
- elif isinstance(compile_spec, VgfCompileSpec):
- partitioner = VgfPartitioner(compile_spec)
- else:
- raise RuntimeError(f"Unhandled compile spec: {compile_spec}")
+ partitioner = create_partitioner(compile_spec)
edge = to_edge_transform_and_lower(
exported_program,
From 0cd8256d145ef7a7913d953347228f0dac4b1ee9 Mon Sep 17 00:00:00 2001
From: Sebastian Larsson <38941629+Sebastian-Larsson@users.noreply.github.com>
Date: Wed, 1 Oct 2025 23:57:16 +0200
Subject: [PATCH 091/266] Arm backend: Add docstrings for
operator_support/convolution_support.py (#14684)
Signed-off-by: Sebastian Larsson
---
.../operator_support/convolution_support.py | 47 +++++++++++++++----
1 file changed, 38 insertions(+), 9 deletions(-)
diff --git a/backends/arm/operator_support/convolution_support.py b/backends/arm/operator_support/convolution_support.py
index 6e9d3b3528e..f335c5046f5 100644
--- a/backends/arm/operator_support/convolution_support.py
+++ b/backends/arm/operator_support/convolution_support.py
@@ -2,6 +2,12 @@
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
+"""Declare operator support for ``aten.convolution`` in TOSA.
+
+Provide general checks and hardware-specific constraints (e.g., U55 subset) for
+convolution nodes prior to delegation to the TOSA backend.
+
+"""
from typing import cast
@@ -18,6 +24,8 @@
@register_tosa_support_check
class ConvolutionSupported(SupportedTOSAOperatorCheck):
+ """Provide TOSA support check for convolutions."""
+
targets = [exir_ops.edge.aten.convolution.default]
tosa_specs = [
@@ -25,8 +33,15 @@ class ConvolutionSupported(SupportedTOSAOperatorCheck):
TosaSpecification.create_from_string("TOSA-1.0+FP"),
]
- def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
+ def is_node_tosa_supported(
+ self, node: fx.Node, tosa_spec: TosaSpecification
+ ) -> bool:
+ """Return True if the node is supported by TOSA.
+ Reject transposed convolutions and convolutions with non-zero output
+ padding. Apply additional hardware-specific constraints for U55.
+
+ """
# Not implemented
transposed = cast(bool, node.args[6])
output_padding = cast(list[int], node.args[7])
@@ -46,9 +61,19 @@ def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
else:
return True
- def _is_node_supported_u55(self, node: fx.Node):
- """Hardware constraints for Ethos-U-55 case, Vela 4.2.0 (25.02 release)"""
+ def _is_node_supported_u55(self, node: fx.Node) -> bool:
+ """Enforce Ethos-U55-specific constraints (Vela 4.2.0).
+
+ Check channel dimensions, kernel sizes, and stride/pad/dilation
+ combinations permitted on U55.
+ Args:
+ node (fx.Node): Convolution node to validate.
+
+ Returns:
+ bool: True if supported; otherwise, False.
+
+ """
shape_in = cast(torch.Tensor, node.all_input_nodes[0].meta["val"]).shape
shape_out = node.meta["val"].shape
kernel = cast(fx.Node, node.args[1]).meta["val"].shape
@@ -98,13 +123,17 @@ def _is_node_supported_u55(self, node: fx.Node):
return True
def _stride_condition(self, node: fx.Node) -> bool:
- """This condition is somewhat complex but boils down
- to not supporting stride > 3, unless we have some special conditions.
- This condition is a simplified, relaxed version of the hardware constraint,
- since the actual constraint requires information not available
- here (without a lot of work).
+ """Check a simplified stride/padding/dilation constraint.
+
+ Disallow strides greater than 3 unless there is no padding and the
+ dilation is 1. For 3D convolutions, enforce ``stride_z <= 1``.
+
+ Args:
+ node (fx.Node): Convolution node to evaluate.
+
+ Returns:
+ bool: True if the condition is satisfied.
- This means that we might accept ops that are not actually supported.
"""
strides = cast(list[int], node.args[3])
has_padding = any(pad > 0 for pad in cast(list[int], node.args[4]))
From 96dfa9c516ee76c8dbda8eeb7104f5f8c8c19a5f Mon Sep 17 00:00:00 2001
From: lucylq
Date: Wed, 1 Oct 2025 15:30:27 -0700
Subject: [PATCH 092/266] Add pybindings for bpte and ptd file
Differential Revision: D83518944
Pull Request resolved: https://github.com/pytorch/executorch/pull/14678
---
extension/pybindings/pybindings.cpp | 127 +++++++++++++------
extension/pybindings/test/test_pybindings.py | 19 ++-
2 files changed, 103 insertions(+), 43 deletions(-)
diff --git a/extension/pybindings/pybindings.cpp b/extension/pybindings/pybindings.cpp
index a896a4bde36..c3cd4ed0b47 100644
--- a/extension/pybindings/pybindings.cpp
+++ b/extension/pybindings/pybindings.cpp
@@ -158,6 +158,24 @@ void setup_output_storage(
}
}
+inline std::unique_ptr loader_from_buffer(
+ const void* ptr,
+ size_t ptr_len) {
+ return std::make_unique(ptr, ptr_len);
+}
+
+inline std::unique_ptr loader_from_file(const std::string& path) {
+ Result res = MmapDataLoader::from(
+ path.c_str(), MmapDataLoader::MlockConfig::UseMlockIgnoreErrors);
+ THROW_IF_ERROR(
+ res.error(),
+ "Failed to create MmapDataLoader from file %s, error: 0x:%" PRIx32,
+ path.c_str(),
+ static_cast(res.error()));
+
+ return std::make_unique(std::move(res.get()));
+}
+
inline std::unique_ptr load_module_from_buffer(
const void* ptr,
size_t ptr_len,
@@ -166,11 +184,11 @@ inline std::unique_ptr load_module_from_buffer(
std::unique_ptr event_tracer,
Program::Verification program_verification) {
EXECUTORCH_SCOPE_PROF("load_module_from_buffer");
- auto loader = std::make_unique(ptr, ptr_len);
+ auto loader = loader_from_buffer(ptr, ptr_len);
if (data_map_ptr.has_value() && data_map_len.has_value()) {
- auto data_map_loader = std::make_unique(
- data_map_ptr.value(), data_map_len.value());
+ auto data_map_loader =
+ loader_from_buffer(data_map_ptr.value(), data_map_len.value());
return std::make_unique(
std::move(loader),
nullptr, // memory_allocator
@@ -194,27 +212,9 @@ inline std::unique_ptr load_module_from_file(
Program::Verification program_verification) {
EXECUTORCH_SCOPE_PROF("load_module_from_file");
- Result program_loader_res = MmapDataLoader::from(
- program_path.c_str(), MmapDataLoader::MlockConfig::UseMlockIgnoreErrors);
- THROW_IF_ERROR(
- program_loader_res.error(),
- "Failed to create MmapDataLoader from file %s, error: 0x:%" PRIx32,
- program_path.c_str(),
- static_cast(program_loader_res.error()));
- auto program_loader =
- std::make_unique(std::move(program_loader_res.get()));
-
+ auto program_loader = loader_from_file(program_path);
if (data_map_path.has_value()) {
- Result data_map_loader_res = MmapDataLoader::from(
- data_map_path->c_str(),
- MmapDataLoader::MlockConfig::UseMlockIgnoreErrors);
- THROW_IF_ERROR(
- data_map_loader_res.error(),
- "Failed to create MmapDataLoader from file %s, error: 0x:%" PRIx32,
- data_map_path->c_str(),
- static_cast(data_map_loader_res.error()));
- auto data_map_loader =
- std::make_unique(std::move(data_map_loader_res.get()));
+ auto data_map_loader = loader_from_file(data_map_path.value());
return std::make_unique(
std::move(program_loader),
nullptr, // memory_allocator
@@ -230,6 +230,22 @@ inline std::unique_ptr load_module_from_file(
nullptr); // data_map_loader
}
+inline std::unique_ptr load_module_from_buffer_with_data_file(
+ const void* ptr,
+ size_t ptr_len,
+ const std::string& data_map_path,
+ std::unique_ptr event_tracer,
+ Program::Verification program_verification) {
+ auto program_loader = loader_from_buffer(ptr, ptr_len);
+ auto data_loader = loader_from_file(data_map_path);
+ return std::make_unique(
+ std::move(program_loader),
+ nullptr, // memory_allocator
+ nullptr, // temp_allocator
+ std::move(event_tracer), // event_tracer
+ std::move(data_loader));
+}
+
inline py::list get_outputs_as_py_list(
const std::vector& outputs,
bool clone_outputs = true) {
@@ -555,6 +571,22 @@ struct PyModule final {
setup_event_tracer(enable_etdump, debug_buffer_size),
program_verification)) {}
+ explicit PyModule(
+ const void* ptr,
+ size_t ptr_len,
+ const std::string& data_path,
+ bool enable_etdump,
+ size_t debug_buffer_size = 0,
+ Program::Verification program_verification =
+ Program::Verification::InternalConsistency)
+ : debug_buffer_size_(debug_buffer_size),
+ module_(load_module_from_buffer_with_data_file(
+ ptr,
+ ptr_len,
+ data_path,
+ setup_event_tracer(enable_etdump, debug_buffer_size),
+ program_verification)) {}
+
explicit PyModule(
const std::string& program_path,
std::optional& data_path,
@@ -605,6 +637,7 @@ struct PyModule final {
program_verification);
}
+ // Load with data as a buffer.
static std::unique_ptr load_from_bundled_program(
PyBundledModule& m,
std::optional data_map_buffer,
@@ -628,6 +661,21 @@ struct PyModule final {
Program::Verification::InternalConsistency);
}
+ // Load with data as a file.
+ static std::unique_ptr load_from_bundled_program(
+ PyBundledModule& m,
+ const std::string& data_path,
+ bool enable_etdump,
+ size_t debug_buffer_size = 0) {
+ return std::make_unique(
+ m.get_program_ptr(),
+ m.get_program_len(),
+ data_path,
+ enable_etdump,
+ debug_buffer_size,
+ Program::Verification::InternalConsistency);
+ }
+
py::list run_method(
const std::string& method_name,
const py::sequence& inputs,
@@ -900,24 +948,6 @@ struct PyModule final {
}
};
-inline std::unique_ptr loader_from_buffer(
- const void* ptr,
- size_t ptr_len) {
- return std::make_unique(ptr, ptr_len);
-}
-
-inline std::unique_ptr loader_from_file(const std::string& path) {
- Result res = MmapDataLoader::from(
- path.c_str(), MmapDataLoader::MlockConfig::UseMlockIgnoreErrors);
- THROW_IF_ERROR(
- res.error(),
- "Failed to create MmapDataLoader from file %s, error: 0x:%" PRIx32,
- path.c_str(),
- static_cast(res.error()));
-
- return std::make_unique(std::move(res.get()));
-}
-
inline std::shared_ptr load_program(
std::unique_ptr loader,
Program::Verification program_verification) {
@@ -1474,12 +1504,25 @@ PYBIND11_MODULE(EXECUTORCH_PYTHON_MODULE_NAME, m) {
call_guard);
m.def(
"_load_for_executorch_from_bundled_program",
- &PyModule::load_from_bundled_program,
+ py::overload_cast<
+ PyBundledModule&,
+ std::optional,
+ bool,
+ size_t>(&PyModule::load_from_bundled_program),
py::arg("ptr"),
py::arg("data_map_buffer") = std::nullopt,
py::arg("enable_etdump") = false,
py::arg("debug_buffer_size") = 0,
call_guard);
+ m.def(
+ "_load_for_executorch_from_bundled_program",
+ py::overload_cast(
+ &PyModule::load_from_bundled_program),
+ py::arg("ptr"),
+ py::arg("data_path"),
+ py::arg("enable_etdump") = false,
+ py::arg("debug_buffer_size") = 0,
+ call_guard);
m.def(
"_load_bundled_program_from_buffer",
&PyBundledModule::load_from_buffer,
diff --git a/extension/pybindings/test/test_pybindings.py b/extension/pybindings/test/test_pybindings.py
index 02ad6b5e327..ec45428c7d7 100644
--- a/extension/pybindings/test/test_pybindings.py
+++ b/extension/pybindings/test/test_pybindings.py
@@ -701,7 +701,7 @@ def test_program_data_separation(self) -> None:
bundled_buffer = serialize_from_bundled_program_to_flatbuffer(bundled_program)
bundled_module = self.runtime._load_bundled_program_from_buffer(bundled_buffer)
- # Load module from bundled program with external data
+ # Load module from bundled program with external data buffer
executorch_module_bundled = (
self.runtime._load_for_executorch_from_bundled_program(
bundled_module, data_buffer
@@ -710,6 +710,23 @@ def test_program_data_separation(self) -> None:
executorch_output_bundled = executorch_module_bundled.forward(inputs)[0]
self.assertTrue(torch.allclose(expected, executorch_output_bundled))
+ # Load module from bundled program with external data file
+ with tempfile.TemporaryDirectory() as tmpdir:
+ ptd_file = os.path.join(tmpdir, "linear.ptd")
+ with open(ptd_file, "wb") as ptd:
+ ptd.write(data_buffer)
+ executorch_module_bundled_data_file = (
+ self.runtime._load_for_executorch_from_bundled_program(
+ bundled_module, ptd_file
+ )
+ )
+ executorch_output_bundled_data_file = (
+ executorch_module_bundled_data_file.forward(inputs)[0]
+ )
+ self.assertTrue(
+ torch.allclose(expected, executorch_output_bundled_data_file)
+ )
+
# Test 6: Bundled program without external data should fail
executorch_module_bundled_no_data = (
self.runtime._load_for_executorch_from_bundled_program(bundled_module)
From b1309e71a2d91353dae5f8579500de5b47cdd03d Mon Sep 17 00:00:00 2001
From: Mengwei Liu
Date: Wed, 1 Oct 2025 23:52:46 +0100
Subject: [PATCH 093/266] Aoti support multi method (#14715)
This pull request introduces several improvements to the CUDA backend.
The main changes include adding a new graph pass to replace unnecessary
`slice_copy` operations, improving how method names are tracked in
compilation artifacts, and making the preprocessing pipeline more robust
and accurate.
**Key changes:**
### Graph optimization and preprocessing
* Introduced `ReplaceSliceCopyWithSlicePass`, a new export pass that
replaces non-mutated `slice_copy` operations with more efficient `slice`
view operations in the computational graph
(`replace_slice_copy_with_slice.py`, used in `cuda_backend.py`).
[[1]](diffhunk://#diff-c4a228b182f50f778545991d472609ad705d2325994342174093ff374738851dR1-R113)
[[2]](diffhunk://#diff-5b5ea2257772b3aba04b2534f5ea1429a0c631bfd25a7ef531f526e76c471d7aR115-R117)
* Added context management for attention kernel selection and no-grad
mode during AOT compilation to ensure correct backend selection for
decomposition. This is needed in the short term until we have a flash
attention cuda kernel.
### Method name and compile specification handling
* Added a `COMPILE_SPEC_KEYS` enum and utility methods
(`generate_method_name_compile_spec`, `method_name_from_compile_specs`)
to consistently embed and retrieve the method name in compile specs and
as a key in the data store, improving traceability of compiled
artifacts.
[[1]](diffhunk://#diff-5b5ea2257772b3aba04b2534f5ea1429a0c631bfd25a7ef531f526e76c471d7aL24-R35)
[[2]](diffhunk://#diff-5b5ea2257772b3aba04b2534f5ea1429a0c631bfd25a7ef531f526e76c471d7aL161-R158)
[[3]](diffhunk://#diff-5b5ea2257772b3aba04b2534f5ea1429a0c631bfd25a7ef531f526e76c471d7aR169-R195)
### Code cleanup and maintainability
* Minor refactor in `cuda_partitioner.py` to clarify delegation tag
assignment.
* Improved imports and code organization for clarity in
`cuda_backend.py`.
These changes collectively improve the reliability, performance, and
maintainability of the CUDA backend pipeline.
---
backends/cuda/cuda_backend.py | 50 +++++++-
backends/cuda/cuda_partitioner.py | 6 +-
.../cuda/replace_slice_copy_with_slice.py | 115 ++++++++++++++++++
3 files changed, 166 insertions(+), 5 deletions(-)
create mode 100644 backends/cuda/replace_slice_copy_with_slice.py
diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
index 49314bed5e6..a39065f6a52 100644
--- a/backends/cuda/cuda_backend.py
+++ b/backends/cuda/cuda_backend.py
@@ -7,10 +7,14 @@
import contextlib
import os
import typing
+from enum import Enum
from typing import Any, Dict, final, List, Optional, Set
import torch
+from executorch.backends.cuda.replace_slice_copy_with_slice import (
+ ReplaceSliceCopyWithSlicePass,
+)
from executorch.exir._serialize._named_data_store import NamedDataStore
from executorch.exir._warnings import experimental
from executorch.exir.backend.backend_details import (
@@ -21,7 +25,7 @@
from executorch.exir.backend.compile_spec_schema import CompileSpec
from torch._inductor.codegen.cpp_wrapper_cpu import CppWrapperCpu
from torch.export.passes import move_to_device_pass
-
+from torch.nn.attention import SDPBackend
# exist fallback operators in et namespace;
supported_fallback_kernels: Dict[str, Any] = {}
@@ -30,6 +34,10 @@
missing_fallback_kernels: Set[str] = set()
+class COMPILE_SPEC_KEYS(Enum):
+ METHOD_NAME = "method_name"
+
+
# context manager for non-fallback guarantee
# it will raise exception when generating fallback kernels during aoti compile
@contextlib.contextmanager
@@ -108,6 +116,9 @@ def preprocess(
# Move the edge_program from CPU to CUDA for aoti compile
cuda_edge_program = move_to_device_pass(edge_program, "cuda")
+ # replace slice_copy with slice
+ ReplaceSliceCopyWithSlicePass()(cuda_edge_program.graph_module)
+
edge_program_module = cuda_edge_program.module()
# Grab all input placeholders from the graph
@@ -132,7 +143,10 @@ def preprocess(
"max_autotune_conv_backends": "TRITON",
}
- with collect_unsupported_fallback_kernels():
+ with collect_unsupported_fallback_kernels(), torch.nn.attention.sdpa_kernel(
+ [SDPBackend.MATH]
+ ), torch.no_grad():
+ # torch._logging.set_logs(post_grad_graphs=True)
so_path = torch._inductor.aot_compile(edge_program_module, tuple(user_input_placeholders), options=options) # type: ignore[arg-type]
if len(missing_fallback_kernels) > 0:
formatted_kernels = "\n - ".join(sorted(missing_fallback_kernels))
@@ -146,7 +160,10 @@ def preprocess(
so_data = f.read()
named_data_store = NamedDataStore()
- named_data_store.add_named_data("so_blob", so_data, 1, "aoti_cuda_blob")
+ method_name = CudaBackend.method_name_from_compile_specs(compile_specs)
+ named_data_store.add_named_data(
+ method_name + "_so_blob", so_data, 1, "aoti_cuda_blob"
+ )
# Clean up the generated so file; it has been packaged into the NamdeDataStore
# pyre-ignorep[6]: Incompatible parameter type
@@ -157,3 +174,30 @@ def preprocess(
debug_handle_map={},
data_store_output=named_data_store.get_named_data_store_output(),
)
+
+ @staticmethod
+ def generate_method_name_compile_spec(
+ method_name: str,
+ ) -> CompileSpec:
+ """
+ Returns the compile spec representing the model compute precision, for additional details
+ please refer to the documentation for ``coremltools.precision``.
+ """
+ return CompileSpec(
+ COMPILE_SPEC_KEYS.METHOD_NAME.value,
+ method_name.encode("utf-8"),
+ )
+
+ @staticmethod
+ def method_name_from_compile_specs(
+ compile_specs: List[CompileSpec],
+ ) -> str:
+ """
+ Returns the method name from the compile specs.
+ """
+ for spec in compile_specs:
+ if spec.key == COMPILE_SPEC_KEYS.METHOD_NAME.value:
+ return spec.value.decode("utf-8")
+ raise RuntimeError(
+ f"Could not find method name in compile specs: {compile_specs}"
+ )
diff --git a/backends/cuda/cuda_partitioner.py b/backends/cuda/cuda_partitioner.py
index d52d7d3d087..14c75bdb937 100644
--- a/backends/cuda/cuda_partitioner.py
+++ b/backends/cuda/cuda_partitioner.py
@@ -44,12 +44,14 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
"""
partition_tags: Dict[str, DelegationSpec] = {}
+ tag = "tag0"
+
for node in exported_program.graph.nodes:
if node.op != "call_function":
continue
- tag = "tag0"
node.meta["delegation_tag"] = tag
- partition_tags[tag] = self.delegation_spec
+
+ partition_tags[tag] = self.delegation_spec
tag_constant_data(exported_program)
diff --git a/backends/cuda/replace_slice_copy_with_slice.py b/backends/cuda/replace_slice_copy_with_slice.py
new file mode 100644
index 00000000000..55ddef5de9b
--- /dev/null
+++ b/backends/cuda/replace_slice_copy_with_slice.py
@@ -0,0 +1,115 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+from typing import Iterable
+
+import torch
+from executorch.exir.dialects._ops import ops
+from executorch.exir.pass_base import ExportPass, PassResult
+from torch import fx
+
+
+_SLICE_COPY_TARGETS = (
+ torch.ops.aten.slice_copy.Tensor,
+ ops.edge.aten.slice_copy.Tensor,
+)
+
+_SLICE_TARGETS = {
+ torch.ops.aten.slice_copy.Tensor: torch.ops.aten.slice.Tensor,
+ ops.edge.aten.slice_copy.Tensor: ops.edge.aten.slice.Tensor,
+}
+
+
+class ReplaceSliceCopyWithSlicePass(ExportPass):
+ """Replace non-mutated ``slice_copy`` results with ``slice`` views."""
+
+ def call(self, graph_module: fx.GraphModule) -> PassResult:
+ graph_changed = False
+
+ for node in graph_module.graph.nodes:
+ if node.op != "call_function" or node.target not in _SLICE_COPY_TARGETS:
+ continue
+
+ if self._has_blocking_user(node, node.users.keys()):
+ continue
+
+ node.target = _SLICE_TARGETS[node.target]
+ graph_changed = True
+
+ if graph_changed:
+ graph_module.graph.lint()
+ graph_module.recompile()
+
+ return PassResult(graph_module, graph_changed)
+
+ def _has_blocking_user(self, node: fx.Node, users: Iterable[fx.Node]) -> bool:
+ for user in users:
+ if self._is_mutating_user(node, user) or self._is_view_user(node, user):
+ return True
+ return False
+
+ def _is_mutating_user(self, node: fx.Node, user: fx.Node) -> bool:
+ if user.op == "call_method":
+ # Treat in-place tensor methods conservatively as mutations only when the
+ # method name ends with ``_`` which is the PyTorch convention for mutation.
+ return isinstance(user.target, str) and user.target.endswith("_")
+
+ if user.op != "call_function":
+ return False
+
+ target = user.target
+ if not hasattr(target, "_schema"):
+ return False
+
+ schema = target._schema # pyre-ignore[16]
+ # Positional arguments
+ for index, arg in enumerate(user.args):
+ if arg is node and self._argument_mutates(schema, index):
+ return True
+
+ # Keyword arguments
+ for name, arg in user.kwargs.items():
+ if arg is node and self._argument_mutates(schema, name):
+ return True
+
+ return False
+
+ def _is_view_user(self, node: fx.Node, user: fx.Node) -> bool:
+ if user.op == "call_method":
+ # Treat tensor methods conservatively and assume they may be view-producing.
+ return True
+
+ if user.op != "call_function":
+ return False
+
+ target = user.target
+ if getattr(target, "is_view", False):
+ for arg in user.args:
+ if arg is node:
+ return True
+ for arg in user.kwargs.values():
+ if arg is node:
+ return True
+
+ return False
+
+ def _argument_mutates(
+ self, schema: torch._C.FunctionSchema, key
+ ) -> bool: # pyre-ignore[11]
+ arguments = schema.arguments
+ if isinstance(key, int):
+ if key >= len(arguments):
+ return False
+ argument = arguments[key]
+ else:
+ argument = next((arg for arg in arguments if arg.name == key), None)
+ if argument is None:
+ return False
+
+ alias_info = argument.alias_info
+ return bool(alias_info and alias_info.is_write)
From 426b7015e2b8302791d37d249710cd8111c5b57b Mon Sep 17 00:00:00 2001
From: Zingo Andersen
Date: Thu, 2 Oct 2025 01:12:35 +0200
Subject: [PATCH 094/266] Arm backend: Backend test TOSA FP, INT and
Ethos-U55/U85 (#14653)
### Summary
Create arm_ethos_u55 and arm_ethos_u85 test flows and add them to CI
Build a semihosted runner for testing on the Corstone3x0 FVP
And split the arm_tosa test job that tested TOSA-1.0+FP into arm_tosa_fp
and arm_tosa_int to also test TOSA-1.0+INT
### Test plan
This will add new tests for arm_tosa_int arm_ethos_u55 and arm_ethos_u85
cc @digantdesai @freddan80 @per @oscarandersson8218
---------
Signed-off-by: Zingo Andersen
---
.ci/scripts/test_backend.sh | 7 +++
.github/workflows/test-backend-arm.yml | 2 +-
backends/test/suite/flow.py | 17 ++++++-
backends/test/suite/flows/arm.py | 68 ++++++++++++++++++++++----
4 files changed, 81 insertions(+), 13 deletions(-)
diff --git a/.ci/scripts/test_backend.sh b/.ci/scripts/test_backend.sh
index df98fb43372..ba5df5c3fe3 100755
--- a/.ci/scripts/test_backend.sh
+++ b/.ci/scripts/test_backend.sh
@@ -1,6 +1,7 @@
#!/usr/bin/env bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
@@ -58,6 +59,12 @@ fi
if [[ "$FLOW" == *arm* ]]; then
# Setup ARM deps.
.ci/scripts/setup-arm-baremetal-tools.sh
+
+ if [[ "$FLOW" == *ethos_u* ]]; then
+ # Prepare a test runner binary that can run on the Corstone-3x0 FVPs
+ backends/arm/scripts/build_executorch.sh
+ backends/arm/test/setup_testing.sh
+ fi
fi
if [[ $IS_MACOS -eq 1 ]]; then
diff --git a/.github/workflows/test-backend-arm.yml b/.github/workflows/test-backend-arm.yml
index bee74fee172..428e3fd1239 100644
--- a/.github/workflows/test-backend-arm.yml
+++ b/.github/workflows/test-backend-arm.yml
@@ -23,7 +23,7 @@ jobs:
uses: ./.github/workflows/_test_backend.yml
with:
backend: arm
- flows: '["arm_tosa"]'
+ flows: '["arm_tosa_fp", "arm_tosa_int", "arm_ethos_u55", "arm_ethos_u85"]'
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
timeout: 120
run-linux: true
diff --git a/backends/test/suite/flow.py b/backends/test/suite/flow.py
index 05fc760683d..29394951bd7 100644
--- a/backends/test/suite/flow.py
+++ b/backends/test/suite/flow.py
@@ -1,3 +1,8 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
import logging
from dataclasses import dataclass, field
@@ -122,10 +127,18 @@ def all_flows() -> dict[str, TestFlow]:
logger.info(f"Skipping QNN flow registration: {e}")
try:
- from executorch.backends.test.suite.flows.arm import ARM_TOSA_FLOW
+ from executorch.backends.test.suite.flows.arm import (
+ ARM_ETHOS_U55_FLOW,
+ ARM_ETHOS_U85_FLOW,
+ ARM_TOSA_FP_FLOW,
+ ARM_TOSA_INT_FLOW,
+ )
flows += [
- ARM_TOSA_FLOW,
+ ARM_TOSA_FP_FLOW,
+ ARM_TOSA_INT_FLOW,
+ ARM_ETHOS_U55_FLOW,
+ ARM_ETHOS_U85_FLOW,
]
except Exception as e:
logger.info(f"Skipping ARM flow registration: {e}")
diff --git a/backends/test/suite/flows/arm.py b/backends/test/suite/flows/arm.py
index baa2df79de9..34a6346fb1f 100644
--- a/backends/test/suite/flows/arm.py
+++ b/backends/test/suite/flows/arm.py
@@ -1,24 +1,72 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from executorch.backends.arm.quantizer import (
+ get_symmetric_quantization_config,
+ TOSAQuantizer,
+)
from executorch.backends.arm.test import common
from executorch.backends.arm.test.tester.arm_tester import ArmTester
from executorch.backends.test.suite.flow import TestFlow
+from executorch.backends.xnnpack.test.tester.tester import Quantize
-def _create_arm_tester_tosa_fp(*args, **kwargs) -> ArmTester:
- kwargs["compile_spec"] = common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP")
+def _create_tosa_flow(
+ name,
+ compile_spec,
+ quantize: bool = False,
+ symmetric_io_quantization: bool = False,
+ per_channel_quantization: bool = True,
+) -> TestFlow:
- return ArmTester(
- *args,
- **kwargs,
- )
+ def _create_arm_tester(*args, **kwargs) -> ArmTester:
+ kwargs["compile_spec"] = compile_spec
+ return ArmTester(
+ *args,
+ **kwargs,
+ )
+
+ # Create and configure quantizer to use in the flow
+ def create_quantize_stage() -> Quantize:
+ quantizer = TOSAQuantizer(compile_spec)
+ quantization_config = get_symmetric_quantization_config(
+ is_per_channel=per_channel_quantization
+ )
+ if symmetric_io_quantization:
+ quantizer.set_io(quantization_config)
+ return Quantize(quantizer, quantization_config)
-def _create_tosa_flow() -> TestFlow:
return TestFlow(
- "arm_tosa",
+ name,
backend="arm",
- tester_factory=_create_arm_tester_tosa_fp,
+ tester_factory=_create_arm_tester,
supports_serialize=False,
+ quantize=quantize,
+ quantize_stage_factory=create_quantize_stage if quantize else None,
)
-ARM_TOSA_FLOW = _create_tosa_flow()
+ARM_TOSA_FP_FLOW = _create_tosa_flow(
+ "arm_tosa_fp", common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP")
+)
+ARM_TOSA_INT_FLOW = _create_tosa_flow(
+ "arm_tosa_int",
+ common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+INT"),
+ quantize=True,
+)
+
+ARM_ETHOS_U55_FLOW = _create_tosa_flow(
+ "arm_ethos_u55",
+ common.get_u55_compile_spec(),
+ quantize=True,
+)
+
+ARM_ETHOS_U85_FLOW = _create_tosa_flow(
+ "arm_ethos_u85",
+ common.get_u85_compile_spec(),
+ quantize=True,
+)
From d4f208d2690bc9abae4709a8932d0ab596d81cc4 Mon Sep 17 00:00:00 2001
From: Hansong Zhang <107070759+kirklandsign@users.noreply.github.com>
Date: Wed, 1 Oct 2025 17:11:37 -0700
Subject: [PATCH 095/266] Android set different maven package names of flavors
(#14674)
Different flavor name generates different maven packages
---
.github/workflows/android-release-artifacts.yml | 4 ++++
extension/android/executorch_android/build.gradle | 3 ++-
2 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/.github/workflows/android-release-artifacts.yml b/.github/workflows/android-release-artifacts.yml
index bec6d3a0f5e..beda0f77c83 100644
--- a/.github/workflows/android-release-artifacts.yml
+++ b/.github/workflows/android-release-artifacts.yml
@@ -90,6 +90,10 @@ jobs:
fi
FLAVOR="${{ inputs.flavor }}"
+ if [ ! -z "$FLAVOR" ]; then
+ GRADLE_ARGS+=" -Dflavor=${FLAVOR}"
+ fi
+
if [[ "$FLAVOR" == "vulkan" || -z "$FLAVOR" ]]; then
curl -O https://sdk.lunarg.com/sdk/download/1.4.321.1/linux/vulkansdk-linux-x86_64-1.4.321.1.tar.xz
tar xf vulkansdk-linux-x86_64-1.4.321.1.tar.xz -C /tmp
diff --git a/extension/android/executorch_android/build.gradle b/extension/android/executorch_android/build.gradle
index e36044e3da5..0c18d60721e 100644
--- a/extension/android/executorch_android/build.gradle
+++ b/extension/android/executorch_android/build.gradle
@@ -15,6 +15,7 @@ plugins {
def qnnVersion = System.properties['qnnVersion']
def execuTorchVersion = System.properties['execuTorchVersion']
+def flavor = System.properties['flavor']
android {
namespace = "org.pytorch.executorch"
@@ -69,7 +70,7 @@ mavenPublishing {
publishToMavenCentral()
signAllPublications()
- coordinates("org.pytorch", "executorch-android" + (qnnVersion ? "-qnn" : ""), execuTorchVersion ? execuTorchVersion : "0.7.0-SNAPSHOT")
+ coordinates("org.pytorch", "executorch-android" + (flavor ? "-" + flavor : ""), execuTorchVersion ? execuTorchVersion : "1.0.0-SNAPSHOT")
pom {
name = "ExecuTorch Android"
From e608a21fc1845d960142b4c78ade06cdafdf5036 Mon Sep 17 00:00:00 2001
From: Gregory Comer
Date: Wed, 1 Oct 2025 19:45:58 -0600
Subject: [PATCH 096/266] [Backend Tester] Update README (#14739)
### Summary
Update the readme for the backend test suite to describe how to run with
pytest and to generally update for recent changes. Add CLI examples for
common invocation patterns (filter by test, flow, or backend) and add
some brief info on the JSON report format.
---
backends/test/suite/README.md | 80 +++++++++++++++++++++++++----------
1 file changed, 57 insertions(+), 23 deletions(-)
diff --git a/backends/test/suite/README.md b/backends/test/suite/README.md
index 564f44362ad..901cd461dbe 100644
--- a/backends/test/suite/README.md
+++ b/backends/test/suite/README.md
@@ -5,37 +5,71 @@ This directory contains tests that validate correctness and coverage of backends
These tests are intended to ensure that backends are robust and provide a smooth, "out-of-box" experience for users across the full span of input patterns. They are not intended to be a replacement for backend-specific tests, as they do not attempt to validate performance or that backends delegate operators that they expect to.
## Running Tests and Interpreting Output
-Tests can be run from the command line, either using the runner.py entry point or the standard Python unittest runner. When running through runner.py, the test runner will report test statistics, including the number of tests with each result type.
+Tests can be run from the command line using pytest. When generating a JSON test report, the runner will report detailed test statistics, including output accuracy, delegated nodes, lowering timing, and more.
-Backends can be specified with the `ET_TEST_ENABLED_BACKENDS` environment variable. By default, all available backends are enabled. Note that backends such as Core ML or Vulkan may require specific hardware or software to be available. See the documentation for each backend for information on requirements.
+Each backend and test flow (recipe) registers a pytest [marker](https://docs.pytest.org/en/stable/example/markers.html) that can be passed to pytest with the `-m marker` argument to filter execution.
-Example:
+To run all XNNPACK backend operator tests:
```
-ET_TEST_ENABLED_BACKENDS=xnnpack python -m executorch.backends.test.suite.runner
+pytest -c /dev/nul backends/test/suite/operators/ -m backend_xnnpack -n auto
```
+To run all model tests for the CoreML static int8 lowering flow:
+```
+pytest -c /dev/nul backends/test/suite/models/ -m flow_coreml_static_int8 -n auto
```
-2465 Passed / 2494
-16 Failed
-13 Skipped
-[Success]
-736 Delegated
-1729 Undelegated
+To run a specific test:
+```
+pytest -c /dev/nul backends/test/suite/ -k "test_prelu_f32_custom_init[xnnpack]"
+```
-[Failure]
-5 Lowering Fail
-3 PTE Run Fail
-8 Output Mismatch Fail
+To generate a JSON report:
+```
+pytest -c /dev/nul backends/test/suite/operators/ -n auto --json-report --json-report-file="test_report.json"
```
-Outcomes can be interpreted as follows:
- * Success (delegated): The test passed and at least one op was delegated by the backend.
- * Success (undelegated): The test passed with no ops delegated by the backend. This is a pass, as the partitioner works as intended.
- * Skipped: test fails in eager or export (indicative of a test or dynamo issue).
- * Lowering fail: The test fails in to_edge_transform_and_lower.
- * PTE run failure: The test errors out when loading or running the method.
- * Output mismatch failure: Output delta (vs eager) exceeds the configured tolerance.
+See [pytest-json-report](https://pypi.org/project/pytest-json-report/) for information on the report format. The test logic in this repository attaches additional metadata to each test entry under the `metadata`/`subtests` keys. One entry is created for each call to `test_runner.lower_and_run_model`.
+
+Here is a excerpt from a test run, showing a successful run of the `test_add_f32_bcast_first[xnnpack]` test.
+```json
+"tests": [
+ {
+ "nodeid": "operators/test_add.py::test_add_f32_bcast_first[xnnpack]",
+ "lineno": 38,
+ "outcome": "passed",
+ "keywords": [
+ "test_add_f32_bcast_first[xnnpack]",
+ "flow_xnnpack",
+ "backend_xnnpack",
+ ...
+ ],
+ "metadata": {
+ "subtests": [
+ {
+ "Test ID": "test_add_f32_bcast_first[xnnpack]",
+ "Test Case": "test_add_f32_bcast_first",
+ "Subtest": 0,
+ "Flow": "xnnpack",
+ "Result": "Pass",
+ "Result Detail": "",
+ "Error": "",
+ "Delegated": "True",
+ "Quantize Time (s)": null,
+ "Lower Time (s)": "2.881",
+ "Output 0 Error Max": "0.000",
+ "Output 0 Error MAE": "0.000",
+ "Output 0 SNR": "inf",
+ "Delegated Nodes": 1,
+ "Undelegated Nodes": 0,
+ "Delegated Ops": {
+ "aten::add.Tensor": 1
+ },
+ "PTE Size (Kb)": "1.600"
+ }
+ ]
+ }
+```
## Backend Registration
@@ -43,11 +77,11 @@ To plug into the test framework, each backend should provide an implementation o
At a minimum, the backend will likely need to provide a custom implementation of the Partition and ToEdgeTransformAndLower stages using the appropriate backend partitioner. See backends/xnnpack/test/tester/tester.py for an example implementation.
-Once a tester is available, the backend flow(s) can be added in __init__.py in this directory by adding an entry to `ALL_TESTER_FLOWS`. Each flow entry consists of a name (used in the test case naming) and a function to instantiate a tester for a given model and input tuple.
+Once a tester is available, the backend flow(s) can be added under flows/ and registered in flow.py. It is intended that this will be unified with the lowering recipes under executorch/export in the near future.
## Test Cases
-Operator test cases are defined under the operators/ directory. Tests are written in a backend-independent manner, and each test is programmatically expanded to generate a variant for each registered backend flow. The `@operator_test` decorator is applied to each test class to trigger this behavior. Tests can also be tagged with an appropriate type specifier, such as `@dtype_test`, to generate variants for each dtype. The decorators and "magic" live in __init__.py in this directory.
+Operator test cases are defined under the operators/ directory. Model tests are under models/. Tests are written in a backend-independent manner, and each test is programmatically expanded to generate a variant for each registered backend flow by use of the `test_runner` fixture parameter. Tests can additionally be parameterized using standard pytest decorators. Parameterizing over dtype is a common use case.
## Evolution of this Test Suite
From fb66fb38604dc02abfc3c52d97d5af72725c92b3 Mon Sep 17 00:00:00 2001
From: robert-kalmar
Date: Thu, 2 Oct 2025 04:12:16 +0200
Subject: [PATCH 097/266] NXP Backend: Add codeowner for the NXP Backend
(#14723)
Add codeowner for the NXP Backend.
cc @digantdesai @JakeStevens
---
CODEOWNERS | 2 ++
1 file changed, 2 insertions(+)
diff --git a/CODEOWNERS b/CODEOWNERS
index 10baed9ede4..11f3ca07615 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -14,6 +14,7 @@
/backends/transforms @kimishpatel
/backends/vulkan @SS-JIA
/backends/xnnpack @digantdesai @mcr229
+/backends/nxp @robert-kalmar
/devtools @Gasoonjia
@@ -33,6 +34,7 @@
/examples/qualcomm @cccclai
/examples/selective_build @lucylq @larryliu0820 @JacobSzwejbka
/examples/xnnpack @digantdesai @mcr229
+/examples/nxp @robert-kalmar
/exir/backend @cccclai @kimishpatel @JacobSzwejbka
/exir @JacobSzwejbka @larryliu0820
From baaaa86ca9bb0a61c42cc36b781571bd5cac2cf6 Mon Sep 17 00:00:00 2001
From: Andrew Grebenisan <33402477+DrJessop@users.noreply.github.com>
Date: Wed, 1 Oct 2025 19:30:25 -0700
Subject: [PATCH 098/266] Add transposed convolution
Differential Revision: D83602808
Pull Request resolved: https://github.com/pytorch/executorch/pull/14708
---
backends/cadence/aot/ref_implementations.py | 59 ++++++++
.../aot/tests/test_ref_implementations.py | 137 ++++++++++++++++++
2 files changed, 196 insertions(+)
diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py
index 312bed89315..ca15e825ff0 100644
--- a/backends/cadence/aot/ref_implementations.py
+++ b/backends/cadence/aot/ref_implementations.py
@@ -960,6 +960,7 @@ def convolution(
_stride: tuple[int, int] | int = stride
_padding: tuple[int, int] | int = padding
_dilation: tuple[int, int] | int = dilation
+
if conv_is_1d:
conv = torch.nn.functional.conv1d
_stride = stride[0]
@@ -978,6 +979,64 @@ def convolution(
return conv_out
+@impl(m, "transposed_convolution")
+def transposed_convolution(
+ input_tensor: torch.Tensor,
+ weight: torch.Tensor,
+ bias: torch.Tensor,
+ stride: tuple[int, int],
+ padding: tuple[int, int],
+ dilation: tuple[int, int],
+ output_padding: tuple[int, int],
+ groups: int,
+ channel_last: bool = False,
+) -> torch.Tensor:
+
+ conv_is_1d = len(input_tensor.shape) == 3
+ if channel_last:
+ if conv_is_1d:
+ input_tensor = input_tensor.movedim(-1, 1).contiguous()
+ if len(weight.shape) != 3:
+ raise ValueError("Weight tensor must be 3D if input is 3D")
+ weight = weight.movedim(-1, 1).contiguous()
+ else:
+ input_tensor = input_tensor.movedim(-1, -3)
+ if len(weight.shape) != 4:
+ raise ValueError("Weight tensor must be 4D if input is nd > 3")
+ weight = torch.permute(weight, (0, -1, 1, 2)).contiguous()
+
+ _stride: tuple[int, int] | int = stride
+ _padding: tuple[int, int] | int = padding
+ _dilation: tuple[int, int] | int = dilation
+ _output_padding: tuple[int, int] | int = output_padding
+ if conv_is_1d:
+ conv = torch.nn.functional.conv_transpose1d
+ _stride = stride[0]
+ _padding = padding[0]
+ _dilation = dilation[0]
+ _output_padding = output_padding[0]
+ else:
+ conv = torch.nn.functional.conv_transpose2d
+
+ conv_out = conv(
+ input_tensor,
+ weight,
+ bias,
+ _stride,
+ _padding,
+ _output_padding,
+ groups,
+ _dilation,
+ )
+ if channel_last:
+ if conv_is_1d:
+ conv_out = conv_out.movedim(1, -1).contiguous()
+ else:
+ conv_out = conv_out.movedim(-3, -1).contiguous()
+
+ return conv_out
+
+
@impl(m, "avg_pool2d")
def avg_pool2d(
input_tensor: torch.Tensor,
diff --git a/backends/cadence/aot/tests/test_ref_implementations.py b/backends/cadence/aot/tests/test_ref_implementations.py
index 32e9b43e68e..8d02c5c2963 100644
--- a/backends/cadence/aot/tests/test_ref_implementations.py
+++ b/backends/cadence/aot/tests/test_ref_implementations.py
@@ -1534,6 +1534,143 @@ def test_convolution(
f"Output values don't match expected in {name}. Got {output}, expected {expected_output}",
)
+ @expand(
+ [
+ # Basic 2D transposed convolution with stride=1 (current test case - corrected name)
+ (
+ "basic_2d_stride1",
+ torch.tensor(
+ [[[[1.0, 2.0], [3.0, 4.0]]]], dtype=torch.float32
+ ), # input: 1x1x2x2
+ torch.tensor(
+ [[[[1.0, 1.0], [1.0, 1.0]]]], dtype=torch.float32
+ ), # weight: 1x1x2x2
+ torch.tensor([0.0], dtype=torch.float32), # bias
+ (1, 1), # stride
+ (0, 0), # padding
+ (1, 1), # dilation
+ 1, # groups
+ (0, 0), # output_padding
+ False, # channel_last
+ torch.tensor(
+ [[[[1.0, 3.0, 2.0], [4.0, 10.0, 6.0], [3.0, 7.0, 4.0]]]],
+ dtype=torch.float32,
+ ),
+ ),
+ # 2D transposed convolution with channel_last=True (NHWC format)
+ (
+ "channel_last_nhwc",
+ torch.tensor(
+ [[[[1.0], [2.0]], [[3.0], [4.0]]]], dtype=torch.float32
+ ), # input: 1x2x2x1 (NHWC)
+ torch.tensor(
+ [[[[1.0], [1.0]], [[1.0], [1.0]]]], dtype=torch.float32
+ ), # weight: 1x2x2x1 (NHWC)
+ torch.tensor([0.0], dtype=torch.float32), # bias
+ (1, 1), # stride
+ (0, 0), # padding
+ (1, 1), # dilation
+ 1, # groups
+ (0, 0), # output_padding
+ True, # channel_last=True
+ torch.tensor(
+ [
+ [
+ [[1.0], [3.0], [2.0]],
+ [[4.0], [10.0], [6.0]],
+ [[3.0], [7.0], [4.0]],
+ ]
+ ],
+ dtype=torch.float32,
+ ),
+ ),
+ # 2D transposed convolution with non-zero bias
+ (
+ "with_bias",
+ torch.tensor(
+ [[[[1.0, 2.0], [3.0, 4.0]]]], dtype=torch.float32
+ ), # input: 1x1x2x2
+ torch.tensor(
+ [[[[1.0, 0.0], [0.0, 1.0]]]], dtype=torch.float32
+ ), # weight: 1x1x2x2
+ torch.tensor([5.0], dtype=torch.float32), # bias=5.0
+ (1, 1), # stride
+ (0, 0), # padding
+ (1, 1), # dilation
+ 1, # groups
+ (0, 0), # output_padding
+ False, # channel_last
+ torch.tensor(
+ [[[[6.0, 7.0, 5.0], [8.0, 10.0, 7.0], [5.0, 8.0, 9.0]]]],
+ dtype=torch.float32,
+ ),
+ ),
+ # 1D transposed convolution (3D tensor, NLC format)
+ (
+ "conv1d_nlc",
+ torch.tensor(
+ [[[1.0], [2.0], [3.0]]], dtype=torch.float32
+ ), # input: 1x3x1 (NLC)
+ torch.tensor(
+ [[[1.0], [0.5]]], dtype=torch.float32
+ ), # weight: 1x2x1 (NLC)
+ torch.tensor([0.0], dtype=torch.float32), # bias
+ (2, 0), # stride
+ (0, 0), # padding
+ (1, 1), # dilation
+ 1, # groups
+ (0, 0), # output_padding
+ True, # channel_last=True
+ torch.tensor(
+ [[[1.0], [0.5], [2.0], [1.0], [3.0], [1.5]]], dtype=torch.float32
+ ),
+ ),
+ ]
+ )
+ def test_transposed_convolution(
+ self,
+ name: str,
+ input_tensor: torch.Tensor,
+ weight: torch.Tensor,
+ bias: torch.Tensor,
+ stride: tuple[int, int],
+ padding: tuple[int, int],
+ dilation: tuple[int, int],
+ groups: int,
+ output_padding: tuple[int, int],
+ channel_last: bool,
+ expected_output: torch.Tensor,
+ ) -> None:
+ output = torch.ops.cadence.transposed_convolution(
+ input_tensor,
+ weight,
+ bias,
+ stride,
+ padding,
+ dilation,
+ output_padding,
+ groups,
+ channel_last,
+ )
+
+ # Verify output properties
+ self.assertEqual(
+ output.dtype,
+ input_tensor.dtype,
+ f"Output dtype should match input dtype in {name}",
+ )
+ self.assertEqual(
+ output.shape,
+ expected_output.shape,
+ f"Output shape should match expected shape in {name}",
+ )
+
+ # Verify output matches expected values
+ self.assertTrue(
+ torch.equal(output, expected_output),
+ f"Output values don't match expected in {name}. Got {output}, expected {expected_output}",
+ )
+
@expand(
[
# Basic non-quantized average pooling
From 9ab5592a6533e9d903d927ff70d9aef83a74f0c6 Mon Sep 17 00:00:00 2001
From: cccclai
Date: Wed, 1 Oct 2025 21:13:24 -0700
Subject: [PATCH 099/266] support qnn mean (dim=None) (#14675)
Summary: Address mean op lower failure. When dim is not specified, it
will take mean across all axes. For QNN, we need to get axes based on
input shape
Differential Revision: D83520776
---
backends/qualcomm/builders/op_mean_dim.py | 19 ++-
backends/qualcomm/tests/models.py | 25 ++--
backends/qualcomm/tests/test_qnn_delegate.py | 132 ++++++++++++++++---
3 files changed, 143 insertions(+), 33 deletions(-)
diff --git a/backends/qualcomm/builders/op_mean_dim.py b/backends/qualcomm/builders/op_mean_dim.py
index 630b1b0b8de..22cb47ee288 100644
--- a/backends/qualcomm/builders/op_mean_dim.py
+++ b/backends/qualcomm/builders/op_mean_dim.py
@@ -4,7 +4,7 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
-from typing import cast, Dict, List
+from typing import cast, Dict
import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
@@ -40,7 +40,22 @@ def define_node(
)
# mean dims and keep dims
- mean_dims = cast(List[int], node.args[1])
+ rank = len(input_node.meta["val"].shape)
+
+ if rank == 0:
+ raise RuntimeError(
+ "Mean doesn't support 0d input, please report a bug in https://github.com/pytorch/executorch/issues"
+ )
+
+ dim_arg = node.args[1]
+
+ if dim_arg is None or len(dim_arg) == 0:
+ mean_dims = list(range(rank)) # reduce over all dims
+ elif isinstance(dim_arg, int):
+ mean_dims = [dim_arg]
+ else:
+ mean_dims = list(dim_arg)
+ print("mean_dims: ", mean_dims, "rank: ", rank)
mean_dims = [
mean_dim % len(input_node.meta["val"].shape) for mean_dim in mean_dims
]
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
index a37648cb6be..cf4b2f21aaa 100644
--- a/backends/qualcomm/tests/models.py
+++ b/backends/qualcomm/tests/models.py
@@ -4,8 +4,9 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
-import torch
+from typing import List, Optional, Tuple, Union
+import torch
# module with related operator only
@@ -1332,20 +1333,20 @@ def forward(self, x):
return self.max_pool2d(x)
-class MeanWKeppDim(torch.nn.Module):
- def __init__(self):
- super().__init__()
-
- def forward(self, x):
- return torch.mean(x, (-1, -2), keepdim=True)
-
-
-class MeanWOKeppDim(torch.nn.Module):
- def __init__(self):
+class Mean(torch.nn.Module):
+ def __init__(
+ self,
+ dim: Optional[Union[int, Tuple[int, ...], List[int]]] = None,
+ keepdim: bool = False,
+ dtype: Optional[torch.dtype] = None,
+ ):
super().__init__()
+ self.dim = dim
+ self.keepdim = keepdim
+ self.dtype = dtype
def forward(self, x):
- return torch.mean(x, (-1, -2))
+ return torch.mean(x, dim=self.dim, keepdim=self.keepdim, dtype=self.dtype)
class MaskedFill(torch.nn.Module):
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index 6c444c90c08..e3cf52b9a6f 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -1018,12 +1018,61 @@ def test_qnn_backend_max_pool2d(self):
sample_input = (torch.randn(4, 3, 24, 24),)
self.lower_module_and_test_output(module, sample_input)
- def test_qnn_backend_mean_dim(self):
- modules = [MeanWKeppDim(), MeanWOKeppDim()] # noqa: F405
- sample_input = (torch.randn([2, 5, 1, 3]),)
- for i, module in enumerate(modules):
+ def test_qnn_backend_mean(self):
+ test_comb = [
+ # Reduce over last two dims, keepdim=True
+ {
+ QCOM_MODULE: Mean(dim=(-1, -2), keepdim=True), # noqa: F405
+ QCOM_SAMPLE_INPUTS: (torch.randn([2, 5, 1, 3]),),
+ },
+ # Reduce over last two dims, keepdim=False
+ {
+ QCOM_MODULE: Mean(dim=(-1, -2), keepdim=False), # noqa: F405
+ QCOM_SAMPLE_INPUTS: (torch.randn([2, 5, 1, 3]),),
+ },
+ # Default: reduce all dims
+ {
+ QCOM_MODULE: Mean(), # noqa: F405
+ QCOM_SAMPLE_INPUTS: (torch.randn(10, 10),),
+ },
+ # TODO: To be enabled via reshape input to 1d tensor
+ # # Scalar case
+ # {
+ # QCOM_MODULE: Mean(),
+ # QCOM_SAMPLE_INPUTS: (torch.tensor(5.0),),
+ # },
+ # Edge case: dim is a empty list
+ {
+ QCOM_MODULE: Mean(dim=[]), # noqa: F405
+ QCOM_SAMPLE_INPUTS: (torch.randn(4, 6, 8),),
+ },
+ # Edge case: reduce along dim=0 (batch dimension)
+ {
+ QCOM_MODULE: Mean(dim=0), # noqa: F405
+ QCOM_SAMPLE_INPUTS: (torch.randn(4, 6, 8),),
+ },
+ # Edge case: reduce along dim=0 with keepdim=True
+ {
+ QCOM_MODULE: Mean(dim=0, keepdim=True), # noqa: F405
+ QCOM_SAMPLE_INPUTS: (torch.randn(4, 6, 8),),
+ },
+ # Edge case: reduce along multiple dims
+ {
+ QCOM_MODULE: Mean(dim=(0, 2)), # noqa: F405
+ QCOM_SAMPLE_INPUTS: (torch.randn(3, 4, 5),),
+ },
+ # Edge case: high-dimensional tensor
+ {
+ QCOM_MODULE: Mean(dim=(1, 3), keepdim=True), # noqa: F405
+ QCOM_SAMPLE_INPUTS: (torch.randn(2, 3, 4, 5, 6),),
+ },
+ ]
+
+ for i, test in enumerate(test_comb):
with self.subTest(i=i):
- self.lower_module_and_test_output(module, sample_input)
+ self.lower_module_and_test_output(
+ test[QCOM_MODULE], test[QCOM_SAMPLE_INPUTS]
+ )
@unittest.skip("failed to lower in QNN 2.26")
def test_qnn_backend_mha(self):
@@ -1216,10 +1265,8 @@ def test_qnn_backend_slice_scatter(self):
],
QCOM_SAMPLE_INPUTS: [
(
- (
- torch.zeros(8, 8),
- torch.ones(8, 2),
- )
+ torch.zeros(8, 8),
+ torch.ones(8, 2),
)
],
},
@@ -2666,13 +2713,62 @@ def test_qnn_backend_max_pool2d(self):
module = self.get_qdq_module(module, sample_input)
self.lower_module_and_test_output(module, sample_input)
- def test_qnn_backend_mean_dim(self):
- modules = [MeanWKeppDim(), MeanWOKeppDim()] # noqa: F405
- sample_input = (torch.randn([2, 5, 1, 3]),)
- for i, module in enumerate(modules):
+ def test_qnn_backend_mean(self):
+ test_comb = [
+ # Reduce over last two dims, keepdim=True
+ {
+ QCOM_MODULE: Mean(dim=(-1, -2), keepdim=True), # noqa: F405
+ QCOM_SAMPLE_INPUTS: (torch.randn([2, 5, 1, 3]),),
+ },
+ # Reduce over last two dims, keepdim=False
+ {
+ QCOM_MODULE: Mean(dim=(-1, -2), keepdim=False), # noqa: F405
+ QCOM_SAMPLE_INPUTS: (torch.randn([2, 5, 1, 3]),),
+ },
+ # Default: reduce all dims
+ {
+ QCOM_MODULE: Mean(), # noqa: F405
+ QCOM_SAMPLE_INPUTS: (torch.randn(10, 10),),
+ },
+ # TODO: To be enabled via reshape input to 1d tensor
+ # Scalar case
+ # {
+ # QCOM_MODULE: Mean(),
+ # QCOM_SAMPLE_INPUTS: (torch.tensor(5.0),),
+ # },
+ # Edge case: dim is a empty list
+ {
+ QCOM_MODULE: Mean(dim=[]), # noqa: F405
+ QCOM_SAMPLE_INPUTS: (torch.randn(4, 6, 8),),
+ },
+ # Edge case: reduce along dim=0 (batch dimension)
+ {
+ QCOM_MODULE: Mean(dim=0), # noqa: F405
+ QCOM_SAMPLE_INPUTS: (torch.randn(4, 6, 8),),
+ },
+ # Edge case: reduce along dim=0 with keepdim=True
+ {
+ QCOM_MODULE: Mean(dim=0, keepdim=True), # noqa: F405
+ QCOM_SAMPLE_INPUTS: (torch.randn(4, 6, 8),),
+ },
+ # Edge case: reduce along multiple dims
+ {
+ QCOM_MODULE: Mean(dim=(0, 2)), # noqa: F405
+ QCOM_SAMPLE_INPUTS: (torch.randn(3, 4, 5),),
+ },
+ # Edge case: high-dimensional tensor
+ {
+ QCOM_MODULE: Mean(dim=(1, 3), keepdim=True), # noqa: F405
+ QCOM_SAMPLE_INPUTS: (torch.randn(2, 3, 4, 5, 6),),
+ },
+ ]
+
+ for i, test in enumerate(test_comb):
with self.subTest(i=i):
- module = self.get_qdq_module(module, sample_input)
- self.lower_module_and_test_output(module, sample_input)
+ module = self.get_qdq_module(
+ test[QCOM_MODULE], test[QCOM_SAMPLE_INPUTS]
+ )
+ self.lower_module_and_test_output(module, test[QCOM_SAMPLE_INPUTS])
def test_qnn_backend_mha(self):
module = MultiheadAttention() # noqa: F405
@@ -2897,10 +2993,8 @@ def test_qnn_backend_slice_scatter(self):
],
QCOM_SAMPLE_INPUTS: [
(
- (
- torch.zeros(8, 8),
- torch.ones(8, 2),
- )
+ torch.zeros(8, 8),
+ torch.ones(8, 2),
)
],
},
From f24351a365ef5929538473a6d8983f7d0f1ddb50 Mon Sep 17 00:00:00 2001
From: Eli Amesefe
Date: Wed, 1 Oct 2025 22:28:18 -0700
Subject: [PATCH 100/266] Update mul int16 test
Differential Revision: D83437473
Pull Request resolved: https://github.com/pytorch/executorch/pull/14646
---
backends/arm/operators/op_repeat.py | 2 +-
backends/arm/test/ops/test_mul.py | 6 ------
2 files changed, 1 insertion(+), 7 deletions(-)
diff --git a/backends/arm/operators/op_repeat.py b/backends/arm/operators/op_repeat.py
index 5db7ce9347c..9ee4e9fedf8 100644
--- a/backends/arm/operators/op_repeat.py
+++ b/backends/arm/operators/op_repeat.py
@@ -44,7 +44,7 @@ def define_node(
validate_valid_dtype(
self.target,
[inputs[0], output],
- [ts.DType.INT8, ts.DType.INT32, ts.DType.FP32],
+ [ts.DType.INT8, ts.DType.INT32, ts.DType.INT16, ts.DType.FP32],
output.tosa_spec,
)
diff --git a/backends/arm/test/ops/test_mul.py b/backends/arm/test/ops/test_mul.py
index e3f2096e7da..2c7b040658a 100644
--- a/backends/arm/test/ops/test_mul.py
+++ b/backends/arm/test/ops/test_mul.py
@@ -338,9 +338,6 @@ def test_mul_tensor_16a8w_tosa_INT(test_data: input_t1):
@common.parametrize("test_data", test_data_suite)
@common.XfailIfNoCorstone300
-@pytest.mark.xfail(
- reason="Vela compilation fails with 'Invalid arguments' for int16 mul operations. See: https://github.com/pytorch/executorch/issues/13947"
-)
def test_mul_tensor_16a8w_u55_INT16(test_data: input_t1):
"""Test mul operation with 16A8W quantization on U55 (16-bit activations, 8-bit weights)"""
per_channel_quantization = False
@@ -365,9 +362,6 @@ def test_mul_tensor_16a8w_u55_INT16(test_data: input_t1):
@common.parametrize("test_data", test_data_suite)
@common.XfailIfNoCorstone320
-@pytest.mark.xfail(
- reason="Vela compilation fails with 'Invalid arguments' for int16 mul operations. See: https://github.com/pytorch/executorch/issues/13947"
-)
def test_mul_tensor_16a8w_u85_INT16(test_data: input_t1):
"""Test mul operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)"""
per_channel_quantization = False
From 499ce5038cf4589eb6761dceb5763acc736fbec1 Mon Sep 17 00:00:00 2001
From: Yufeng Shi
Date: Thu, 2 Oct 2025 13:57:33 +0100
Subject: [PATCH 101/266] Arm backend: Add VGF tests to StableDiffusion module
tests (#14655)
Also refactor the StableDiffusion module tests to use test_pipeline
instead of ArmTester directly.
Signed-off-by: Yufeng Shi
---
.../test_CLIPTextModelWithProjection.py | 146 ++++++++++++------
.../test_SD3Transformer2DModel.py | 138 +++++++++++------
.../stable_diffusion/test_T5EncoderModel.py | 140 +++++++++++------
.../test_vae_AutoencoderKL.py | 114 +++++++++-----
4 files changed, 359 insertions(+), 179 deletions(-)
diff --git a/backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py b/backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py
index 49266beee63..fad31b57537 100644
--- a/backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py
+++ b/backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py
@@ -4,7 +4,7 @@
# LICENSE file in the root directory of this source tree.
-import unittest
+from typing import Tuple
import torch
from executorch.backends.arm._passes import (
@@ -17,11 +17,17 @@
from executorch.backends.arm.test.models.stable_diffusion.stable_diffusion_module_test_configs import (
CLIP_text_encoder_config,
)
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.test.tester.test_pipeline import (
+ TosaPipelineFP,
+ TosaPipelineINT,
+ VgfPipeline,
+)
from transformers import CLIPTextModelWithProjection
+input_t = Tuple[torch.Tensor]
+
-class TestCLIPTextModelWithProjection(unittest.TestCase):
+class TestCLIPTextModelWithProjection:
"""
Test class of CLIPTextModelWithProjection.
CLIPTextModelWithProjection is one of the text_encoder used by Stable Diffusion 3.5 Medium
@@ -69,47 +75,93 @@ def prepare_model_and_inputs(self):
return text_encoder_model, text_encoder_model_inputs
- def test_CLIPTextModelWithProjection_tosa_FP(self):
- text_encoder_model, text_encoder_model_inputs = self.prepare_model_and_inputs()
- with torch.no_grad():
- (
- ArmTester(
- text_encoder_model,
- example_inputs=text_encoder_model_inputs,
- compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP"),
- transform_passes=[
- ConvertInt64ConstOpsToInt32Pass(),
- ConvertInt64OutputOpsToInt32Pass(),
- InsertInt32CastsAfterInt64PlaceholdersPass(),
- ],
- )
- .export()
- .to_edge_transform_and_lower()
- .dump_operator_distribution()
- .check_count(self.ops_after_partitioner_FP)
- .to_executorch()
- .run_method_and_compare_outputs(
- inputs=text_encoder_model_inputs,
- )
- )
-
- def test_CLIPTextModelWithProjection_tosa_INT(self):
- text_encoder_model, text_encoder_model_inputs = self.prepare_model_and_inputs()
- with torch.no_grad():
- (
- ArmTester(
- text_encoder_model,
- example_inputs=text_encoder_model_inputs,
- compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+INT"),
- )
- .quantize()
- .export()
- .to_edge_transform_and_lower()
- .dump_operator_distribution()
- .check_count(self.ops_after_partitioner_INT)
- .to_executorch()
- .run_method_and_compare_outputs(
- inputs=text_encoder_model_inputs,
- atol=0.8,
- )
- )
+
+def test_CLIPTextModelWithProjection_tosa_FP():
+ text_encoder_model, text_encoder_model_inputs = (
+ TestCLIPTextModelWithProjection().prepare_model_and_inputs()
+ )
+ with torch.no_grad():
+ pipeline = TosaPipelineFP[input_t](
+ text_encoder_model,
+ text_encoder_model_inputs,
+ aten_op=[],
+ exir_op=[],
+ use_to_edge_transform_and_lower=True,
+ transform_passes=[
+ ConvertInt64ConstOpsToInt32Pass(),
+ ConvertInt64OutputOpsToInt32Pass(),
+ InsertInt32CastsAfterInt64PlaceholdersPass(),
+ ],
+ )
+ pipeline.change_args(
+ "check_count.exir", TestCLIPTextModelWithProjection.ops_after_partitioner_FP
+ )
+ pipeline.run()
+
+
+def test_CLIPTextModelWithProjection_tosa_INT():
+ text_encoder_model, text_encoder_model_inputs = (
+ TestCLIPTextModelWithProjection().prepare_model_and_inputs()
+ )
+ with torch.no_grad():
+ pipeline = TosaPipelineINT[input_t](
+ text_encoder_model,
+ text_encoder_model_inputs,
+ aten_op=[],
+ exir_op=[],
+ use_to_edge_transform_and_lower=True,
+ atol=0.8,
+ )
+ pipeline.change_args(
+ "check_count.exir",
+ TestCLIPTextModelWithProjection.ops_after_partitioner_INT,
+ )
+ pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+def test_CLIPTextModelWithProjection_vgf_FP():
+ text_encoder_model, text_encoder_model_inputs = (
+ TestCLIPTextModelWithProjection().prepare_model_and_inputs()
+ )
+ with torch.no_grad():
+ pipeline = VgfPipeline[input_t](
+ text_encoder_model,
+ text_encoder_model_inputs,
+ aten_op=[],
+ exir_op=[],
+ tosa_version="TOSA-1.0+FP",
+ use_to_edge_transform_and_lower=True,
+ atol=4, # TODO: Investiage numerical issue: MAX Diff ~50%
+ transform_passes=[
+ ConvertInt64ConstOpsToInt32Pass(),
+ ConvertInt64OutputOpsToInt32Pass(),
+ InsertInt32CastsAfterInt64PlaceholdersPass(),
+ ],
+ )
+ pipeline.change_args(
+ "check_count.exir", TestCLIPTextModelWithProjection.ops_after_partitioner_FP
+ )
+ pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+def test_CLIPTextModelWithProjection_vgf_INT():
+ text_encoder_model, text_encoder_model_inputs = (
+ TestCLIPTextModelWithProjection().prepare_model_and_inputs()
+ )
+ with torch.no_grad():
+ pipeline = VgfPipeline[input_t](
+ text_encoder_model,
+ text_encoder_model_inputs,
+ aten_op=[],
+ exir_op=[],
+ tosa_version="TOSA-1.0+INT",
+ use_to_edge_transform_and_lower=True,
+ atol=0.8,
+ )
+ pipeline.change_args(
+ "check_count.exir",
+ TestCLIPTextModelWithProjection.ops_after_partitioner_INT,
+ )
+ pipeline.run()
diff --git a/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py b/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py
index f9d814d044b..1267c5b8e4c 100644
--- a/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py
+++ b/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py
@@ -4,7 +4,7 @@
# LICENSE file in the root directory of this source tree.
-import unittest
+from typing import Tuple
import torch
from diffusers.models.transformers import SD3Transformer2DModel
@@ -13,10 +13,16 @@
from executorch.backends.arm.test.models.stable_diffusion.stable_diffusion_module_test_configs import (
SD3Transformer2DModel_init_dict,
)
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.test.tester.test_pipeline import (
+ TosaPipelineFP,
+ TosaPipelineINT,
+ VgfPipeline,
+)
+
+input_t4 = Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
-class TestSD3Transformer2DModel(unittest.TestCase):
+class TestSD3Transformer2DModel:
"""
Test class of AutoenSD3Transformer2DModelcoderKL.
SD3Transformer2DModel is the transformer model used by Stable Diffusion 3.5 Medium
@@ -93,48 +99,88 @@ def forward(self, *args, **kwargs):
return sd35_transformer2D_model, sd35_transformer2D_model_inputs
- def test_SD3Transformer2DModel_tosa_FP(self):
- sd35_transformer2D_model, sd35_transformer2D_model_inputs = (
- self.prepare_model_and_inputs()
- )
- with torch.no_grad():
- (
- ArmTester(
- sd35_transformer2D_model,
- example_inputs=sd35_transformer2D_model_inputs,
- compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP"),
- )
- .export()
- .to_edge_transform_and_lower()
- .check_count(self.ops_after_partitioner_FP)
- .to_executorch()
- .run_method_and_compare_outputs(
- inputs=sd35_transformer2D_model_inputs,
- rtol=1.0, # TODO: MLETORCH-875: Reduce tolerance of SD3Transformer2DModel with FP and INT
- atol=4.0,
- )
- )
- def test_SD3Transformer2DModel_tosa_INT(self):
- sd35_transformer2D_model, sd35_transformer2D_model_inputs = (
- self.prepare_model_and_inputs()
+def test_SD3Transformer2DModel_tosa_FP():
+ sd35_transformer2D_model, sd35_transformer2D_model_inputs = (
+ TestSD3Transformer2DModel().prepare_model_and_inputs()
+ )
+ with torch.no_grad():
+ pipeline = TosaPipelineFP[input_t4](
+ sd35_transformer2D_model,
+ sd35_transformer2D_model_inputs,
+ aten_op=[],
+ exir_op=[],
+ use_to_edge_transform_and_lower=True,
+ rtol=1.0, # TODO: MLETORCH-875: Reduce tolerance of SD3Transformer2DModel with FP and INT
+ atol=4.0,
)
- with torch.no_grad():
- (
- ArmTester(
- sd35_transformer2D_model,
- example_inputs=sd35_transformer2D_model_inputs,
- compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+INT"),
- )
- .quantize()
- .export()
- .to_edge_transform_and_lower()
- .check_count(self.ops_after_partitioner_INT)
- .to_executorch()
- .run_method_and_compare_outputs(
- inputs=sd35_transformer2D_model_inputs,
- qtol=1.0, # TODO: MLETORCH-875: Reduce tolerance of SD3Transformer2DModel with FP and INT
- rtol=1.0,
- atol=4.0,
- )
- )
+ pipeline.change_args(
+ "check_count.exir", TestSD3Transformer2DModel.ops_after_partitioner_FP
+ )
+ pipeline.run()
+
+
+def test_SD3Transformer2DModel_tosa_INT():
+ sd35_transformer2D_model, sd35_transformer2D_model_inputs = (
+ TestSD3Transformer2DModel().prepare_model_and_inputs()
+ )
+ with torch.no_grad():
+ pipeline = TosaPipelineINT[input_t4](
+ sd35_transformer2D_model,
+ sd35_transformer2D_model_inputs,
+ aten_op=[],
+ exir_op=[],
+ use_to_edge_transform_and_lower=True,
+ qtol=1.0, # TODO: MLETORCH-875: Reduce tolerance of SD3Transformer2DModel with FP and INT
+ rtol=1.0,
+ atol=4.0,
+ )
+ pipeline.change_args(
+ "check_count.exir", TestSD3Transformer2DModel.ops_after_partitioner_INT
+ )
+ pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+def test_SD3Transformer2DModel_vgf_FP():
+ sd35_transformer2D_model, sd35_transformer2D_model_inputs = (
+ TestSD3Transformer2DModel().prepare_model_and_inputs()
+ )
+ with torch.no_grad():
+ pipeline = VgfPipeline[input_t4](
+ sd35_transformer2D_model,
+ sd35_transformer2D_model_inputs,
+ aten_op=[],
+ exir_op=[],
+ tosa_version="TOSA-1.0+FP",
+ use_to_edge_transform_and_lower=True,
+ rtol=1.0, # TODO: MLETORCH-875: Reduce tolerance of SD3Transformer2DModel with FP and INT
+ atol=4.0,
+ )
+ pipeline.change_args(
+ "check_count.exir", TestSD3Transformer2DModel.ops_after_partitioner_FP
+ )
+ pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+def test_SD3Transformer2DModel_vgf_INT():
+ sd35_transformer2D_model, sd35_transformer2D_model_inputs = (
+ TestSD3Transformer2DModel().prepare_model_and_inputs()
+ )
+ with torch.no_grad():
+ pipeline = VgfPipeline[input_t4](
+ sd35_transformer2D_model,
+ sd35_transformer2D_model_inputs,
+ aten_op=[],
+ exir_op=[],
+ tosa_version="TOSA-1.0+INT",
+ use_to_edge_transform_and_lower=True,
+ qtol=1.0, # TODO: MLETORCH-875: Reduce tolerance of SD3Transformer2DModel with FP and INT
+ rtol=1.0,
+ atol=4.0,
+ )
+ pipeline.change_args(
+ "check_count.exir", TestSD3Transformer2DModel.ops_after_partitioner_INT
+ )
+ pipeline.run()
diff --git a/backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py b/backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py
index 22a47042eb1..20b92e4a258 100644
--- a/backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py
+++ b/backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py
@@ -4,7 +4,7 @@
# LICENSE file in the root directory of this source tree.
-import unittest
+from typing import Tuple
import torch
from executorch.backends.arm._passes import (
@@ -17,11 +17,17 @@
from executorch.backends.arm.test.models.stable_diffusion.stable_diffusion_module_test_configs import (
T5_encoder_config,
)
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.test.tester.test_pipeline import (
+ TosaPipelineFP,
+ TosaPipelineINT,
+ VgfPipeline,
+)
from transformers import T5EncoderModel
+input_t = Tuple[torch.Tensor]
+
-class TestT5EncoderModel(unittest.TestCase):
+class TestT5EncoderModel:
"""
Test class of T5EncoderModel.
T5EncoderModel is one of the text_encoder used by Stable Diffusion 3.5 Medium
@@ -61,46 +67,88 @@ def prepare_model_and_inputs(self):
return t5_encoder_model, t5_encoder_model_inputs
- def test_T5EncoderModel_tosa_FP(self):
- t5_encoder_model, t5_encoder_model_inputs = self.prepare_model_and_inputs()
- with torch.no_grad():
- (
- ArmTester(
- t5_encoder_model,
- example_inputs=t5_encoder_model_inputs,
- compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP"),
- transform_passes=[
- ConvertInt64ConstOpsToInt32Pass(),
- ConvertInt64OutputOpsToInt32Pass(),
- InsertInt32CastsAfterInt64PlaceholdersPass(),
- ],
- )
- .export()
- .to_edge_transform_and_lower()
- .dump_operator_distribution()
- .check_count(self.ops_after_partitioner_FP)
- .to_executorch()
- .run_method_and_compare_outputs(
- inputs=t5_encoder_model_inputs,
- )
- )
-
- def test_T5EncoderModel_tosa_INT(self):
- t5_encoder_model, t5_encoder_model_inputs = self.prepare_model_and_inputs()
- with torch.no_grad():
- (
- ArmTester(
- t5_encoder_model,
- example_inputs=t5_encoder_model_inputs,
- compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+INT"),
- )
- .quantize()
- .export()
- .to_edge_transform_and_lower()
- .dump_operator_distribution()
- .check_count(self.ops_after_partitioner_INT)
- .to_executorch()
- .run_method_and_compare_outputs(
- inputs=t5_encoder_model_inputs,
- )
- )
+
+def test_T5EncoderModel_tosa_FP():
+ t5_encoder_model, t5_encoder_model_inputs = (
+ TestT5EncoderModel().prepare_model_and_inputs()
+ )
+ with torch.no_grad():
+ pipeline = TosaPipelineFP[input_t](
+ t5_encoder_model,
+ t5_encoder_model_inputs,
+ aten_op=[],
+ exir_op=[],
+ use_to_edge_transform_and_lower=True,
+ transform_passes=[
+ ConvertInt64ConstOpsToInt32Pass(),
+ ConvertInt64OutputOpsToInt32Pass(),
+ InsertInt32CastsAfterInt64PlaceholdersPass(),
+ ],
+ )
+ pipeline.change_args(
+ "check_count.exir", TestT5EncoderModel.ops_after_partitioner_FP
+ )
+ pipeline.run()
+
+
+def test_T5EncoderModel_tosa_INT():
+ t5_encoder_model, t5_encoder_model_inputs = (
+ TestT5EncoderModel().prepare_model_and_inputs()
+ )
+ with torch.no_grad():
+ pipeline = TosaPipelineINT[input_t](
+ t5_encoder_model,
+ t5_encoder_model_inputs,
+ aten_op=[],
+ exir_op=[],
+ use_to_edge_transform_and_lower=True,
+ )
+ pipeline.change_args(
+ "check_count.exir", TestT5EncoderModel.ops_after_partitioner_INT
+ )
+ pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+def test_T5EncoderModel_vgf_FP():
+ t5_encoder_model, t5_encoder_model_inputs = (
+ TestT5EncoderModel().prepare_model_and_inputs()
+ )
+ with torch.no_grad():
+ pipeline = VgfPipeline[input_t](
+ t5_encoder_model,
+ t5_encoder_model_inputs,
+ aten_op=[],
+ exir_op=[],
+ tosa_version="TOSA-1.0+FP",
+ use_to_edge_transform_and_lower=True,
+ transform_passes=[
+ ConvertInt64ConstOpsToInt32Pass(),
+ ConvertInt64OutputOpsToInt32Pass(),
+ InsertInt32CastsAfterInt64PlaceholdersPass(),
+ ],
+ )
+ pipeline.change_args(
+ "check_count.exir", TestT5EncoderModel.ops_after_partitioner_FP
+ )
+ pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+def test_T5EncoderModel_vgf_INT():
+ t5_encoder_model, t5_encoder_model_inputs = (
+ TestT5EncoderModel().prepare_model_and_inputs()
+ )
+ with torch.no_grad():
+ pipeline = VgfPipeline[input_t](
+ t5_encoder_model,
+ t5_encoder_model_inputs,
+ aten_op=[],
+ exir_op=[],
+ tosa_version="TOSA-1.0+INT",
+ use_to_edge_transform_and_lower=True,
+ )
+ pipeline.change_args(
+ "check_count.exir", TestT5EncoderModel.ops_after_partitioner_INT
+ )
+ pipeline.run()
diff --git a/backends/arm/test/models/stable_diffusion/test_vae_AutoencoderKL.py b/backends/arm/test/models/stable_diffusion/test_vae_AutoencoderKL.py
index ab0f4892fb8..a3c3a018131 100644
--- a/backends/arm/test/models/stable_diffusion/test_vae_AutoencoderKL.py
+++ b/backends/arm/test/models/stable_diffusion/test_vae_AutoencoderKL.py
@@ -4,7 +4,7 @@
# LICENSE file in the root directory of this source tree.
-import unittest
+from typing import Tuple
import torch
from diffusers.models.autoencoders import AutoencoderKL
@@ -14,10 +14,16 @@
from executorch.backends.arm.test.models.stable_diffusion.stable_diffusion_module_test_configs import (
AutoencoderKL_config,
)
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.test.tester.test_pipeline import (
+ TosaPipelineFP,
+ TosaPipelineINT,
+ VgfPipeline,
+)
+
+input_t = Tuple[torch.Tensor]
-class TestAutoencoderKL(unittest.TestCase):
+class TestAutoencoderKL:
"""
Test class of AutoencoderKL.
AutoencoderKL is the encoder/decoder used by Stable Diffusion 3.5 Medium
@@ -41,40 +47,68 @@ def forward(self, *args, **kwargs):
return auto_encoder_model, auto_encoder_model_inputs
- def test_AutoencoderKL_tosa_FP(self):
- auto_encoder_model, auto_encoder_model_inputs = self.prepare_model_and_inputs()
- with torch.no_grad():
- (
- ArmTester(
- auto_encoder_model,
- example_inputs=auto_encoder_model_inputs,
- compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP"),
- )
- .export()
- .to_edge_transform_and_lower()
- .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
- .to_executorch()
- .run_method_and_compare_outputs(
- inputs=auto_encoder_model_inputs,
- )
- )
-
- def test_AutoencoderKL_tosa_INT(self):
- auto_encoder_model, auto_encoder_model_inputs = self.prepare_model_and_inputs()
- with torch.no_grad():
- (
- ArmTester(
- auto_encoder_model,
- example_inputs=auto_encoder_model_inputs,
- compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+INT"),
- )
- .quantize()
- .export()
- .to_edge_transform_and_lower()
- .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
- .to_executorch()
- .run_method_and_compare_outputs(
- inputs=auto_encoder_model_inputs,
- atol=1.0, # TODO: MLETORCH-990 Reduce tolerance of vae(AutoencoderKL) with INT
- )
- )
+
+def test_AutoencoderKL_tosa_FP():
+ auto_encoder_model, auto_encoder_model_inputs = (
+ TestAutoencoderKL().prepare_model_and_inputs()
+ )
+ with torch.no_grad():
+ pipeline = TosaPipelineFP[input_t](
+ auto_encoder_model,
+ auto_encoder_model_inputs,
+ aten_op=[],
+ exir_op=[],
+ use_to_edge_transform_and_lower=True,
+ )
+ pipeline.run()
+
+
+def test_AutoencoderKL_tosa_INT():
+ auto_encoder_model, auto_encoder_model_inputs = (
+ TestAutoencoderKL().prepare_model_and_inputs()
+ )
+ with torch.no_grad():
+ pipeline = TosaPipelineINT[input_t](
+ auto_encoder_model,
+ auto_encoder_model_inputs,
+ aten_op=[],
+ exir_op=[],
+ use_to_edge_transform_and_lower=True,
+ atol=1.0, # TODO: MLETORCH-990 Reduce tolerance of vae(AutoencoderKL) with INT
+ )
+ pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+def test_AutoencoderKL_vgf_FP():
+ auto_encoder_model, auto_encoder_model_inputs = (
+ TestAutoencoderKL().prepare_model_and_inputs()
+ )
+ with torch.no_grad():
+ pipeline = VgfPipeline[input_t](
+ auto_encoder_model,
+ auto_encoder_model_inputs,
+ aten_op=[],
+ exir_op=[],
+ tosa_version="TOSA-1.0+FP",
+ use_to_edge_transform_and_lower=True,
+ )
+ pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+def test_AutoencoderKL_vgf_INT():
+ auto_encoder_model, auto_encoder_model_inputs = (
+ TestAutoencoderKL().prepare_model_and_inputs()
+ )
+ with torch.no_grad():
+ pipeline = VgfPipeline[input_t](
+ auto_encoder_model,
+ auto_encoder_model_inputs,
+ aten_op=[],
+ exir_op=[],
+ tosa_version="TOSA-1.0+INT",
+ use_to_edge_transform_and_lower=True,
+ atol=1.0, # TODO: MLETORCH-990 Reduce tolerance of vae(AutoencoderKL) with INT
+ )
+ pipeline.run()
From edf69278ea59dc681a72ee3697021e6af533bb97 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0imon=20Str=C3=BD=C4=8Dek?=
Date: Thu, 2 Oct 2025 16:02:21 +0200
Subject: [PATCH 102/266] NXP backend: Improve Neutron targets handling
(#14718)
### Summary
Adds NeutronTargetSpec class containing metadata about the desired
target for better handling of Neutron target support.
### Test plan
This feature modifies handling of individual operators target support
and therefore should be covered by already existing unit tests.
cc @digantdesai @JakeStevens @robert-kalmar
Co-authored-by: Jiri Ocenasek
---
.../nxp/backend/edge_program_converter.py | 10 ++-
.../ir/converter/builder/model_builder.py | 70 +++++++++-------
.../backend/ir/converter/node_converter.py | 25 ++----
.../ops_converters/add_tensor_converter.py | 17 ++--
.../ops_converters/cat_converter.py | 81 ++++++++++---------
.../constant_pad_nd_converter.py | 22 ++---
.../ops_converters/convolution_converter.py | 67 +++++++--------
.../ops_converters/mean_dim_converter.py | 38 ++++-----
.../ops_converters/softmax_converter.py | 17 +---
.../prune_transpose_operators.py | 2 +-
.../nxp/backend/neutron_converter_manager.py | 45 +++++++----
backends/nxp/backend/neutron_target_spec.py | 64 +++++++++++++++
backends/nxp/neutron_partitioner.py | 44 +++++-----
backends/nxp/nxp_backend.py | 20 ++---
backends/nxp/tests/executors.py | 10 +--
backends/nxp/tests/test_neutron_backend.py | 2 +-
.../tests/test_neutron_converter_manager.py | 9 +--
17 files changed, 289 insertions(+), 254 deletions(-)
create mode 100644 backends/nxp/backend/neutron_target_spec.py
diff --git a/backends/nxp/backend/edge_program_converter.py b/backends/nxp/backend/edge_program_converter.py
index 192798c151e..febcd03913a 100644
--- a/backends/nxp/backend/edge_program_converter.py
+++ b/backends/nxp/backend/edge_program_converter.py
@@ -18,6 +18,7 @@
from torch.fx import Node
from torch.nn.parameter import Parameter
from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters import * # noqa F403
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
from executorch.backends.nxp.backend.node_format_inference import (
NodeFormat,
NodeFormatInference,
@@ -54,12 +55,14 @@ class EdgeProgramToIRConverter:
"""
_default_conversion_config = ConversionConfig()
+ _default_target_spec = NeutronTargetSpec("imxrt700", "SDK_25_09")
_default_delegation_options = CustomDelegationOptions()
def convert_program(
self,
edge_program: ExportedProgram,
- conversion_config=_default_conversion_config,
+ conversion_config: ConversionConfig = _default_conversion_config,
+ neutron_target_spec: NeutronTargetSpec = _default_target_spec,
custom_delegation_options: CustomDelegationOptions = _default_delegation_options,
) -> (bytes, dict):
"""
@@ -67,6 +70,7 @@ def convert_program(
:param edge_program: Converter ExportedProgram.
:param conversion_config: ConversionConfig instance.
+ :param neutron_target_spec: Object for querying the target platform to retrieve its properties.
:param custom_delegation_options: Custom user options which affect node delegation.
:return: TFLite flatbuffers as bytes.
"""
@@ -76,6 +80,7 @@ def convert_program(
cc = self.build_conversion_context(
parameters_mapping,
node_formats,
+ neutron_target_spec,
conversion_config,
custom_delegation_options,
)
@@ -173,11 +178,12 @@ def map_inputs_to_parameters(edge_program: ExportedProgram) -> dict[str, Paramet
def build_conversion_context(
parameters_mapping: dict,
node_formats: dict[Node, NodeFormat],
+ neutron_target_spec: NeutronTargetSpec,
conversion_config: ConversionConfig = _default_conversion_config,
custom_delegation_options: CustomDelegationOptions = _default_delegation_options,
) -> ConversionContext:
tflite_builder = AtenModelBuilderDirector(
- 3, "TFLite from EdgeProgram", conversion_config
+ 3, "TFLite from EdgeProgram", neutron_target_spec, conversion_config
)
# Add "sentinel" buffer (defined in schema.fbs)
diff --git a/backends/nxp/backend/ir/converter/builder/model_builder.py b/backends/nxp/backend/ir/converter/builder/model_builder.py
index 496fa752853..643a6231d15 100755
--- a/backends/nxp/backend/ir/converter/builder/model_builder.py
+++ b/backends/nxp/backend/ir/converter/builder/model_builder.py
@@ -48,6 +48,7 @@
FlexTranspose,
)
from executorch.backends.nxp.backend.ir.tflite_optimizer import optimizer
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
class ModelBuilder:
@@ -74,17 +75,21 @@ class ModelBuilder:
_zeros_tensor_map: Dict # Mapping 'string' shapes to 'tflT.Tensor' objects
- _default_conversion_config = ConversionConfig()
+ neutron_target_spec: NeutronTargetSpec
conversion_config: ConversionConfig
+ _default_conversion_config = ConversionConfig()
+
def __init__(
self,
model_version: int,
model_description: str,
+ neutron_target_spec: NeutronTargetSpec,
conversion_config: ConversionConfig = _default_conversion_config,
) -> None:
self._tfl_model = tflite_model.Model(model_version, model_description)
+ self.neutron_target_spec = neutron_target_spec
self.conversion_config = conversion_config
self.op_code_type_index_map = {}
@@ -471,31 +476,7 @@ def finish(self) -> tflite_model.Model:
return self._tfl_model
- def _assign_tensor_and_buffer_indices( # noqa C901
- self, allow_inputs_stripping: bool
- ):
- """Correctly initialize all references via indices in all tensors and buffers."""
-
- # Assign each buffer its index
- for i, buffer in enumerate(self.get_buffers().vector):
- buffer.tmp_index = i
-
- # Assign each tensor its index and its buffer index
- for i, tensor in enumerate(self.get_tensors().vector):
- if tensor.tmp_null_tensor:
- # Using -1 as the index to the 'tensors' vector is way of telling the TFLite inference engine, that
- # this tensor should not be used.
- # https://github.com/tensorflow/tensorflow/blob/05404d959119d41a8ffb8a75c6f232cfd8540d45/tensorflow/lite/kernels/kernel_util.cc#L79-L98
- tensor.tmp_index = -1
- else:
- tensor.tmp_index = i
-
- tensor.buffer = tensor.tmp_buffer.tmp_index
-
- # TODO Remove inputs and outputs that are not in the tensors collection
-
- # Assign 'Outputs' and 'Inputs' their tensor indices
- outputs = self.get_sub_graph().outputs
+ def _assign_io_tensor_indices(self, inputs, outputs, allow_inputs_stripping: bool):
for tensor in outputs.tmp_outputs:
try:
outputs.append(tensor.tmp_index)
@@ -505,7 +486,6 @@ def _assign_tensor_and_buffer_indices( # noqa C901
f"The tensor '{tensor.name}' is among the model outputs, but does NOT appear in the graph!",
)
- inputs = self.get_sub_graph().inputs
for tensor in inputs.tmp_inputs:
try:
inputs.append(tensor.tmp_index)
@@ -520,14 +500,46 @@ def _assign_tensor_and_buffer_indices( # noqa C901
f"The tensor '{tensor.name}' is among the model inputs, but does NOT appear in the graph!",
)
- # Assign each operator its inputs and outputs indices
- for operator in self.get_sub_graph().operators.vector:
+ def _assign_operators_io_tensor_indices(self, operators):
+ for operator in operators.vector:
for inputTensor in operator.tmp_inputs:
operator.inputs.append(inputTensor.tmp_index)
for outputTensor in operator.tmp_outputs:
operator.outputs.append(outputTensor.tmp_index)
+ def _assign_tensor_and_buffer_indices(self, allow_inputs_stripping: bool):
+ """Correctly initialize all references via indices in all tensors and buffers."""
+
+ # Assign each buffer its index
+ for i, buffer in enumerate(self.get_buffers().vector):
+ buffer.tmp_index = i
+
+ # Assign each tensor its index and its buffer index
+ for i, tensor in enumerate(self.get_tensors().vector):
+ if tensor.tmp_null_tensor:
+ # Using -1 as the index to the 'tensors' vector is way of telling the TFLite inference engine, that
+ # this tensor should not be used.
+ # https://github.com/tensorflow/tensorflow/blob/05404d959119d41a8ffb8a75c6f232cfd8540d45/tensorflow/lite/kernels/kernel_util.cc#L79-L98
+ tensor.tmp_index = -1
+ else:
+ tensor.tmp_index = i
+
+ tensor.buffer = tensor.tmp_buffer.tmp_index
+
+ # TODO Remove inputs and outputs that are not in the tensors collection
+
+ subgraph = self.get_sub_graph()
+
+ # Assign 'Outputs' and 'Inputs' their tensor indices
+ self._assign_io_tensor_indices(
+ inputs=subgraph.inputs,
+ outputs=subgraph.outputs,
+ allow_inputs_stripping=allow_inputs_stripping,
+ )
+ # Assign each operator its inputs and outputs indices
+ self._assign_operators_io_tensor_indices(operators=subgraph.operators)
+
def _build_operator_code(
self, op_type: BuiltinOperator, version, custom_code: str = None
):
diff --git a/backends/nxp/backend/ir/converter/node_converter.py b/backends/nxp/backend/ir/converter/node_converter.py
index c44a6e19955..36266486aac 100755
--- a/backends/nxp/backend/ir/converter/node_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converter.py
@@ -4,7 +4,6 @@
# LICENSE file in the root directory of this source tree.
from abc import ABC, abstractmethod
-from enum import Enum
import torch
@@ -16,6 +15,7 @@
AtenModelBuilderDirector,
)
from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
from executorch.exir.dialects._ops import ops as exir_ops
from torch.fx import Node
from torch.fx.passes.infra.partitioner import Partition
@@ -42,17 +42,6 @@ def is_not_qdq_node(node: torch.fx.Node) -> bool:
return not (_is_quant_node(node) or _is_dequant_node(node))
-class Target(Enum):
- IGNORE = "ignore" # No target platform. Any target specific restrictions will be ignored.
-
- RT700 = "imxrt700"
- IMX95 = "imx95"
-
- @classmethod
- def values(cls) -> list[str]:
- return [elt.value for elt in cls]
-
-
class NodeConverter(ABC):
"""
Classes which implement conversion of torch.Node to TFLite should inherit from this class and overwrite the
@@ -94,7 +83,7 @@ def _is_supported_in_IR(
@staticmethod
def _is_supported_on_target(
node: Node,
- target: Target,
+ neutron_target_spec: NeutronTargetSpec,
parameters_mapping: dict[str, Parameter],
custom_delegation_options: CustomDelegationOptions,
) -> bool:
@@ -103,31 +92,31 @@ def _is_supported_on_target(
can be used by operators with no target specific requirements.
:param node: The node (edge operator) to check.
- :param target: Value of the `Target` enum representing the target platform to check for.
+ :param neutron_target_spec: Object for querying the target platform to retrieve its properties.
:param parameters_mapping: Dictionary mapping tensor names to their static data (if they have it).
:param custom_delegation_options: Custom options which affect delegation.
"""
- return target == Target.RT700
+ return True
@classmethod
def is_supported(
cls,
node: Node,
- target: Target,
+ neutron_target_spec: NeutronTargetSpec,
parameters_mapping: dict[str, Parameter],
custom_delegation_options: CustomDelegationOptions,
) -> bool:
"""Check if the given `node` is supported in the IR and on the given `target` platform.
:param node: torch.Node to check.
- :param target: Value of the `Target` enum representing the target platform to check for.
+ :param neutron_target_spec: Object for querying the target platform to retrieve its properties.
:param parameters_mapping: Dict mapping tensor names to their data.
:param custom_delegation_options: Custom user options which affect node delegation.
"""
return cls._is_supported_in_IR(
node, parameters_mapping, custom_delegation_options
) and cls._is_supported_on_target(
- node, target, parameters_mapping, custom_delegation_options
+ node, neutron_target_spec, parameters_mapping, custom_delegation_options
)
@classmethod
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/add_tensor_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/add_tensor_converter.py
index c74baa61f67..cd5aa2ead81 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/add_tensor_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/add_tensor_converter.py
@@ -9,11 +9,11 @@
from executorch.backends.nxp.backend.ir.converter.node_converter import (
CustomDelegationOptions,
NodeConverter,
- Target,
)
from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options import (
add_options,
)
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
from torch.fx import Node
from torch.nn import Parameter
@@ -22,20 +22,15 @@ class AddTensorConverter(NodeConverter):
@staticmethod
def _is_supported_on_target(
node: Node,
- target: Target,
+ neutron_target_spec: NeutronTargetSpec,
parameters_mapping: dict[str, Parameter],
custom_delegation_options: CustomDelegationOptions,
) -> bool:
- match target:
- case Target.RT700:
- if node_uses_shape_broadcasting(node):
- # Shape broadcasting may require the addition of `Transpose` ops during conversion.
- return False
-
- return True
+ if node_uses_shape_broadcasting(node):
+ # Shape broadcasting may require the addition of `Transpose` ops during conversion.
+ return False
- case _:
- return False
+ return True
@staticmethod
def _is_supported_in_IR(
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/cat_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/cat_converter.py
index 4f7f00fe5ba..22ca258cd4f 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/cat_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/cat_converter.py
@@ -13,11 +13,11 @@
_is_dequant_node,
_is_quant_node,
NodeConverter,
- Target,
)
from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options.concatenation_options import (
Concatenation,
)
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
from torch.fx import Node
from torch.nn import Parameter
@@ -72,51 +72,52 @@ def _all_io_shares_quantization_parameters(node: Node) -> bool:
@staticmethod
def _is_supported_on_target(
node: Node,
- target: Target,
+ neutron_target_spec: NeutronTargetSpec,
parameters_mapping: dict[str, Parameter],
custom_delegation_options: CustomDelegationOptions,
) -> bool:
if custom_delegation_options.force_delegate_cat:
return True
- match target:
- case Target.RT700:
- dim = CatConverter._get_normalized_dim(node)
-
- # neutron-library/src/utils/NeutronLibraryInterrogation.cpp#1491
- if dim == 0:
- return False
-
- # Neutron requires the channels to be a multiple of `8`. The channels could either be the second or the
- # last dimension, depending on the formats of the node. The format, however, cannot be determined
- # during conversion, as it depends on what other nodes are delegated.
- input_channels = [
- # The second dimension is the channels in PyTorch. If the inputs/output are not channels first, it
- # will still be the channels in the IR.
- _get_shape(input_)[1]
- for input_ in node.all_input_nodes
- ] + [
- # If the inputs/outputs are channels first, the last dimension will be the channels.
- _get_shape(input_)[-1]
- for input_ in node.all_input_nodes
- ]
- if any((input_channel % 8) != 0 for input_channel in input_channels):
- # neutron-library/src/utils/NeutronLibraryInterrogation.cpp#1492
- return False
-
- output_channels = [_get_shape(node)[1], _get_shape(node)[-1]]
- # neutron-library/src/utils/NeutronLibraryInterrogation.cpp#1493
- if any((out_c % 8) != 0 for out_c in output_channels):
- return False
-
- if len(node.all_input_nodes) < 2: # Not supported on Neutron
- # TODO Try to skip the operator if this case is realistic.
- return False
-
- return True
-
- case _:
- return False
+ dim = CatConverter._get_normalized_dim(node)
+
+ # neutron-library/src/utils/NeutronLibraryInterrogation.cpp#1491
+ if dim == 0:
+ return False
+
+ # Neutron requires the channels to be a multiple of numMacs. The channels could either be the second or the
+ # last dimension, depending on the formats of the node. The format, however, cannot be determined
+ # during conversion, as it depends on what other nodes are delegated.
+ input_channels = [
+ # The second dimension is the channels in PyTorch. If the inputs/output are not channels first, it
+ # will still be the channels in the IR.
+ _get_shape(input_)[1]
+ for input_ in node.all_input_nodes
+ ] + [
+ # If the inputs/outputs are channels first, the last dimension will be the channels.
+ _get_shape(input_)[-1]
+ for input_ in node.all_input_nodes
+ ]
+ if any(
+ (input_channel % neutron_target_spec.get_num_macs()) != 0
+ for input_channel in input_channels
+ ):
+ # neutron-library/src/utils/NeutronLibraryInterrogation.cpp#1492
+ return False
+
+ output_channels = [_get_shape(node)[1], _get_shape(node)[-1]]
+ # neutron-library/src/utils/NeutronLibraryInterrogation.cpp#1493
+ if any(
+ (out_c % neutron_target_spec.get_num_macs()) != 0
+ for out_c in output_channels
+ ):
+ return False
+
+ if len(node.all_input_nodes) < 2: # Not supported on Neutron
+ # TODO Try to skip the operator if this case is realistic.
+ return False
+
+ return True
@staticmethod
def _is_supported_in_IR(
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/constant_pad_nd_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/constant_pad_nd_converter.py
index f58df1a88d9..499541aa58c 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/constant_pad_nd_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/constant_pad_nd_converter.py
@@ -17,7 +17,6 @@
from executorch.backends.nxp.backend.ir.converter.node_converter import (
CustomDelegationOptions,
NodeConverter,
- Target,
)
from executorch.backends.nxp.backend.ir.converter.quantization_utils import (
quantize_int8,
@@ -27,6 +26,7 @@
pad_options,
pad_v2_options,
)
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
from torch.fx import Node
from torch.nn import Parameter
@@ -35,22 +35,16 @@ class ConstantPadNDConverter(NodeConverter):
@staticmethod
def _is_supported_on_target(
node: Node,
- target: Target,
+ neutron_target_spec: NeutronTargetSpec,
parameters_mapping: dict[str, Parameter],
custom_delegation_options: CustomDelegationOptions,
) -> bool:
- match target:
- case Target.RT700:
- # TODO: Consider different tensor formats (dim-order)
- paddings = node.args[1]
- if len(paddings) > 4 and paddings[4:6] != [0, 0]:
- # Attempt to Pad channels dimension, which is not supported on Neutron.
- return False
-
- return True
-
- case _:
- return False
+ paddings = node.args[1]
+ if len(paddings) > 4 and paddings[4:6] != [0, 0]:
+ # Attempt to Pad channels dimension, which is not supported on Neutron.
+ return False
+
+ return True
@staticmethod
def _is_supported_in_IR(
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/convolution_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/convolution_converter.py
index 8955b4c8fd4..f32b5a65cac 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/convolution_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/convolution_converter.py
@@ -25,7 +25,6 @@
from executorch.backends.nxp.backend.ir.converter.node_converter import (
CustomDelegationOptions,
NodeConverter,
- Target,
)
from executorch.backends.nxp.backend.ir.converter.node_converters.shared import (
conv_utils,
@@ -45,6 +44,7 @@
depthwise_conv_2d_options,
reshape_options,
)
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
from torch.fx import Node
from torch.nn import Parameter
@@ -53,45 +53,38 @@ class ConvolutionConverter(NodeConverter):
@staticmethod
def _is_supported_on_target(
node: Node,
- target: Target,
+ neutron_target_spec: NeutronTargetSpec,
parameters_mapping: dict[str, Parameter],
custom_delegation_options: CustomDelegationOptions,
) -> bool:
- match target:
- case Target.RT700:
- activations = node.args[0]
- weights = node.args[1]
- groups = node.args[8]
-
- if activations.meta["val"].shape[0] != 1:
- # Only batch size 1 is supported on neutron.
- return False
-
- if groups == 1: # Regular convolution.
- pass
- elif conv_utils.group_conv_convertible_as_depthwise(
- node, groups
- ): # Depthwise convolution.
- # Only supported if the weights are static, because TFLite `DepthwiseConv2D` uses permuted
- # weights. In case the weights are dynamic, a Transpose operator would have to be added, which
- # is not supported on Neutron.
- if not node_is_effectively_static_tensor(
- weights, parameters_mapping
- ):
- return False
- elif conv_utils.group_conv_convertible_into_multiple_convolutions(
- node, groups
- ): # Separable conv. This should never be reached, as the node should have been decomposed into
- # multiple parallel convolutions by the `SplitGroupConvolution` pre-processing pass.
- logging.warning("Group convolution was not decomposed.")
- return False
- else: # Unexpected case (should never happen).
- return False
-
- return True
-
- case _:
+ activations = node.args[0]
+ weights = node.args[1]
+ groups = node.args[8]
+
+ if activations.meta["val"].shape[0] != 1:
+ # Only batch size 1 is supported on neutron.
+ return False
+
+ if groups == 1: # Regular convolution.
+ pass
+ elif conv_utils.group_conv_convertible_as_depthwise(
+ node, groups
+ ): # Depthwise convolution.
+ # Only supported if the weights are static, because TFLite `DepthwiseConv2D` uses permuted
+ # weights. In case the weights are dynamic, a Transpose operator would have to be added, which
+ # is not supported on Neutron.
+ if not node_is_effectively_static_tensor(weights, parameters_mapping):
return False
+ elif conv_utils.group_conv_convertible_into_multiple_convolutions(
+ node, groups
+ ): # Separable conv. This should never be reached, as the node should have been decomposed into
+ # multiple parallel convolutions by the `SplitGroupConvolution` pre-processing pass.
+ logging.warning("Group convolution was not decomposed.")
+ return False
+ else: # Unexpected case (should never happen).
+ return False
+
+ return True
@staticmethod
def _is_supported_in_IR(
@@ -238,7 +231,7 @@ def _convert_1d_conv(
def _convert_unpadded_2D(
self, t_op: tflite_model.Operator, conv_params: ConvParameters
) -> conv_utils.ConvConversionResult:
- """Convert the `aten.convolution` into TFLite. The `padding` and `builtin_options` must be converter by the
+ """Convert the `aten.convolution` into TFLite. The `padding` and `builtin_options` must be converted by the
caller.
"""
common.assign_2d_strides(t_op.builtin_options, conv_params.stride)
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py
index f03c403876f..c1dd7b600be 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py
@@ -12,7 +12,6 @@
from executorch.backends.nxp.backend.ir.converter.node_converter import (
CustomDelegationOptions,
NodeConverter,
- Target,
)
from executorch.backends.nxp.backend.ir.converter.node_converters.shared.reduce_utils import (
convert_axes_from_attribute,
@@ -20,6 +19,7 @@
from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options import (
mean_options,
)
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
from torch.fx import Node
from torch.nn import Parameter
@@ -28,34 +28,20 @@ class MeanDimConverter(NodeConverter):
@staticmethod
def _is_supported_on_target(
node: Node,
- target: Target,
+ neutron_target_spec: NeutronTargetSpec,
parameters_mapping: dict[str, Parameter],
custom_delegation_options: CustomDelegationOptions,
) -> bool:
- match target:
- case Target.RT700:
- # TODO: Consider different tensor formats (dim-order)
- dim = node.args[1]
- keepdim = node.args[2] if len(node.args) >= 3 else False
- rank = len(node.args[0].meta["val"].shape)
- dim = [MeanDimConverter._to_neg_dim(d, rank) for d in dim]
-
- # Only last 2 dimensions (H, W) and keepdim=True with rank=4 are supported on Neutron.
- if rank != 4 or dim not in [[-1, -2], [-2, -1]] or not keepdim:
- return False
-
- return True
-
- case _:
- return False
+ dim = node.args[1]
+ keepdim = node.args[2] if len(node.args) >= 3 else False
+ rank = len(node.args[0].meta["val"].shape)
+ dim = [d - rank if d > 0 else d for d in dim]
- @staticmethod
- def _to_pos_dim(d, rank):
- return d + rank if d < 0 else d
+ # Only last 2 dimensions (H, W) and keepdim=True with rank=4 are supported on Neutron.
+ if rank != 4 or dim not in [[-1, -2], [-2, -1]] or not keepdim:
+ return False
- @staticmethod
- def _to_neg_dim(d, rank):
- return d - rank if d > 0 else d
+ return True
@staticmethod
def _is_supported_in_IR(
@@ -75,6 +61,10 @@ def _is_supported_in_IR(
return True
+ @staticmethod
+ def _to_pos_dim(d: int, rank: int):
+ return d + rank if d < 0 else d
+
@staticmethod
def _normalize_and_to_channel_last_dim(dim: list[int], rank: int) -> list[int]:
# convert negative index to positive
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/softmax_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/softmax_converter.py
index aa74c78ca24..5e4404d8476 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/softmax_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/softmax_converter.py
@@ -7,13 +7,11 @@
CustomDelegationOptions,
)
from executorch.backends.nxp.backend.edge_helper import input_rank
-from executorch.backends.nxp.backend.ir.converter.node_converter import (
- NodeConverter,
- Target,
-)
+from executorch.backends.nxp.backend.ir.converter.node_converter import NodeConverter
from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options import (
softmax_options,
)
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
from torch.fx import Node
from torch.nn import Parameter
@@ -22,18 +20,11 @@ class SoftmaxConverter(NodeConverter):
@staticmethod
def _is_supported_on_target(
node: Node,
- target: Target,
+ neutron_target_spec: NeutronTargetSpec,
parameters_mapping: dict[str, Parameter],
custom_delegation_options: CustomDelegationOptions,
) -> bool:
- match target:
- case Target.RT700:
- # The eIQ Neutron NPU runtime software has a known issue with the SoftMax operation.
- # As long as the issue is present, return False for the i.MX RT700 target also.
- return False
-
- case _:
- return False
+ return False
@staticmethod
def _is_supported_in_IR(
diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_transpose_operators.py b/backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_transpose_operators.py
index dc9ad9999b4..0be46efcaa8 100755
--- a/backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_transpose_operators.py
+++ b/backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_transpose_operators.py
@@ -1,4 +1,4 @@
-# Copyright 2024 NXP
+# Copyright 2024-2025 NXP
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
diff --git a/backends/nxp/backend/neutron_converter_manager.py b/backends/nxp/backend/neutron_converter_manager.py
index 2bc4380f89b..a6884a9ee24 100644
--- a/backends/nxp/backend/neutron_converter_manager.py
+++ b/backends/nxp/backend/neutron_converter_manager.py
@@ -7,8 +7,6 @@
import multiprocessing
import pkgutil
-from executorch.backends.nxp.backend.ir.converter.node_converter import Target
-
def convert_unsafe(neutron_converter, tflite_model, cctx, queue):
"""
@@ -27,16 +25,7 @@ class NeutronConverterManager:
contains NeutronGraph nodes.
"""
- _supported_target_names = [Target.RT700.value]
-
- def convert(
- self, tflite_model: bytes, target: str, neutron_converter_flavor: str
- ) -> bytes:
- # Neutron converter crashes if we provide invalid target -> verify.
- if target not in self._supported_target_names:
- raise RuntimeError(
- f"Target '{target}' is not supported by NeutronConverterManager."
- )
+ def __init__(self, neutron_converter_flavor: str = "SDK_25_09"):
neutron_converter_modules = [
module.name
@@ -57,13 +46,34 @@ def convert(
f"not found. Install 'neutron_converter_[flavor]' Python package."
)
- neutron_converter = importlib.import_module(
+ self.neutron_converter = importlib.import_module(
f"{requested_module_name}.neutron_converter"
)
+ self.neutron_library_utils = importlib.import_module(
+ f"{requested_module_name}.neutron_library_utils"
+ )
+
+ def get_converter(self):
+ return self.neutron_converter
+
+ def get_library_utils(self):
+ return self.neutron_library_utils
+
+ def verify_target(self, target: str):
+ if not self.neutron_library_utils.isNeutronTarget(target):
+ valid_targets = [
+ target.name for target in self.neutron_library_utils.getNeutronTargets()
+ ]
+ raise ValueError(
+ f"Target `{target}` is not a valid target. Must be one of `{valid_targets}`."
+ )
+
+ def convert(self, tflite_model: bytes, target: str) -> bytes:
+ # Neutron converter crashes if we provide invalid target -> verify.
+ self.verify_target(target)
- cctx = neutron_converter.CompilationContext()
- cctx.targetOpts = neutron_converter.getNeutronTarget(target)
- # New switch since Neutron Converter SDK_25.06
+ cctx = self.neutron_converter.CompilationContext()
+ cctx.targetOpts = self.neutron_converter.getNeutronTarget(target)
cctx.compilationOpts.minNumOpsPerGraph = 1
logger = multiprocessing.log_to_stderr()
@@ -71,7 +81,8 @@ def convert(
queue = multiprocessing.Manager().Queue()
process = multiprocessing.Process(
- target=convert_unsafe, args=(neutron_converter, tflite_model, cctx, queue)
+ target=convert_unsafe,
+ args=(self.neutron_converter, tflite_model, cctx, queue),
)
process.start()
process.join() # waits until the subprocess is complete
diff --git a/backends/nxp/backend/neutron_target_spec.py b/backends/nxp/backend/neutron_target_spec.py
new file mode 100644
index 00000000000..44399982e29
--- /dev/null
+++ b/backends/nxp/backend/neutron_target_spec.py
@@ -0,0 +1,64 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Target Spec for the NXP Neutron NPU
+
+from enum import Enum
+
+from executorch.backends.nxp.backend.neutron_converter_manager import (
+ NeutronConverterManager,
+)
+
+
+class NeutronHWVersion(Enum):
+ N1 = 1
+ N3 = 2
+
+
+class NeutronTargetSpec:
+ """
+ The functionality for probing the properties of Neutron Target.
+ """
+
+ def __init__(self, target: str, neutron_converter_flavor: str):
+
+ converter_manager = NeutronConverterManager(neutron_converter_flavor)
+ converter_manager.verify_target(target)
+ neutron_converter = converter_manager.get_converter()
+ self.neutron_target = neutron_converter.getNeutronTarget(target)
+
+ if self.is_subsystem():
+ raise ValueError(
+ f"Target `{target}` is not a neutron-C target. Only MCU targets are supported at the moment."
+ )
+
+ if self.get_hw_version() != NeutronHWVersion.N3:
+ raise ValueError(
+ f"Target `{target}` contains unsupported HW version. Only N3/N3+ targets are supported at the moment."
+ )
+
+ # Target name.
+ def get_name(self) -> str:
+ return self.neutron_target.name
+
+ # Whether the target has subsystem (Neutron-S) or not (Neutron-C).
+ def is_subsystem(self) -> bool:
+ return self.neutron_target.subsystem
+
+ # Number of compute units.
+ def get_num_units(self) -> int:
+ return self.neutron_target.numUnits
+
+ # Number of compute pipelines.
+ def get_num_pipes(self) -> int:
+ return self.neutron_target.numPipes
+
+ # Number of compute MACs.
+ def get_num_macs(self) -> int:
+ return self.neutron_target.numMacs
+
+ # Neutron compute block hardware version.
+ def get_hw_version(self) -> NeutronHWVersion:
+ return NeutronHWVersion(self.neutron_target.hwVersion)
diff --git a/backends/nxp/neutron_partitioner.py b/backends/nxp/neutron_partitioner.py
index 371b7474f58..917545e6c89 100644
--- a/backends/nxp/neutron_partitioner.py
+++ b/backends/nxp/neutron_partitioner.py
@@ -8,7 +8,7 @@
import logging
import operator
from dataclasses import dataclass
-from typing import Dict, final, List, Mapping
+from typing import final, Mapping
import torch
@@ -18,13 +18,13 @@
from executorch.backends.nxp.backend.edge_program_converter import (
EdgeProgramToIRConverter,
)
-from executorch.backends.nxp.backend.ir.converter.node_converter import Target
from torch.export.exported_program import ExportedProgram
from torch.fx import Graph
from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner, Partition
from torch.fx.passes.operator_support import OperatorSupportBase
from torch.nn import Parameter
from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters import * # noqa F403
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
from executorch.backends.nxp.nxp_backend import NeutronBackend
from executorch.exir.backend.compile_spec_schema import CompileSpec
from executorch.exir.backend.partitioner import (
@@ -64,7 +64,7 @@ class QDQCluster:
"""
compute_node: torch.fx.Node
- ops: List[torch.fx.Node]
+ ops: list[torch.fx.Node]
QUANTIZE_OPERATORS = [
exir_ops.edge.quantized_decomposed.quantize_per_channel.default,
@@ -97,7 +97,7 @@ def is_dequant_node(node: torch.fx.Node) -> bool:
def is_auxiliary_node(node: torch.fx.Node) -> bool:
return node.target in QDQClusterRecognizer.AUXILIARY_OPS
- def get_qdq_cluster_input_part(self, node: torch.fx.Node) -> List[torch.fx.Node]:
+ def get_qdq_cluster_input_part(self, node: torch.fx.Node) -> list[torch.fx.Node]:
"""
Return the list of nodes representing the input part of the QDQ cluster of the node `node`.
Those are various dequantization nodes (see DEQUANTIZE_OPERATORS) optionally followed by auxiliary
@@ -125,7 +125,7 @@ def get_qdq_cluster_input_part(self, node: torch.fx.Node) -> List[torch.fx.Node]
logging.debug(f"Dequant Cluster for {node} is: {qdq_cluster}")
return qdq_cluster
- def get_qdq_cluster_output_part(self, node: torch.fx.Node) -> List[torch.fx.Node]:
+ def get_qdq_cluster_output_part(self, node: torch.fx.Node) -> list[torch.fx.Node]:
"""
Returns the list of nodes representing the output part of the QDQ cluster of the `node`.
Those are various quantize nodes (see QUANTIZE_OPERATORS) preceded by auxiliary nodes.
@@ -155,7 +155,7 @@ def get_qdq_cluster_output_part(self, node: torch.fx.Node) -> List[torch.fx.Node
logging.debug(f"Quant Cluster for {node} is {qdq_cluster}")
return qdq_cluster
- def get_qdq_cluster(self, node: torch.fx.Node) -> List[torch.fx.Node]:
+ def get_qdq_cluster(self, node: torch.fx.Node) -> list[torch.fx.Node]:
"""
Returns the QDQ cluster of the operator, if quantized. If operator is not quantized, returns empty list.
"""
@@ -167,7 +167,7 @@ def get_qdq_cluster(self, node: torch.fx.Node) -> List[torch.fx.Node]:
else:
return []
- def tag_nodes(self, nodes: List[torch.fx.Node], cluster_name: str) -> None:
+ def tag_nodes(self, nodes: list[torch.fx.Node], cluster_name: str) -> None:
"""
Tags a node and its related dequant and quant nodes with a specified cluster name
"""
@@ -175,7 +175,7 @@ def tag_nodes(self, nodes: List[torch.fx.Node], cluster_name: str) -> None:
logging.info(f"Tagging node {node} as {cluster_name}")
node.meta["cluster"] = cluster_name
- def tag_qdq_clusters(self, nodes: List[torch.fx.Node]):
+ def tag_qdq_clusters(self, nodes: list[torch.fx.Node]):
"""
Identifies QDQ clusters and tag them based on compute operation inside.
"""
@@ -220,14 +220,14 @@ class NeutronSupportedOperators(OperatorSupportBase):
def __init__(
self,
- qdq_clusters: Dict[str, QDQClusterRecognizer.QDQCluster],
- target: Target,
- operators_not_to_delegate: List[str],
+ qdq_clusters: dict[str, QDQClusterRecognizer.QDQCluster],
+ neutron_target_spec: NeutronTargetSpec,
+ operators_not_to_delegate: list[str],
parameters_mapping: dict[str, Parameter],
custom_delegation_options: CustomDelegationOptions,
):
self.qdq_clusters = qdq_clusters
- self.target = target
+ self.neutron_target_spec = neutron_target_spec
self.operators_not_to_delegate = operators_not_to_delegate
self.parameters_mapping = parameters_mapping
self.custom_delegation_options = custom_delegation_options
@@ -269,7 +269,7 @@ def _is_node_supported_compute(self, node: torch.fx.node.Node) -> bool:
# TODO: `view_copy` node should be delegated only if it's not the only operator in the cluster.
node_converter.is_supported(
node,
- self.target,
+ self.neutron_target_spec,
self.parameters_mapping,
self.custom_delegation_options,
)
@@ -305,13 +305,16 @@ def is_node_supported(
class NeutronPartitioner(Partitioner):
def __init__(
self,
- compile_spec: List[CompileSpec],
+ compile_spec: list[CompileSpec],
custom_delegation_options: CustomDelegationOptions | None = None,
) -> None:
self.delegation_spec = DelegationSpec(NeutronBackend.__name__, compile_spec)
self.custom_delegation_options = (
custom_delegation_options or CustomDelegationOptions()
)
+ target = self.delegation_spec[1][2].value.decode()
+ converter_flavor = self.delegation_spec[1][3].value.decode()
+ self.neutron_target_spec = NeutronTargetSpec(target, converter_flavor)
def validate_partitioning_result(
self,
@@ -343,22 +346,17 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
# subgraphs containing the nodes with the tags
logging.info("NeutronPartitioner::partition")
partition_tags = {}
+ partition_list = []
graph_module = exported_program.graph_module
nodes = list(graph_module.graph.nodes)
qdq_cluster_recognizer = QDQClusterRecognizer()
qdq_cluster_recognizer.tag_qdq_clusters(nodes)
+
graph_module.recompile()
- target = None
- operators_not_to_delegate = ""
- for spec in self.delegation_spec.compile_specs:
- if spec.key == "target":
- target = Target(spec.value.decode())
- if spec.key == "operators_not_to_delegate":
- operators_not_to_delegate = spec.value.decode().split(",")
- assert target is not None
+ operators_not_to_delegate = self.delegation_spec[1][4].value.decode().split(",")
logging.info(f"Operators not to delegate: {operators_not_to_delegate}")
parameters_mapping = EdgeProgramToIRConverter.map_inputs_to_parameters(
@@ -368,7 +366,7 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
exported_program.graph_module,
NeutronSupportedOperators(
qdq_cluster_recognizer.cluster_map,
- target,
+ self.neutron_target_spec,
operators_not_to_delegate,
parameters_mapping,
self.custom_delegation_options,
diff --git a/backends/nxp/nxp_backend.py b/backends/nxp/nxp_backend.py
index fd1687d73fd..44e9a19d9f2 100644
--- a/backends/nxp/nxp_backend.py
+++ b/backends/nxp/nxp_backend.py
@@ -1,4 +1,4 @@
-# Copyright 2024 NXP
+# Copyright 2024-2025 NXP
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
@@ -18,11 +18,11 @@
from executorch.backends.nxp.backend.edge_program_converter import (
EdgeProgramToIRConverter,
)
-from executorch.backends.nxp.backend.ir.converter.node_converter import Target
from executorch.backends.nxp.backend.ir.tensor_formatting import TensorFormat
from executorch.backends.nxp.backend.neutron_converter_manager import (
NeutronConverterManager,
)
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
from executorch.backends.nxp.neutron_node_extraction import (
extract_artifacts_from_neutron_node,
NeutronNodeArtifacts,
@@ -36,9 +36,9 @@
class NeutronCompileSpecBuilder:
+ config: NeutronTargetSpec
def __init__(self):
- self.config: Target = None
self.compile_spec: List[CompileSpec] = []
self.compiler_flags = []
self.output_format = None
@@ -68,14 +68,9 @@ def neutron_compile_spec(
extra_flags: Extra flags for the Neutron compiler
operators_not_to_delegate: List of operators that should not be delegated
"""
- try:
- self.config = Target(config)
- except ValueError:
- raise ValueError(
- f"Config `{config}` is not a valid target. Must be one of `{Target.values()}`."
- )
self.neutron_converter_flavor = neutron_converter_flavor
+ self.config = NeutronTargetSpec(config, neutron_converter_flavor)
assert (
self.output_format is None
@@ -101,7 +96,7 @@ def build(self):
self.compile_spec += [
CompileSpec("output_format", "tflite".encode()),
CompileSpec("compile_flags", " ".join(self.compiler_flags).encode()),
- CompileSpec("target", self.config.value.encode()),
+ CompileSpec("target", self.config.get_name().encode()),
CompileSpec(
"neutron_converter_flavor", self.neutron_converter_flavor.encode()
),
@@ -187,10 +182,11 @@ def preprocess( # noqa C901
# Convert the edge program to TFLite.
tflite_model, io_formats = EdgeProgramToIRConverter().convert_program(
edge_program,
+ neutron_target_spec=NeutronTargetSpec(target, neutron_converter_flavor),
)
- neutron_model = NeutronConverterManager().convert(
- tflite_model, target, neutron_converter_flavor
+ neutron_model = NeutronConverterManager(neutron_converter_flavor).convert(
+ tflite_model, target
)
# Dump the tflite file if logging level is enabled
diff --git a/backends/nxp/tests/executors.py b/backends/nxp/tests/executors.py
index 592717c0b3b..9626a2779c4 100644
--- a/backends/nxp/tests/executors.py
+++ b/backends/nxp/tests/executors.py
@@ -1,4 +1,4 @@
-# Copyright 2023-2024 NXP
+# Copyright 2023-2025 NXP
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
@@ -18,10 +18,8 @@
create_channels_first_to_channels_last_permutation,
create_channels_last_to_channels_first_permutation,
)
-from executorch.backends.nxp.backend.ir.converter.node_converter import (
- NodeConverter,
- Target,
-)
+from executorch.backends.nxp.backend.ir.converter.node_converter import NodeConverter
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
from torch.export import ExportedProgram
from torch.fx import Node
from torch.fx.graph import Graph
@@ -373,7 +371,7 @@ def graph_contains_any_of_ops(graph: Graph, ops: list) -> bool:
return any(node.target in ops for node in graph.nodes)
-target_support_check_function = Callable[[Node, Target], bool]
+target_support_check_function = Callable[[Node, NeutronTargetSpec], bool]
class OverrideTargetSupportCheck:
diff --git a/backends/nxp/tests/test_neutron_backend.py b/backends/nxp/tests/test_neutron_backend.py
index 53e54ec2f56..c9917651fbd 100644
--- a/backends/nxp/tests/test_neutron_backend.py
+++ b/backends/nxp/tests/test_neutron_backend.py
@@ -1,4 +1,4 @@
-# Copyright 2024 NXP
+# Copyright 2024-2025 NXP
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
diff --git a/backends/nxp/tests/test_neutron_converter_manager.py b/backends/nxp/tests/test_neutron_converter_manager.py
index e10e8cca67b..2fcfd8cd987 100644
--- a/backends/nxp/tests/test_neutron_converter_manager.py
+++ b/backends/nxp/tests/test_neutron_converter_manager.py
@@ -1,4 +1,4 @@
-# Copyright 2024 NXP
+# Copyright 2024-2025 NXP
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
@@ -29,9 +29,7 @@ def test_conv2d_neutron_conversion__default_flavor():
)
neutron_converter_manager = NeutronConverterManager()
- neutron_model = neutron_converter_manager.convert(
- tflite_model, "imxrt700", "SDK_25_09"
- )
+ neutron_model = neutron_converter_manager.convert(tflite_model, "imxrt700")
assert len(
neutron_model
@@ -50,9 +48,8 @@ def test__conv2d_neutron_conversion__invalid_flavor():
edge_program_manager.exported_program()
)
- neutron_converter_manager = NeutronConverterManager()
with pytest.raises(RuntimeError) as excinfo:
- _ = neutron_converter_manager.convert(tflite_model, "imxrt700", "bad_flavor")
+ _ = NeutronConverterManager("bad_flavor").convert(tflite_model, "imxrt700")
assert "Neutron Converter module with flavor 'bad_flavor' not found." in str(
excinfo
From 01456041ecaf58548da1d32397553edcb2713767 Mon Sep 17 00:00:00 2001
From: Agrima Khare <121654192+agrima1304@users.noreply.github.com>
Date: Thu, 2 Oct 2025 15:10:52 +0100
Subject: [PATCH 103/266] Arm Backend: Add tests for stack.default (#14623)
Stack is not in the list of core ATen ops and is decomposed
automatically when lowering the graph
(https://docs.pytorch.org/docs/main/export.html#export-ir-decompositions),
so only the tests need to be added.
stack is in this decomp table:
https://github.com/pytorch/pytorch/blob/5d749ceb92c2c28bcfbdf918b4ab99b1a91fcb50/torch/_decomp/__init__.py#L466
Signed-off-by: Agrima Khare
---
backends/arm/test/ops/test_stack.py | 150 ++++++++++++++++++++++++++++
1 file changed, 150 insertions(+)
create mode 100644 backends/arm/test/ops/test_stack.py
diff --git a/backends/arm/test/ops/test_stack.py b/backends/arm/test/ops/test_stack.py
new file mode 100644
index 00000000000..873a599992a
--- /dev/null
+++ b/backends/arm/test/ops/test_stack.py
@@ -0,0 +1,150 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+ EthosU55PipelineINT,
+ EthosU85PipelineINT,
+ TosaPipelineFP,
+ TosaPipelineINT,
+ VgfPipeline,
+)
+
+test_data_suite = {
+ # (test_name, test_data)
+ "ones_two_tensors": lambda: ((torch.ones(1), torch.ones(1)), 0),
+ "ones_and_rand_three_tensors": lambda: (
+ (torch.ones(1, 2), torch.randn(1, 2), torch.randn(1, 2)),
+ 1,
+ ),
+ "ones_and_rand_four_tensors": lambda: (
+ (
+ torch.ones(1, 2, 5),
+ torch.randn(1, 2, 5),
+ torch.randn(1, 2, 5),
+ torch.randn(1, 2, 5),
+ ),
+ -1,
+ ),
+ "rand_two_tensors": lambda: (
+ (torch.randn(2, 2, 4), torch.randn(2, 2, 4)),
+ 2,
+ ),
+ "rand_two_tensors_dim_0": lambda: (
+ (torch.randn(1, 2, 4, 4), torch.randn(1, 2, 4, 4)),
+ ),
+ "rand_two_tensors_dim_2": lambda: (
+ (torch.randn(2, 2, 3, 5), torch.randn(2, 2, 3, 5)),
+ 2,
+ ),
+ "rand_large": lambda: (
+ (
+ 10000 * torch.randn(2, 3, 1, 4),
+ torch.randn(2, 3, 1, 4),
+ torch.randn(2, 3, 1, 4),
+ ),
+ -3,
+ ),
+}
+
+
+class Stack(nn.Module):
+ aten_op = "torch.ops.aten.stack.default"
+ exir_op = "executorch_exir_dialects_edge__ops_aten_cat_default"
+
+ def forward(self, n: tuple[torch.Tensor, ...], dim: int = 0):
+ return torch.stack(n, dim)
+
+
+input_t1 = Tuple[torch.Tensor]
+
+
+@common.parametrize("test_module", test_data_suite)
+def test_stack_tosa_FP(test_module: input_t1):
+ test_data = test_module()
+ pipeline = TosaPipelineFP[input_t1](
+ Stack(),
+ test_data,
+ aten_op=Stack.aten_op,
+ exir_op=Stack.exir_op,
+ use_to_edge_transform_and_lower=False,
+ )
+ pipeline.run()
+
+
+@common.parametrize("test_module", test_data_suite)
+def test_stack_tosa_INT(test_module: input_t1):
+ test_data = test_module()
+ pipeline = TosaPipelineINT[input_t1](
+ Stack(),
+ test_data,
+ aten_op=Stack.aten_op,
+ exir_op=Stack.exir_op,
+ use_to_edge_transform_and_lower=False,
+ )
+ pipeline.run()
+
+
+@common.XfailIfNoCorstone300
+@common.parametrize("test_module", test_data_suite)
+def test_stack_u55_INT(test_module: input_t1):
+ test_data = test_module()
+ pipeline = EthosU55PipelineINT[input_t1](
+ Stack(),
+ test_data,
+ aten_ops=Stack.aten_op,
+ exir_ops=Stack.exir_op,
+ use_to_edge_transform_and_lower=False,
+ )
+ pipeline.run()
+
+
+@common.XfailIfNoCorstone320
+@common.parametrize("test_module", test_data_suite)
+def test_stack_u85_INT(test_module: input_t1):
+ test_data = test_module()
+ pipeline = EthosU85PipelineINT[input_t1](
+ Stack(),
+ test_data,
+ aten_ops=Stack.aten_op,
+ exir_ops=Stack.exir_op,
+ use_to_edge_transform_and_lower=False,
+ )
+ pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+@common.parametrize("test_module", test_data_suite)
+def test_stack_vgf_FP(test_module: input_t1):
+ test_data = test_module()
+ pipeline = VgfPipeline[input_t1](
+ Stack(),
+ test_data,
+ aten_op=Stack.aten_op,
+ exir_op=Stack.exir_op,
+ tosa_version="TOSA-1.0+FP",
+ use_to_edge_transform_and_lower=False,
+ )
+ pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+@common.parametrize("test_module", test_data_suite)
+def test_stack_vgf_INT(test_module: input_t1):
+ test_data = test_module()
+ pipeline = VgfPipeline[input_t1](
+ Stack(),
+ test_data,
+ aten_op=Stack.aten_op,
+ exir_op=Stack.exir_op,
+ tosa_version="TOSA-1.0+INT",
+ use_to_edge_transform_and_lower=False,
+ )
+ pipeline.run()
From 4372a143172df1b0037f296a36e9b5e83cdba548 Mon Sep 17 00:00:00 2001
From: Abhinayk
Date: Thu, 2 Oct 2025 10:03:06 -0700
Subject: [PATCH 104/266] Fix const prop pass when a const prop tensor has zero
stride, make it contiguous (#14725)
---
exir/passes/constant_prop_pass.py | 8 ++++
exir/tests/test_passes.py | 73 ++++++++++++++++++++++++++++++-
2 files changed, 80 insertions(+), 1 deletion(-)
diff --git a/exir/passes/constant_prop_pass.py b/exir/passes/constant_prop_pass.py
index 7daa3a247e8..06c1c78ee21 100644
--- a/exir/passes/constant_prop_pass.py
+++ b/exir/passes/constant_prop_pass.py
@@ -164,6 +164,14 @@ def get_propagated_const_tensor_dict(
with torch.no_grad():
# Execute the `node.target` and create a new propagated constant tensor.
prop_constant_tensor = node.target(*args_data, **kwargs_data)
+
+ # ExecuTorch doesn't support zero strides, so we need to ensure the tensor is contiguous
+ # if it has any zero strides from broadcasting/expansion operations
+ if (
+ isinstance(prop_constant_tensor, torch.Tensor)
+ and 0 in prop_constant_tensor.stride()
+ ):
+ prop_constant_tensor = prop_constant_tensor.contiguous()
const_node_to_tensor[node] = prop_constant_tensor
return const_node_to_tensor
diff --git a/exir/tests/test_passes.py b/exir/tests/test_passes.py
index 716b808b087..14f105e8205 100644
--- a/exir/tests/test_passes.py
+++ b/exir/tests/test_passes.py
@@ -24,7 +24,17 @@
from executorch.backends.xnnpack.quantizer.xnnpack_quantizer_utils import (
QuantizationConfig,
)
-from executorch.exir import EdgeCompileConfig, EdgeProgramManager, memory, to_edge
+from executorch.backends.xnnpack.utils.configs import (
+ get_xnnpack_executorch_backend_config,
+)
+
+from executorch.exir import (
+ EdgeCompileConfig,
+ EdgeProgramManager,
+ memory,
+ to_edge,
+ to_edge_transform_and_lower,
+)
from executorch.exir.dialects._ops import bind_pattern_to_op, ops, ops as exir_ops
from executorch.exir.dialects.edge._ops import EdgeOpOverload
from executorch.exir.emit import emit_program
@@ -2022,3 +2032,64 @@ def forward(self, x):
pass_result = constant_prop_pass(edge.exported_program())
# 1 constant: a (= self.w @ self.cst)
self.assertEqual(1, len(pass_result.constants))
+
+ def test_constant_prop_pass_zero_stride_tensors(self) -> None:
+ """
+ Test that constant propagation correctly handles tensors with zero strides
+ by converting them to contiguous tensors. Zero-stride tensors can be created
+ by operations like expand() and are not supported by ExecuTorch.
+ """
+
+ class ZeroStrideModel(torch.nn.Module):
+ def __init__(self):
+ super().__init__()
+ self.const_param = torch.nn.Parameter(torch.tensor([1.0, 2.0, 3.0]))
+
+ def forward(self, x):
+ unsqueezed = self.const_param.unsqueeze(
+ 1
+ ) # Shape: (3, 1), strides: (1, 1)
+ # expand creates zero-stride tensor
+ expanded = unsqueezed.expand(3, 5) # Shape: (3, 5), strides: (1, 0)
+
+ # Use the expanded tensor with the input to prevent elimination
+ result = x + expanded.sum()
+ return result
+
+ model = ZeroStrideModel()
+ x = torch.randn(3, 5)
+ exported = torch.export.export(model, (x,))
+
+ # Before constant prop: verify we have the parameter
+ self.assertIn("const_param", exported.state_dict)
+
+ const_prop_result = constant_prop_pass(exported)
+ lowered = to_edge_transform_and_lower(
+ const_prop_result,
+ partitioner=[XnnpackPartitioner()],
+ )
+
+ # Should go through
+ lowered.to_executorch(get_xnnpack_executorch_backend_config([SpecPropPass()]))
+ self.assertGreater(len(const_prop_result.constants), 0)
+
+ # Find the propagated constant tensor
+ prop_tensor = None
+ for constant_name, constant_tensor in const_prop_result.constants.items():
+ if constant_name.startswith("_prop_tensor_constant"):
+ prop_tensor = constant_tensor
+ break
+
+ # Verify the propagated tensor exists and has no zero strides
+ self.assertIsNotNone(prop_tensor)
+ self.assertNotIn(
+ 0,
+ prop_tensor.stride(),
+ f"Propagated tensor still has zero stride: {prop_tensor.stride()}",
+ )
+
+ # Verify the tensor is contiguous
+ self.assertTrue(
+ prop_tensor.is_contiguous(),
+ f"Propagated tensor is not contiguous: {prop_tensor.stride()}",
+ )
From 0882c9b689196791384a74ba1a2da695cd1cba4b Mon Sep 17 00:00:00 2001
From: DannyYuyang-quic
Date: Fri, 3 Oct 2025 01:30:47 +0800
Subject: [PATCH 105/266] Qualcomm AI Engine Direct - GA Static
Gemma-2b-instruct (#14459)
### Summary:
- e2e script for Gemma-2b-it in static llama version
- add model params file & model weight converter
### Test plan
``` bash
python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --temperature 0 --model_mode hybrid --max_seq_len 1024 --prefill_ar_len 128 --decoder_model gemma3-1b --prompt "I would like to learn python, could you teach me with a simple example?" --tasks wikitext --limit 1
```
---
backends/qualcomm/tests/test_qnn_delegate.py | 59 ++++++++++
examples/models/gemma/__init__.py | 16 +++
examples/models/gemma/config/2b_config.json | 19 ++++
examples/models/gemma/convert_weights.py | 104 ++++++++++++++++++
examples/qualcomm/oss_scripts/llama/README.md | 20 +++-
.../qualcomm/oss_scripts/llama/__init__.py | 31 ++++++
.../oss_scripts/llama/decoder_constants.py | 1 +
examples/qualcomm/oss_scripts/llama/llama.py | 16 ++-
.../oss_scripts/llama/qnn_llama_runner.cpp | 3 +-
.../oss_scripts/llama/runner/runner.cpp | 6 +-
.../oss_scripts/llama/runner/runner.h | 1 +
11 files changed, 265 insertions(+), 11 deletions(-)
create mode 100644 examples/models/gemma/__init__.py
create mode 100644 examples/models/gemma/config/2b_config.json
create mode 100644 examples/models/gemma/convert_weights.py
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index e3cf52b9a6f..7018edcbb9c 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -4968,6 +4968,65 @@ def test_qnn_backend_seq_mse(self):
class TestExampleLLMScript(TestQNN):
+ def test_static_gemma_2b(self):
+ if not self.required_envs():
+ self.skipTest("missing required envs")
+
+ prompt = "My favourite condiment is "
+ cmds = [
+ "python",
+ f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
+ "--artifact",
+ self.artifact_dir,
+ "--build_folder",
+ self.build_folder,
+ "--model",
+ self.model,
+ "--ip",
+ self.ip,
+ "--port",
+ str(self.port),
+ "--prompt",
+ f"{prompt}",
+ "--decoder_model",
+ "gemma-2b",
+ "--model_mode",
+ "kv",
+ "--max_seq_len",
+ "1024",
+ "--eval_perplexity",
+ "--tasks",
+ "wikitext",
+ "--limit",
+ "1",
+ ]
+ if self.compile_only:
+ cmds.extend(["--compile_only"])
+ elif self.device:
+ cmds.extend(["--device", self.device])
+ if self.host:
+ cmds.extend(["--host", self.host])
+ elif self.enable_x86_64:
+ cmds.extend(["--enable_x86_64"])
+ if self.pre_gen_pte:
+ cmds.extend(["--pre_gen_pte", self.pre_gen_pte])
+
+ p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+ with Listener((self.ip, self.port)) as listener:
+ conn = listener.accept()
+ p.communicate()
+ msg = json.loads(conn.recv())
+ if "Error" in msg:
+ self.fail(msg["Error"])
+ else:
+ inference_speed_ref = {"SM8650": 32, "SM8750": 36}
+ self.assertLessEqual(msg["wiki_ppl"], 35)
+ self.assertLessEqual(msg["pte_size"], 2_700_000_000) # 2.7GB
+ if self.model in inference_speed_ref:
+ self.assertGreaterEqual(
+ msg["inference_speed"], inference_speed_ref[self.model]
+ )
+
def test_static_gemma3_1b(self):
if not self.required_envs():
self.skipTest("missing required envs")
diff --git a/examples/models/gemma/__init__.py b/examples/models/gemma/__init__.py
new file mode 100644
index 00000000000..13a14ff0751
--- /dev/null
+++ b/examples/models/gemma/__init__.py
@@ -0,0 +1,16 @@
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.examples.models.gemma.convert_weights import convert_weights
+from executorch.examples.models.llama.model import Llama2Model
+
+
+class GemmaModel(Llama2Model):
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+
+
+__all__ = [
+ "GemmaModel",
+ "convert_weights",
+]
diff --git a/examples/models/gemma/config/2b_config.json b/examples/models/gemma/config/2b_config.json
new file mode 100644
index 00000000000..20a40723c30
--- /dev/null
+++ b/examples/models/gemma/config/2b_config.json
@@ -0,0 +1,19 @@
+{
+ "dim": 2048,
+ "ffn_dim_multiplier": 1,
+ "hidden_dim": 16384,
+ "n_heads": 8,
+ "head_dim": 256,
+ "n_kv_heads": 1,
+ "n_layers": 18,
+ "act_fn": "gelu",
+ "norm_type": "gemma3",
+ "norm_eps": 1e-06,
+ "rope_theta": 10000.0,
+ "use_scaled_rope": false,
+ "apply_embedding": true,
+ "embedding_scale_factor": 45.254833995939045,
+ "vocab_size": 256000,
+ "use_hf_rope": true,
+ "attention_qkv_bias": false
+}
diff --git a/examples/models/gemma/convert_weights.py b/examples/models/gemma/convert_weights.py
new file mode 100644
index 00000000000..09a17bc2266
--- /dev/null
+++ b/examples/models/gemma/convert_weights.py
@@ -0,0 +1,104 @@
+import argparse
+
+import json
+import os
+from typing import Dict
+
+import torch
+from safetensors.torch import load_file
+
+from torchtune.models.convert_weights import get_mapped_key
+
+
+# Weight mappings from Gemma's checkpoint to ExecuTorch's transformer parameters.
+_GEMMA_TO_EXECUTORCH = {
+ "model.embed_tokens.weight": "tok_embeddings.weight",
+ "model.norm.weight": "norm.weight",
+ "model.layers.{}.self_attn.k_proj.weight": "layers.{}.attention.wk.weight",
+ "model.layers.{}.self_attn.q_proj.weight": "layers.{}.attention.wq.weight",
+ "model.layers.{}.self_attn.v_proj.weight": "layers.{}.attention.wv.weight",
+ "model.layers.{}.self_attn.o_proj.weight": "layers.{}.attention.wo.weight",
+ "model.layers.{}.input_layernorm.weight": "layers.{}.attention_norm.weight",
+ "model.layers.{}.post_attention_layernorm.weight": "layers.{}.ffn_norm.weight",
+ "model.layers.{}.mlp.gate_proj.weight": "layers.{}.feed_forward.w1.weight",
+ "model.layers.{}.mlp.down_proj.weight": "layers.{}.feed_forward.w2.weight",
+ "model.layers.{}.mlp.up_proj.weight": "layers.{}.feed_forward.w3.weight",
+}
+
+
+def gemma_to_executorch(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+ """
+ Convert the state dict so that it matches what ExecuTorch's transformer definition expects.
+ """
+ converted_state_dict = {}
+ for key, value in state_dict.items():
+ new_key = get_mapped_key(key, _GEMMA_TO_EXECUTORCH)
+ converted_state_dict[new_key] = value
+ converted_state_dict["output.weight"] = converted_state_dict[
+ "tok_embeddings.weight"
+ ]
+ return converted_state_dict
+
+
+def load_checkpoint_from_safetensors(input_dir: str) -> Dict:
+ index_path = os.path.join(input_dir, "model.safetensors.index.json")
+ if os.path.exists(index_path):
+ # Sharded checkpoint.
+ with open(index_path, "r") as f:
+ index = json.load(f)
+ weight_map = index["weight_map"]
+ checkpoint_shards = sorted(set(weight_map.values()))
+
+ # Load all the shards into memory
+ shard_to_weights = {}
+ for shard in checkpoint_shards:
+ shard_to_weights[shard] = load_file(os.path.join(input_dir, shard))
+
+ # Merge tensors into consolidated state dict.
+ merged_state_dict = {}
+ for weight_name, shard in weight_map.items():
+ tensor = shard_to_weights[shard][weight_name]
+ merged_state_dict[weight_name] = tensor
+ return merged_state_dict
+ else:
+ # Single checkpoint.
+ state_dict = load_file(os.path.join(input_dir, "model.safetensors"))
+ return state_dict
+
+
+def load_checkpoint(input_dir: str) -> Dict:
+ pytorch_path = os.path.join(input_dir, "pytorch_model.bin")
+ if os.path.exists(pytorch_path):
+ print("Loading checkpoint from PyTorch .bin file")
+ return torch.load(pytorch_path, map_location="cpu", weights_only=True)
+ print("Loading checkpoint from safetensors directory")
+ return load_checkpoint_from_safetensors(input_dir)
+
+
+def convert_weights(input_dir: str, output_file: str) -> None:
+ print("Loading checkpoint...")
+ sd = load_checkpoint(input_dir)
+ print("Converting checkpoint...")
+ sd = gemma_to_executorch(sd)
+ print("Saving checkpoint...")
+ torch.save(sd, output_file)
+ print("Done.")
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Convert Gemma weights to ExecuTorch transformer format."
+ )
+ parser.add_argument(
+ "input_dir",
+ type=str,
+ help="Path to directory containing safetensor checkpoint files, or PyTorch checkpoint file.",
+ )
+ parser.add_argument("output", type=str, help="Path to the output checkpoint")
+
+ args = parser.parse_args()
+ convert_weights(args.input_dir, args.output)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/examples/qualcomm/oss_scripts/llama/README.md b/examples/qualcomm/oss_scripts/llama/README.md
index 1be94ec04d6..9bb76142362 100644
--- a/examples/qualcomm/oss_scripts/llama/README.md
+++ b/examples/qualcomm/oss_scripts/llama/README.md
@@ -5,12 +5,13 @@ This file provides you the instructions to run LLM Decoder model with different
1. LLAMA2 Stories 110M
2. LLAMA3.2 1B
3. LLAMA3.2 3B
- 4. Gemma3 1B
- 5. Phi4-mini-instruct
- 6. QWEN2.5 0.5B / 1.5B
- 7. QWEN3 0.6B / 1.7B
- 8. SmolLM2 135M
- 9. SmolLM3 3B
+ 4. Gemma 2B
+ 5. Gemma3 1B
+ 6. Phi4-mini-instruct
+ 7. QWEN2.5 0.5B / 1.5B
+ 8. QWEN3 0.6B / 1.7B
+ 9. SmolLM2 135M
+ 10. SmolLM3 3B
We offer the following modes to execute the model:
@@ -78,6 +79,13 @@ Default example using kv mode.
python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --decoder_model llama3_2-3b_instruct --model_mode kv --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?" --tasks wikitext --limit 1
```
+#### Gemma 2B
+Default example using hybrid mode
+```bash
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --temperature 0 --model_mode hybrid --max_seq_len 1024 --prefill_ar_len 128 --decoder_model gemma-2b --prompt "I would like to learn python, could you teach me with a simple example?" --tasks wikitext --limit 1
+```
+
+
#### Gemma3 1B
Default example using hybrid mode
```bash
diff --git a/examples/qualcomm/oss_scripts/llama/__init__.py b/examples/qualcomm/oss_scripts/llama/__init__.py
index 5908fcf32a6..628defc1496 100644
--- a/examples/qualcomm/oss_scripts/llama/__init__.py
+++ b/examples/qualcomm/oss_scripts/llama/__init__.py
@@ -24,6 +24,7 @@
)
from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
+from executorch.examples.models.gemma import convert_weights as convert_gemma_weights
from executorch.examples.models.gemma3 import convert_weights as convert_gemma3_weights
from executorch.examples.models.phi_4_mini import (
convert_weights as convert_phi_4_mini_weights,
@@ -300,6 +301,36 @@ class Llama3_2_3B_Instruct(LLMModelConfig):
)
+@register_llm_model("gemma-2b")
+@dataclass(init=False, frozen=True)
+class Gemma_2B(LLMModelConfig):
+ repo_id: str = "google/gemma-2b-it"
+ params_path: str = os.path.join(
+ BASE_DIR, "../../../models/gemma/config/2b_config.json"
+ )
+ convert_weights = convert_gemma_weights
+ transform_weight = False
+ instruct_model = True
+
+ num_sharding = 4
+ # quant config
+ ptq = QuantDtype.use_16a4w_block
+ group_size = 64
+ masked_softmax = True
+ seq_mse_candidates = 0
+ r1 = False
+ r2 = False
+ r3 = False
+ quantization_config_wv_sha_16a8w = get_ptq_per_channel_quant_config(
+ torch.uint16, weight_dtype=torch.int8, act_observer=MinMaxObserver
+ )
+ custom_annotation = (
+ annotate_kv_8bit,
+ annotate_output_16a8w,
+ partial(annotate_wv_sha, quantization_config=quantization_config_wv_sha_16a8w),
+ )
+
+
@register_llm_model("gemma3-1b")
@dataclass(init=False, frozen=True)
class Gemma3(LLMModelConfig):
diff --git a/examples/qualcomm/oss_scripts/llama/decoder_constants.py b/examples/qualcomm/oss_scripts/llama/decoder_constants.py
index ac96770b889..d43ceb8351a 100644
--- a/examples/qualcomm/oss_scripts/llama/decoder_constants.py
+++ b/examples/qualcomm/oss_scripts/llama/decoder_constants.py
@@ -14,6 +14,7 @@
DECODER_MODEL_VERSION = {
"stories260k": "llama2",
"stories110m": "llama2",
+ "gemma-2b": "gemma",
"gemma3-1b": "gemma3",
"phi_4_mini": "phi_4_mini",
"llama3_2-1b_instruct": "llama3",
diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
index ae5ae63d509..887e680341f 100755
--- a/examples/qualcomm/oss_scripts/llama/llama.py
+++ b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -327,6 +327,13 @@ def quantize(
chat_template, args.prompt[0], args.system_prompt
)
)
+
+ # Gemma may produce unexpected output if the prompt contains an extra token.
+ # This can happen after applying a prompt template, which might inject unintentionally.
+ # To prevent decoding issues, we explicitly remove token
+ if chat_template and args.decoder_model in {"gemma-2b", "gemma3-1b"}:
+ prompt = prompt.replace("", "")
+
graph_module_inference(
use_kv_cache=self.llama_meta["get_use_kv_cache"],
get_example_inputs=self.get_example_inputs,
@@ -534,14 +541,13 @@ def compile(
state_dict = torch.load(
checkpoint, weights_only=True, map_location="cpu", mmap=True
)
- if args.decoder_model == "gemma3-1b":
+ if args.decoder_model in {"gemma-2b", "gemma3-1b"}:
for k, v in state_dict.items():
if "norm" not in k:
continue
# Llama does x.to(float16) * w whilst Gemma3 is (x * w).to(float16)
# See https://github.com/huggingface/transformers/pull/29402
state_dict[k] = v.float() + torch.ones(v.shape, dtype=torch.float32)
-
else:
state_dict = torch.load(
args.checkpoint, weights_only=True, map_location="cpu", mmap=True
@@ -1286,7 +1292,11 @@ def export_llama(args) -> None:
)
tokenizer_artifacts = tokenizer.save_pretrained(args.artifact)
tokenizer_config = tokenizer_artifacts[0]
- runtime_tokenizer_path = tokenizer_artifacts[-1]
+ if args.decoder_model == "gemma-2b":
+ # For Gemma, use tokenizer.model as it doesn't provide pre_tokenizer in tokenizer.json.
+ runtime_tokenizer_path = tokenizer_artifacts[-3]
+ else:
+ runtime_tokenizer_path = tokenizer_artifacts[-1]
tokenizer = get_tokenizer(runtime_tokenizer_path, tokenizer_config)
# TODO: Remove this once error is resolved.
diff --git a/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
index 71eaea2b8d6..2bffb35852a 100644
--- a/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
@@ -9,7 +9,7 @@
/**
* @file
*
- * This tool can run Llama2 110M, Llama3.2 1B / 3B, Gemma3 1B,
+ * This tool can run Llama2 110M, Llama3.2 1B / 3B, Gemma 2B, Gemma3 1B,
* phi4-mini-instruct, Qwen2.5 0.5B / 1.5B, Qwen3 0.6B / 1.7B, SmolLM2 135M,
* SmolLM3 3B with Qualcomm AI Engine Direct.
*
@@ -117,6 +117,7 @@ std::string get_formatted_prompt(
formatted_prompt.append(
"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");
break;
+ case example::DecoderModelVersion::kGemma:
case example::DecoderModelVersion::kGemma3:
formatted_prompt.append("user\n");
formatted_prompt.append(prompt);
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
index fe45d4b6a67..0c4884bbccf 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
@@ -122,6 +122,8 @@ Runner::Runner(
decoder_model_version_ = DecoderModelVersion::kLlama2;
} else if (decoder_model_version == "llama3") {
decoder_model_version_ = DecoderModelVersion::kLlama3;
+ } else if (decoder_model_version == "gemma") {
+ decoder_model_version_ = DecoderModelVersion::kGemma;
} else if (decoder_model_version == "gemma3") {
decoder_model_version_ = DecoderModelVersion::kGemma3;
cache_mode_ = CacheMode::HybridCache;
@@ -199,7 +201,9 @@ Error Runner::load() {
decoder_model_version_ == DecoderModelVersion::kSmollm2_135m ||
decoder_model_version_ == DecoderModelVersion::kSmollm3) {
eos_ids->insert(tokenizer_->encode("<|im_end|>", 0, 0).get()[0]);
- } else if (decoder_model_version_ == DecoderModelVersion::kGemma3) {
+ } else if (
+ decoder_model_version_ == DecoderModelVersion::kGemma ||
+ decoder_model_version_ == DecoderModelVersion::kGemma3) {
eos_ids->insert(tokenizer_->encode("", 0, 0).get()[0]);
}
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.h b/examples/qualcomm/oss_scripts/llama/runner/runner.h
index 9f290d79c75..1472093ab66 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/runner.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/runner.h
@@ -32,6 +32,7 @@ namespace example {
enum DecoderModelVersion {
kLlama2 = 0,
kLlama3,
+ kGemma,
kGemma3,
kPhi4,
kQwen2_5,
From deb42f2a8e48f5032b4a98ee781a15fa87a157cf Mon Sep 17 00:00:00 2001
From: Laith Sakka
Date: Thu, 2 Oct 2025 10:54:53 -0700
Subject: [PATCH 106/266] update lama export DS specs to be more accurate.
Differential Revision: D83708583
Pull Request resolved: https://github.com/pytorch/executorch/pull/14737
---
extension/llm/export/builder.py | 9 +++++++--
extension/llm/export/test/test_builder.py | 2 +-
2 files changed, 8 insertions(+), 3 deletions(-)
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index 01000f3564c..da5c3324662 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -142,9 +142,14 @@ def __init__(
{1: torch.export.Dim("token_dim", max=self.max_seq_len - 1)},
)
else:
- # Two input arguments: tokens and input_pos but input_pos is static shape
+ # Two input arguments: tokens and input_pos but input_pos is static shape.
+
+ # A runtime assertion is added by torch.ops.llama.update_cache requires that
+ # L['tokens'].size()[1] + input_pos[0].item() < self.max_seq_len
+ # This consttaint L['tokens'].size()[1] to be elf.max_seq_len-1
+ # run with TORCH_LOGS=+dynamic for details
self.dynamic_shapes = (
- {1: torch.export.Dim("token_dim", max=self.max_seq_len)},
+ {1: torch.export.Dim("token_dim", max=self.max_seq_len - 1)},
{"input_pos": {0: 1}},
)
diff --git a/extension/llm/export/test/test_builder.py b/extension/llm/export/test/test_builder.py
index 8bf591813ec..7883480c1e7 100644
--- a/extension/llm/export/test/test_builder.py
+++ b/extension/llm/export/test/test_builder.py
@@ -88,7 +88,7 @@ def test_get_dynamic_shape_with_dynamic_shape_enabled_with_kv_cache(self) -> Non
# Check first element (tokens dimension)
self.assertIsInstance(result[0], dict)
self.assertIn(1, result[0])
- self.assertEqual(result[0][1].max, self.max_seq_len)
+ self.assertEqual(result[0][1].max, self.max_seq_len - 1)
# Check second element (input_pos dimension)
self.assertIsInstance(result[1], dict)
From 19258d284c8257a53471a63d0b92f462f8eb2a5c Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka
Date: Thu, 2 Oct 2025 12:47:25 -0700
Subject: [PATCH 107/266] update tokenizer pin (#14751)
Summary:
https://github.com/meta-pytorch/tokenizers/commit/65e41a96e1b6870d0e616cd7f9eaaf5aaa1d89f3
bringing in this change for windows builds of voxtral runner
Differential Revision: D83759380
---
extension/llm/tokenizers | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/extension/llm/tokenizers b/extension/llm/tokenizers
index b0076444dec..65e41a96e1b 160000
--- a/extension/llm/tokenizers
+++ b/extension/llm/tokenizers
@@ -1 +1 @@
-Subproject commit b0076444decffb88166452e26ba688233b905647
+Subproject commit 65e41a96e1b6870d0e616cd7f9eaaf5aaa1d89f3
From a1652f97b721dccc4f1f2585d3e1f15a2306e8d0 Mon Sep 17 00:00:00 2001
From: tmi
Date: Fri, 3 Oct 2025 00:16:59 +0200
Subject: [PATCH 108/266] Fix pyproject.toml license classifier deprecation
(#14592)
Gets rid of the 'deprecated' warnings that pop up multiple times during
build/install
Bumps setuptools requirement to accept the new license declaration
format
### Summary
Just a tiny PR, no change to API or code or anything. The license itself
is as before, it just changes the manner in which it is declared -- as
recommended by the PyPA guidelines
https://packaging.python.org/en/latest/guides/writing-pyproject-toml/#license
I tried to find any issue related to this but found none. And I guess
not worth it creating one
### Test plan
Since this does not change any of the code, I just tested that the
package can be installed/built as before via `./install_executorch.sh`,
and that the deprecation warnings vanish
---
pyproject.toml | 7 ++++---
requirements-dev.txt | 3 ++-
2 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/pyproject.toml b/pyproject.toml
index fbed875a824..401b1fa2c24 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,9 +1,10 @@
[build-system]
requires = [
"cmake>=3.29,<4.0.0", # For building binary targets in the wheel. 4.0.0 breaks third-party CMake build so temporarily pin the version.
+ "packaging>=24.2", # Lower bound required by setuptools
"pip>=23", # For building the pip package.
"pyyaml", # Imported by the kernel codegen tools.
- "setuptools>=63", # For building the pip package contents.
+ "setuptools>=77.0.3", # For building the pip package contents.
"wheel", # For building the pip package archive.
"zstd", # Imported by resolve_buck.py.
"certifi", # Imported by resolve_buck.py.
@@ -21,7 +22,8 @@ readme = "README-wheel.md"
authors = [
{name="PyTorch Team", email="packages@pytorch.org"},
]
-license = {file = "LICENSE"}
+license = "BSD-3-Clause"
+license-files = ["LICENSE"]
keywords = ["pytorch", "machine learning"]
# PyPI package information.
classifiers = [
@@ -33,7 +35,6 @@ classifiers = [
"Intended Audience :: Developers",
"Intended Audience :: Education",
"Intended Audience :: Science/Research",
- "License :: OSI Approved :: BSD License",
"Topic :: Scientific/Engineering",
"Topic :: Scientific/Engineering :: Mathematics",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 9df5e7b93ed..258a898894c 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,9 +1,10 @@
# Pip packages needed to build from source. Mainly for development of ExecuTorch.
cmake>=3.29, <4.0.0 # For building binary targets in the wheel.
+packaging>=24.2 # Lower bound required by setuptools
pip>=23 # For building the pip package.
pyyaml # Imported by the kernel codegen tools.
-setuptools>=63 # For building the pip package contents.
+setuptools>=77.0.3 # For building the pip package contents.
wheel # For building the pip package archive.
zstd # Imported by resolve_buck.py.
certifi # Imported by resolve_buck.py.
From 53ccfd04c2ebd74da7d17174dd64711783466bcf Mon Sep 17 00:00:00 2001
From: Mengwei Liu
Date: Thu, 2 Oct 2025 23:25:43 +0100
Subject: [PATCH 109/266] Fix cuda export test failures from #14715 (#14753)
---
backends/cuda/TARGETS | 1 +
backends/cuda/cuda_backend.py | 4 +++-
backends/cuda/replace_slice_copy_with_slice.py | 13 ++++++++-----
backends/cuda/tests/test_cuda_export.py | 5 ++++-
4 files changed, 16 insertions(+), 7 deletions(-)
diff --git a/backends/cuda/TARGETS b/backends/cuda/TARGETS
index 3e412b6dc56..fe57f7f1b63 100644
--- a/backends/cuda/TARGETS
+++ b/backends/cuda/TARGETS
@@ -6,6 +6,7 @@ runtime.python_library(
name = "cuda_backend",
srcs = [
"cuda_backend.py",
+ "replace_slice_copy_with_slice.py",
],
visibility = [
"//executorch/...",
diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
index a39065f6a52..8ed8cdefbb1 100644
--- a/backends/cuda/cuda_backend.py
+++ b/backends/cuda/cuda_backend.py
@@ -144,7 +144,9 @@ def preprocess(
}
with collect_unsupported_fallback_kernels(), torch.nn.attention.sdpa_kernel(
- [SDPBackend.MATH]
+ [
+ SDPBackend.MATH # pyre-ignore[16]: Module `torch.nn.attention` has no attribute `SDPBackend`.
+ ]
), torch.no_grad():
# torch._logging.set_logs(post_grad_graphs=True)
so_path = torch._inductor.aot_compile(edge_program_module, tuple(user_input_placeholders), options=options) # type: ignore[arg-type]
diff --git a/backends/cuda/replace_slice_copy_with_slice.py b/backends/cuda/replace_slice_copy_with_slice.py
index 55ddef5de9b..4f16759af35 100644
--- a/backends/cuda/replace_slice_copy_with_slice.py
+++ b/backends/cuda/replace_slice_copy_with_slice.py
@@ -6,20 +6,23 @@
# pyre-strict
-from typing import Iterable
+from typing import Dict, Iterable, Tuple
import torch
from executorch.exir.dialects._ops import ops
+from executorch.exir.dialects.edge._ops import EdgeOpOverload
from executorch.exir.pass_base import ExportPass, PassResult
from torch import fx
-_SLICE_COPY_TARGETS = (
+_SLICE_COPY_TARGETS: Tuple[torch._ops.OpOverload | EdgeOpOverload] = (
torch.ops.aten.slice_copy.Tensor,
ops.edge.aten.slice_copy.Tensor,
)
-_SLICE_TARGETS = {
+_SLICE_TARGETS: Dict[
+ torch._ops.OpOverload | EdgeOpOverload, torch._ops.OpOverload | EdgeOpOverload
+] = {
torch.ops.aten.slice_copy.Tensor: torch.ops.aten.slice.Tensor,
ops.edge.aten.slice_copy.Tensor: ops.edge.aten.slice.Tensor,
}
@@ -99,8 +102,8 @@ def _is_view_user(self, node: fx.Node, user: fx.Node) -> bool:
return False
def _argument_mutates(
- self, schema: torch._C.FunctionSchema, key
- ) -> bool: # pyre-ignore[11]
+ self, schema: torch._C.FunctionSchema, key: int | str
+ ) -> bool:
arguments = schema.arguments
if isinstance(key, int):
if key >= len(arguments):
diff --git a/backends/cuda/tests/test_cuda_export.py b/backends/cuda/tests/test_cuda_export.py
index 99f8d33a766..d794a4f042c 100644
--- a/backends/cuda/tests/test_cuda_export.py
+++ b/backends/cuda/tests/test_cuda_export.py
@@ -8,6 +8,7 @@
from typing import Tuple
import torch
+from executorch.backends.cuda.cuda_backend import CudaBackend
from executorch.backends.cuda.cuda_partitioner import CudaPartitioner
from executorch.exir import EdgeCompileConfig, to_edge_transform_and_lower
from torch.export import export
@@ -30,7 +31,9 @@ def _export_to_cuda_with_lower(
exported_program = export(module, inputs, strict=True)
# Create partitioner and compile specs
- partitioner = CudaPartitioner([])
+ partitioner = CudaPartitioner(
+ [CudaBackend.generate_method_name_compile_spec("forward")]
+ )
# Use to_edge_transform_and_lower for complete pipeline
edge_program_manager = to_edge_transform_and_lower(
From c997fe405ac0ad6bf295ca5459f5352c2aeaae45 Mon Sep 17 00:00:00 2001
From: Naveen Suda <99509021+navsud@users.noreply.github.com>
Date: Thu, 2 Oct 2025 18:05:45 -0700
Subject: [PATCH 110/266] Remove explicit device arguments
Differential Revision: D82239525
Pull Request resolved: https://github.com/pytorch/executorch/pull/14619
---
examples/models/llama/model_args.py | 3 +++
examples/models/llama/rope.py | 10 ++++++++--
2 files changed, 11 insertions(+), 2 deletions(-)
diff --git a/examples/models/llama/model_args.py b/examples/models/llama/model_args.py
index 04d29f91ac6..3f9d3d8f2af 100644
--- a/examples/models/llama/model_args.py
+++ b/examples/models/llama/model_args.py
@@ -63,6 +63,9 @@ class ModelArgs:
use_sdpa_with_kv_cache_op: bool = (
False # Use custom sdpa op that updates kv cache in-place
)
+ # Device to use for the model: "cpu" or "cuda" (needed for QAT)
+ # Only used for creating Rope parameters
+ device: str = "cpu"
# Generate logits for all inputs. When it's True, it would take big memory usage
# at runtime. Enable it only necessary (e.g., use perplexity tools that requires
# logits for all input tokens.)
diff --git a/examples/models/llama/rope.py b/examples/models/llama/rope.py
index 8c0d5db6a80..0d1dd306091 100644
--- a/examples/models/llama/rope.py
+++ b/examples/models/llama/rope.py
@@ -138,7 +138,11 @@ def forward(
# and https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_rope_utils.py#L242.
# Current only support non-long rope.
def hf_precompute_freqs_cis(
- dim: int, end: int, theta: float, partial_rotary_factor: float = 1.0
+ dim: int,
+ end: int,
+ theta: float,
+ partial_rotary_factor: float = 1.0,
+ device: Union[str, torch.device] = "cpu",
):
# Partial rotary embeddings.
dim = int(dim * partial_rotary_factor)
@@ -146,7 +150,7 @@ def hf_precompute_freqs_cis(
# Short factor scaling.
freqs = 1.0 / (
theta
- ** (torch.arange(0, dim, 2, device="cpu", dtype=torch.int64).float() / dim)
+ ** (torch.arange(0, dim, 2, device=device, dtype=torch.int64).float() / dim)
)
# TODO: support long factor scaling.
@@ -236,6 +240,7 @@ def __init__(self, params: ModelArgs):
self.precompute_freqs_cis = partial(
hf_precompute_freqs_cis,
partial_rotary_factor=self.params.partial_rotary_factor,
+ device=self.params.device,
)
self.apply_rotary_emb = hf_apply_rotary_emb
else:
@@ -244,6 +249,7 @@ def __init__(self, params: ModelArgs):
use_scaled=self.params.use_scaled_rope,
scale_factor=self.params.rope_scale_factor,
high_freq_factor=self.params.high_freq_factor,
+ device=self.params.device,
)
self.apply_rotary_emb = RotaryEmbedding()
From 54bfd72921034825f5bd0e5bfcd93808bc8156b1 Mon Sep 17 00:00:00 2001
From: Andrew Grebenisan <33402477+DrJessop@users.noreply.github.com>
Date: Thu, 2 Oct 2025 20:11:48 -0700
Subject: [PATCH 111/266] Fix Wav2Vec Replace Pass Bug
Differential Revision: D83778606
Pull Request resolved: https://github.com/pytorch/executorch/pull/14757
---
backends/cadence/aot/replace_ops.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/backends/cadence/aot/replace_ops.py b/backends/cadence/aot/replace_ops.py
index 9e95460f2f5..2104764cd14 100644
--- a/backends/cadence/aot/replace_ops.py
+++ b/backends/cadence/aot/replace_ops.py
@@ -89,10 +89,10 @@ def replace_logical_nop_where_with_where(
# Get the third arg node and its input
logical_not_node = node.args[0]
- logical_not_input_tensor = logical_not_node.args[0].to_tensor()
+ logical_not_input_node = logical_not_node.args[0]
# If the logical_not input is not a boolean tensor, bail.
- if logical_not_input_tensor.meta["spec"].dtype != torch.bool:
+ if logical_not_input_node.meta["val"].dtype != torch.bool:
continue
# Replace the where op with another one, flipping the inputs and using the boolean
From 822a711dbe3b12f8defe740ea6ab570dec2841f6 Mon Sep 17 00:00:00 2001
From: Eli Amesefe
Date: Thu, 2 Oct 2025 20:59:25 -0700
Subject: [PATCH 112/266] Update addmm int16 for Ethos-U85
Differential Revision: D83627934
Pull Request resolved: https://github.com/pytorch/executorch/pull/14714
---
backends/arm/operators/op_bmm.py | 23 +++++++++++++++++++++++
backends/arm/test/ops/test_addmm.py | 6 ------
2 files changed, 23 insertions(+), 6 deletions(-)
diff --git a/backends/arm/operators/op_bmm.py b/backends/arm/operators/op_bmm.py
index 2636a08d7c5..9bebc3597ca 100644
--- a/backends/arm/operators/op_bmm.py
+++ b/backends/arm/operators/op_bmm.py
@@ -79,6 +79,12 @@ def define_node(
input1_zp = input_qparams[1].get_zp_per_tensor()
bmm_result = tosa_graph.addIntermediate(output.shape, ts.DType.INT32)
bmm_output_name = bmm_result.name
+ elif inputs[0].dtype == ts.DType.INT16:
+ input_qparams = get_input_qparams(node)
+ input0_zp = input_qparams[0].get_zp_per_tensor()
+ input1_zp = input_qparams[1].get_zp_per_tensor()
+ bmm_result = tosa_graph.addIntermediate(output.shape, ts.DType.INT48)
+ bmm_output_name = bmm_result.name
else:
bmm_output_name = output.name
input0_zp, input1_zp = 0, 0
@@ -118,3 +124,20 @@ def define_node(
output_zp=[output_qparams.get_zp_per_tensor()],
rounding_mode=RoundingMode.SINGLE_ROUND,
)
+ elif output.dtype == ts.DType.INT16:
+ output_qparams = get_output_qparams(node)[0]
+ final_output_scale = (
+ input_qparams[0].get_scale_per_tensor() * input_qparams[1].get_scale_per_tensor() # type: ignore[possibly-undefined] # pyre-ignore[61]
+ ) / output_qparams.get_scale_per_tensor()
+
+ build_rescale(
+ tosa_fb=tosa_graph,
+ scale=[final_output_scale],
+ # pyre-ignore[61]: Uninitialized local [61]: Local variable `bmm_result` is undefined, or not always defined.
+ input_node=bmm_result, # type: ignore[possibly-undefined]
+ output_name=output.name,
+ output_type=ts.DType.INT16,
+ input_zp=[0],
+ output_zp=[output_qparams.get_zp_per_tensor()],
+ rounding_mode=RoundingMode.SINGLE_ROUND,
+ )
diff --git a/backends/arm/test/ops/test_addmm.py b/backends/arm/test/ops/test_addmm.py
index b9a891ec740..1170f65dd58 100644
--- a/backends/arm/test/ops/test_addmm.py
+++ b/backends/arm/test/ops/test_addmm.py
@@ -213,9 +213,6 @@ def get_symmetric_a16w8_addmm_quantizer(per_channel_quantization=False):
@common.parametrize("test_data", test_data_suite)
-@pytest.mark.xfail(
- reason="missing int16 addmm ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13979"
-)
def test_addmm_16a8w_tosa_INT(test_data: input_t1):
"""Test addmm (FC layer) operation with 16A8W quantization (16-bit activations, 8-bit weights)"""
per_channel_quantization = False
@@ -268,9 +265,6 @@ def test_addmm_16a8w_u55_INT16(test_data: input_t1):
@common.parametrize("test_data", test_data_suite)
@common.XfailIfNoCorstone320
-@pytest.mark.xfail(
- reason="Vela compilation fails with 'Invalid arguments' for int16 addmm operations"
-)
def test_addmm_16a8w_u85_INT16(test_data: input_t1):
"""Test addmm (FC layer) operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)"""
per_channel_quantization = False
From e6527463f88fd69862b6799d3e9465b9690d4309 Mon Sep 17 00:00:00 2001
From: Naveen Suda <99509021+navsud@users.noreply.github.com>
Date: Thu, 2 Oct 2025 22:07:58 -0700
Subject: [PATCH 113/266] Use FusedMovingAvgObsFakeQuantize instead of
FakeQuantize for faster QAT
Differential Revision: D83583655
Pull Request resolved: https://github.com/pytorch/executorch/pull/14740
---
backends/qualcomm/quantizer/qconfig.py | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/backends/qualcomm/quantizer/qconfig.py b/backends/qualcomm/quantizer/qconfig.py
index 30af923781a..694fab3dc6b 100644
--- a/backends/qualcomm/quantizer/qconfig.py
+++ b/backends/qualcomm/quantizer/qconfig.py
@@ -200,7 +200,7 @@ def get_16a8w_qnn_qat_config(
act_observer=MovingAverageMinMaxObserver,
) -> QuantizationConfig:
extra_args: Dict[str, Any] = {"eps": 2**-20}
- act_fake_quant_ctr = FakeQuantize.with_args(
+ act_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args(
dtype=torch.int32,
quant_min=torch.iinfo(torch.uint16).min,
quant_max=torch.iinfo(torch.uint16).max,
@@ -398,7 +398,7 @@ def get_ptq_per_block_quant_config(
def get_8a8w_qnn_qat_config(
act_symmetric: bool = False, act_observer=MovingAverageMinMaxObserver
) -> QuantizationConfig:
- act_fake_quant_ctr = FakeQuantize.with_args(
+ act_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args(
dtype=torch.uint8,
qscheme=(
torch.per_tensor_symmetric if act_symmetric else torch.per_tensor_affine
@@ -458,7 +458,7 @@ def get_8a8w_qnn_qat_config(
def get_16a4w_qnn_qat_config(
act_observer=MovingAverageMinMaxObserver,
) -> QuantizationConfig:
- act_fake_quant_ctr = FakeQuantize.with_args(
+ act_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args(
dtype=torch.int32,
quant_min=torch.iinfo(torch.uint16).min,
quant_max=torch.iinfo(torch.uint16).max,
@@ -541,7 +541,7 @@ def get_qat_per_channel_quant_config(
# If zero_point is 128, htp can do optimizations.
# If we keep quant_min and quant_max none, observer will default use 128 as zero_point.
# If we provide uint8 quant_min/max, it will use 127 as zero_point, which is undesired.
- act_fake_quant_ctr = FakeQuantize.with_args(
+ act_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args(
dtype=torch.int32 if act_dtype == torch.uint16 else act_dtype,
qscheme=torch.per_tensor_symmetric,
observer=act_observer,
@@ -553,7 +553,7 @@ def get_qat_per_channel_quant_config(
observer_or_fake_quant_ctr=act_fake_quant_ctr,
)
else:
- act_fake_quant_ctr = FakeQuantize.with_args(
+ act_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args(
dtype=torch.int32 if act_dtype == torch.uint16 else act_dtype,
quant_min=torch.iinfo(act_dtype).min,
quant_max=torch.iinfo(act_dtype).max,
From 70ea66186e34210676171b3fb1ac8055117d8c06 Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin
Date: Thu, 2 Oct 2025 23:00:56 -0700
Subject: [PATCH 114/266] Add Phi4 test and fix regex parsing.
Differential Revision: D83641294
Pull Request resolved: https://github.com/pytorch/executorch/pull/14716
---
.../Exported/ExecuTorchLLMTextRunner.h | 15 +++++++-
.../Exported/ExecuTorchLLMTextRunner.mm | 11 +++++-
.../__tests__/MultimodalRunnerTest.swift | 2 +-
.../__tests__/TextRunnerTest.swift | 37 ++++++++++++++++++-
4 files changed, 59 insertions(+), 6 deletions(-)
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.h b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.h
index 550a20ea633..50957ee47f5 100644
--- a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.h
+++ b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.h
@@ -25,12 +25,23 @@ __attribute__((deprecated("This API is experimental.")))
@param modelPath File system path to the serialized model.
@param tokenizerPath File system path to the tokenizer data.
- @param tokens An array of NSString special tokens to use during tokenization.
+ @return An initialized ExecuTorchLLMTextRunner instance.
+*/
+- (instancetype)initWithModelPath:(NSString *)modelPath
+ tokenizerPath:(NSString *)tokenizerPath;
+
+/**
+ Initializes a text LLM runner with the given model and tokenizer paths,
+ and a list of special tokens to include in the tokenizer.
+
+ @param modelPath File system path to the serialized model.
+ @param tokenizerPath File system path to the tokenizer data.
+ @param specialTokens An array of NSString special tokens to use during tokenization.
@return An initialized ExecuTorchLLMTextRunner instance.
*/
- (instancetype)initWithModelPath:(NSString *)modelPath
tokenizerPath:(NSString *)tokenizerPath
- specialTokens:(NSArray *)tokens
+ specialTokens:(NSArray *)specialTokens
NS_DESIGNATED_INITIALIZER;
/**
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.mm b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.mm
index 4ea1bd921f7..1a6c3f40045 100644
--- a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.mm
+++ b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.mm
@@ -28,15 +28,22 @@ @implementation ExecuTorchLLMTextRunner {
std::unique_ptr _runner;
}
+- (instancetype)initWithModelPath:(NSString*)modelPath
+ tokenizerPath:(NSString*)tokenizerPath {
+ return [self initWithModelPath:modelPath
+ tokenizerPath:tokenizerPath
+ specialTokens:@[]];
+}
+
- (instancetype)initWithModelPath:(NSString*)modelPath
tokenizerPath:(NSString*)tokenizerPath
- specialTokens:(NSArray*)tokens {
+ specialTokens:(NSArray*)specialTokens {
self = [super init];
if (self) {
_modelPath = [modelPath copy];
_tokenizerPath = [tokenizerPath copy];
_specialTokens = std::make_unique>();
- for (NSString *token in tokens) {
+ for (NSString *token in specialTokens) {
_specialTokens->emplace_back(token.UTF8String);
}
}
diff --git a/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift b/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift
index cdf15f12350..7ae9da4969b 100644
--- a/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift
+++ b/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift
@@ -60,7 +60,7 @@ class MultimodalRunnerTest: XCTestCase {
let userPrompt = "What's on the picture?"
let sequenceLength = 768
- func test() {
+ func testLLaVA() {
let bundle = Bundle(for: type(of: self))
guard let modelPath = bundle.path(forResource: "llava", ofType: "pte"),
let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "bin"),
diff --git a/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift b/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift
index 5e99af0c57f..f7124fec640 100644
--- a/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift
+++ b/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift
@@ -39,7 +39,7 @@ class TextRunnerTest: XCTestCase {
let userPrompt = "The capital of France is called"
let sequenceLength = 128
- func test() {
+ func testLLaMA() {
let bundle = Bundle(for: type(of: self))
guard let modelPath = bundle.path(forResource: "llama3_2-1B", ofType: "pte"),
let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "model") else {
@@ -73,4 +73,39 @@ class TextRunnerTest: XCTestCase {
}
XCTAssertTrue(text.lowercased().contains("paris"))
}
+
+ func testPhi4() {
+ let bundle = Bundle(for: type(of: self))
+ guard let modelPath = bundle.path(forResource: "phi4-mini", ofType: "pte"),
+ let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "json") else {
+ XCTFail("Couldn't find model or tokenizer files")
+ return
+ }
+ let runner = TextRunner(modelPath: modelPath, tokenizerPath: tokenizerPath)
+ var text = ""
+
+ do {
+ try runner.generate(userPrompt, Config {
+ $0.sequenceLength = sequenceLength
+ }) { token in
+ text += token
+ }
+ } catch {
+ XCTFail("Failed to generate text with error \(error)")
+ }
+ XCTAssertTrue(text.lowercased().contains("paris"))
+
+ text = ""
+ runner.reset()
+ do {
+ try runner.generate(userPrompt, Config {
+ $0.sequenceLength = sequenceLength
+ }) { token in
+ text += token
+ }
+ } catch {
+ XCTFail("Failed to generate text with error \(error)")
+ }
+ XCTAssertTrue(text.lowercased().contains("paris"))
+ }
}
From 05799c93bac19db778f380bb906d0d556e1672ca Mon Sep 17 00:00:00 2001
From: Vaclav Novak
Date: Fri, 3 Oct 2025 10:33:40 +0200
Subject: [PATCH 115/266] NXP backend: added aten.sub operator support (#14514)
### Summary
adds support for aten.sub operator
### Test plan
tests can be manually run using `pytest -c /dev/null
backends/nxp/tests/`
---------
Co-authored-by: Martin Pavella
---
.../nxp/backend/edge_program_converter.py | 1 +
.../ops_converters/__init__.py | 4 +
.../ops_converters/sub_tensor_converter.py | 59 ++++++
backends/nxp/neutron_partitioner.py | 1 +
backends/nxp/quantizer/neutron_quantizer.py | 2 +
backends/nxp/quantizer/patterns.py | 26 +++
.../test_add_tensor_converter.py | 4 +
.../test_sub_tensor_converter.py | 175 ++++++++++++++++++
backends/nxp/tests/models.py | 28 +++
9 files changed, 300 insertions(+)
create mode 100644 backends/nxp/backend/ir/converter/node_converters/ops_converters/sub_tensor_converter.py
create mode 100644 backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py
diff --git a/backends/nxp/backend/edge_program_converter.py b/backends/nxp/backend/edge_program_converter.py
index febcd03913a..03d55548d2d 100644
--- a/backends/nxp/backend/edge_program_converter.py
+++ b/backends/nxp/backend/edge_program_converter.py
@@ -43,6 +43,7 @@
exir_ops.edge.aten.permute_copy.default: PermuteCopyConverter, # noqa F405
exir_ops.edge.aten.relu.default: ReLUConverter, # noqa F405
exir_ops.edge.aten._softmax.default: SoftmaxConverter, # noqa F405
+ exir_ops.edge.aten.sub.Tensor: SubTensorConverter, # noqa F405
exir_ops.edge.aten.tanh.default: TanhConverter, # noqa F405
exir_ops.edge.aten.view_copy.default: ViewCopyConverter, # noqa F405
exir_ops.edge.aten.sigmoid.default: SigmoidConverter, # noqa F405
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py
index 472a3495e19..3cf70f46b8d 100755
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py
@@ -56,6 +56,9 @@
from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.softmax_converter import (
SoftmaxConverter,
)
+from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.sub_tensor_converter import (
+ SubTensorConverter,
+)
from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.tanh_converter import (
TanhConverter,
)
@@ -80,6 +83,7 @@
"MaxPool2dConverter",
"AvgPool2dConverter",
"AddTensorConverter",
+ "SubTensorConverter",
"CloneConverter",
"AbsConverter",
"AdaptiveAvgPool2dConverter",
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/sub_tensor_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/sub_tensor_converter.py
new file mode 100644
index 00000000000..e9522c87114
--- /dev/null
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/sub_tensor_converter.py
@@ -0,0 +1,59 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.nxp.backend.ir.converter.conversion.common import (
+ node_uses_shape_broadcasting,
+)
+from executorch.backends.nxp.backend.ir.converter.node_converter import (
+ CustomDelegationOptions,
+ NodeConverter,
+)
+from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options import (
+ sub_options,
+)
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
+from torch.fx import Node
+from torch.nn import Parameter
+
+
+class SubTensorConverter(NodeConverter):
+ @staticmethod
+ def _is_supported_on_target(
+ node: Node,
+ neutron_target_spec: NeutronTargetSpec,
+ parameters_mapping: dict[str, Parameter],
+ custom_delegation_options: CustomDelegationOptions,
+ ) -> bool:
+ if node_uses_shape_broadcasting(node):
+ # Shape broadcasting may require the addition of `Transpose` ops during conversion.
+ return False
+
+ return True
+
+ @staticmethod
+ def _is_supported_in_IR(
+ node: Node,
+ parameters_mapping: dict[str, Parameter],
+ custom_delegation_options: CustomDelegationOptions,
+ ) -> bool:
+ if len(node.args) != 2:
+ return False
+
+ # The `alpha` attribute can be represented by adding an extra `Mul` operator.
+ # However, this is not implemented as `alpha` is rarely used.
+ if hasattr(node.kwargs, "alpha"):
+ return False
+
+ return True
+
+ # sub.Tensor Node format: (Tensor self, Tensor other, *, Scalar alpha=1)
+ def convert(self, node: Node):
+ """Convert 'sub_tensor' operator to NeutronIR 'Sub'."""
+ self.assert_convertible(node)
+
+ t_op = self._create_tflite_op_with_io_tensors(node)
+
+ t_op.builtin_options = sub_options.Sub()
+ self.builder.append_operators([t_op])
diff --git a/backends/nxp/neutron_partitioner.py b/backends/nxp/neutron_partitioner.py
index 917545e6c89..e7ad7ff7a0b 100644
--- a/backends/nxp/neutron_partitioner.py
+++ b/backends/nxp/neutron_partitioner.py
@@ -210,6 +210,7 @@ def tag_qdq_clusters(self, nodes: list[torch.fx.Node]):
exir_ops.edge.aten.mm.default: MMConverter, # noqa F405
exir_ops.edge.aten.relu.default: ReLUConverter, # noqa F405
exir_ops.edge.aten._softmax.default: SoftmaxConverter, # noqa F405
+ exir_ops.edge.aten.sub.Tensor: SubTensorConverter, # noqa F405
exir_ops.edge.aten.tanh.default: TanhConverter, # noqa F405
exir_ops.edge.aten.view_copy.default: ViewCopyConverter, # noqa F405
exir_ops.edge.aten.sigmoid.default: SigmoidConverter, # noqa F405
diff --git a/backends/nxp/quantizer/neutron_quantizer.py b/backends/nxp/quantizer/neutron_quantizer.py
index db19bcb8ba8..2681e221869 100644
--- a/backends/nxp/quantizer/neutron_quantizer.py
+++ b/backends/nxp/quantizer/neutron_quantizer.py
@@ -36,6 +36,7 @@
SharedSpecPattern,
SigmoidPattern,
SoftMaxPattern,
+ SubTensorPattern,
TanhInPlacePattern,
TanhPattern,
ViewPattern,
@@ -208,6 +209,7 @@ def __init__(self):
NeutronAtenQuantizer(ReshapePattern(), static_qconfig),
NeutronAtenQuantizer(SigmoidPattern(), static_qconfig),
NeutronAtenQuantizer(SoftMaxPattern(), static_qconfig),
+ NeutronAtenQuantizer(SubTensorPattern(), static_qconfig),
NeutronAtenQuantizer(TanhPattern(), static_qconfig),
NeutronAtenQuantizer(TanhInPlacePattern(), static_qconfig),
NeutronAtenQuantizer(ViewPattern(), static_qconfig),
diff --git a/backends/nxp/quantizer/patterns.py b/backends/nxp/quantizer/patterns.py
index 34ee611b8b2..9588ce24c9e 100644
--- a/backends/nxp/quantizer/patterns.py
+++ b/backends/nxp/quantizer/patterns.py
@@ -224,6 +224,32 @@ def get_anchors(
)
+class SubTensorPattern(QuantizationPattern):
+ """
+ Quantization pattern for Sub Tensor quantization. Accepts 1 or 2 input nodes.
+
+ Basic quantization for all inputs and output.
+ """
+
+ def partition_types(self) -> list[torch.nn.Module]:
+ return [torch.ops.aten.sub.Tensor]
+
+ def get_anchors(
+ self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule]
+ ) -> PartitionAnchors | None:
+ node = fused_partition[0].nodes[-1]
+ inputs = [(node, NodeArgsIdx(0))]
+ if len(fused_partition[0].input_nodes) == 2:
+ inputs = [(node, NodeArgsIdx(0)), (node, NodeArgsIdx(1))]
+
+ return PartitionAnchors(
+ inputs=inputs,
+ weights=[],
+ biases=[],
+ output=[(node,)],
+ )
+
+
class AvgPoolPattern(SharedSpecPattern):
"""
Quantizer for AvgPool2D operator.
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py
index 567b593e05b..2c3107eae77 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py
@@ -1,3 +1,7 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
import numpy as np
import pytest
import torch
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py
new file mode 100644
index 00000000000..98566ff1ad6
--- /dev/null
+++ b/backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py
@@ -0,0 +1,175 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import pytest
+import torch
+
+from executorch.backends.nxp.backend.edge_program_converter import (
+ EdgeProgramToIRConverter,
+)
+from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program
+from executorch.backends.nxp.tests.executors import (
+ convert_run_compare,
+ ToChannelFirstPreprocess,
+ ToChannelLastPreprocess,
+)
+from executorch.backends.nxp.tests.models import (
+ SubTensorConvModule,
+ SubTensorModule,
+ SubTensorOneInputModule,
+)
+from executorch.exir.dialects._ops import ops as exir_ops
+from torch.export import ExportedProgram
+
+
+@pytest.fixture(autouse=True)
+def reseed_model_per_test_run():
+ torch.manual_seed(23)
+ np.random.seed(23)
+
+
+@pytest.mark.parametrize(
+ "input_shape",
+ [
+ pytest.param((4,), id="1D."),
+ pytest.param((6, 6), id="2D."),
+ pytest.param((1, 4, 8), id="3D."),
+ pytest.param((1, 4, 8, 8), id="4D."),
+ ],
+)
+def test_sub_tensor_quant_conversion(mocker, input_shape):
+ model = SubTensorModule()
+
+ converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
+
+ # Run conversion
+ _ = to_quantized_edge_program(model, [input_shape, input_shape])
+
+ # Capture generated model
+ tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+
+ # Capture converted program
+ exported_program: ExportedProgram = converter_spy.call_args.args[1]
+
+ input_data_1 = (np.random.random(input_shape).astype(np.float32) * 50).astype(
+ np.int8
+ )
+ input_data_2 = (np.random.random(input_shape).astype(np.float32) * 50).astype(
+ np.int8
+ )
+ input_data = {0: input_data_1, 1: input_data_2}
+
+ nodes = list(exported_program.graph.nodes)
+ assert nodes[4].target == exir_ops.edge.aten.sub.Tensor
+
+ convert_run_compare(
+ exported_program, tfl_model=tflite_flatbuffers_model, input_data=input_data
+ )
+
+
+@pytest.mark.parametrize(
+ "input_shape",
+ [
+ pytest.param((4,), id="1D."),
+ pytest.param((6, 6), id="2D."),
+ pytest.param((1, 4, 8), id="3D."),
+ pytest.param((1, 4, 8, 8), id="4D."),
+ ],
+)
+def test_sub_tensor_one_input_quant_conversion(mocker, input_shape):
+ model = SubTensorOneInputModule()
+
+ converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
+
+ # Run conversion
+ _ = to_quantized_edge_program(model, input_shape)
+
+ # Capture generated model
+ tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+
+ # Capture converted program
+ exported_program: ExportedProgram = converter_spy.call_args.args[1]
+
+ input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(np.int8)
+
+ nodes = list(exported_program.graph.nodes)
+ assert nodes[2].target == exir_ops.edge.aten.sub.Tensor
+
+ convert_run_compare(
+ exported_program, tfl_model=tflite_flatbuffers_model, input_data=input_data
+ )
+
+
+@pytest.mark.parametrize(
+ "x_input_shape",
+ [
+ pytest.param((1, 4, 8, 8), id="4D."),
+ pytest.param((1, 4, 5, 5), id="4D, product of dims is not a multiple of 8."),
+ ],
+)
+def test_sub_tensor_w_conv_quant_conversion(mocker, x_input_shape):
+ model = SubTensorConvModule()
+
+ converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
+
+ n, c, h, w = x_input_shape
+ y_input_shape = (n, 8, h, w)
+
+ # Run conversion
+ _ = to_quantized_edge_program(model, [x_input_shape, y_input_shape])
+
+ # Capture generated model
+ tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+
+ # Capture converted program
+ exported_program: ExportedProgram = converter_spy.call_args.args[1]
+
+ input_data_1 = (np.random.random(x_input_shape).astype(np.float32) * 50).astype(
+ np.int8
+ )
+ input_data_2 = (np.random.random(y_input_shape).astype(np.float32) * 50).astype(
+ np.int8
+ )
+ input_data = {0: input_data_1, 1: input_data_2}
+
+ nodes = list(exported_program.graph.nodes)
+ assert nodes[15].target == exir_ops.edge.aten.sub.Tensor
+
+ convert_run_compare(
+ exported_program,
+ input_data=input_data,
+ tflite_input_preprocess=ToChannelLastPreprocess(),
+ tfl_model=tflite_flatbuffers_model,
+ tflite_output_preprocess=ToChannelFirstPreprocess(),
+ )
+
+
+@pytest.mark.parametrize(
+ "x_input_shape, y_input_shape",
+ [
+ pytest.param((1, 4, 7), (4, 7), id="3D -> 2D."),
+ pytest.param((1, 4, 8), (1, 4, 4, 8), id="3D -> 4D."),
+ pytest.param((1, 1, 4, 4, 8), (1, 4, 4, 8), id="5D -> 4D."),
+ pytest.param((4,), (4, 4), id="1D -> 2D."),
+ pytest.param((4,), (4, 4, 4), id="1D -> 3D."),
+ pytest.param((6, 6), (1, 8, 6, 6), id="2D -> 4D."),
+ pytest.param((6, 6), (6,), id="2D -> 1D."),
+ ],
+)
+def test_sub_tensor_broadcasting_unsupported_quant_conversion(
+ x_input_shape, y_input_shape
+):
+ model = SubTensorModule()
+
+ # Run conversion
+ edge_program = to_quantized_edge_program(
+ model, [x_input_shape, y_input_shape]
+ ).exported_program()
+ nodes = list(edge_program.graph.nodes)
+
+ # Broadcast is not supported, node is not converted
+ assert (
+ nodes[6].target == exir_ops.edge.aten.sub.Tensor
+ ) # Sub Tensor is not delegated.
diff --git a/backends/nxp/tests/models.py b/backends/nxp/tests/models.py
index e7b60b2566c..f613349fed0 100644
--- a/backends/nxp/tests/models.py
+++ b/backends/nxp/tests/models.py
@@ -451,6 +451,34 @@ def forward(x):
return x + x
+class SubTensorModule(torch.nn.Module):
+ def __init__(self):
+ super().__init__()
+
+ @staticmethod
+ def forward(x, y):
+ return x - y
+
+
+class SubTensorConvModule(torch.nn.Module):
+ def __init__(self):
+ super().__init__()
+ self.conv = Conv2dModule(padding=1, stride=1)
+
+ def forward(self, x, y):
+ x = self.conv(x)
+ return x - y
+
+
+class SubTensorOneInputModule(torch.nn.Module):
+ def __init__(self):
+ super().__init__()
+
+ @staticmethod
+ def forward(x):
+ return x - x
+
+
class MeanDimLinearModule(torch.nn.Module):
def __init__(self, dim, keepdim):
super().__init__()
From 3557edf1dfab4fcc9732bfae30f61001a4f96d7f Mon Sep 17 00:00:00 2001
From: neuropilot-captain
Date: Fri, 3 Oct 2025 21:15:50 +0800
Subject: [PATCH 116/266] Update MTK tool versions in documents (#14772)
### Summary
NeuroPilot Express SDK is released for ExecuTorch 1.0. Update the
document for the latest tool version
Resolves discussion 14253
---
backends/mediatek/README.md | 4 ++--
docs/source/backends-mediatek.md | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/backends/mediatek/README.md b/backends/mediatek/README.md
index e8a535b3fde..6ff751f8408 100644
--- a/backends/mediatek/README.md
+++ b/backends/mediatek/README.md
@@ -28,7 +28,7 @@ To get started with MediaTek's ExecuTorch libraries, download the [NeuroPilot Ex
- **`mtk_converter-8.13.0+public-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl`**: This library preprocesses the model into a MediaTek representation.
-- **`mtk_neuron-8.2.19-py3-none-linux_x86_64.whl`**: This library converts the model to binaries.
+- **`mtk_neuron-8.2.23-py3-none-linux_x86_64`**: This library converts the model to binaries.
Additionally, make sure to copy `NeuronAdapter.h` to the following directory: `backends/mediatek/runtime/include/api/`.
@@ -45,7 +45,7 @@ Follow the steps below to setup your build environment:
```
- Install the two .whl downloaded from NeuroPilot Portal
```bash
- pip3 install mtk_neuron-8.2.19-py3-none-linux_x86_64.whl
+ pip3 install mtk_neuron-8.2.23-py3-none-linux_x86_64.whl
pip3 install mtk_converter-8.13.0+public-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
```
diff --git a/docs/source/backends-mediatek.md b/docs/source/backends-mediatek.md
index a562cea13bd..34cd56f971b 100644
--- a/docs/source/backends-mediatek.md
+++ b/docs/source/backends-mediatek.md
@@ -23,7 +23,7 @@ The MediaTek backend enables acceleration of PyTorch models on edge devices with
```
- NeuroPilot SDK Python wheels (download from [NeuroPilot Express SDK](https://neuropilot.mediatek.com/resources/public/npexpress/en/docs/npexpress)):
```bash
- pip3 install mtk_neuron-8.2.19-py3-none-linux_x86_64.whl
+ pip3 install mtk_neuron-8.2.23-py3-none-linux_x86_64.whl
pip3 install mtk_converter-8.13.0+public-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
```
From c44c5417f79b39c750701f66e0f26b84fa2cd770 Mon Sep 17 00:00:00 2001
From: pytorchbot
Date: Fri, 3 Oct 2025 12:11:58 -0400
Subject: [PATCH 117/266] Runner support for multiple ptd files (#14758)
This PR was created by the merge bot to help merge the original PR into
the main branch.
ghstack PR number: https://github.com/pytorch/executorch/pull/14159 by
@lucylq
^ Please use this as the source of truth for the PR details, comments,
and reviews
ghstack PR base:
https://github.com/pytorch/executorch/tree/gh/lucylq/111/base
ghstack PR head:
https://github.com/pytorch/executorch/tree/gh/lucylq/111/head
Merge bot PR base: https://github.com/pytorch/executorch/tree/main
Merge bot PR head:
https://github.com/pytorch/executorch/tree/gh/lucylq/111/orig
Differential Revision:
[D82072385](https://our.internmc.facebook.com/intern/diff/D82072385/)
@diff-train-skip-merge
Co-authored-by: lucylq
---
examples/models/llama/runner/runner.cpp | 17 +++++++++++++++-
examples/models/llama/runner/runner.h | 11 +++++++----
extension/llm/runner/llm_runner_helper.cpp | 22 +++++++++++++++++++--
extension/llm/runner/llm_runner_helper.h | 23 +++++++++++++++++++++-
4 files changed, 65 insertions(+), 8 deletions(-)
diff --git a/examples/models/llama/runner/runner.cpp b/examples/models/llama/runner/runner.cpp
index 2ba2fdf9941..19ed9f88339 100644
--- a/examples/models/llama/runner/runner.cpp
+++ b/examples/models/llama/runner/runner.cpp
@@ -37,6 +37,21 @@ std::unique_ptr create_llama_runner(
const std::string& tokenizer_path,
std::optional data_path,
float temperature) {
+ if (data_path.has_value()) {
+ std::vector data_files;
+ data_files.push_back(data_path.value());
+ return create_llama_runner(
+ model_path, tokenizer_path, std::move(data_files), temperature);
+ }
+ return create_llama_runner(
+ model_path, tokenizer_path, std::vector(), temperature);
+}
+
+std::unique_ptr create_llama_runner(
+ const std::string& model_path,
+ const std::string& tokenizer_path,
+ std::vector data_files,
+ float temperature) {
ET_LOG(
Info,
"Creating LLaMa runner: model_path=%s, tokenizer_path=%s",
@@ -55,7 +70,7 @@ std::unique_ptr create_llama_runner(
return nullptr;
}
return llm::create_text_llm_runner(
- model_path, std::move(tokenizer), data_path);
+ model_path, std::move(tokenizer), data_files);
}
} // namespace example
diff --git a/examples/models/llama/runner/runner.h b/examples/models/llama/runner/runner.h
index f07cd4e8ee8..728ae57efa8 100644
--- a/examples/models/llama/runner/runner.h
+++ b/examples/models/llama/runner/runner.h
@@ -11,12 +11,9 @@
#pragma once
-#include
-#include
#include
#include
#include
-#include
#include
#include
@@ -30,7 +27,13 @@ namespace llm = ::executorch::extension::llm;
std::unique_ptr create_llama_runner(
const std::string& model_path,
const std::string& tokenizer_path,
- std::optional data_path = std::nullopt,
+ std::optional data_path,
+ float temperature = -1.0f);
+
+std::unique_ptr create_llama_runner(
+ const std::string& model_path,
+ const std::string& tokenizer_path,
+ std::vector data_files = {},
float temperature = -1.0f);
std::unique_ptr load_llama_tokenizer(
diff --git a/extension/llm/runner/llm_runner_helper.cpp b/extension/llm/runner/llm_runner_helper.cpp
index f12de5f1d87..d1e4ff2ce45 100644
--- a/extension/llm/runner/llm_runner_helper.cpp
+++ b/extension/llm/runner/llm_runner_helper.cpp
@@ -183,6 +183,24 @@ std::unique_ptr create_text_llm_runner(
std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
std::optional data_path,
float temperature) {
+ if (data_path.has_value()) {
+ std::vector data_files;
+ data_files.push_back(data_path.value());
+ return create_text_llm_runner(
+ model_path, std::move(tokenizer), std::move(data_files), temperature);
+ }
+ return create_text_llm_runner(
+ model_path,
+ std::move(tokenizer),
+ std::vector(),
+ temperature);
+}
+
+std::unique_ptr create_text_llm_runner(
+ const std::string& model_path,
+ std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
+ std::vector data_files,
+ float temperature) {
// Sanity check tokenizer
if (!tokenizer || !tokenizer->is_loaded()) {
ET_LOG(Error, "Tokenizer is null or not loaded");
@@ -191,9 +209,9 @@ std::unique_ptr create_text_llm_runner(
// Create the Module
std::unique_ptr module;
- if (data_path.has_value()) {
+ if (data_files.size() > 0) {
module = std::make_unique(
- model_path, data_path.value(), Module::LoadMode::File);
+ model_path, data_files, Module::LoadMode::File);
} else {
module = std::make_unique(model_path, Module::LoadMode::File);
}
diff --git a/extension/llm/runner/llm_runner_helper.h b/extension/llm/runner/llm_runner_helper.h
index 191ea3ab090..5c109581e19 100644
--- a/extension/llm/runner/llm_runner_helper.h
+++ b/extension/llm/runner/llm_runner_helper.h
@@ -101,7 +101,28 @@ ET_EXPERIMENTAL std::unordered_set get_eos_ids(
ET_EXPERIMENTAL std::unique_ptr create_text_llm_runner(
const std::string& model_path,
std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
- std::optional data_path = std::nullopt,
+ std::optional data_path,
+ float temperature = -1.0f);
+
+/**
+ * @brief Creates a TextLLMRunner instance with dependency injection
+ *
+ * This factory function creates and initializes a TextLLMRunner with all
+ * necessary components for text generation using the specified model and
+ * tokenizer.
+ *
+ * @param model_path Path to the model file
+ * @param tokenizer Initialized tokenizer instance
+ * @param data_files Vector of paths to additional data required by the model
+ * @param temperature Optional temperature parameter for controlling randomness
+ * (deprecated)
+ * @return std::unique_ptr Initialized TextLLMRunner instance, or
+ * nullptr on failure
+ */
+ET_EXPERIMENTAL std::unique_ptr create_text_llm_runner(
+ const std::string& model_path,
+ std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
+ std::vector data_files = {},
float temperature = -1.0f);
/**
From 4d681cb3b81de5b5fc4c7969f109e83e4607a06c Mon Sep 17 00:00:00 2001
From: pytorchbot
Date: Fri, 3 Oct 2025 12:13:53 -0400
Subject: [PATCH 118/266] JNI support for multiple ptd files (#14769)
This PR was created by the merge bot to help merge the original PR into
the main branch.
ghstack PR number: https://github.com/pytorch/executorch/pull/14168 by
@lucylq
^ Please use this as the source of truth for the PR details, comments,
and reviews
ghstack PR base:
https://github.com/pytorch/executorch/tree/gh/lucylq/113/base
ghstack PR head:
https://github.com/pytorch/executorch/tree/gh/lucylq/113/head
Merge bot PR base:
https://github.com/pytorch/executorch/tree/gh/lucylq/111/orig
Merge bot PR head:
https://github.com/pytorch/executorch/tree/gh/lucylq/113/orig
Differential Revision:
[D82072929](https://our.internmc.facebook.com/intern/diff/D82072929/)
@diff-train-skip-merge
---------
Co-authored-by: lucylq
---
.../executorch/extension/llm/LlmModule.java | 33 +++++++++++++++----
extension/android/jni/jni_layer_llama.cpp | 29 +++++++++++-----
2 files changed, 47 insertions(+), 15 deletions(-)
diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java
index 289df5defd9..f135731f26a 100644
--- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java
+++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java
@@ -11,6 +11,7 @@
import com.facebook.jni.HybridData;
import com.facebook.jni.annotations.DoNotStrip;
import java.io.File;
+import java.util.List;
import org.pytorch.executorch.ExecuTorchRuntime;
import org.pytorch.executorch.annotations.Experimental;
@@ -32,14 +33,22 @@ public class LlmModule {
@DoNotStrip
private static native HybridData initHybrid(
- int modelType, String modulePath, String tokenizerPath, float temperature, String dataPath);
+ int modelType,
+ String modulePath,
+ String tokenizerPath,
+ float temperature,
+ List dataFiles);
/**
* Constructs a LLM Module for a model with given type, model path, tokenizer, temperature, and
- * data path.
+ * dataFiles.
*/
public LlmModule(
- int modelType, String modulePath, String tokenizerPath, float temperature, String dataPath) {
+ int modelType,
+ String modulePath,
+ String tokenizerPath,
+ float temperature,
+ List dataFiles) {
ExecuTorchRuntime runtime = ExecuTorchRuntime.getRuntime();
File modelFile = new File(modulePath);
@@ -50,12 +59,22 @@ public LlmModule(
if (!tokenizerFile.canRead() || !tokenizerFile.isFile()) {
throw new RuntimeException("Cannot load tokenizer path " + tokenizerPath);
}
- mHybridData = initHybrid(modelType, modulePath, tokenizerPath, temperature, dataPath);
+
+ mHybridData = initHybrid(modelType, modulePath, tokenizerPath, temperature, dataFiles);
+ }
+
+ /**
+ * Constructs a LLM Module for a model with given type, model path, tokenizer, temperature, and
+ * data path.
+ */
+ public LlmModule(
+ int modelType, String modulePath, String tokenizerPath, float temperature, String dataPath) {
+ this(modelType, modulePath, tokenizerPath, temperature, List.of(dataPath));
}
/** Constructs a LLM Module for a model with given model path, tokenizer, temperature. */
public LlmModule(String modulePath, String tokenizerPath, float temperature) {
- this(MODEL_TYPE_TEXT, modulePath, tokenizerPath, temperature, null);
+ this(MODEL_TYPE_TEXT, modulePath, tokenizerPath, temperature, List.of());
}
/**
@@ -63,12 +82,12 @@ public LlmModule(String modulePath, String tokenizerPath, float temperature) {
* path.
*/
public LlmModule(String modulePath, String tokenizerPath, float temperature, String dataPath) {
- this(MODEL_TYPE_TEXT, modulePath, tokenizerPath, temperature, dataPath);
+ this(MODEL_TYPE_TEXT, modulePath, tokenizerPath, temperature, List.of(dataPath));
}
/** Constructs a LLM Module for a model with given path, tokenizer, and temperature. */
public LlmModule(int modelType, String modulePath, String tokenizerPath, float temperature) {
- this(modelType, modulePath, tokenizerPath, temperature, null);
+ this(modelType, modulePath, tokenizerPath, temperature, List.of());
}
/** Constructs a LLM Module for a model with the given LlmModuleConfig */
diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
index cabf30c42e4..a0c90991bf7 100644
--- a/extension/android/jni/jni_layer_llama.cpp
+++ b/extension/android/jni/jni_layer_llama.cpp
@@ -140,13 +140,13 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass {
facebook::jni::alias_ref model_path,
facebook::jni::alias_ref tokenizer_path,
jfloat temperature,
- facebook::jni::alias_ref data_path) {
+ facebook::jni::alias_ref data_files) {
return makeCxxInstance(
model_type_category,
model_path,
tokenizer_path,
temperature,
- data_path);
+ data_files);
}
ExecuTorchLlmJni(
@@ -154,7 +154,7 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass {
facebook::jni::alias_ref model_path,
facebook::jni::alias_ref tokenizer_path,
jfloat temperature,
- facebook::jni::alias_ref data_path = nullptr) {
+ facebook::jni::alias_ref data_files = nullptr) {
temperature_ = temperature;
#if defined(ET_USE_THREADPOOL)
// Reserve 1 thread for the main thread.
@@ -173,18 +173,32 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass {
model_path->toStdString().c_str(),
llm::load_tokenizer(tokenizer_path->toStdString()));
} else if (model_type_category == MODEL_TYPE_CATEGORY_LLM) {
- std::optional data_path_str = data_path
- ? std::optional{data_path->toStdString()}
- : std::nullopt;
+ std::vector data_files_vector;
+ if (data_files != nullptr) {
+ // Convert Java List to C++ std::vector
+ auto list_class = facebook::jni::findClassStatic("java/util/List");
+ auto size_method = list_class->getMethod("size");
+ auto get_method =
+ list_class->getMethod(jint)>(
+ "get");
+
+ jint size = size_method(data_files);
+ for (jint i = 0; i < size; ++i) {
+ auto str_obj = get_method(data_files, i);
+ auto jstr = facebook::jni::static_ref_cast(str_obj);
+ data_files_vector.push_back(jstr->toStdString());
+ }
+ }
runner_ = executorch::extension::llm::create_text_llm_runner(
model_path->toStdString(),
llm::load_tokenizer(tokenizer_path->toStdString()),
- data_path_str);
+ data_files_vector);
#if defined(EXECUTORCH_BUILD_QNN)
} else if (model_type_category == MODEL_TYPE_QNN_LLAMA) {
std::unique_ptr module = std::make_unique<
executorch::extension::Module>(
model_path->toStdString().c_str(),
+ data_files_set,
executorch::extension::Module::LoadMode::MmapUseMlockIgnoreErrors);
std::string decoder_model = "llama3"; // use llama3 for now
runner_ = std::make_unique>( // QNN runner
@@ -192,7 +206,6 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass {
decoder_model.c_str(),
model_path->toStdString().c_str(),
tokenizer_path->toStdString().c_str(),
- data_path->toStdString().c_str(),
"");
model_type_category_ = MODEL_TYPE_CATEGORY_LLM;
#endif
From 7116e0ad6d0454755f1a90016ae96a4d2ede3329 Mon Sep 17 00:00:00 2001
From: Mengwei Liu
Date: Fri, 3 Oct 2025 19:18:21 +0100
Subject: [PATCH 119/266] Tag mutated buffer for AOTI cuda partitioner (#14783)
This should avoid having to copy mutated buffer back to outputs.
Before PR I'm getting this graph:
```
graph():
%b_key_cache_0 : [num_users=1] = placeholder[target=b_key_cache_0]
%b_value_cache_0 : [num_users=1] = placeholder[target=b_value_cache_0]
%b_key_cache_1 : [num_users=1] = placeholder[target=b_key_cache_1]
%b_value_cache_1 : [num_users=1] = placeholder[target=b_value_cache_1]
%b_key_cache_2 : [num_users=1] = placeholder[target=b_key_cache_2]
%b_value_cache_2 : [num_users=1] = placeholder[target=b_value_cache_2]
%b_key_cache_3 : [num_users=1] = placeholder[target=b_key_cache_3]
%b_value_cache_3 : [num_users=1] = placeholder[target=b_value_cache_3]
...
%b_key_cache_29 : [num_users=1] = placeholder[target=b_key_cache_29]
%b_value_cache_29 : [num_users=1] = placeholder[target=b_value_cache_29]
%inputs_embeds : [num_users=1] = placeholder[target=inputs_embeds]
%cache_position : [num_users=1] = placeholder[target=cache_position]
%lowered_module_0 : [num_users=1] = get_attr[target=lowered_module_0]
%executorch_call_delegate : [num_users=61] = call_function[target=torch.ops.higher_order.executorch_call_delegate](args = (%lowered_module_0, %inputs_embeds, %cache_position, %b_value_cache_0, %b_key_cache_0, %b_value_cache_1, %b_key_cache_1, %b_value_cache_2, %b_key_cache_2, %b_value_cache_3, %b_key_cache_3, %b_value_cache_4, %b_key_cache_4, %b_value_cache_5, %b_key_cache_5, %b_value_cache_6, %b_key_cache_6, %b_value_cache_7, %b_key_cache_7, %b_value_cache_8, %b_key_cache_8, %b_value_cache_9, %b_key_cache_9, %b_value_cache_10, %b_key_cache_10, %b_value_cache_11, %b_key_cache_11, %b_value_cache_12, %b_key_cache_12, %b_value_cache_13, %b_key_cache_13, %b_value_cache_14, %b_key_cache_14, %b_value_cache_15, %b_key_cache_15, %b_value_cache_16, %b_key_cache_16, %b_value_cache_17, %b_key_cache_17, %b_value_cache_18, %b_key_cache_18, %b_value_cache_19, %b_key_cache_19, %b_value_cache_20, %b_key_cache_20, %b_value_cache_21, %b_key_cache_21, %b_value_cache_22, %b_key_cache_22, %b_value_cache_23, %b_key_cache_23, %b_value_cache_24, %b_key_cache_24, %b_value_cache_25, %b_key_cache_25, %b_value_cache_26, %b_key_cache_26, %b_value_cache_27, %b_key_cache_27, %b_value_cache_28, %b_key_cache_28, %b_value_cache_29, %b_key_cache_29), kwargs = {})
%getitem : [num_users=1] = call_function[target=operator.getitem](args = (%executorch_call_delegate, 0), kwargs = {})
%getitem_1 : [num_users=1] = call_function[target=operator.getitem](args = (%executorch_call_delegate, 1), kwargs = {})
%getitem_2 : [num_users=1] = call_function[target=operator.getitem](args = (%executorch_call_delegate, 2), kwargs = {})
%getitem_3 : [num_users=1] = call_function[target=operator.getitem](args = (%executorch_call_delegate, 3), kwargs = {})
%getitem_4 : [num_users=1] = call_function[target=operator.getitem](args = (%executorch_call_delegate, 4), kwargs = {})
...
%getitem_60 : [num_users=1] = call_function[target=operator.getitem](args = (%executorch_call_delegate, 60), kwargs = {})
return (getitem_1, getitem, getitem_3, getitem_2, getitem_5, getitem_4, getitem_7, getitem_6, getitem_9, getitem_8, getitem_11, getitem_10, getitem_13, getitem_12, getitem_15, getitem_14, getitem_17, getitem_16, getitem_19, getitem_18, getitem_21, getitem_20, getitem_23, getitem_22, getitem_25, getitem_24, getitem_27, getitem_26, getitem_29, getitem_28, getitem_31, getitem_30, getitem_33, getitem_32, getitem_35, getitem_34, getitem_37, getitem_36, getitem_39, getitem_38, getitem_41, getitem_40, getitem_43, getitem_42, getitem_45, getitem_44, getitem_47, getitem_46, getitem_49, getitem_48, getitem_51, getitem_50, getitem_53, getitem_52, getitem_55, getitem_54, getitem_57, getitem_56, getitem_59, getitem_58, getitem_60)/home/larryliu/.conda/envs/executorch/lib/python3.11/site-packages/executorch/exir/emit/_emitter.py:1595: UserWarning: Mutation on a buffer in the model is detected. ExecuTorch assumes buffers that are mutated in the graph have a meaningless initial state, only the shape and dtype will be serialized, unless a pass which sets meta["et_init_buffer"] to True such as InitializedMutableBufferPass is run.
warnings.warn(
```
This is unncessary because we don't want the kv cache as output.
After applying this PR I'm getting this graph instead:
```
graph():
%inputs_embeds : [num_users=1] = placeholder[target=inputs_embeds]
%cache_position : [num_users=1] = placeholder[target=cache_position]
%lowered_module_0 : [num_users=1] = get_attr[target=lowered_module_0]
%executorch_call_delegate : [num_users=1] = call_function[target=torch.ops.higher_order.executorch_call_delegate](args = (%lowered_module_0, %inputs_embeds, %cache_position), kwargs = {})
%getitem_60 : [num_users=1] = call_function[target=operator.getitem](args = (%executorch_call_delegate, 0), kwargs = {})
return (getitem_60,)
```
### Summary
[PLEASE REMOVE] See [CONTRIBUTING.md's Pull
Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests)
for ExecuTorch PR guidelines.
[PLEASE REMOVE] If this PR closes an issue, please add a `Fixes
#` line.
[PLEASE REMOVE] If this PR introduces a fix or feature that should be
the upcoming release notes, please add a "Release notes: " label.
For a list of available release notes labels, check out
[CONTRIBUTING.md's Pull
Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests).
### Test plan
[PLEASE REMOVE] How did you test this PR? Please write down any manual
commands you used and note down tests that you have written if
applicable.
---
backends/cuda/cuda_partitioner.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/backends/cuda/cuda_partitioner.py b/backends/cuda/cuda_partitioner.py
index 14c75bdb937..64df7b7dcb2 100644
--- a/backends/cuda/cuda_partitioner.py
+++ b/backends/cuda/cuda_partitioner.py
@@ -15,7 +15,7 @@
Partitioner,
PartitionResult,
)
-from executorch.exir.backend.utils import tag_constant_data
+from executorch.exir.backend.utils import tag_constant_data, tag_mutated_buffer
from torch.export.exported_program import ExportedProgram
@@ -54,6 +54,7 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
partition_tags[tag] = self.delegation_spec
tag_constant_data(exported_program)
+ tag_mutated_buffer(exported_program)
return PartitionResult(
tagged_exported_program=exported_program, partition_tags=partition_tags
From b021fd01eab33b14749197a1fd7bbd2dfa85e823 Mon Sep 17 00:00:00 2001
From: Andrew Grebenisan <33402477+DrJessop@users.noreply.github.com>
Date: Fri, 3 Oct 2025 14:24:33 -0700
Subject: [PATCH 120/266] Support im2row
Differential Revision: D83620790
Pull Request resolved: https://github.com/pytorch/executorch/pull/14729
---
backends/cadence/aot/ref_implementations.py | 113 +++++++
.../aot/tests/test_ref_implementations.py | 293 ++++++++++++++++++
2 files changed, 406 insertions(+)
diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py
index ca15e825ff0..886cb14d0d6 100644
--- a/backends/cadence/aot/ref_implementations.py
+++ b/backends/cadence/aot/ref_implementations.py
@@ -1303,3 +1303,116 @@ def rope(
[x0 * cos_tensor - x1 * sin_tensor, x0 * sin_tensor + x1 * cos_tensor], dim=-1
)
return rotated.view(original_shape)
+
+
+@impl(m, "im2row")
+def im2row(
+ input_tensor: torch.Tensor,
+ kernel_size: tuple[int, int],
+ dilation: tuple[int, int],
+ padding: tuple[int, int],
+ stride: tuple[int, int],
+ in_zero_point: torch.Tensor,
+ channel_last: bool = False,
+) -> torch.Tensor:
+ """
+ Converts an input tensor into a 2D matrix where each row is a flattened sliding window (patch)
+ from the input, suitable for use in convolution as a matrix multiplication (im2row).
+
+ Args:
+ - input_tensor: Input tensor of shape (N, C, H, W) or (N, H, W, C) if channel_last.
+ - kernel_size: Size of the convolution kernel.
+ - dilation: Dilation of the convolution kernel.
+ - padding: Padding to apply to the input.
+ - stride: Stride of the convolution.
+ - in_zero_point : Zero point for input quantization (broadcastable to input).
+ - channel_last: If True, input is in NHWC format, else NCHW.
+
+ Returns:
+ - Tensor of shape (N, num_patches, patch_size)
+ """
+ if len(input_tensor.shape) == 3:
+ height_dim = 1 if channel_last else 2
+ input_tensor = input_tensor.unsqueeze(height_dim)
+
+ if in_zero_point is not None:
+ if in_zero_point.numel() != 1 and in_zero_point.shape != (
+ input_tensor.shape[0],
+ ):
+ raise ValueError(
+ f"Input zero point must be a scalar or broadcastable to input shape {input_tensor.shape}"
+ )
+ if in_zero_point.dtype != torch.int32:
+ raise ValueError("Input zero point must be an int32 tensor")
+
+ if channel_last:
+ input_tensor = input_tensor.movedim(-1, -3).contiguous() # NHWC -> NCHW
+
+ N, C, H, W = input_tensor.shape
+ kH, kW = kernel_size
+ dH, dW = dilation
+ pH, pW = padding
+ sH, sW = stride
+
+ # Handle padding with zero point values
+ if in_zero_point is not None and (pH > 0 or pW > 0):
+ # Expand zero point to (N, 1, 1, 1) for broadcasting
+ in_zero_point = in_zero_point.expand(N)
+
+ # Pad input with the per-batch zero point values
+ input_tensor = torch.stack(
+ [
+ torch.nn.functional.pad(
+ input_tensor[i],
+ (pW, pW, pH, pH),
+ mode="constant",
+ value=in_zero_point[i].item(),
+ )
+ for i in range(len(input_tensor))
+ ]
+ )
+
+ padding = (0, 0) # Already padded manually
+
+ # Use unfold to extract sliding local blocks
+ # Unfold: (N, C, H, W) -> (N, C, L, kH, kW), where L = number of sliding windows
+ # torch.nn.functional.unfold returns (N, C*kH*kW, L)
+ patches = torch.nn.functional.unfold(
+ input_tensor.float(), # unfold not implemented for int
+ kernel_size=(kH, kW),
+ dilation=(dH, dW),
+ padding=padding,
+ stride=(sH, sW),
+ ).to(
+ input_tensor.dtype
+ ) # (N, C*kH*kW, L)
+
+ # Transpose to (N, L, C*kH*kW)
+ patches = patches.transpose(1, 2).contiguous()
+
+ # Reshape to (N*L, C*kH*kW)
+ patches = patches.view(N, -1, C * kH * kW)
+
+ # If channel_last, output should be in NHWC patch order (but im2row is always row-major)
+ return patches
+
+
+@impl(m, "im2row.per_tensor")
+def im2row_per_tensor(
+ input_tensor: torch.Tensor,
+ kernel_size: tuple[int, int],
+ dilation: tuple[int, int],
+ padding: tuple[int, int],
+ stride: tuple[int, int],
+ in_zero_point: int,
+ channel_last: bool = False,
+) -> torch.Tensor:
+ return im2row(
+ input_tensor,
+ kernel_size,
+ dilation,
+ padding,
+ stride,
+ torch.tensor(in_zero_point, dtype=torch.int32),
+ channel_last,
+ )
diff --git a/backends/cadence/aot/tests/test_ref_implementations.py b/backends/cadence/aot/tests/test_ref_implementations.py
index 8d02c5c2963..0aa1f0a243a 100644
--- a/backends/cadence/aot/tests/test_ref_implementations.py
+++ b/backends/cadence/aot/tests/test_ref_implementations.py
@@ -1843,3 +1843,296 @@ def test_avg_pool2d(
torch.equal(output, expected_output),
f"Output values don't match expected in {name}. Got {output}, expected {expected_output}",
)
+
+ @expand(
+ [
+ # Basic 2x2 kernel, stride 1, no padding, NCHW
+ (
+ "nchw_basic_2x2",
+ torch.tensor(
+ [[[[1, 2, 3], [4, 5, 6], [7, 8, 9]]]], dtype=torch.float32
+ ), # (N=1, C=1, H=3, W=3)
+ (2, 2), # kernel_size
+ (1, 1), # dilation
+ (0, 0), # padding
+ (1, 1), # stride
+ None, # in_zero_point
+ False, # channel_last
+ False,
+ torch.tensor(
+ [
+ [[1, 2, 4, 5], [2, 3, 5, 6], [4, 5, 7, 8], [5, 6, 8, 9]],
+ ],
+ dtype=torch.float32,
+ ),
+ ),
+ # 2x2 kernel, stride 2, no padding, NCHW
+ (
+ "nchw_stride2",
+ torch.tensor(
+ [[[[1, 2, 3], [4, 5, 6], [7, 8, 9]]]], dtype=torch.float32
+ ),
+ (2, 2),
+ (1, 1),
+ (0, 0),
+ (2, 2),
+ None,
+ False,
+ False,
+ torch.tensor(
+ [
+ [[1, 2, 4, 5]],
+ ],
+ dtype=torch.float32, # Only every other patch in each dim
+ ),
+ ),
+ # 2x2 kernel, stride 1, padding 1, NCHW
+ (
+ "nchw_padding1",
+ torch.tensor([[[[1, 2], [3, 4]]]], dtype=torch.float32), # (1,1,2,2)
+ (2, 2),
+ (1, 1),
+ (1, 1),
+ (1, 1),
+ None,
+ False,
+ False,
+ torch.tensor(
+ [
+ [
+ [0, 0, 0, 1],
+ [0, 0, 1, 2],
+ [0, 0, 2, 0],
+ [0, 1, 0, 3],
+ [1, 2, 3, 4],
+ [2, 0, 4, 0],
+ [0, 3, 0, 0],
+ [3, 4, 0, 0],
+ [4, 0, 0, 0],
+ ],
+ ],
+ dtype=torch.float32,
+ ),
+ ),
+ # 2x2 kernel, stride 1, no padding, NHWC
+ (
+ "nhwc_basic_2x2",
+ torch.tensor(
+ [[[[1], [2], [3]], [[4], [5], [6]], [[7], [8], [9]]]],
+ dtype=torch.float32,
+ ), # (N=1, H=3, W=3, C=1)
+ (2, 2),
+ (1, 1),
+ (0, 0),
+ (1, 1),
+ None,
+ True,
+ False,
+ torch.tensor(
+ [
+ [[1, 2, 4, 5], [2, 3, 5, 6], [4, 5, 7, 8], [5, 6, 8, 9]],
+ ],
+ dtype=torch.float32,
+ ),
+ ),
+ # 2x2 kernel, stride 1, no padding, NCHW, in_zero_point=1
+ (
+ "nchw_in_zero_point_no_padding",
+ torch.tensor([[[[2, 3, 4], [5, 6, 7], [8, 9, 10]]]], dtype=torch.int8),
+ (2, 2),
+ (1, 1),
+ (0, 0),
+ (1, 1),
+ torch.tensor(1, dtype=torch.int32),
+ False,
+ False,
+ torch.tensor(
+ [
+ [[2, 3, 5, 6], [3, 4, 6, 7], [5, 6, 8, 9], [6, 7, 9, 10]],
+ ],
+ dtype=torch.int8,
+ ),
+ ),
+ (
+ "nchw_in_zero_point_with_padding=1_and_stride=2",
+ torch.tensor([[[[2, 3, 4], [5, 6, 7], [8, 9, 10]]]], dtype=torch.int8),
+ (2, 2),
+ (1, 1),
+ (1, 1),
+ (2, 2),
+ torch.tensor(-1, dtype=torch.int32),
+ False,
+ False,
+ torch.tensor(
+ [
+ [
+ [-1, -1, -1, 2],
+ [-1, -1, 3, 4],
+ [-1, 5, -1, 8],
+ [6, 7, 9, 10],
+ ],
+ ],
+ dtype=torch.int8,
+ ),
+ ),
+ # 2x2 kernel, stride 1, no padding, NHWC, in_zero_point=2
+ (
+ "nhwc_in_zero_point",
+ torch.tensor(
+ [[[[3], [4], [5]], [[6], [7], [8]], [[9], [10], [11]]]],
+ dtype=torch.int8,
+ ),
+ (2, 2),
+ (1, 1),
+ (0, 0),
+ (1, 1),
+ torch.tensor(2, dtype=torch.int32),
+ True,
+ False,
+ torch.tensor(
+ [
+ [[3, 4, 6, 7], [4, 5, 7, 8], [6, 7, 9, 10], [7, 8, 10, 11]],
+ ],
+ dtype=torch.int8,
+ ),
+ ),
+ # Multi-channel input, 2x2 kernel, stride 1, no padding, NCHW
+ (
+ "nchw_multi_channel",
+ torch.tensor(
+ [
+ [
+ [[1, 2, 3], [4, 5, 6], [7, 8, 9]], # channel 0
+ [[10, 11, 12], [13, 14, 15], [16, 17, 18]], # channel 1
+ ]
+ ],
+ dtype=torch.float32,
+ ), # (1,2,3,3)
+ (2, 2),
+ (1, 1),
+ (0, 0),
+ (1, 1),
+ None,
+ False,
+ False,
+ torch.tensor(
+ [
+ [
+ [1, 2, 4, 5, 10, 11, 13, 14],
+ [2, 3, 5, 6, 11, 12, 14, 15],
+ [4, 5, 7, 8, 13, 14, 16, 17],
+ [5, 6, 8, 9, 14, 15, 17, 18],
+ ],
+ ],
+ dtype=torch.float32,
+ ),
+ ),
+ # Multi-channel input and multi-channel zero-point
+ (
+ "nchw_multi_channel_and_zero_point_no_padding",
+ torch.tensor([[[1, 2, 3]], [[4, 5, 6]]], dtype=torch.int32),
+ (1, 2),
+ (1, 1),
+ (0, 0),
+ (1, 1),
+ torch.tensor([-1, -2], dtype=torch.int32),
+ False,
+ False,
+ torch.tensor([[[1, 2], [2, 3]], [[4, 5], [5, 6]]], dtype=torch.int32),
+ ),
+ (
+ "nchw_multi_channel_and_zero_point_with_padding=1_and_stride=(2, 1)",
+ torch.tensor([[[1, 2, 3]], [[4, 5, 6]]], dtype=torch.int32),
+ (1, 2),
+ (1, 1),
+ (2, 1),
+ (2, 2),
+ torch.tensor([-1, -2], dtype=torch.int32),
+ False,
+ False,
+ torch.tensor(
+ [
+ [
+ [-1, -1],
+ [-1, -1],
+ [-1, 1],
+ [2, 3],
+ [-1, -1],
+ [-1, -1],
+ ],
+ [
+ [-2, -2],
+ [-2, -2],
+ [-2, 4],
+ [5, 6],
+ [-2, -2],
+ [-2, -2],
+ ],
+ ],
+ dtype=torch.int32,
+ ),
+ ),
+ (
+ "per_tensor",
+ torch.tensor(
+ [[[[3], [4], [5]], [[6], [7], [8]], [[9], [10], [11]]]],
+ dtype=torch.int8,
+ ),
+ (2, 2),
+ (1, 1),
+ (0, 0),
+ (1, 1),
+ 2,
+ True,
+ True,
+ torch.tensor(
+ [
+ [[3, 4, 6, 7], [4, 5, 7, 8], [6, 7, 9, 10], [7, 8, 10, 11]],
+ ],
+ dtype=torch.int8,
+ ),
+ ),
+ ]
+ )
+ def test_im2row(
+ self,
+ name: str,
+ input_tensor: torch.Tensor,
+ kernel_size: tuple[int, int],
+ dilation: tuple[int, int],
+ padding: tuple[int, int],
+ stride: tuple[int, int],
+ in_zero_point: torch.Tensor | None,
+ channel_last: bool,
+ per_tensor: bool,
+ expected_output: torch.Tensor,
+ ) -> None:
+ if per_tensor:
+ output = torch.ops.cadence.im2row.per_tensor(
+ input_tensor,
+ kernel_size,
+ dilation,
+ padding,
+ stride,
+ in_zero_point,
+ channel_last,
+ )
+ else:
+ output = torch.ops.cadence.im2row(
+ input_tensor,
+ kernel_size,
+ dilation,
+ padding,
+ stride,
+ in_zero_point,
+ channel_last,
+ )
+ self.assertEqual(
+ output.shape,
+ expected_output.shape,
+ f"im2row output shape mismatch in {name}",
+ )
+ self.assertTrue(
+ torch.equal(output, expected_output),
+ f"im2row output mismatch in {name}: got {output}, expected {expected_output}",
+ )
From 7c7b729e0413390c8991819c87324ab9fb5d8c4c Mon Sep 17 00:00:00 2001
From: lucylq
Date: Fri, 3 Oct 2025 16:35:16 -0700
Subject: [PATCH 121/266] Patch
https://github.com/pytorch/executorch/pull/14754 (#14786)
landed as https://www.internalfb.com/diff/D82075758
---
.../ExecuTorch/Exported/ExecuTorchModule.h | 8 ++++----
.../ExecuTorch/Exported/ExecuTorchModule.mm | 19 +++++++++++++------
2 files changed, 17 insertions(+), 10 deletions(-)
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchModule.h b/extension/apple/ExecuTorch/Exported/ExecuTorchModule.h
index cda9a914bc3..9b8400d739f 100644
--- a/extension/apple/ExecuTorch/Exported/ExecuTorchModule.h
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorchModule.h
@@ -126,14 +126,14 @@ NS_SWIFT_NAME(Module)
* Initializes a module with a file path, data path and a specified load mode.
*
* @param filePath A string representing the path to the ExecuTorch program file.
- * @param dataFilePath A string representing the path to a .ptd file with
+ * @param dataFilePaths A list of strings representing paths to .ptd files with
* external tensors and external data.
* @param loadMode A value from ExecuTorchModuleLoadMode that determines the
* file loading behavior.
* @return An initialized ExecuTorchModule instance.
*/
- (instancetype)initWithFilePath:(NSString *)filePath
- dataFilePath:(NSString *)dataFilePath
+ dataFilePaths:(NSArray *)dataFilePaths
loadMode:(ExecuTorchModuleLoadMode)loadMode
NS_DESIGNATED_INITIALIZER;
@@ -141,12 +141,12 @@ NS_SWIFT_NAME(Module)
* Initializes a module with a file path, data path and a specified load mode.
*
* @param filePath A string representing the path to the ExecuTorch program file.
- * @param dataFilePath A string representing the path to a .ptd file with
+ * @param dataFilePaths A list of strings representing paths to .ptd files with
* external tensors and external data.
* @return An initialized ExecuTorchModule instance.
*/
- (instancetype)initWithFilePath:(NSString *)filePath
- dataFilePath:(NSString *)dataFilePath;
+ dataFilePaths:(NSArray *)dataFilePaths;
/**
* Initializes a module with a file path and a specified load mode.
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchModule.mm b/extension/apple/ExecuTorch/Exported/ExecuTorchModule.mm
index ce58f2fb21a..69bb59c860e 100644
--- a/extension/apple/ExecuTorch/Exported/ExecuTorchModule.mm
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorchModule.mm
@@ -250,13 +250,20 @@ @implementation ExecuTorchModule {
}
- (instancetype)initWithFilePath:(NSString *)filePath
- dataFilePath:(NSString *)dataFilePath
+ dataFilePaths:(NSArray *)dataFilePaths
loadMode:(ExecuTorchModuleLoadMode)loadMode {
self = [super init];
if (self) {
+ // Convert NSArray to std::vector
+ std::vector dataFilePathsVector;
+ if (dataFilePaths != nil) {
+ for (NSString *dataFile in dataFilePaths) {
+ dataFilePathsVector.emplace_back(dataFile.UTF8String);
+ }
+ }
_module = std::make_unique(
filePath.UTF8String,
- dataFilePath.UTF8String,
+ dataFilePathsVector,
static_cast(loadMode)
);
_inputs = [NSMutableDictionary new];
@@ -266,21 +273,21 @@ - (instancetype)initWithFilePath:(NSString *)filePath
}
- (instancetype)initWithFilePath:(NSString *)filePath
- dataFilePath:(NSString *)dataFilePath {
+ dataFilePaths:(NSArray *)dataFilePaths {
return [self initWithFilePath:filePath
- dataFilePath:dataFilePath
+ dataFilePaths:dataFilePaths
loadMode:ExecuTorchModuleLoadModeFile];
}
- (instancetype)initWithFilePath:(NSString *)filePath
loadMode:(ExecuTorchModuleLoadMode)loadMode {
return [self initWithFilePath:filePath
- dataFilePath:@""
+ dataFilePaths:@[]
loadMode:loadMode];
}
- (instancetype)initWithFilePath:(NSString *)filePath {
return [self initWithFilePath:filePath
- dataFilePath:@""
+ dataFilePaths:@[]
loadMode:ExecuTorchModuleLoadModeFile];
}
From 0ee11607fc08d7c02374ddde1f92ed8c273b15b4 Mon Sep 17 00:00:00 2001
From: Andrew Grebenisan <33402477+DrJessop@users.noreply.github.com>
Date: Fri, 3 Oct 2025 18:36:02 -0700
Subject: [PATCH 122/266] Add transposed im2row
Differential Revision: D83709868
Pull Request resolved: https://github.com/pytorch/executorch/pull/14738
---
backends/cadence/aot/ref_implementations.py | 156 ++++++++++++++++
.../aot/tests/test_ref_implementations.py | 170 ++++++++++++++++++
2 files changed, 326 insertions(+)
diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py
index 886cb14d0d6..2642340679e 100644
--- a/backends/cadence/aot/ref_implementations.py
+++ b/backends/cadence/aot/ref_implementations.py
@@ -1416,3 +1416,159 @@ def im2row_per_tensor(
torch.tensor(in_zero_point, dtype=torch.int32),
channel_last,
)
+
+
+@impl(m, "transposed_im2row")
+def transposed_im2row(
+ input_tensor: torch.Tensor,
+ kernel_size: tuple[int, int],
+ dilation: tuple[int, int],
+ padding: tuple[int, int],
+ stride: tuple[int, int],
+ output_padding: tuple[int, int],
+ in_zero_point: torch.Tensor,
+ channel_last: bool = False,
+) -> torch.Tensor:
+ """
+ Converts input tensor patches into im2row format for transposed convolutions.
+ This function extracts patches from input in a pattern suitable for transposed convolution.
+
+ Args:
+ - input_tensor: Input spatial tensor, NCHW or NHWC format (3D or 4D).
+ - kernel_size: Size of the convolution kernel.
+ - dilation: Dilation of the convolution kernel.
+ - padding: Padding to apply to the input.
+ - stride: Stride of the convolution.
+ - output_padding: Additional output padding for transposed convolution.
+ - in_zero_point: Zero point for input quantization (broadcastable to input).
+ - channel_last: If True, input is in NHWC format, else NCHW.
+
+ Returns:
+ - 3D tensor of shape (N, output_h * output_w, kernel_h * kernel_w * in_c)
+ """
+ # Handle 1D convolution case by adding height dimension
+ if len(input_tensor.shape) == 3:
+ height_dim = 1 if channel_last else 2
+ input_tensor = input_tensor.unsqueeze(height_dim)
+
+ if in_zero_point is not None:
+ if in_zero_point.dtype != torch.int32:
+ raise ValueError("Input zero point must be an int32 tensor")
+
+ # Move to NCHW for processing if needed
+ if channel_last:
+ input_tensor = input_tensor.movedim(-1, -3).contiguous() # NHWC -> NCHW
+
+ N, C, H_in, W_in = input_tensor.shape
+
+ # Output: (N, C*H_in*W_in, H_out, W_out)
+ H_out = (
+ (H_in - 1) * stride[0]
+ + kernel_size[0]
+ + output_padding[0]
+ - 2 * padding[0]
+ + dilation[0] * (kernel_size[0] - 1)
+ )
+ W_out = (
+ (W_in - 1) * stride[1]
+ + kernel_size[1]
+ + output_padding[1]
+ - 2 * padding[1]
+ + dilation[1] * (kernel_size[1] - 1)
+ )
+
+ # For each input pixel, create a channel where the upsampled (transposed conv) patch is placed
+ # Output: (N, C*H_in*W_in, H_out, W_out)
+ inp_flat = input_tensor.reshape(N, C * H_in * W_in)
+
+ # Calculate output spatial size
+ H_out = (
+ (H_in - 1) * stride[0]
+ - 2 * padding[0]
+ + dilation[0] * (kernel_size[0] - 1)
+ + output_padding[0]
+ + 1
+ )
+ W_out = (
+ (W_in - 1) * stride[1]
+ - 2 * padding[1]
+ + dilation[1] * (kernel_size[1] - 1)
+ + output_padding[1]
+ + 1
+ )
+
+ # Compute the upsampled (top-left) position for each input pixel
+ h_idx = torch.arange(H_in, device=input_tensor.device)
+ w_idx = torch.arange(W_in, device=input_tensor.device)
+ grid_h, grid_w = torch.meshgrid(h_idx, w_idx, indexing="ij")
+ out_h_idx = grid_h * stride[0] - padding[0]
+ out_w_idx = grid_w * stride[1] - padding[1]
+
+ # Compute all input pixel positions (flattened)
+ ch_idx = torch.arange(C * H_in * W_in, device=input_tensor.device)
+ ij_idx = ch_idx % (H_in * W_in)
+ i_idx = ij_idx // W_in
+ j_idx = ij_idx % W_in
+
+ # For each input pixel, compute the output positions for the kernel window
+ kh_idx = torch.arange(kernel_size[0], device=input_tensor.device)
+ kw_idx = torch.arange(kernel_size[1], device=input_tensor.device)
+ kh_grid, kw_grid = torch.meshgrid(kh_idx, kw_idx, indexing="ij")
+ kh_grid = kh_grid.reshape(-1)
+ kw_grid = kw_grid.reshape(-1)
+ num_kernel = kernel_size[0] * kernel_size[1]
+
+ # Broadcast to all channels and kernel positions
+ ch_idx_b = ch_idx.repeat_interleave(num_kernel)
+ n_kernel = ch_idx.shape[0] * num_kernel
+
+ i_idx_b = i_idx.repeat_interleave(num_kernel)
+ j_idx_b = j_idx.repeat_interleave(num_kernel)
+ kh_b = kh_grid.repeat(ch_idx.shape[0])
+ kw_b = kw_grid.repeat(ch_idx.shape[0])
+
+ h_out = out_h_idx[i_idx_b, j_idx_b] + kh_b * dilation[0]
+ w_out = out_w_idx[i_idx_b, j_idx_b] + kw_b * dilation[1]
+
+ # Mask for valid output positions
+ valid = (h_out >= 0) & (h_out < H_out) & (w_out >= 0) & (w_out < W_out)
+
+ # Prepare indices for advanced indexing
+ n_idx = (
+ torch.arange(N, device=input_tensor.device)
+ .view(-1, 1)
+ .expand(N, n_kernel)
+ .reshape(-1)
+ )
+ ch_idx_full = ch_idx_b.expand(N, n_kernel).reshape(-1)
+ h_out_full = h_out.expand(N, n_kernel).reshape(-1)
+ w_out_full = w_out.expand(N, n_kernel).reshape(-1)
+ valid_full = valid.expand(N, n_kernel).reshape(-1)
+
+ # Gather input values for each channel
+ inp_vals = inp_flat[:, ch_idx_b].reshape(-1)
+
+ # Create output tensor
+ patches = torch.zeros((N, C * H_in * W_in, H_out, W_out), dtype=input_tensor.dtype)
+
+ # If in_zero_point is provided, fill patches with it
+ if in_zero_point is not None:
+ if in_zero_point.numel() == 1:
+ patches.fill_(in_zero_point.item())
+ else:
+ # Broadcast in_zero_point to (N, C, H_in, W_in)
+ assert in_zero_point.shape == (N,)
+ in_zero_point = in_zero_point.view(N, 1, 1, 1)
+ patches = patches + in_zero_point
+
+ # Scatter input values to output positions (only valid positions)
+ patches[
+ n_idx[valid_full],
+ ch_idx_full[valid_full],
+ h_out_full[valid_full],
+ w_out_full[valid_full],
+ ] = inp_vals[valid_full]
+
+ # Optionally, flatten to (N, num_patches, patch_size) if needed
+ patches = patches.view(N, C * H_in * W_in, -1).transpose(1, 2).contiguous()
+ return patches
diff --git a/backends/cadence/aot/tests/test_ref_implementations.py b/backends/cadence/aot/tests/test_ref_implementations.py
index 0aa1f0a243a..f78d2292e7b 100644
--- a/backends/cadence/aot/tests/test_ref_implementations.py
+++ b/backends/cadence/aot/tests/test_ref_implementations.py
@@ -2136,3 +2136,173 @@ def test_im2row(
torch.equal(output, expected_output),
f"im2row output mismatch in {name}: got {output}, expected {expected_output}",
)
+
+ @expand(
+ [
+ (
+ "basic_2x2",
+ torch.tensor([[[[1, 2], [3, 4]]]], dtype=torch.int32),
+ (2, 2),
+ (1, 1),
+ (0, 0),
+ (1, 1),
+ (0, 0),
+ None,
+ False,
+ torch.tensor(
+ [
+ [
+ [1, 0, 0, 0],
+ [1, 2, 0, 0],
+ [0, 2, 0, 0],
+ [1, 0, 3, 0],
+ [1, 2, 3, 4],
+ [0, 2, 0, 4],
+ [0, 0, 3, 0],
+ [0, 0, 3, 4],
+ [0, 0, 0, 4],
+ ]
+ ],
+ dtype=torch.int32,
+ ),
+ ),
+ (
+ "basic_2x2_with_zero_point",
+ torch.tensor([[[[1, 2], [3, 4]]]], dtype=torch.int32),
+ (2, 2),
+ (1, 1),
+ (0, 0),
+ (1, 1),
+ (0, 0),
+ torch.tensor(100, dtype=torch.int32),
+ False,
+ torch.tensor(
+ [
+ [
+ [1, 100, 100, 100],
+ [1, 2, 100, 100],
+ [100, 2, 100, 100],
+ [1, 100, 3, 100],
+ [1, 2, 3, 4],
+ [100, 2, 100, 4],
+ [100, 100, 3, 100],
+ [100, 100, 3, 4],
+ [100, 100, 100, 4],
+ ]
+ ],
+ dtype=torch.int32,
+ ),
+ ),
+ (
+ "basic_2x2_with_stride_2",
+ torch.tensor([[[[1, 2], [3, 4]]]], dtype=torch.int32),
+ (2, 2), # kernel size
+ (1, 1), # dilation
+ (0, 0), # padding
+ (2, 2), # stride
+ (0, 0), # output padding
+ None,
+ False,
+ torch.tensor(
+ [
+ [
+ [1, 0, 0, 0],
+ [1, 0, 0, 0],
+ [0, 2, 0, 0],
+ [0, 2, 0, 0],
+ [1, 0, 0, 0],
+ [1, 0, 0, 0],
+ [0, 2, 0, 0],
+ [0, 2, 0, 0],
+ [0, 0, 3, 0],
+ [0, 0, 3, 0],
+ [0, 0, 0, 4],
+ [0, 0, 0, 4],
+ [0, 0, 3, 0],
+ [0, 0, 3, 0],
+ [0, 0, 0, 4],
+ [0, 0, 0, 4],
+ ]
+ ],
+ dtype=torch.int32,
+ ),
+ ),
+ (
+ "batch2_with_batch2_zero_point",
+ torch.tensor(
+ [
+ [[[1, 2], [3, 4]]],
+ [[[5, 6], [7, 8]]],
+ ],
+ dtype=torch.int32,
+ ), # input: (2,1,2,2)
+ (2, 2), # kernel_size
+ (1, 1), # dilation
+ (0, 0), # padding
+ (1, 1), # stride
+ (0, 0), # output_padding
+ torch.tensor([100, 200], dtype=torch.int32), # in_zero_point per batch
+ False, # channel_last
+ torch.tensor(
+ [
+ [
+ [1, 100, 100, 100],
+ [1, 2, 100, 100],
+ [100, 2, 100, 100],
+ [1, 100, 3, 100],
+ [1, 2, 3, 4],
+ [100, 2, 100, 4],
+ [100, 100, 3, 100],
+ [100, 100, 3, 4],
+ [100, 100, 100, 4],
+ ],
+ [
+ [5, 200, 200, 200],
+ [5, 6, 200, 200],
+ [200, 6, 200, 200],
+ [5, 200, 7, 200],
+ [5, 6, 7, 8],
+ [200, 6, 200, 8],
+ [200, 200, 7, 200],
+ [200, 200, 7, 8],
+ [200, 200, 200, 8],
+ ],
+ ],
+ dtype=torch.int32,
+ ),
+ ),
+ ]
+ )
+ def test_transposed_im2row(
+ self,
+ name: str,
+ input_tensor: torch.Tensor,
+ kernel_size: tuple[int, int],
+ dilation: tuple[int, int],
+ padding: tuple[int, int],
+ stride: tuple[int, int],
+ output_padding: tuple[int, int],
+ in_zero_point: torch.Tensor | int | None,
+ channel_last: bool,
+ expected_output: torch.Tensor,
+ ) -> None:
+ output = torch.ops.cadence.transposed_im2row(
+ input_tensor,
+ kernel_size,
+ dilation,
+ padding,
+ stride,
+ output_padding,
+ in_zero_point,
+ channel_last,
+ )
+
+ self.assertEqual(
+ output.shape,
+ expected_output.shape,
+ f"transposed_im2row output shape mismatch in {name}: got {output.shape}, expected {expected_output.shape}",
+ )
+ self.assertTrue(
+ torch.equal(output, expected_output),
+ f"transposed_im2row output mismatch in {name}: got {output}, expected {expected_output}",
+ )
From 0b5a4ab1ff1ebe3262742764c19d5c8cc15874ef Mon Sep 17 00:00:00 2001
From: Eli Amesefe
Date: Fri, 3 Oct 2025 20:19:41 -0700
Subject: [PATCH 123/266] Update linear -> conv2d int16 for Ethos
Differential Revision: D83632029
Pull Request resolved: https://github.com/pytorch/executorch/pull/14763
---
backends/arm/operators/op_conv2d.py | 6 +++---
backends/arm/test/ops/test_linear.py | 14 ++------------
2 files changed, 5 insertions(+), 15 deletions(-)
diff --git a/backends/arm/operators/op_conv2d.py b/backends/arm/operators/op_conv2d.py
index 469e6613c1f..933e353387b 100644
--- a/backends/arm/operators/op_conv2d.py
+++ b/backends/arm/operators/op_conv2d.py
@@ -182,11 +182,11 @@ def define_node(
acc_type = ts.DType.FP32
tosa_graph.addConst(
- [1], output.dtype, [input_zp], name=f"{conv2d_output_name}_input_zp"
+ [1], inputs[0].dtype, [input_zp], name=f"{conv2d_output_name}_input_zp"
)
tosa_graph.addConst(
[1],
- output.dtype,
+ inputs[1].dtype,
weight_zp,
name=f"{conv2d_output_name}_weight_zp",
)
@@ -269,7 +269,7 @@ def define_node(
# For quantized convolution, rescale the output value back to the same
# integer value domain of the next op. Otherwise return float32 output.
- if inputs[0].dtype == ts.DType.INT8 or inputs[0].dtype == ts.DType.INT16:
+ if output.dtype == ts.DType.INT8 or output.dtype == ts.DType.INT16:
# Get scale_factor from input, weight, and output.
input_scale = input_qparams[0].get_scale_per_tensor() # type: ignore[possibly-undefined] # pyre-ignore [61]
per_channel_quant = input_qparams[1].per_channel # pyre-ignore [61]
diff --git a/backends/arm/test/ops/test_linear.py b/backends/arm/test/ops/test_linear.py
index bd719954ff5..4029fcef54e 100644
--- a/backends/arm/test/ops/test_linear.py
+++ b/backends/arm/test/ops/test_linear.py
@@ -8,8 +8,6 @@
from typing import Tuple
-import pytest
-
import torch
from executorch.backends.arm.quantizer.arm_quantizer import (
get_symmetric_a16w8_quantization_config,
@@ -313,12 +311,8 @@ def test_linear_16a8w_tosa_INT(test_data: torch.Tensor):
pipeline.run()
-@common.parametrize("test_data", test_data_rank1_INT | test_data_rank4_INT)
+@common.parametrize("test_data", test_data_all_16a8w)
@common.XfailIfNoCorstone300
-@pytest.mark.xfail(
- reason="Ethos-U55 A16W8 linear: int16 matmul not yet supported; pending backend support or linear->conv1x1 lowering. See: https://github.com/pytorch/executorch/issues/13947",
- strict=False,
-)
def test_linear_16a8w_u55_INT16(test_data: torch.Tensor):
"""Test linear operation with 16A8W quantization on U55 (16-bit activations, 8-bit weights)"""
test_data, out_features, has_bias, per_channel_quantization = test_data()
@@ -347,12 +341,8 @@ def test_linear_16a8w_u55_INT16(test_data: torch.Tensor):
pipeline.run()
-@common.parametrize("test_data", test_data_rank1_INT | test_data_rank4_INT)
+@common.parametrize("test_data", test_data_all_16a8w)
@common.XfailIfNoCorstone320
-@pytest.mark.xfail(
- reason="Ethos-U55 A16W8 linear: int16 matmul not yet supported; pending backend support or linear->conv1x1 lowering. See: https://github.com/pytorch/executorch/issues/13947",
- strict=False,
-)
def test_linear_16a8w_u85_INT16(test_data: torch.Tensor):
"""Test linear operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)"""
test_data, out_features, has_bias, per_channel_quantization = test_data()
From ca9fc0613063ce8d15148ca9c3dfe7e94b6b14c0 Mon Sep 17 00:00:00 2001
From: pytorchbot
Date: Fri, 3 Oct 2025 23:32:01 -0400
Subject: [PATCH 124/266] [Release Only] Bugfix/fix nxp separable conv test
(#14800)
### Summary
Fix failing separable convolution test.
The error is larger on the CI than on my PC.
Fixes #14709
### Test plan
N/A
Co-authored-by: Martin Pavella
---
backends/nxp/tests/test_split_group_convolution.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/backends/nxp/tests/test_split_group_convolution.py b/backends/nxp/tests/test_split_group_convolution.py
index 21ab1c5b59a..4c9f277e34d 100644
--- a/backends/nxp/tests/test_split_group_convolution.py
+++ b/backends/nxp/tests/test_split_group_convolution.py
@@ -110,7 +110,7 @@ def test_split_group_convolution__2d(self, _, input_shape: list[int], group: int
input_data = torch.randn(input_shape, dtype=torch.float32)
out1 = original_module(input_data).detach().numpy()
out2 = modified_module(input_data).detach().numpy()
- assert np.allclose(out1, out2, atol=2.0e-7)
+ assert np.allclose(out1, out2, atol=2.0e-7, rtol=1.9e-4)
# Make sure the graph can be correctly quantized and lowered to edge.
ep = _quantize_and_lower_module(
From 3f0896a5d9dd70f5c21bf2368640d748192f0238 Mon Sep 17 00:00:00 2001
From: pytorchbot
Date: Sat, 4 Oct 2025 00:00:31 -0400
Subject: [PATCH 125/266] [ET-VK] Miscellaneous fixes (#14801)
Collecting fixes for various models/ops in this diff/PR.
They have all been squashed into this single change to make it easier to cherry pick.
# Fixes
## Wav2Letter
Type: Output correctness failure
This is caused by a bug in swiftshader, and not reproducible on any other platform. Specifically, the issue is in the softmax shader; the exact cause of the issue is unknown, but it is related to using shared memory within shaders. The workaround for this issue is to use separate shared memory arrays for the shared max and shared sum.
## ConvNeXT
Type: Exception during runtime
This is caused by an incompatible memory layout being used for mean2d. More technically, the packed dimension of the tensor cannot be one of the dims being reduced. The current operator registry system did not have a way to select valid tensor representations based on the actual arguments of an op.
To fix, we have to introduce a mechanism for ops to specify valid representations once a node's arguments are known. Once the model is exported with supported memory layout, the model test passes.
## Inception_V3/ViT
Type: Exception during runtime
The root cause of this was an interaction betwen the fuse batch norm pass and how `vulkan_preprocess.py` was applying passes. Essentially, the fuse batch norm pass creates a new param node for the fused weight, but after the pass is applied `_copy_module` is used to copy the transformed graph back into the ExportedProgram. However, it seems that _copy_module lowercases the node names without updating the exported program's graph signature. Therefore, subsequent passes couldn't recognize the weight tensor of convolution tensors as a constant/parameter node.
The solution was to migrate vulkan_preprocess.py to use the _transform() API instead of using _copy_module.
## DenseNet 161 (w/ dynamic shapes)
Type: Output Mismatch
Cause: the native_batch_norm op doesn't support dynamic shapes. However, the backend test runner doesn't set the correct compile option to filter ops without dynamic shape support.
Differential Revision: [D83703496](https://our.internmc.facebook.com/intern/diff/D83703496/)
[ghstack-poisoned]
---
.github/workflows/pull.yml | 7 +-
backends/vulkan/_passes/fold_qdq.py | 5 +-
backends/vulkan/_passes/fuse_patterns.py | 10 +-
backends/vulkan/_passes/fuse_quantized_ops.py | 10 +-
.../vulkan/_passes/tag_memory_meta_pass.py | 4 +
backends/vulkan/op_registry.py | 93 +++++++++----
.../vulkan/partitioner/vulkan_partitioner.py | 10 +-
backends/vulkan/patterns/quantized_linear.py | 12 +-
.../vulkan/runtime/graph/ops/glsl/conv2d.glsl | 2 +-
.../runtime/graph/ops/glsl/conv2d_dw.glsl | 2 +-
.../graph/ops/glsl/conv2d_dw_output_tile.glsl | 4 +
.../vulkan/runtime/graph/ops/glsl/full.yaml | 1 +
.../runtime/graph/ops/glsl/softmax.glsl | 27 ++--
.../runtime/graph/ops/impl/BatchNorm.cpp | 14 +-
.../vulkan/runtime/graph/ops/impl/Permute.cpp | 8 +-
.../vulkan/runtime/graph/ops/impl/Pool.cpp | 4 +-
.../vulkan/runtime/graph/ops/impl/Squeeze.cpp | 9 +-
backends/vulkan/test/TARGETS | 1 -
backends/vulkan/test/test_vulkan_passes.py | 70 +---------
backends/vulkan/test/utils.py | 4 +-
backends/vulkan/utils.py | 19 ++-
backends/vulkan/vulkan_preprocess.py | 59 ++++----
examples/vulkan/export.py | 127 +++++++++++-------
23 files changed, 298 insertions(+), 204 deletions(-)
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index c15fadd102f..845cb5d8631 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -970,11 +970,16 @@ jobs:
PYTHON_EXECUTABLE=python bash backends/vulkan/test/scripts/test_model.sh --build
# Test models serially
- models="mv2 mv3 edsr resnet18 resnet50 dl3"
+ models="mv2 mv3 edsr resnet18 resnet50 dl3 w2l ic3 ic4"
for model in $models; do
python -m examples.vulkan.export --model_name=$model --test
done
+ # For selected vision models, test with dynamic shapes
+ models="mv2 resnet18 resnet50 ic3 densenet161"
+ for model in $models; do
+ python -m examples.vulkan.export --model_name=$model --test -d
+ done
test-vulkan-operators-linux:
name: test-vulkan-operators-linux
diff --git a/backends/vulkan/_passes/fold_qdq.py b/backends/vulkan/_passes/fold_qdq.py
index 3beccc2205c..a6a5e751c05 100644
--- a/backends/vulkan/_passes/fold_qdq.py
+++ b/backends/vulkan/_passes/fold_qdq.py
@@ -17,9 +17,8 @@ class FoldQDQPass(ExportPass):
valid quant op patterns have already been fused before this pass.
"""
- def __init__(self, edge_program: torch.export.ExportedProgram):
- super(FoldQDQPass, self).__init__()
- self.edge_program = edge_program
+ def __init__(self):
+ super().__init__()
def call(self, graph_module: torch.fx.GraphModule):
for node in graph_module.graph.nodes:
diff --git a/backends/vulkan/_passes/fuse_patterns.py b/backends/vulkan/_passes/fuse_patterns.py
index 6ced1f32a7c..1575dd6a4f6 100644
--- a/backends/vulkan/_passes/fuse_patterns.py
+++ b/backends/vulkan/_passes/fuse_patterns.py
@@ -4,6 +4,8 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
+from typing import Optional
+
import executorch.backends.vulkan.patterns as vk_patterns
import torch
@@ -13,13 +15,15 @@
class FusePatternsPass(ExportPass):
- def __init__(self, exported_program: ExportedProgram) -> None:
+ def __init__(self) -> None:
super().__init__()
- self.program = exported_program
+ self._exported_program: Optional[ExportedProgram] = None
def call(self, graph_module: torch.fx.GraphModule):
+ assert self._exported_program is not None
+
total_replaced = vk_patterns.replace_all_fusable_subgraphs(
- self.program, graph_module
+ self._exported_program, graph_module
)
if total_replaced > 0:
diff --git a/backends/vulkan/_passes/fuse_quantized_ops.py b/backends/vulkan/_passes/fuse_quantized_ops.py
index ca9f7541159..bb8cf5f2e64 100644
--- a/backends/vulkan/_passes/fuse_quantized_ops.py
+++ b/backends/vulkan/_passes/fuse_quantized_ops.py
@@ -211,18 +211,20 @@ def fuse_into_linear_qcnw_node(
class FuseQuantizedOpsTransform(ExportPass):
- def __init__(self, exported_program: ExportedProgram) -> None:
+ def __init__(self) -> None:
super().__init__()
- self.program = exported_program
+ self._exported_program: Optional[ExportedProgram] = None
def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+ assert self._exported_program is not None
+
for node in graph_module.graph.nodes:
# Check for linear_qcnw pattern (weight-only quantization)
- qcnw_details = matches_linear_qcnw_pattern(self.program, node)
+ qcnw_details = matches_linear_qcnw_pattern(self._exported_program, node)
if qcnw_details is not None:
qcnw_method, qcnw_nbits = qcnw_details
fuse_into_linear_qcnw_node(
- self.program, graph_module, node, qcnw_method, qcnw_nbits
+ self._exported_program, graph_module, node, qcnw_method, qcnw_nbits
)
continue
diff --git a/backends/vulkan/_passes/tag_memory_meta_pass.py b/backends/vulkan/_passes/tag_memory_meta_pass.py
index db53cc666a8..8ed71aa1dae 100644
--- a/backends/vulkan/_passes/tag_memory_meta_pass.py
+++ b/backends/vulkan/_passes/tag_memory_meta_pass.py
@@ -230,6 +230,10 @@ def get_arg_tensor_source_repset(
"""
arg_node = op_node.args[arg_i]
+ # For non-tensor arguments, return ANY_STORAGE
+ if not utils.is_tensor_arg_node(arg_node):
+ return utils.ANY_STORAGE
+
# Special case for cat - use the first tensor in the list as representative
if isinstance(arg_node, list):
arg_node = arg_node[0]
diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
index a92b3b11f6f..63b57a0e79c 100644
--- a/backends/vulkan/op_registry.py
+++ b/backends/vulkan/op_registry.py
@@ -16,8 +16,6 @@
import torch
-from executorch.backends.vulkan.serialization.vulkan_graph_schema import VkMemoryLayout
-
from executorch.exir.dialects._ops import ops as exir_ops
from executorch.exir.dialects.edge._ops import EdgeOpOverload
@@ -48,6 +46,9 @@ class OpFeatures:
# Optional check function used during partitioning to determine if a node's
# inputs are supported by the operator implementation.
"are_node_inputs_supported_fn",
+ # Optional function to determine valid representation sets for input and outputs
+ # once a node's actual inputs are known.
+ "pick_io_storage_fn",
]
def __init__(
@@ -61,6 +62,7 @@ def __init__(
supports_resize: bool = False,
supports_prepacking: bool = False,
are_node_inputs_supported_fn: Optional[Callable] = allow_node,
+ pick_io_storage_fn: Optional[Callable] = None,
):
self.inputs_storage: utils.TensorRepSetList = utils.TensorRepSetList(
inputs_storage if inputs_storage is not None else []
@@ -77,15 +79,21 @@ def __init__(
self.supports_prepacking = supports_prepacking
self.are_node_inputs_supported_fn = are_node_inputs_supported_fn
+ self.pick_io_storage_fn = pick_io_storage_fn
def make_op_repsets(
self,
op_node: torch.fx.Node,
texture_limits: utils.ImageExtents = utils.DEFAULT_TEXTURE_LIMITS,
) -> utils.OpRepSets:
- return utils.OpRepSets(
- self.inputs_storage, self.outputs_storage, op_node, texture_limits
- )
+ inputs_storage = self.inputs_storage
+ outputs_storage = self.outputs_storage
+ if self.pick_io_storage_fn is not None:
+ i_storage, o_storage = self.pick_io_storage_fn(op_node)
+ inputs_storage = utils.TensorRepSetList(i_storage)
+ outputs_storage = utils.TensorRepSetList(o_storage)
+
+ return utils.OpRepSets(inputs_storage, outputs_storage, op_node, texture_limits)
#######################
@@ -410,28 +418,16 @@ def register_softmax_op():
)
def register_reduce_op():
def check_reduce_node(node: torch.fx.Node) -> bool:
+ # Only one argument implies that the reduction is over the entire tensor, which
+ # is not supported yet.
+ if len(node.args) == 1:
+ return False
+
dim_list = node.args[1]
+ # Only 1D and 2D reductions are supported at the moment.
if isinstance(dim_list, list) and len(dim_list) > 2:
return False
- if isinstance(dim_list, list) and len(dim_list) == 2:
- # Try to get the memory layout for this node
- try:
- memory_layout = utils.get_node_memory_layout(node)
-
- # If we have memory layout information, check if any dimension in dim_list corresponds to a packed dimension
- if (
- memory_layout is not None
- and memory_layout != VkMemoryLayout.DEFAULT_LAYOUT
- ):
- # For now only default layout is supported for 2D reduction.
- # Because we can't determine if the input is NCHW or NHWC here,
- # assume the reduction dimension is packed so we cannot support it.
- return False
- except (AssertionError, KeyError, AttributeError):
- # If we can't get memory layout information, we'll assume the dims aren't packed
- pass
-
def try_find_keepdim_arg(node: torch.fx.Node) -> bool:
for arg in node.args:
if isinstance(arg, bool):
@@ -446,10 +442,41 @@ def try_find_keepdim_arg(node: torch.fx.Node) -> bool:
return True
+ def pick_io_storage_for_reduce(node: torch.fx.Node):
+ inputs_storage = utils.ANY_TEXTURE
+ outputs_storage = utils.ANY_TEXTURE
+
+ input_tensor = node.args[0]
+ ndim = input_tensor.meta["val"].ndim
+ dim_list = node.args[1]
+ if isinstance(dim_list, list) and len(dim_list) == 2:
+ reduce_dim1_whcn = utils.nchw_dim_to_whcn_dim(dim_list[0], ndim)
+ reduce_dim2_whcn = utils.nchw_dim_to_whcn_dim(dim_list[1], ndim)
+
+ possible_packed_dims = {0, 1, 2}
+ possible_packed_dims.discard(reduce_dim1_whcn)
+ possible_packed_dims.discard(reduce_dim2_whcn)
+
+ packed_dim = possible_packed_dims.pop()
+ assert packed_dim in [0, 1, 2]
+
+ if packed_dim == 0:
+ inputs_storage = utils.WIDTH_PACKED_TEXTURE
+ outputs_storage = utils.WIDTH_PACKED_TEXTURE
+ elif packed_dim == 1:
+ inputs_storage = utils.HEIGHT_PACKED_TEXTURE
+ outputs_storage = utils.HEIGHT_PACKED_TEXTURE
+ else:
+ inputs_storage = utils.CHANNELS_PACKED_TEXTURE
+ outputs_storage = utils.CHANNELS_PACKED_TEXTURE
+
+ return inputs_storage, outputs_storage
+
return OpFeatures(
inputs_storage=utils.ANY_TEXTURE,
supports_resize=True,
are_node_inputs_supported_fn=check_reduce_node,
+ pick_io_storage_fn=pick_io_storage_for_reduce,
)
@@ -474,6 +501,23 @@ def register_2d_pool_op():
]
)
def register_convolution_op():
+ def check_conv_node(node: torch.fx.Node) -> bool:
+ x = node.args[0]
+ x_shape = x.meta["val"].size()
+ # 4-D input implies 2D convolution
+ if len(x_shape) == 4:
+ batches = x.meta["val"].size()[0]
+ if batches != 1:
+ return False
+ # 3-D input implies 1D convolution
+ if len(x_shape) == 3:
+ transpose = node.args[6]
+ # Transposed 1D convolution is not supported yet
+ if transpose:
+ return False
+
+ return True
+
return OpFeatures(
inputs_storage=[
utils.CHANNELS_PACKED_TEXTURE, # input
@@ -490,6 +534,7 @@ def register_convolution_op():
],
supports_resize=True,
supports_prepacking=True,
+ are_node_inputs_supported_fn=check_conv_node,
)
@@ -716,6 +761,7 @@ def register_ported_ops_with_prepacking():
return OpFeatures(
inputs_storage=utils.CHANNELS_PACKED_TEXTURE,
supports_prepacking=True,
+ supports_resize=True,
)
@@ -746,6 +792,7 @@ def register_ported_ops_with_prepacking_all_dims():
return OpFeatures(
inputs_storage=utils.ANY_TEXTURE,
supports_prepacking=True,
+ supports_resize=True,
)
diff --git a/backends/vulkan/partitioner/vulkan_partitioner.py b/backends/vulkan/partitioner/vulkan_partitioner.py
index e5b2d0f7864..0bdc16616ef 100644
--- a/backends/vulkan/partitioner/vulkan_partitioner.py
+++ b/backends/vulkan/partitioner/vulkan_partitioner.py
@@ -36,7 +36,7 @@
Partitioner,
PartitionResult,
)
-from executorch.exir.backend.utils import tag_constant_data
+from executorch.exir.backend.utils import tag_constant_data, tag_mutated_buffer
from executorch.exir.dialects._ops import ops as exir_ops
from torch.export.exported_program import ExportedProgram
@@ -254,9 +254,10 @@ def _is_node_supported(self, node: torch.fx.Node) -> bool: # noqa: C901
self.log_skip(node, "permute node of non compatible linear node")
return False
- is_in_local_scalar_dense_chain, dst_node_is_compatible = (
- self.is_in_local_scalar_dense_chain(node)
- )
+ (
+ is_in_local_scalar_dense_chain,
+ dst_node_is_compatible,
+ ) = self.is_in_local_scalar_dense_chain(node)
if is_in_local_scalar_dense_chain and dst_node_is_compatible:
return True
elif is_in_local_scalar_dense_chain:
@@ -419,6 +420,7 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
logger.info(f"Found {pl} Vulkan subgraphs to be partitioned.")
tag_constant_data(exported_program)
+ tag_mutated_buffer(exported_program)
return PartitionResult(
tagged_exported_program=exported_program, partition_tags=partition_tags
diff --git a/backends/vulkan/patterns/quantized_linear.py b/backends/vulkan/patterns/quantized_linear.py
index 882d0d41e6d..374e29c634d 100644
--- a/backends/vulkan/patterns/quantized_linear.py
+++ b/backends/vulkan/patterns/quantized_linear.py
@@ -92,9 +92,11 @@ def __init__(self, mm_node: torch.fx.Node) -> None:
return
# Identify input node
- self.fp_input_node, self.quantize_input_node, dq_node = (
- utils.maybe_skip_q_dq_arg_chain(self.anchor_node.args[0])
- )
+ (
+ self.fp_input_node,
+ self.quantize_input_node,
+ dq_node,
+ ) = utils.maybe_skip_q_dq_arg_chain(self.anchor_node.args[0])
assert self.fp_input_node is not None
self.all_nodes.append(self.fp_input_node)
@@ -386,7 +388,7 @@ def make_linear_dq8ca_q4gsw_op(
weight_sums_node = create_constant_placeholder(
exp_program=ep,
graph=graph_module.graph,
- kind=InputKind.CONSTANT_TENSOR,
+ kind=InputKind.PARAMETER,
name=sums_name,
data=sum_per_quant_group,
)
@@ -429,7 +431,7 @@ def make_linear_q8ta_q8csw_custom_op(
weight_sums_node = create_constant_placeholder(
exp_program=ep,
graph=graph_module.graph,
- kind=InputKind.CONSTANT_TENSOR,
+ kind=InputKind.PARAMETER,
name=sums_name,
data=sum_per_output_channel,
)
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl
index 0f5dbc41273..88746c5594e 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl
@@ -60,7 +60,7 @@ void main() {
int num_steps = ((-ipos.y) + dilation.y - 1) / dilation.y;
start.y = ipos.y + num_steps * dilation.y;
}
- const ivec2 end = min(ipos + overlay_region.xy, ivec2(in_sizes.xy));
+ const ivec2 end = min(ipos + overlay_region.xy, in_sizes.xy);
// Compute the start of the kernel based on how far we are skipping ahead when
// reading the input. Note that these are "canonical" indices.
ivec2 kstart = (start - ipos) / dilation;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl
index 02fbef29b75..9089f87d658 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl
@@ -54,7 +54,7 @@ void main() {
// Compute the start and end of the input indices to load. Padding is assumed
// to be constant 0 padding, so reads from the padding region are skipped.
const ivec2 start = ipos;
- const ivec2 end = ipos + overlay_region.xy;
+ const ivec2 end = min(ipos + overlay_region.xy, in_sizes.xy);
VEC4_T sum = texelFetch(t_bias, ivec2(pos.z, 0), 0);
int kx = 0;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
index 19250419baf..7448b042cad 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
@@ -97,6 +97,10 @@ void main() {
for (int y = start.y, i = 0; i < TILE_SIZE + BATCH_SIZE_Y - 1; y += dilation.y, i++) {
for (int x = start.x, j = 0; j < TILE_SIZE + BATCH_SIZE_X - 1; x += dilation.x, j++) {
in_texels[j] = texelFetch(t_in, ivec3(x, y, pos.z), 0);
+ // Set to zero if reading out of bounds
+ if (any(greaterThanEqual(ivec2(x, y), in_sizes.xy))) {
+ in_texels[j] = VEC4_T(0);
+ }
}
// from 2nd iteration onwards accumulate dot product in 2nd sum
diff --git a/backends/vulkan/runtime/graph/ops/glsl/full.yaml b/backends/vulkan/runtime/graph/ops/glsl/full.yaml
index eff78a7938d..1a5b0cb235e 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/full.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/full.yaml
@@ -14,5 +14,6 @@ full:
DTYPE:
- VALUE: half
- VALUE: float
+ - VALUE: int32
shader_variants:
- NAME: full
diff --git a/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl b/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl
index d35492bc367..86a2229c416 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl
@@ -42,7 +42,8 @@ layout(constant_id = 5) const int group_dim = 1;
// work group will write into its assigned element in the shared array.
#define MAX_NTHREADS 16
-shared vec4 shared_vecs[MAX_NTHREADS];
+shared vec4 shared_max[MAX_NTHREADS];
+shared vec4 shared_sum[MAX_NTHREADS];
#include "indexing_utils.h"
@@ -102,13 +103,13 @@ void softmax_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
max_elements = max(max_elements, load_texel(tin, scan_pos));
}
- shared_vecs[smi] = max_elements;
+ shared_max[smi] = max_elements;
barrier();
// Iterate over the partial maximums to obtain the overall maximum
group_i = tid.y * NWORKERS;
- max_elements = shared_vecs[group_i++];
+ max_elements = shared_max[group_i++];
for (int i = 1; i < NWORKERS; ++i, group_i++) {
- max_elements = max(max_elements, shared_vecs[group_i]);
+ max_elements = max(max_elements, shared_max[group_i]);
}
scan_pos[reduce_dim] = tid.x;
@@ -118,13 +119,13 @@ void softmax_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
denominators += exp(load_texel(tin, scan_pos) - max_elements);
}
- shared_vecs[smi] = denominators;
+ shared_sum[smi] = denominators;
barrier();
// Iterate over the partial sums to obtain the overall sum
group_i = tid.y * NWORKERS;
- denominators = shared_vecs[group_i++];
+ denominators = shared_sum[group_i++];
for (int i = 1; i < NWORKERS; ++i, group_i++) {
- denominators += shared_vecs[group_i];
+ denominators += shared_sum[group_i];
}
// Determine if there are any padding elements in the final texel of the
@@ -184,13 +185,13 @@ void softmax_packed_dim(const ivec2 tid, ivec3 scan_pos) {
max_elements.x = max(intex[i], max_elements.x);
}
}
- shared_vecs[smi] = max_elements;
+ shared_max[smi] = max_elements;
barrier();
// Iterate over the partial maximums to obtain the overall maximum
group_i = tid.y * NWORKERS;
- max_elements = shared_vecs[group_i++];
+ max_elements = shared_max[group_i++];
for (int i = 1; i < NWORKERS; ++i, group_i++) {
- max_elements = max(max_elements, shared_vecs[group_i]);
+ max_elements = max(max_elements, shared_max[group_i]);
}
// Each element of the texel is itself a partial maximum; iterate over the
// texel to find the actual maximum
@@ -214,13 +215,13 @@ void softmax_packed_dim(const ivec2 tid, ivec3 scan_pos) {
denominators.x += exp(intex[i] - max_element);
}
}
- shared_vecs[smi] = denominators;
+ shared_sum[smi] = denominators;
barrier();
// Iterate over the partial sums to obtain the overall sum
group_i = tid.y * NWORKERS;
- denominators = shared_vecs[group_i++];
+ denominators = shared_sum[group_i++];
for (int i = 1; i < NWORKERS; ++i, group_i++) {
- denominators += shared_vecs[group_i];
+ denominators += shared_sum[group_i];
}
// Reduce over the accumulated texel to find the overall sum
float denominator = 0;
diff --git a/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp b/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp
index 757afd06849..a6dd8f07f53 100644
--- a/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp
@@ -19,6 +19,18 @@
namespace vkcompute {
+void resize_batch_norm_node(
+ ComputeGraph* graph,
+ const std::vector& args,
+ const std::vector& extra_args) {
+ const ValueRef out = args.at(0).refs.at(0);
+ const ValueRef self = args.at(1).refs.at(0);
+
+ // For batch norm, output dimensions are the same as input dimensions
+ std::vector new_out_sizes = graph->sizes_of(self);
+ graph->virtual_resize(out, new_out_sizes);
+}
+
ValueRef check_and_prepack_arg(
ComputeGraph& graph,
ValueRef arg_ref,
@@ -101,7 +113,7 @@ void add_native_batch_norm_node(
// Resize Args
{},
// Resizing Logic
- nullptr));
+ resize_batch_norm_node));
}
void native_batch_norm(ComputeGraph& graph, const std::vector& args) {
diff --git a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
index 9ac4c963bc3..329620e80e6 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
@@ -109,11 +109,15 @@ void add_permute_node(
{
IntListPtr permute_dims_ptr = graph.get_int_list(permute_dims);
const int32_t permute_ndim =
- utils::safe_downcast(permute_dims_ptr->size());
+ utils::safe_downcast(permute_dims_ptr->size());
for (int32_t nchw_i = permute_ndim - 1, whcn_i = 0; nchw_i >= 0;
nchw_i--, whcn_i++) {
- const int32_t permute_dim_nchw = permute_dims_ptr->at(nchw_i);
+ int32_t permute_dim_nchw =
+ utils::safe_downcast(permute_dims_ptr->at(nchw_i));
+ if (permute_dim_nchw < 0) {
+ permute_dim_nchw += permute_ndim;
+ }
const int32_t permute_dim_whcn = permute_ndim - 1 - permute_dim_nchw;
whcn_permute_dims[whcn_i] = permute_dim_whcn;
diff --git a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp b/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
index 250fcdd5490..879f59667d6 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
@@ -137,7 +137,7 @@ void max_pool2d(ComputeGraph& graph, const std::vector& args) {
struct DivisorParams final {
int32_t divisor_override;
- bool count_include_pad;
+ int32_t count_include_pad;
};
DivisorParams create_divisor_params(
@@ -148,7 +148,7 @@ DivisorParams create_divisor_params(
graph.val_is_int(divisor_override)
? static_cast(graph.get_int(divisor_override))
: 0,
- graph.get_bool(count_include_pad)};
+ int32_t(graph.get_bool(count_include_pad))};
}
void add_avg_pool2d_node(
diff --git a/backends/vulkan/runtime/graph/ops/impl/Squeeze.cpp b/backends/vulkan/runtime/graph/ops/impl/Squeeze.cpp
index 13801b45cc7..e2b73b2f3f2 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Squeeze.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Squeeze.cpp
@@ -32,8 +32,13 @@ void add_squeeze_copy_dims_node(
// 2. Squeeze outter most dim
// For these cases, just pass input to output via clone.
for (int i = 0; i < dims.size(); ++i) {
- if (dims.at(i) != 0 && in_sizes.at(dims.at(i)) == 1) {
- squeeze_dims.push_back(dims.at(i));
+ // adjust negative dims
+ int64_t dim_val = dims.at(i);
+ if (dim_val < 0) {
+ dim_val += in_dim;
+ }
+ if (dims.at(i) != 0 && in_sizes.at(dim_val) == 1) {
+ squeeze_dims.push_back(dim_val);
}
}
if (squeeze_dims.size() == 0) {
diff --git a/backends/vulkan/test/TARGETS b/backends/vulkan/test/TARGETS
index 53fad86f90c..ee296a4f68f 100644
--- a/backends/vulkan/test/TARGETS
+++ b/backends/vulkan/test/TARGETS
@@ -34,7 +34,6 @@ python_unittest(
deps = [
"//caffe2:torch",
"//executorch/backends/vulkan/_passes:vulkan_passes",
- "//executorch/backends/vulkan/quantizer:vulkan_quantizer",
"//executorch/backends/vulkan:vulkan_preprocess",
"//pytorch/ao:torchao", # @manual
]
diff --git a/backends/vulkan/test/test_vulkan_passes.py b/backends/vulkan/test/test_vulkan_passes.py
index 4a30ab6c2de..438126a179f 100644
--- a/backends/vulkan/test/test_vulkan_passes.py
+++ b/backends/vulkan/test/test_vulkan_passes.py
@@ -3,15 +3,8 @@
import torch
-from executorch.backends.transforms.addmm_mm_to_linear import AddmmToLinearTransform
-from executorch.backends.vulkan._passes import FuseQuantizedOpsTransform
from executorch.backends.vulkan._passes.fuse_patterns import FusePatternsPass
-from executorch.backends.vulkan.quantizer.vulkan_quantizer import (
- get_symmetric_quantization_config,
- VulkanQuantizer,
-)
-
from executorch.exir import EdgeCompileConfig, EdgeProgramManager, to_edge
from executorch.exir.backend.canonical_partitioners.config_partitioner import (
@@ -94,66 +87,6 @@ def op_node_count(graph_module: torch.fx.GraphModule, canonical_op_name: str) ->
class TestVulkanPasses(unittest.TestCase):
- def test_fuse_int8pack_mm(self):
- K = 256
- N = 256
- model = SingleLinearModule(K, N)
- sample_inputs = model.get_sample_inputs()
-
- quantizer = VulkanQuantizer()
- quantizer.set_global(
- get_symmetric_quantization_config(is_dynamic=False, weight_bits=8)
- )
-
- edge_manager = quantize_and_lower_module(
- model,
- sample_inputs,
- quantizer,
- )
-
- ep = edge_manager._edge_programs["forward"]
- edge_manager.transform(
- [
- AddmmToLinearTransform(),
- FuseQuantizedOpsTransform(ep),
- ]
- )
-
- gm = ep.graph_module
-
- self.assertEqual(op_node_count(gm, "_weight_int8pack_mm.default"), 1)
- self.assertEqual(op_node_count(gm, "dequantize_per_channel.default"), 0)
-
- def test_fuse_linear_qcs4w(self):
- K = 256
- N = 256
- model = SingleLinearModule(K, N)
- sample_inputs = model.get_sample_inputs()
-
- quantizer = VulkanQuantizer()
- quantizer.set_global(
- get_symmetric_quantization_config(is_dynamic=False, weight_bits=4)
- )
-
- edge_manager = quantize_and_lower_module(
- model,
- sample_inputs,
- quantizer,
- )
-
- ep = edge_manager._edge_programs["forward"]
- edge_manager.transform(
- [
- AddmmToLinearTransform(),
- FuseQuantizedOpsTransform(ep),
- ]
- )
-
- gm = ep.graph_module
-
- self.assertEqual(op_node_count(gm, "linear_qcs4w.default"), 1)
- self.assertEqual(op_node_count(gm, "dequantize_per_channel.default"), 0)
-
def test_fuse_rotary_emb(self):
"""Test conversion of rotary embedding pattern to et_vk.apply_rotary_emb custom op."""
@@ -238,7 +171,8 @@ def _reshape_for_broadcast(self, freqs_cis: torch.Tensor, x: torch.Tensor):
# Apply the rotary embedding pass
ep = edge_manager._edge_programs["forward"]
- rotary_pass = FusePatternsPass(ep)
+ rotary_pass = FusePatternsPass()
+ rotary_pass._exported_program = ep
result = rotary_pass.call(ep.graph_module)
# Verify that the pass was successful
diff --git a/backends/vulkan/test/utils.py b/backends/vulkan/test/utils.py
index bfe4e9fceee..a887c53473a 100644
--- a/backends/vulkan/test/utils.py
+++ b/backends/vulkan/test/utils.py
@@ -90,7 +90,9 @@ def export_model_to_vulkan(
qmode=QuantizationMode.NONE,
):
compile_options = {}
- exported_graph = get_exported_graph(model, sample_inputs, qmode=qmode)
+ exported_graph = get_exported_graph(
+ model, sample_inputs, dynamic_shapes=dynamic_shapes, qmode=qmode
+ )
program = export(
exported_graph,
sample_inputs,
diff --git a/backends/vulkan/utils.py b/backends/vulkan/utils.py
index 972a4f26c1b..09c57f649ae 100644
--- a/backends/vulkan/utils.py
+++ b/backends/vulkan/utils.py
@@ -128,7 +128,7 @@ def is_param_node(program: ExportedProgram, node: torch.fx.Node) -> bool:
is_get_attr_node(node)
or is_param(program, node)
or is_buffer(program, node)
- or is_constant(program, node)
+ or is_lifted_tensor_constant(program, node)
)
@@ -206,6 +206,8 @@ def is_tensor_arg_node(node: Any) -> bool:
if isinstance(node, torch.fx.Node):
return is_tensor_node(node)
elif isinstance(node, (list, tuple)):
+ if len(node) == 0:
+ return False
return all(is_tensor_node(n) for n in node)
return False
@@ -1228,6 +1230,16 @@ def is_in_8bit_range(tensor: torch.Tensor) -> bool:
##
+def nchw_dim_to_whcn_dim(nchw_dim: int, ndim: int) -> int:
+ # Handle negative indices for nchw_dim
+ if nchw_dim < 0:
+ nchw_dim += ndim
+
+ assert nchw_dim >= 0 and nchw_dim < ndim
+ whcn_dim = (ndim - 1) - nchw_dim
+ return whcn_dim
+
+
def get_tensor_val_str(tensor_val: FakeTensor) -> str:
return f"{tensor_val.dtype}: {tensor_val.shape}"
@@ -1279,6 +1291,7 @@ def update_program_state_dict(
updated_tensor: torch.Tensor,
) -> None:
target_name = None
+ kind = None
# Iterate over all the tensors in the graph signature, and find
# the one corresponding to the parameter/buffer name
for input_ in program.graph_signature.input_specs:
@@ -1287,6 +1300,7 @@ def update_program_state_dict(
and isinstance(input_.arg, TensorArgument)
and input_.arg.name == buffer_name
):
+ kind = input_.kind
target_name = input_.target
break
@@ -1296,6 +1310,9 @@ def update_program_state_dict(
), f"could not find {buffer_name} in source program signature"
assert target_name in program.state_dict, f"could not find {target_name}"
+ if kind == InputKind.PARAMETER:
+ updated_tensor = torch.nn.Parameter(updated_tensor, requires_grad=False)
+
# Finally, overwrite the current tensor with updated tensor
program.state_dict[target_name] = updated_tensor
diff --git a/backends/vulkan/vulkan_preprocess.py b/backends/vulkan/vulkan_preprocess.py
index 2f91d97ff58..876f7fa8900 100644
--- a/backends/vulkan/vulkan_preprocess.py
+++ b/backends/vulkan/vulkan_preprocess.py
@@ -8,7 +8,7 @@
from functools import partial
-from typing import Any, Dict, final, List
+from typing import Any, Callable, Dict, final, List
import executorch.backends.vulkan.utils as utils
@@ -56,7 +56,9 @@
from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
-from executorch.exir.program._program import _copy_module
+from executorch.exir.program._program import _transform
+
+from torch._export.verifier import Verifier
from torch.export._remove_auto_functionalized_pass import (
unsafe_remove_auto_functionalized_pass,
@@ -65,28 +67,34 @@
DEFAULT_DEBUG_HANDLE = 65535
+class _any_op(Verifier):
+ # Set training dialect to skip functional check in base verifier
+ dialect = "TRAINING"
+
+ def allowed_op_types(self):
+ return (Callable,)
+
+
# pyre-ignore
def apply_passes(program: ExportedProgram, passes) -> ExportedProgram:
for p in passes:
- if issubclass(type(p), ExportPass) or issubclass(type(p), PassBase):
- new_gm = program.graph_module
- # This is a workaround to allow the memory planning pass to work without
- # having to first apply ToOutVarPass(). See the `greedy()` function in
- # `exir.memory_planning`; if this attribute isn't set, assertions in
- # `collect_spec_from_nodes()` will fail.
- if isinstance(p, MemoryPlanningPass):
- new_gm.encounter_to_out_var_failure = True
-
- new_gm_res = p(new_gm)
- assert new_gm_res is not None
- new_gm = new_gm_res.graph_module
-
+ if isinstance(p, MemoryPlanningPass) and hasattr(p, "run"):
+ p.run(program.graph_module)
+
+ elif issubclass(type(p), ExportPass) or issubclass(type(p), PassBase):
+ # Some passes require the ep to be provided. However, since the ep may be
+ # updated with each pass applied, the ep must be set right before calling
+ # the pass. _exported_program is the attribute used by XNNPACK and Vulkan
+ # passes to store the exported program.
+ if hasattr(p, "_exported_program"):
+ p._exported_program = program
+
+ program = _transform(program, p, override_verifiers=[_any_op])
# See the application of this function in exir/program/_program.py for more
# details on why this step is necessary.
if isinstance(p, SpecPropPass):
- p.update_placeholder_tensor_specs(program, new_gm)
+ p.update_placeholder_tensor_specs(program, program.graph_module)
- _copy_module(program.graph_module, new_gm)
else:
program = p(program)
@@ -159,17 +167,17 @@ def preprocess( # noqa: C901
program = apply_passes(
program,
[
- FusePatternsPass(program),
- RemoveRedundantOpsTransform(),
+ FuseBatchNormPass(program),
+ FusePatternsPass(),
+ FuseClampPass(),
AddmmToLinearTransform(),
- FuseQuantizedOpsTransform(program),
+ RemoveRedundantOpsTransform(),
+ FuseQuantizedOpsTransform(),
ReplaceQDQPass(),
- FoldQDQPass(program),
+ FoldQDQPass(),
SqueezeUnsqueezeInputs(),
FuseViewCopyTransform(),
ViewCopyToSqueezeUnsqueezePass(),
- FuseBatchNormPass(program),
- FuseClampPass(),
],
)
@@ -215,6 +223,11 @@ def preprocess( # noqa: C901
mem_planning_suite = MemoryPlanningAlgorithmSuite(
algo_list=[greedy_memory_planning]
)
+ # This is a workaround to allow the memory planning pass to work without having
+ # to first apply ToOutVarPass(). See the `greedy()` function in
+ # `exir.memory_planning`; if this attribute isn't set, assertions in
+ # `collect_spec_from_nodes()` will fail.
+ program.graph_module.encounter_to_out_var_failure = True
program = apply_passes(
program,
[
diff --git a/examples/vulkan/export.py b/examples/vulkan/export.py
index c90b501df6f..dace37e5473 100644
--- a/examples/vulkan/export.py
+++ b/examples/vulkan/export.py
@@ -14,22 +14,18 @@
import backends.vulkan.test.utils as test_utils
import torch
+import torchvision
-from executorch.backends.transforms.convert_dtype_pass import I64toI32
from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
from executorch.devtools import BundledProgram
from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
from executorch.devtools.bundled_program.serialize import (
serialize_from_bundled_program_to_flatbuffer,
)
-from executorch.exir import (
- EdgeCompileConfig,
- ExecutorchBackendConfig,
- to_edge_transform_and_lower,
-)
+from executorch.exir import to_edge_transform_and_lower
from executorch.extension.export_util.utils import save_pte_program
from executorch.extension.pytree import tree_flatten
-from torch.export import export
+from torch.export import Dim, export
from ..models import MODEL_NAME_TO_MODEL
from ..models.model_factory import EagerModelFactory
@@ -38,6 +34,67 @@
logging.basicConfig(level=logging.INFO, format=FORMAT)
+def is_vision_model(model_name):
+ if model_name in [
+ # These models are also registered in examples/models
+ "dl3",
+ "edsr",
+ "mv2",
+ "mv3",
+ "vit",
+ "ic3",
+ "ic4",
+ "resnet18",
+ "resnet50",
+ # These models are not registered in examples/models but are available via
+ # torchvision
+ "convnext_small",
+ "densenet161",
+ "shufflenet_v2_x1_0",
+ ]:
+ return True
+
+ return False
+
+
+def get_vision_model_sample_input():
+ return (torch.randn(1, 3, 224, 224),)
+
+
+def get_vision_model_dynamic_shapes():
+ return (
+ {
+ 2: Dim("height", min=1, max=16) * 16,
+ 3: Dim("width", min=1, max=16) * 16,
+ },
+ )
+
+
+def init_model(model_name):
+ if model_name == "convnext_small":
+ return torchvision.models.convnext_small()
+ if model_name == "densenet161":
+ return torchvision.models.densenet161()
+ if model_name == "shufflenet_v2_x1_0":
+ return torchvision.models.shufflenet_v2_x1_0()
+
+ return None
+
+
+def get_sample_inputs(model_name):
+ if is_vision_model(model_name):
+ return get_vision_model_sample_input()
+
+ return None
+
+
+def get_dynamic_shapes(model_name):
+ if is_vision_model(model_name):
+ return get_vision_model_dynamic_shapes()
+
+ return None
+
+
def main() -> None:
logger = logging.getLogger("")
logger.setLevel(logging.INFO)
@@ -68,21 +125,6 @@ def main() -> None:
help="whether to export with strict mode. Default is True",
)
- parser.add_argument(
- "-a",
- "--segment_alignment",
- required=False,
- help="specify segment alignment in hex. Default is 0x1000. Use 0x4000 for iOS",
- )
-
- parser.add_argument(
- "-e",
- "--external_constants",
- action=argparse.BooleanOptionalAction,
- default=False,
- help="Save constants in external .ptd file. Default is False",
- )
-
parser.add_argument(
"-d",
"--dynamic",
@@ -119,31 +161,35 @@ def main() -> None:
args = parser.parse_args()
- if args.model_name not in MODEL_NAME_TO_MODEL:
- raise RuntimeError(
- f"Model {args.model_name} is not a valid name. "
- f"Available models are {list(MODEL_NAME_TO_MODEL.keys())}."
+ if args.model_name in MODEL_NAME_TO_MODEL:
+ model, example_inputs, _, dynamic_shapes = EagerModelFactory.create_model(
+ *MODEL_NAME_TO_MODEL[args.model_name]
)
+ else:
+ model = init_model(args.model_name)
+ example_inputs = get_sample_inputs(args.model_name)
+ dynamic_shapes = get_dynamic_shapes(args.model_name) if args.dynamic else None
- model, example_inputs, _, dynamic_shapes = EagerModelFactory.create_model(
- *MODEL_NAME_TO_MODEL[args.model_name]
- )
+ if model is None:
+ raise RuntimeError(
+ f"Model {args.model_name} is not a valid name. "
+ f"Available models are {list(MODEL_NAME_TO_MODEL.keys())}."
+ )
# Prepare model
model.eval()
# Setup compile options
compile_options = {}
- if args.dynamic or dynamic_shapes is not None:
+ if args.dynamic:
compile_options["require_dynamic_shapes"] = True
+ # Try to manually get the dynamic shapes for the model if not set
+ if dynamic_shapes is None:
+ dynamic_shapes = get_dynamic_shapes(args.model_name)
+
if args.force_fp16:
compile_options["force_fp16"] = True
- # Configure Edge compilation
- edge_compile_config = EdgeCompileConfig(
- _skip_dim_order=False, # Proper handling for Vulkan memory format
- )
-
logging.info(f"Exporting model {args.model_name} with Vulkan delegate")
# Export the model using torch.export
@@ -157,10 +203,6 @@ def main() -> None:
# Transform and lower with Vulkan partitioner
edge_program = to_edge_transform_and_lower(
program,
- compile_config=edge_compile_config,
- transform_passes=[
- I64toI32(edge_compile_config._skip_dim_order),
- ],
partitioner=[VulkanPartitioner(compile_options)],
generate_etrecord=args.etrecord,
)
@@ -169,13 +211,8 @@ def main() -> None:
f"Exported and lowered graph:\n{edge_program.exported_program().graph}"
)
- # Configure backend options
- backend_config = ExecutorchBackendConfig(external_constants=args.external_constants)
- if args.segment_alignment is not None:
- backend_config.segment_alignment = int(args.segment_alignment, 16)
-
# Create executorch program
- exec_prog = edge_program.to_executorch(config=backend_config)
+ exec_prog = edge_program.to_executorch()
# Save ETRecord if requested
if args.etrecord:
From 881915d21d8704eaee45183108626c77ed5fdfd4 Mon Sep 17 00:00:00 2001
From: Hardik Sharma
Date: Sat, 4 Oct 2025 10:32:16 -0700
Subject: [PATCH 126/266] Add platforms for all operator library sub-targets.
Differential Revision: D83680406
Pull Request resolved: https://github.com/pytorch/executorch/pull/14728
---
shim_et/xplat/executorch/codegen/codegen.bzl | 108 ++++++++++--------
.../kernels/prim_ops/selective_build.bzl | 1 +
2 files changed, 60 insertions(+), 49 deletions(-)
diff --git a/shim_et/xplat/executorch/codegen/codegen.bzl b/shim_et/xplat/executorch/codegen/codegen.bzl
index 3546b64cdb6..0002884b2a4 100644
--- a/shim_et/xplat/executorch/codegen/codegen.bzl
+++ b/shim_et/xplat/executorch/codegen/codegen.bzl
@@ -1,12 +1,12 @@
load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_default_executorch_platforms", "is_xplat", "runtime", "struct_to_json")
load("@fbsource//xplat/executorch/build:selects.bzl", "selects")
-load("@fbsource//xplat/executorch/kernels/portable:op_registration_util.bzl", "portable_source_list")
-load("@fbsource//xplat/executorch/kernels/optimized:op_registration_util.bzl", "optimized_source_list")
load(
"@fbsource//xplat/executorch/kernels/optimized:lib_defs.bzl",
"get_vec_deps",
"get_vec_preprocessor_flags",
)
+load("@fbsource//xplat/executorch/kernels/optimized:op_registration_util.bzl", "optimized_source_list")
+load("@fbsource//xplat/executorch/kernels/portable:op_registration_util.bzl", "portable_source_list")
load("@fbsource//xplat/executorch/kernels/prim_ops:selective_build.bzl", "prim_ops_registry_selective")
# Headers that declare the function signatures of the C++ functions that
@@ -96,15 +96,17 @@ def _get_prim_ops_registry_target(name, deps, aten_suffix, platforms):
Returns:
String: Target name for the appropriate prim ops registry
"""
+
# If selective build targets are specified, create a selective prim ops registry
# Create a selective prim ops registry using the existing function
selective_prim_ops_registry_name = name + "_selected_prim_ops_registry"
combined_prim_ops_header_target_name = name + "_combined_prim_ops_header"
selected_prim_operators_genrule(combined_prim_ops_header_target_name, deps, platforms)
+
# Use the existing prim_ops_registry_selective function
prim_ops_registry_selective(
name = selective_prim_ops_registry_name,
- selected_prim_ops_header_target = ":"+combined_prim_ops_header_target_name,
+ selected_prim_ops_header_target = ":" + combined_prim_ops_header_target_name,
aten_suffix = aten_suffix,
platforms = platforms,
)
@@ -123,11 +125,16 @@ def _extract_prim_ops_from_lists(ops, ops_dict):
Returns:
Tuple of (prim_ops, remaining_ops, remaining_ops_dict)
"""
+
def _is_aten_prim_op(op_name):
if not op_name.startswith("aten::"):
return False
for prim_suffix in [
- "sym_size", "sym_numel", "sym_max", "sym_min", "sym_float"
+ "sym_size",
+ "sym_numel",
+ "sym_max",
+ "sym_min",
+ "sym_float",
]:
if prim_suffix in op_name:
return True
@@ -169,7 +176,6 @@ def et_operator_library(
ops_schema_yaml_target = None,
server_generated_yaml_target = None,
**kwargs):
-
# Check if we should extract prim ops from the operator lists
# Note that selective build for prim ops doesnt support model or ops_schema_yaml_target or server_generated_yaml_target
# TODO: Add support for selective build for prim ops with model or ops_schema_yaml_target or server_generated_yaml_target
@@ -178,6 +184,7 @@ def et_operator_library(
if should_extract_prim_ops:
# Extract prim ops from ops and ops_dict
prim_ops, remaining_ops, remaining_ops_dict = _extract_prim_ops_from_lists(ops, ops_dict)
+
# Use the remaining ops (with prim ops removed) for the main et_operator_library
final_ops = remaining_ops
final_ops_dict = remaining_ops_dict
@@ -189,6 +196,7 @@ def et_operator_library(
selected_operator_yaml_filename = "selected_operators.yaml"
selected_prim_ops_filename = "selected_prim_ops.h"
+
# Generate the main operator library with the final ops
# do a dummy copy if server_generated_yaml_target is set
if server_generated_yaml_target:
@@ -231,6 +239,7 @@ def et_operator_library(
"--prim_op_names=" + ",".join(prim_ops),
"--output_dir=${OUT}",
]
+
# Here we generate the selected_prim_ops.h and the selected_operators.yaml file
# both with single genrule
genrule_cmd = genrule_cmd + [" && "] + prim_ops_genrule_cmd
@@ -307,7 +316,6 @@ def _prepare_genrule_and_lib(
if support_exceptions:
genrule_cmd.append("--add-exception-boundary")
-
# Sources for generated kernel registration lib
sources = MANUAL_REGISTRATION_SOURCES if manual_registration else GENERATED_SOURCES
@@ -371,7 +379,8 @@ def _prepare_custom_ops_genrule_and_lib(
custom_ops_yaml_path = None,
support_exceptions = True,
deps = [],
- kernels = []):
+ kernels = [],
+ platforms = get_default_executorch_platforms()):
"""Similar to _prepare_genrule_and_lib but for custom ops."""
genrules = {}
libs = {}
@@ -390,6 +399,7 @@ def _prepare_custom_ops_genrule_and_lib(
"--output_dir $OUT ").format(deps = " ".join(["\"{}\"".format(d) for d in deps])),
outs = {"selected_operators.yaml": ["selected_operators.yaml"]},
default_outs = ["."],
+ platforms = platforms,
)
# genrule for generating operator kernel bindings
@@ -460,6 +470,7 @@ def exir_custom_ops_aot_lib(
kernels = kernels,
support_exceptions = support_exceptions,
deps = deps,
+ platforms = platforms,
)
for genrule in genrules:
runtime.genrule(
@@ -468,6 +479,7 @@ def exir_custom_ops_aot_lib(
cmd = genrules[genrule]["cmd"],
outs = genrules[genrule]["outs"],
default_outs = ["."],
+ platforms = platforms,
)
for compiler_lib in libs:
runtime.cxx_library(
@@ -538,7 +550,7 @@ def get_optimized_lib_deps():
"//executorch/runtime/kernel:kernel_includes",
] + get_vec_deps()
-def build_portable_header_lib(name, oplist_header_name, feature = None):
+def build_portable_header_lib(name, oplist_header_name, feature = None, **kwargs):
"""Build the portable headers into a header-only library.
Ensures that includes work across portable and optimized libs.
"""
@@ -546,21 +558,23 @@ def build_portable_header_lib(name, oplist_header_name, feature = None):
name = name,
srcs = [],
exported_headers = {
- "selected_op_variants.h":":{}[selected_op_variants]".format(oplist_header_name),
+ "selected_op_variants.h": ":{}[selected_op_variants]".format(oplist_header_name),
},
exported_preprocessor_flags = ["-DEXECUTORCH_SELECTIVE_BUILD_DTYPE"],
header_namespace = "",
feature = feature,
+ **kwargs
)
def build_portable_lib(
- name,
- et_operator_lib_deps = [],
- oplist_header_name = None,
- portable_header_lib = None,
- feature = None,
- expose_operator_symbols = False,
- visibility = ["@EXECUTORCH_CLIENTS"]):
+ name,
+ et_operator_lib_deps = [],
+ oplist_header_name = None,
+ portable_header_lib = None,
+ feature = None,
+ expose_operator_symbols = False,
+ visibility = ["@EXECUTORCH_CLIENTS"],
+ platforms = get_default_executorch_platforms()):
"""
WARNING: Before using this, please consider using executorch_generated_lib instead. This
function is only for special cases where you need to build a portable kernel library with
@@ -639,9 +653,10 @@ def build_portable_lib(
# @lint-ignore BUCKLINT link_whole
link_whole = True,
feature = feature,
+ platforms = platforms,
)
-def build_optimized_lib(name, oplist_header_name, portable_header_lib, feature = None, expose_operator_symbols = False):
+def build_optimized_lib(name, oplist_header_name, portable_header_lib, feature = None, expose_operator_symbols = False, platforms = get_default_executorch_platforms()):
"""Build optimized lib from source. We build from source so that the generated header file,
selected_op_variants.h, can be used to selectively build the lib for different dtypes.
"""
@@ -661,7 +676,7 @@ def build_optimized_lib(name, oplist_header_name, portable_header_lib, feature =
# Currently fbcode links all dependent libraries through shared
# library, and it blocks users like unit tests to use kernel
# implementation directly. So we enable this for xplat only.
- compiler_flags = ["-Wno-missing-prototypes", "-Wno-pass-failed","-Wno-global-constructors","-Wno-shadow",]
+ compiler_flags = ["-Wno-missing-prototypes", "-Wno-pass-failed", "-Wno-global-constructors", "-Wno-shadow"]
if not expose_operator_symbols and is_xplat():
# Removing '-fvisibility=hidden' exposes operator symbols.
# This allows operators to be called outside of the kernel registry.
@@ -674,6 +689,7 @@ def build_optimized_lib(name, oplist_header_name, portable_header_lib, feature =
exported_preprocessor_flags = ["-DEXECUTORCH_SELECTIVE_BUILD_DTYPE"],
deps = get_portable_lib_deps() + get_optimized_lib_deps() + [":" + portable_header_lib],
compiler_flags = compiler_flags,
+ platforms = platforms,
preprocessor_flags = get_vec_preprocessor_flags(),
# sleef needs to be added as a direct dependency of the operator target when building for Android,
# or a linker error may occur. Not sure why this happens; it seems that fbandroid_platform_deps of
@@ -699,10 +715,9 @@ def build_optimized_lib(name, oplist_header_name, portable_header_lib, feature =
)
def selected_operators_genrule(
- name,
- deps,
- platforms = get_default_executorch_platforms(),
-):
+ name,
+ deps,
+ platforms = get_default_executorch_platforms()):
"""Generates selected_operators.yaml from the list of deps. We look into the trasitive closure of all the deps,
and look for macros `et_operator_library`.
@@ -725,10 +740,9 @@ def selected_operators_genrule(
)
def selected_prim_operators_genrule(
- name,
- deps,
- platforms = get_default_executorch_platforms(),
-):
+ name,
+ deps,
+ platforms = get_default_executorch_platforms()):
"""Generates selected_prim_ops.h from the list of deps. We look into the transitive closure of all the deps,
and look for targets with label `et_operator_library`.
@@ -750,12 +764,11 @@ def selected_prim_operators_genrule(
)
def dtype_header_genrule(
- name,
- visibility,
- deps = [],
- selected_operators_genrule_name = None,
- platforms = get_default_executorch_platforms(),
-):
+ name,
+ visibility,
+ deps = [],
+ selected_operators_genrule_name = None,
+ platforms = get_default_executorch_platforms()):
"""Generate selected_op_variants.h from selected_operators.yaml.
Given a `selected_operators.yaml` (passed in as selected_operators_genrule_name), we should be able to determine
@@ -921,15 +934,14 @@ def executorch_generated_lib(
index = index + 1
portable = name + "_check_portable_" + dep.split(":")[1] + str(index)
message = "Dtype selective build requires that the portable library is not passed into `deps`. This will cause duplicate symbol errors in the build. Please remove it from `deps` and place it into `kernel_deps`"
- check_recursive_dependencies(portable, dep, "//executorch/kernels/portable:operators", message)
+ check_recursive_dependencies(portable, dep, "//executorch/kernels/portable:operators", message, platforms = platforms)
if ("//executorch/kernels/optimized:optimized_operators" in kernel_deps):
index = 0
for dep in deps:
index = index + 1
optimized = name + "_check_optimized_" + dep.split(":")[1] + str(index)
message = "Dtype selective build requires that the optimized library is not passed into `deps`. This will cause duplicate symbol errors in the build. Please remove it from `deps` and place it into `kernel_deps`"
- check_recursive_dependencies(optimized, dep, "//executorch/kernels/optimized:optimized_operators", message)
-
+ check_recursive_dependencies(optimized, dep, "//executorch/kernels/optimized:optimized_operators", message, platforms = platforms)
aten_suffix = "_aten" if aten_mode else ""
@@ -995,7 +1007,7 @@ def executorch_generated_lib(
if dtype_selective_build:
# Build portable headers lib. Used for portable and optimized kernel libraries.
portable_header_lib = name + "_portable_header_lib"
- build_portable_header_lib(portable_header_lib, oplist_header_name, feature)
+ build_portable_header_lib(portable_header_lib, oplist_header_name, feature, platforms = platforms)
if "//executorch/kernels/portable:operators" in kernel_deps:
# Remove portable from kernel_deps as we're building it from source.
@@ -1003,7 +1015,7 @@ def executorch_generated_lib(
# Build portable lib.
portable_lib_name = name + "_portable_lib"
- build_portable_lib(name = portable_lib_name, portable_header_lib = portable_header_lib, feature = feature, expose_operator_symbols = expose_operator_symbols)
+ build_portable_lib(name = portable_lib_name, portable_header_lib = portable_header_lib, feature = feature, expose_operator_symbols = expose_operator_symbols, platforms = platforms)
kernel_deps.append(":{}".format(portable_lib_name))
if "//executorch/kernels/optimized:optimized_operators" in kernel_deps:
@@ -1012,7 +1024,7 @@ def executorch_generated_lib(
# Build optimized lib.
optimized_lib_name = name + "_optimized_lib"
- build_optimized_lib(optimized_lib_name, oplist_header_name, portable_header_lib, feature, expose_operator_symbols)
+ build_optimized_lib(optimized_lib_name, oplist_header_name, portable_header_lib, feature, expose_operator_symbols, platforms = platforms)
kernel_deps.append(":{}".format(optimized_lib_name))
# Exports headers that declare the function signatures of the C++ functions
@@ -1111,10 +1123,9 @@ def executorch_generated_lib(
#
# If build successfully, all of the `selected_operators.yaml` will be merged into 1 `selected_operators.yaml` for debugging purpose.
def executorch_ops_check(
- name,
- deps,
- **kwargs,
-):
+ name,
+ deps,
+ **kwargs):
runtime.genrule(
name = name,
macros_only = False,
@@ -1128,16 +1139,15 @@ def executorch_ops_check(
platforms = kwargs.pop("platforms", get_default_executorch_platforms()),
outs = {"selected_operators.yaml": ["selected_operators.yaml"]},
default_outs = ["."],
- **kwargs,
+ **kwargs
)
def check_recursive_dependencies(
- name,
- parent,
- child,
- message = "",
- **kwargs,
-):
+ name,
+ parent,
+ child,
+ message = "",
+ **kwargs):
"""
Checks if child is a transitive dependency of parent and fails if it is.
The query runs the equivalent of `buck2 uquery "allpaths(parent, child)".
diff --git a/shim_et/xplat/executorch/kernels/prim_ops/selective_build.bzl b/shim_et/xplat/executorch/kernels/prim_ops/selective_build.bzl
index a5c89147801..73421f031ec 100644
--- a/shim_et/xplat/executorch/kernels/prim_ops/selective_build.bzl
+++ b/shim_et/xplat/executorch/kernels/prim_ops/selective_build.bzl
@@ -28,6 +28,7 @@ def prim_ops_registry_selective(name, selected_prim_ops_header_target, aten_suff
header_name: [header_name],
"selected_prim_ops.h": ["selected_prim_ops.h"]
},
+ platforms = kwargs.get("platforms", "CXX"),
default_outs = ["."],
)
runtime.cxx_library(
From 3d8b8d1d5f1cf74bf62cc9848e2a1cfe9d6804c0 Mon Sep 17 00:00:00 2001
From: cccclai
Date: Sat, 4 Oct 2025 16:33:48 -0700
Subject: [PATCH 127/266] fix test-huggingface-transformers-* tests (#14752)
Fix these tests
https://hud.pytorch.org/hud/pytorch/executorch/main/1?per_page=50&name_filter=huggingface-transformer
The optimum is installed in a bit weird way, inside executorch folder,
it clone optimum inside executorch, and try to install executorch in the
nested optimum folder. Install optimum via pip instead in the same
commit. The behavior should be the same, tests still run as expected
---
.github/workflows/trunk.yml | 53 +++++++++++++++----------------------
1 file changed, 22 insertions(+), 31 deletions(-)
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index ae3001ca920..adf3b7da151 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -823,11 +823,26 @@ jobs:
echo "Recipe: $RECIPE"
echo "Quantize: $QUANTIZE"
- echo "::group::Set up ExecuTorch"
# The generic Linux job chooses to use base env, not the one setup by the image
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
conda activate "${CONDA_ENV}"
- PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake
+
+ echo "::group::Setup ExecuTorch"
+ PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "cmake"
+ echo "::endgroup::"
+
+ echo "::group::Setup Huggingface"
+ pip install -U "huggingface_hub[cli]" accelerate
+ huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+ OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
+ pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
+ echo "::endgroup::"
+
+ echo "::group::Test MODEL: $MODEL RECIPE: $RECIPE QUANTIZE: $QUANTIZE"
+ export OUTPUT_DIR="$(pwd)/${MODEL}_${RECIPE}_${QUANTIZE}"
+ python .ci/scripts/test_huggingface_optimum_model.py --model "$MODEL" --recipe "$RECIPE" $QUANTIZE --model_dir "$OUTPUT_DIR"
+ echo "::endgroup::"
+
# Build executor_runner with ETdump enabled
PYTHON_EXECUTABLE=python cmake -DPYTHON_EXECUTABLE=python \
-DCMAKE_INSTALL_PREFIX=cmake-out \
@@ -845,25 +860,6 @@ jobs:
-DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
-Bcmake-out .
cmake --build cmake-out -j16 --target install --config Release
- echo "::endgroup::"
-
- echo "::group::Set up Hugging Face"
- pip install -U "huggingface_hub[cli]"
- huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
- OPTIMUM_ET_COMMIT=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
- git clone https://github.com/huggingface/optimum-executorch
- pushd optimum-executorch
- # There is no release yet, for CI stability, always test from the same commit on main
- git checkout $OPTIMUM_ET_COMMIT
- python install_dev.py --skip_override_torch
- popd
- pip list
- echo "::endgroup::"
-
- echo "::group::Run tests"
- export OUTPUT_DIR="$(pwd)/${MODEL}_${RECIPE}_${QUANTIZE}"
- python .ci/scripts/test_huggingface_optimum_model.py --model ${MODEL} --recipe ${RECIPE} ${QUANTIZE} --model_dir ${OUTPUT_DIR}
- echo "::endgroup::"
echo "::group::Generate artifacts for performance profiling"
./cmake-out/executor_runner \
@@ -930,16 +926,11 @@ jobs:
${CONDA_RUN} python install_executorch.py
echo "::endgroup::"
- echo "::group::Set up Hugging Face"
- pip install -U "huggingface_hub[cli]"
- huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
- OPTIMUM_ET_COMMIT=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
- git clone https://github.com/huggingface/optimum-executorch
- pushd optimum-executorch
- # There is no release yet, for CI stability, always test from the same commit on main
- git checkout $OPTIMUM_ET_COMMIT
- ${CONDA_RUN} python install_dev.py --skip_override_torch
- popd
+ echo "::group::Set up Huggingface"
+ ${CONDA_RUN} pip install -U "huggingface_hub[cli]" accelerate
+ ${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+ OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
+ ${CONDA_RUN} pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
${CONDA_RUN} pip list
echo "::endgroup::"
From 3b16bc14ccb7e956b2a4bf0bdb541700596b1a20 Mon Sep 17 00:00:00 2001
From: Siddartha Pothapragada
Date: Sun, 5 Oct 2025 19:31:55 -0700
Subject: [PATCH 128/266] =?UTF-8?q?Summary:=20Use=20javaClassStatic()=20fo?=
=?UTF-8?q?r=20class=20references=20stored=20in=20static=20=E2=80=A6=20(#1?=
=?UTF-8?q?4744)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
…variables - creates global references safe for persistence
findClassLocal() returns a local reference. Storing it in static auto
exceptionClass = ... could result in potential 'invalid local
reference:' as local references become invalid when the JNI frame ends
Test Plan:
Reviewers:
Subscribers:
Tasks:
Tags:
### Summary
[PLEASE REMOVE] See [CONTRIBUTING.md's Pull
Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests)
for ExecuTorch PR guidelines.
[PLEASE REMOVE] If this PR closes an issue, please add a `Fixes
#` line.
[PLEASE REMOVE] If this PR introduces a fix or feature that should be
the upcoming release notes, please add a "Release notes: " label.
For a list of available release notes labels, check out
[CONTRIBUTING.md's Pull
Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests).
### Test plan
[PLEASE REMOVE] How did you test this PR? Please write down any manual
commands you used and note down tests that you have written if
applicable.
Co-authored-by: Github Executorch
---
extension/android/jni/jni_helper.cpp | 9 ++++++---
extension/android/jni/jni_helper.h | 7 +++++++
2 files changed, 13 insertions(+), 3 deletions(-)
diff --git a/extension/android/jni/jni_helper.cpp b/extension/android/jni/jni_helper.cpp
index b92856bacb2..6491524c7ac 100644
--- a/extension/android/jni/jni_helper.cpp
+++ b/extension/android/jni/jni_helper.cpp
@@ -13,10 +13,13 @@ namespace executorch::jni_helper {
void throwExecutorchException(uint32_t errorCode, const std::string& details) {
// Get the current JNI environment
auto env = facebook::jni::Environment::current();
+ if (!env) {
+ return;
+ }
- // Find the Java ExecutorchRuntimeException class
- static auto exceptionClass = facebook::jni::findClassLocal(
- "org/pytorch/executorch/ExecutorchRuntimeException");
+ // stable/global class ref — safe to cache
+ static const auto exceptionClass =
+ JExecutorchRuntimeException::javaClassStatic();
// Find the static factory method: makeExecutorchException(int, String)
static auto makeExceptionMethod =
diff --git a/extension/android/jni/jni_helper.h b/extension/android/jni/jni_helper.h
index 996d75581d3..898c1619d9c 100644
--- a/extension/android/jni/jni_helper.h
+++ b/extension/android/jni/jni_helper.h
@@ -23,4 +23,11 @@ namespace executorch::jni_helper {
*/
void throwExecutorchException(uint32_t errorCode, const std::string& details);
+// Define the JavaClass wrapper
+struct JExecutorchRuntimeException
+ : public facebook::jni::JavaClass {
+ static constexpr auto kJavaDescriptor =
+ "Lorg/pytorch/executorch/ExecutorchRuntimeException;";
+};
+
} // namespace executorch::jni_helper
From f81e8346f4153cb2e21eb33a6bdce9c1008696ae Mon Sep 17 00:00:00 2001
From: Erik Lundell
Date: Mon, 6 Oct 2025 14:23:08 +0200
Subject: [PATCH 129/266] Add strict-flag to ExportSession (#14588)
**Add strict export option to ExportRecipe**
Default is True, mirroring earlier behavior.
Also update ExportSession to handle this.
Signed-off-by: Erik Lundell
---
export/export.py | 5 ++++-
export/recipe.py | 3 +++
export/stages.py | 5 ++++-
3 files changed, 11 insertions(+), 2 deletions(-)
diff --git a/export/export.py b/export/export.py
index 86a932d153c..1e9cdbde7c0 100644
--- a/export/export.py
+++ b/export/export.py
@@ -1,5 +1,6 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
@@ -200,7 +201,9 @@ def _build_stages(self, stages: List[StageType]) -> Dict[StageType, Stage]:
aten_transform_passes = list(
self._export_recipe.aten_transform_passes
)
- stage = TorchExportStage(aten_transform_passes)
+ stage = TorchExportStage(
+ aten_transform_passes, strict=self._export_recipe.strict
+ )
elif stage_type == StageType.TO_EDGE_TRANSFORM_AND_LOWER:
stage = EdgeTransformAndLowerStage.from_recipe(self._lowering_recipe)
elif stage_type == StageType.TO_EDGE:
diff --git a/export/recipe.py b/export/recipe.py
index 18f4b8aebb9..4465da51956 100644
--- a/export/recipe.py
+++ b/export/recipe.py
@@ -1,5 +1,6 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
@@ -151,6 +152,7 @@ class ExportRecipe:
executorch_backend_config: Optional backend configuration for ExecuTorch
pipeline_stages: Optional list of stages to execute, defaults to a standard pipeline.
mode: Export mode (debug or release)
+ strict: Set the strict flag in the torch export call.
"""
name: Optional[str] = None
@@ -163,6 +165,7 @@ class ExportRecipe:
executorch_backend_config: Optional[ExecutorchBackendConfig] = None
pipeline_stages: Optional[List[StageType]] = None
mode: Mode = Mode.RELEASE
+ strict: bool = True
@classmethod
def get_recipe(cls, recipe: "RecipeType", **kwargs) -> "ExportRecipe":
diff --git a/export/stages.py b/export/stages.py
index 323b327bfa4..3be801c6a14 100644
--- a/export/stages.py
+++ b/export/stages.py
@@ -1,5 +1,6 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
@@ -110,9 +111,11 @@ def __init__(
aten_transform_passes: Optional[
List[Callable[[str, ExportedProgram], ExportedProgram]]
] = None,
+ strict=True,
) -> None:
super().__init__()
self._aten_transform_passes = aten_transform_passes
+ self.strict = strict
@property
def stage_type(self) -> str:
@@ -147,7 +150,7 @@ def run(self, artifact: PipelineArtifact) -> None:
model,
example_inputs[method_name][0],
dynamic_shapes=method_dynamic_shapes,
- strict=True,
+ strict=self.strict,
)
# Apply pre-edge transform passes if available
From 75ebd05eba32df37211e73012b7211a4a66d9b4c Mon Sep 17 00:00:00 2001
From: Surya Siddharth Pemmaraju
Date: Mon, 6 Oct 2025 07:52:32 -0700
Subject: [PATCH 130/266] Fix OpenVINO ci (#14784)
### Summary
Re enable OpenVINO CI
Fixes #14314
### Test plan
Tested this PR locally with setup-openvino.sh and test_openvino.sh
The CI should run these two scripts and verify that all tests are
passing
---
.ci/scripts/setup-openvino.sh | 20 +++++++++-----------
.ci/scripts/test_openvino.sh | 2 +-
.github/workflows/pull.yml | 1 -
backends/openvino/partitioner.py | 8 +++++++-
backends/openvino/preprocess.py | 8 ++++++++
5 files changed, 25 insertions(+), 14 deletions(-)
diff --git a/.ci/scripts/setup-openvino.sh b/.ci/scripts/setup-openvino.sh
index ff667619125..587494f46ac 100755
--- a/.ci/scripts/setup-openvino.sh
+++ b/.ci/scripts/setup-openvino.sh
@@ -10,19 +10,17 @@ set -ex
# shellcheck source=/dev/null
source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
-git clone https://github.com/openvinotoolkit/openvino.git
-cd openvino && git checkout releases/2025/1
-git submodule update --init --recursive
-sudo ./install_build_dependencies.sh
-mkdir build && cd build
-cmake .. -DCMAKE_BUILD_TYPE=Release -DENABLE_PYTHON=ON
-make -j$(nproc)
+# Download and install OpenVINO from release packages
+OPENVINO_VERSION="2025.3"
+OPENVINO_BUILD="2025.3.0.19807.44526285f24"
+OPENVINO_URL="https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION}/linux/openvino_toolkit_ubuntu22_${OPENVINO_BUILD}_x86_64.tgz"
-cd ..
-cmake --install build --prefix dist
+curl -Lo /tmp/openvino_toolkit.tgz --retry 3 --fail ${OPENVINO_URL}
+tar -xzf /tmp/openvino_toolkit.tgz
+mv openvino_toolkit_ubuntu22_${OPENVINO_BUILD}_x86_64 openvino
-source dist/setupvars.sh
-cd ../backends/openvino
+source openvino/setupvars.sh
+cd backends/openvino
pip install -r requirements.txt
cd scripts
./openvino_build.sh --enable_python
diff --git a/.ci/scripts/test_openvino.sh b/.ci/scripts/test_openvino.sh
index 85884a6475b..2bb2115b1ec 100755
--- a/.ci/scripts/test_openvino.sh
+++ b/.ci/scripts/test_openvino.sh
@@ -10,7 +10,7 @@ set -ex
# shellcheck source=/dev/null
source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
-source openvino/dist/setupvars.sh
+source openvino/setupvars.sh
cd backends/openvino/tests
python test_runner.py --test_type ops
python test_runner.py --test_type models
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 845cb5d8631..8248a9637ec 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -787,7 +787,6 @@ jobs:
contents: read
strategy:
fail-fast: false
- if: false # TODO Re-enable after fixing timeouts (#14314)
with:
runner: linux.2xlarge
docker-image: ci-image:executorch-ubuntu-22.04-gcc9
diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py
index bc3fde573e2..4975dc657c6 100644
--- a/backends/openvino/partitioner.py
+++ b/backends/openvino/partitioner.py
@@ -27,6 +27,10 @@
class OpenvinoOperatorsSupport(OperatorSupportBase):
+ extended_support_dict = {
+ "torch.ops.dim_order_ops._clone_dim_order.default": None,
+ "torch.ops.dim_order_ops._to_dim_order_copy.default": None,
+ }
def __init__(
self,
@@ -62,7 +66,9 @@ def is_node_supported(self, _, node: torch.fx.Node) -> bool:
op_type = node.target.__name__
else:
op_type = str(node.target)
- supported_ops = OperatorSupport(options)._support_dict
+ supported_ops = (
+ OperatorSupport(options)._support_dict | self.extended_support_dict
+ )
if op_type == "getitem":
return True
diff --git a/backends/openvino/preprocess.py b/backends/openvino/preprocess.py
index c343f44a8b5..691115f6579 100644
--- a/backends/openvino/preprocess.py
+++ b/backends/openvino/preprocess.py
@@ -14,6 +14,8 @@
PreprocessResult,
)
from executorch.exir.backend.compile_spec_schema import CompileSpec
+
+from executorch.exir.passes.memory_format_ops_pass import DimOrderOpsRevertPass
from openvino.frontend.pytorch.torchdynamo.compile import ( # type: ignore[import-untyped]
openvino_compile,
)
@@ -36,6 +38,12 @@ def preprocess(
Returns:
PreprocessResult: The result of preprocessing, including the compiled model bytes.
"""
+ transformed_ep = DimOrderOpsRevertPass()(edge_program.graph_module)
+
+ # Update the edge_program with the transformed graph
+ if transformed_ep and transformed_ep.graph_module:
+ edge_program._graph_module = transformed_ep.graph_module
+
input_names = edge_program.graph_signature.user_inputs
args = []
for node in edge_program.graph.nodes:
From 9a7fb42d5ac95ec0d8f30759625fd9dfcca4f1db Mon Sep 17 00:00:00 2001
From: Yufeng Shi
Date: Mon, 6 Oct 2025 17:19:11 +0100
Subject: [PATCH 131/266] Arm backend: Fix torch.matmul() failures for 2D
tensor inputs (#14624)
- ConvertMmToBmmPass converts an MM node to BMM nodes, turns input and
output tensors from rank-2 to rank-3 via unsqueeze/squeeze, and inserts
q-dq before and after BMM node when necessary.
- After ConvertMmToBmmPass:
```
x -> q -> dq -> unsqueeze -> q_2 -> dq_2 ->
\
bmm -> q_4 -> dq_4
/
y -> q_1 -> dq_1 -> unsqueeze -> q_3 -> dq_3 ->
```
- Therefore, if the original matmul was 2D, the bmm already has DQ nodes
on its inputs and Q node on its output. If AnnotateDecomposedMatmulPass
(#10654) is still applied in this case, it produces illegal sequences
such as: x -> q -> unsqueeze -> q_2 (invalid)
- Fix by checking whether the BMM is already surrounded by DQ nodes on
its inputs and Q nodes on its output.
Change-Id: I9949d59b0b4a96fa34a88b0734014567ea6f24cc
cc @digantdesai @freddan80 @per @zingo @oscarandersson8218
Signed-off-by: Yufeng Shi
Co-authored-by: Oscar Andersson
---
backends/arm/_passes/annotate_decomposed_matmul.py | 9 +++++++--
backends/arm/test/ops/test_matmul.py | 7 +++++++
2 files changed, 14 insertions(+), 2 deletions(-)
diff --git a/backends/arm/_passes/annotate_decomposed_matmul.py b/backends/arm/_passes/annotate_decomposed_matmul.py
index 6b89b0c3c4a..72ae46c76c1 100644
--- a/backends/arm/_passes/annotate_decomposed_matmul.py
+++ b/backends/arm/_passes/annotate_decomposed_matmul.py
@@ -73,7 +73,10 @@ def call(self, graph_module: GraphModule) -> PassResult:
node for node in partition.nodes if node.target in matmul_targets
][0]
- if quantized_input:
+ if quantized_input and not all(
+ input_node.target in DQ_OPS
+ for input_node in matmul_node.all_input_nodes
+ ):
matmul_args = matmul_node.all_input_nodes
for node in matmul_args:
# Find the dq-node connected to this mm/bmm arg
@@ -99,7 +102,9 @@ def call(self, graph_module: GraphModule) -> PassResult:
partition_output = list(partition.output_nodes[0].users)[0]
quantized_output = partition_output.target in Q_OPS
- if quantized_output:
+ if quantized_output and not all(
+ user.target in Q_OPS for user in matmul_node.users
+ ):
with graph_module.graph.inserting_after(matmul_node):
# Create q-node after matmul
q_node = create_node(
diff --git a/backends/arm/test/ops/test_matmul.py b/backends/arm/test/ops/test_matmul.py
index a788fc00a5d..f564672e98f 100644
--- a/backends/arm/test/ops/test_matmul.py
+++ b/backends/arm/test/ops/test_matmul.py
@@ -22,6 +22,7 @@
class MatMul(torch.nn.Module):
test_data_generators = {
+ "rand_rand_2d": lambda: (torch.rand(5, 5), torch.rand(5, 2)),
"rand_rand_3d": lambda: (torch.rand(2, 3, 5), torch.rand(2, 5, 2)),
"rand_rand_4d": lambda: (torch.rand(1, 2, 3, 5), torch.rand(1, 2, 5, 2)),
}
@@ -32,6 +33,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
class MatMulSingleInput(torch.nn.Module):
test_data_generators = {
+ "rand_2d": lambda: (torch.rand(5, 5),),
"rand_3d": lambda: (torch.rand(2, 5, 5),),
"rand_4d": lambda: (torch.rand(1, 2, 5, 5),),
}
@@ -42,6 +44,11 @@ def forward(self, x: torch.Tensor):
class MatMulCombo(torch.nn.Module):
test_data_generators = {
+ "rand_rand_rand_2d": lambda: (
+ torch.rand(5, 5),
+ torch.rand(5, 2),
+ torch.rand(2, 5),
+ ),
"rand_rand_rand_3d": lambda: (
torch.rand(2, 5, 5),
torch.rand(2, 5, 2),
From ed3fdad208ccf9309a61c60ed3a262fb796f8848 Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin
Date: Mon, 6 Oct 2025 09:59:22 -0700
Subject: [PATCH 132/266] Update extension/llm/tokenizers (#14807)
---
extension/llm/tokenizers | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/extension/llm/tokenizers b/extension/llm/tokenizers
index 65e41a96e1b..ee0ad9b6e84 160000
--- a/extension/llm/tokenizers
+++ b/extension/llm/tokenizers
@@ -1 +1 @@
-Subproject commit 65e41a96e1b6870d0e616cd7f9eaaf5aaa1d89f3
+Subproject commit ee0ad9b6e84622589911e2855a111b4278db114b
From 815ae92399815df6976620dbf977561ae79c4780 Mon Sep 17 00:00:00 2001
From: Ethan Ng
Date: Mon, 6 Oct 2025 10:25:13 -0700
Subject: [PATCH 133/266] Update
ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass to check if
is_tensor() is valid
Differential Revision: D83861005
Pull Request resolved: https://github.com/pytorch/executorch/pull/14798
---
backends/cadence/aot/replace_ops.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/backends/cadence/aot/replace_ops.py b/backends/cadence/aot/replace_ops.py
index 2104764cd14..24390da5e16 100644
--- a/backends/cadence/aot/replace_ops.py
+++ b/backends/cadence/aot/replace_ops.py
@@ -1590,7 +1590,7 @@ def call_operator(self, op, args, kwargs, meta):
updated_args = list(args)
for op_arg_index in args_to_be_replaced:
arg = args[op_arg_index]
- if not arg.is_tensor():
+ if not isinstance(arg, ProxyValue) or not arg.is_tensor():
return super().call_operator(op, args, kwargs, meta)
if not isinstance(arg.node.target, EdgeOpOverload):
From 8c434ddb066feafa3773ac4332a7fed62e9c6c76 Mon Sep 17 00:00:00 2001
From: Gregory Comer
Date: Mon, 6 Oct 2025 12:02:06 -0600
Subject: [PATCH 134/266] [Windows] Enable LLM preset in CI (#14805)
### Summary
Testing more extensions on Windows.
---
.github/workflows/build-presets.yml | 2 +-
tools/cmake/preset/windows.cmake | 9 ++-------
2 files changed, 3 insertions(+), 8 deletions(-)
diff --git a/.github/workflows/build-presets.yml b/.github/workflows/build-presets.yml
index 66ab19eef3c..46031ac7ea3 100644
--- a/.github/workflows/build-presets.yml
+++ b/.github/workflows/build-presets.yml
@@ -109,7 +109,7 @@ jobs:
strategy:
fail-fast: false
matrix:
- preset: [pybind, windows]
+ preset: [pybind, windows, llm]
with:
job-name: build
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
diff --git a/tools/cmake/preset/windows.cmake b/tools/cmake/preset/windows.cmake
index b75a5af578e..ef8bbbedbbf 100644
--- a/tools/cmake/preset/windows.cmake
+++ b/tools/cmake/preset/windows.cmake
@@ -4,14 +4,9 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
+include(${PROJECT_SOURCE_DIR}/tools/cmake/preset/llm.cmake)
+
# keep sorted
set_overridable_option(EXECUTORCH_BUILD_EXECUTOR_RUNNER ON)
-set_overridable_option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER ON)
set_overridable_option(EXECUTORCH_BUILD_EXTENSION_EVALUE_UTIL ON)
-set_overridable_option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
-set_overridable_option(EXECUTORCH_BUILD_EXTENSION_MODULE ON)
set_overridable_option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL ON)
-set_overridable_option(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
-set_overridable_option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED ON)
-set_overridable_option(EXECUTORCH_BUILD_KERNELS_QUANTIZED ON)
-set_overridable_option(EXECUTORCH_BUILD_XNNPACK ON)
From 563a5d244e42787b5b94702b4766b95287257dd9 Mon Sep 17 00:00:00 2001
From: Oscar Andersson <87121123+oscarandersson8218@users.noreply.github.com>
Date: Mon, 6 Oct 2025 20:02:54 +0200
Subject: [PATCH 135/266] Arm backend: Remove CheckNeedsDecomposition (#14512)
Remove redundant check as this can be covered by TOSAProIntSupportList.
Signed-off-by: Oscar Andersson
---
.../tosa_profile_supported_op_lists.py | 22 +-------
.../tosa_supported_operators.py | 55 +------------------
2 files changed, 3 insertions(+), 74 deletions(-)
diff --git a/backends/arm/operator_support/tosa_profile_supported_op_lists.py b/backends/arm/operator_support/tosa_profile_supported_op_lists.py
index d763ef23df2..86db2d9b0b6 100644
--- a/backends/arm/operator_support/tosa_profile_supported_op_lists.py
+++ b/backends/arm/operator_support/tosa_profile_supported_op_lists.py
@@ -18,6 +18,7 @@
# INT profile: ops supported via native TOSA ops, decompositions/transformations, precompute, TableOps, etc.
+# Note that ops supported via pre-quantization decompositions are not included here.
TOSA_PRO_INT_SupportList: Final[Set] = {
exir_ops.edge.aten.abs.default,
exir_ops.edge.aten.add.Tensor,
@@ -46,8 +47,6 @@
exir_ops.edge.aten.hardsigmoid.default,
exir_ops.edge.aten.hardtanh.default,
exir_ops.edge.aten.hardswish.default,
- exir_ops.edge.aten.div.Tensor,
- exir_ops.edge.aten.div.Tensor_mode,
exir_ops.edge.aten.eq.Tensor,
exir_ops.edge.aten.eq.Scalar,
exir_ops.edge.aten.erf.default,
@@ -68,16 +67,7 @@
exir_ops.edge.aten.lt.Tensor,
exir_ops.edge.aten.lt.Scalar,
exir_ops.edge.aten.mul.Tensor,
- exir_ops.edge.aten.ne.Tensor,
- exir_ops.edge.aten.ne.Scalar,
exir_ops.edge.aten.neg.default,
- exir_ops.edge.aten.add.Scalar,
- exir_ops.edge.aten.sub.Scalar,
- exir_ops.edge.aten.mul.Scalar,
- exir_ops.edge.aten.div.Scalar,
- exir_ops.edge.aten._native_batch_norm_legit_no_training.default,
- exir_ops.edge.aten.native_layer_norm.default,
- exir_ops.edge.aten.native_group_norm.default,
exir_ops.edge.aten.sigmoid.default,
exir_ops.edge.aten.mean.dim,
exir_ops.edge.aten.mm.default,
@@ -86,19 +76,12 @@
exir_ops.edge.aten.repeat.default,
exir_ops.edge.aten.reciprocal.default,
exir_ops.edge.aten.relu.default,
- exir_ops.edge.aten.leaky_relu.default,
- exir_ops.edge.aten.sqrt.default,
exir_ops.edge.aten.rsqrt.default,
- exir_ops.edge.aten.round.default,
- exir_ops.edge.aten._softmax.default,
exir_ops.edge.aten.select_copy.int,
- exir_ops.edge.aten._log_softmax.default,
exir_ops.edge.aten.sub.Tensor,
exir_ops.edge.aten.tanh.default,
exir_ops.edge.aten.upsample_bilinear2d.vec,
exir_ops.edge.aten.upsample_nearest2d.vec,
- exir_ops.edge.aten.var.correction,
- exir_ops.edge.aten.var.dim,
exir_ops.edge.aten.view_copy.default,
exir_ops.edge.aten.unsqueeze_copy.default,
exir_ops.edge.aten.squeeze_copy.dims,
@@ -127,12 +110,9 @@
exir_ops.edge.aten.sign.default,
exir_ops.edge.aten.asin.default,
exir_ops.edge.aten.atanh.default,
- exir_ops.edge.aten.addmm.default,
exir_ops.edge.aten.masked_fill.Scalar,
exir_ops.edge.aten.asinh.default,
exir_ops.edge.aten.cosh.default,
- exir_ops.edge.aten.glu.default,
- exir_ops.edge.aten.logit.default,
exir_ops.edge.aten.acos.default,
exir_ops.edge.aten.elu.default,
exir_ops.edge.aten.bitwise_not.default,
diff --git a/backends/arm/operator_support/tosa_supported_operators.py b/backends/arm/operator_support/tosa_supported_operators.py
index 86c53e4aff1..f7dace09c0b 100644
--- a/backends/arm/operator_support/tosa_supported_operators.py
+++ b/backends/arm/operator_support/tosa_supported_operators.py
@@ -135,7 +135,6 @@ def tosa_support_factory(
]
if not tosa_spec.support_float():
- negative_checks.append(NeedsDecompositionCheck(reporter))
negative_checks.append(CheckProperQuantization(reporter))
if tosa_spec.is_U55_subset:
negative_checks.append(EthosU55NotSupported(reporter))
@@ -156,7 +155,8 @@ def tosa_support_factory(
class TOSAProINTSupportList(OperatorSupportBase):
"""
TOSA_PRO_INT_SupportList:
- Ops supported in INT profile via native TOSA ops, decomposition/transformation, pre-compute, or TableOps
+ Ops supported in INT profile via native TOSA ops, decomposition/transformation, pre-compute, or TableOps.
+ Note that ops supported via pre-quantization decompositions are not included here.
"""
def is_node_supported(
@@ -179,57 +179,6 @@ def is_node_supported(
return node.op == "call_function" and node.target in TOSA_PRO_FP_SupportList
-class NeedsDecompositionCheck(OperatorSupportBase):
- """
- Targeted operators need to be decomposed prior to quantization in order to get a pair of q-dq-nodes surrounding
- the operator, and to get optimal quantization parameters for each operator. This check will reject operators
- that need to be decomposed.
- """
-
- def __init__(self, reporter: WhyNoPartitionReporter):
- self.reporter = reporter
-
- def is_node_supported(
- self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
- ) -> bool:
-
- if node.op != "call_function":
- return True
-
- needs_decomp_dict = {
- exir_ops.edge.aten.div.Tensor: None,
- exir_ops.edge.aten._native_batch_norm_legit_no_training.default: "BatchNorm2D with track_running_stats==True not immediately following a convolution is not supported for quantized TOSA backends.",
- exir_ops.edge.aten.native_layer_norm.default: None,
- exir_ops.edge.aten.native_group_norm.default: None,
- exir_ops.edge.aten._softmax.default: None,
- exir_ops.edge.aten._log_softmax.default: None,
- exir_ops.edge.aten.var.correction: None,
- exir_ops.edge.aten.var.dim: None,
- exir_ops.edge.aten.add.Scalar: None,
- exir_ops.edge.aten.sqrt.default: None,
- exir_ops.edge.aten.sub.Scalar: None,
- exir_ops.edge.aten.mul.Scalar: None,
- exir_ops.edge.aten.ne.Tensor: None,
- exir_ops.edge.aten.ne.Scalar: None,
- exir_ops.edge.aten.div.Scalar: None,
- exir_ops.edge.aten.leaky_relu.default: None,
- exir_ops.edge.aten.round.default: None,
- exir_ops.edge.aten.addmm.default: None,
- exir_ops.edge.aten.glu.default: None,
- exir_ops.edge.aten.logit.default: None,
- }
-
- if node.target in needs_decomp_dict:
- reject_message = needs_decomp_dict[node.target]
- if reject_message is None:
- reject_message = "Op needs to be decomposed into other ops before quantization to get quantized properly."
-
- self.reporter.report_reject(node, reject_message)
- return False
- else:
- return True
-
-
class CheckProperQuantization(OperatorSupportBase):
"""
For targeted nodes, check that it has been quantized as expected. In most cases this means that a pair of quantize
From 8484aeead6203f96b1033d7df5b3d51baefed3c6 Mon Sep 17 00:00:00 2001
From: Zingo Andersen
Date: Mon, 6 Oct 2025 20:03:51 +0200
Subject: [PATCH 136/266] Arm backend: Backend test serializes and uses
EthosUQuant on Ethos-U flows (#14817)
### Summary
Serialize and quantize automaticaly when possible. This make Ethos-U
flows work.
### Test plan
This is runned by the backed suit testing for Ethos-U
Signed-off-by: Zingo Andersen
---
backends/test/suite/flows/arm.py | 60 +++++++++++++++-----------------
1 file changed, 28 insertions(+), 32 deletions(-)
diff --git a/backends/test/suite/flows/arm.py b/backends/test/suite/flows/arm.py
index 34a6346fb1f..85674331eda 100644
--- a/backends/test/suite/flows/arm.py
+++ b/backends/test/suite/flows/arm.py
@@ -3,70 +3,66 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
+# Create flows for Arm Backends used to test operator and model suits
-from executorch.backends.arm.quantizer import (
- get_symmetric_quantization_config,
- TOSAQuantizer,
-)
+from executorch.backends.arm.common.arm_compile_spec import ArmCompileSpec
+from executorch.backends.arm.quantizer import get_symmetric_quantization_config
from executorch.backends.arm.test import common
from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.tosa.compile_spec import TosaCompileSpec
+from executorch.backends.arm.util._factory import create_quantizer
from executorch.backends.test.suite.flow import TestFlow
from executorch.backends.xnnpack.test.tester.tester import Quantize
-def _create_tosa_flow(
+def _create_arm_flow(
name,
- compile_spec,
- quantize: bool = False,
+ compile_spec: ArmCompileSpec,
symmetric_io_quantization: bool = False,
per_channel_quantization: bool = True,
) -> TestFlow:
def _create_arm_tester(*args, **kwargs) -> ArmTester:
kwargs["compile_spec"] = compile_spec
+ return ArmTester(*args, **kwargs)
+
+ support_serialize = not isinstance(compile_spec, TosaCompileSpec)
+ quantize = compile_spec.tosa_spec.support_integer()
- return ArmTester(
- *args,
- **kwargs,
- )
+ if quantize is True:
- # Create and configure quantizer to use in the flow
- def create_quantize_stage() -> Quantize:
- quantizer = TOSAQuantizer(compile_spec)
- quantization_config = get_symmetric_quantization_config(
- is_per_channel=per_channel_quantization
- )
- if symmetric_io_quantization:
- quantizer.set_io(quantization_config)
- return Quantize(quantizer, quantization_config)
+ def create_quantize_stage() -> Quantize:
+ quantizer = create_quantizer(compile_spec)
+ quantization_config = get_symmetric_quantization_config(
+ is_per_channel=per_channel_quantization
+ )
+ if symmetric_io_quantization:
+ quantizer.set_io(quantization_config)
+ return Quantize(quantizer, quantization_config)
return TestFlow(
name,
backend="arm",
tester_factory=_create_arm_tester,
- supports_serialize=False,
+ supports_serialize=support_serialize,
quantize=quantize,
- quantize_stage_factory=create_quantize_stage if quantize else None,
+ quantize_stage_factory=(create_quantize_stage if quantize is True else False),
)
-ARM_TOSA_FP_FLOW = _create_tosa_flow(
- "arm_tosa_fp", common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP")
+ARM_TOSA_FP_FLOW = _create_arm_flow(
+ "arm_tosa_fp",
+ common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP"),
)
-ARM_TOSA_INT_FLOW = _create_tosa_flow(
+ARM_TOSA_INT_FLOW = _create_arm_flow(
"arm_tosa_int",
common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+INT"),
- quantize=True,
)
-
-ARM_ETHOS_U55_FLOW = _create_tosa_flow(
+ARM_ETHOS_U55_FLOW = _create_arm_flow(
"arm_ethos_u55",
common.get_u55_compile_spec(),
- quantize=True,
)
-
-ARM_ETHOS_U85_FLOW = _create_tosa_flow(
+ARM_ETHOS_U85_FLOW = _create_arm_flow(
"arm_ethos_u85",
common.get_u85_compile_spec(),
- quantize=True,
)
From b6bc421f2c01c38cb8a300a1cee6799151cf7818 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A5ns=20Nilsson?=
Date: Mon, 6 Oct 2025 20:05:17 +0200
Subject: [PATCH 137/266] Arm backend: Fix Arm tester issue for inplace ops
(#14625)
Deep-copying the input avoids it getting mutated by the first reference
run.
---
backends/arm/test/ops/test_silu.py | 5 -----
backends/arm/test/tester/arm_tester.py | 10 +++++++---
2 files changed, 7 insertions(+), 8 deletions(-)
diff --git a/backends/arm/test/ops/test_silu.py b/backends/arm/test/ops/test_silu.py
index 25117ef89de..362358d0813 100644
--- a/backends/arm/test/ops/test_silu.py
+++ b/backends/arm/test/ops/test_silu.py
@@ -8,7 +8,6 @@
from typing import Optional, Tuple
-import pytest
import torch
from executorch.backends.arm.test import common
from executorch.backends.arm.test.tester.test_pipeline import (
@@ -125,7 +124,6 @@ def test_silu_u85_INT_inplace(test_data: input_t):
@common.parametrize("test_data", Silu.test_data)
@common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1387: Output differs")
def test_silu_vgf_FP(test_data: input_t):
silu_data = (test_data(), False)
pipeline = VgfPipeline[input_t](
@@ -136,7 +134,6 @@ def test_silu_vgf_FP(test_data: input_t):
@common.parametrize("test_data", Silu.test_data)
@common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1387: Output differs")
def test_silu_vgf_FP_inplace(test_data: input_t):
silu_data = (test_data(), True)
pipeline = VgfPipeline[input_t](
@@ -147,7 +144,6 @@ def test_silu_vgf_FP_inplace(test_data: input_t):
@common.parametrize("test_data", Silu.test_data)
@common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1387: Output differs")
def test_silu_vgf_INT(test_data: input_t):
silu_data = (test_data(), False)
pipeline = VgfPipeline[input_t](
@@ -161,7 +157,6 @@ def test_silu_vgf_INT(test_data: input_t):
@common.parametrize("test_data", Silu.test_data)
@common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1387: Output differs")
def test_silu_vgf_INT_inplace(test_data: input_t):
silu_data = (test_data(), True)
pipeline = VgfPipeline[input_t](
diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py
index 9f530f428ce..0cba8d987c0 100644
--- a/backends/arm/test/tester/arm_tester.py
+++ b/backends/arm/test/tester/arm_tester.py
@@ -430,6 +430,10 @@ def run_method_and_compare_outputs(
for run_iteration in range(num_runs):
reference_input = inputs if inputs else next(self.generate_random_inputs())
+ # Avoid issues with inplace operators
+ test_input = copy.deepcopy(reference_input)
+ original_input = copy.deepcopy(reference_input)
+
input_shapes = [
generated_input.shape if hasattr(generated_input, "shape") else (1,)
for generated_input in reference_input
@@ -444,16 +448,16 @@ def run_method_and_compare_outputs(
# Run exported module directly
test_outputs, _ = pytree.tree_flatten(
self._calculate_reference_output(
- exported_program.module(), reference_input
+ exported_program.module(), test_input
)
)
else:
# Run lowered model with target
test_outputs, _ = pytree.tree_flatten(
- test_stage.run_artifact(reference_input)
+ test_stage.run_artifact(test_input)
)
- logger.info(f"\n Input: {reference_input}")
+ logger.info(f"\n Input: {original_input}")
logger.info(f"\n Ref output: {reference_outputs}")
logger.info(f"\nTest output: {test_outputs}")
From 6e7353f2c337afe0882ddb3579c4bdfdf6f24718 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A5ns=20Nilsson?=
Date: Mon, 6 Oct 2025 20:06:29 +0200
Subject: [PATCH 138/266] Arm backend: Add 6D tensor and pixel
shuffle/unshuffle support (#14626)
Adds 6D tensor support required by pixel_shuffle/pixel_unshuffle when
given 4D inputs, which means for now we only support 4D inputs. Adds
TOSA, VGF and xfailing Ethos-U85 unit tests.
cc @digantdesai @freddan80 @per @zingo @oscarandersson8218
---
.../arm/_passes/to_tosa_memory_format_pass.py | 101 +++++---
backends/arm/constants.py | 7 +-
.../tosa_supported_operators.py | 4 +-
.../arm/quantizer/quantization_annotator.py | 2 +
backends/arm/scripts/parse_test_names.py | 2 +
.../test_SD3Transformer2DModel.py | 4 -
backends/arm/test/ops/test_pixel_shuffling.py | 233 ++++++++++++++++++
backends/arm/tosa/dialect/ops/transpose.py | 4 +-
8 files changed, 310 insertions(+), 47 deletions(-)
create mode 100644 backends/arm/test/ops/test_pixel_shuffling.py
diff --git a/backends/arm/_passes/to_tosa_memory_format_pass.py b/backends/arm/_passes/to_tosa_memory_format_pass.py
index dcbdfb03f7b..b906c06b329 100644
--- a/backends/arm/_passes/to_tosa_memory_format_pass.py
+++ b/backends/arm/_passes/to_tosa_memory_format_pass.py
@@ -26,6 +26,9 @@
NNCHW_ORDER,
NNHWC_INVERSE_ORDER,
NNHWC_ORDER,
+ NNNCHW_ORDER,
+ NNNHWC_INVERSE_ORDER,
+ NNNHWC_ORDER,
)
from executorch.exir import ExportedProgram
from executorch.exir.dialects._ops import ops as exir_ops
@@ -51,12 +54,6 @@ class ToTosaMemoryFormatPass(ExportPass):
_passes_required_after: Set[Type[ExportPass]] = set()
- NHWC_order = (0, 2, 3, 1)
- NHWC_inverse_order = (0, 3, 1, 2)
- HWCM_order = (2, 3, 0, 1)
- NNHWC_order = (0, 1, 3, 4, 2)
- NNHWC_inverse_order = (0, 1, 4, 2, 3)
-
def __init__(self, exported_program: ExportedProgram) -> None:
self.exported_program = exported_program
super().__init__()
@@ -93,7 +90,11 @@ def is_weight_node_for_depthwise_conv2d(self, node: torch.fx.Node):
@staticmethod
def memory_format_differs(shape):
"""Returns true if the shape will have a different memory layout in (N)NCHW and (N)NHWC format"""
- if len(shape) >= 5:
+ if len(shape) >= 6:
+ C = shape[3]
+ H = shape[4]
+ W = shape[5]
+ elif len(shape) == 5:
C = shape[2]
H = shape[3]
W = shape[4]
@@ -112,25 +113,26 @@ def memory_format_differs(shape):
@staticmethod
def is_channel_reshape(input_shape, output_shape):
- """Returns true if the reshape changes the channel dimension"""
- if not (
- (len(input_shape) == len(output_shape) and (len(output_shape) in (4, 5)))
- or (len(input_shape) == 4 and len(output_shape) == 5)
- or (len(input_shape) == 5 and len(output_shape) == 4)
- ):
+ """Returns true if reshape changes the channel dimension or batch product dimension(s)"""
+
+ valid_ranks = {4, 5, 6}
+
+ if not (len(input_shape) in valid_ranks and len(output_shape) in valid_ranks):
return False
C_old = input_shape[-3]
C_new = output_shape[-3]
- N_new = (
- output_shape[0]
- if len(output_shape) == 4
- else output_shape[0] * output_shape[1]
- )
- N_old = (
- input_shape[0] if len(input_shape) == 4 else input_shape[0] * input_shape[1]
- )
+ def get_batch_prod_dim(shape):
+ product = 1
+
+ for dim in shape[:-3]:
+ product = product * dim
+
+ return product
+
+ N_old = get_batch_prod_dim(input_shape)
+ N_new = get_batch_prod_dim(output_shape)
return (N_old != N_new) or (C_old != C_new)
@@ -141,17 +143,27 @@ def insert_input_transpose(node, input_node, graph_module):
node.replace_input_with(input_node, pre_permute_node)
return
+ if len(get_first_fake_tensor(input_node).size()) == 6:
+ mem_format = NNNHWC_INVERSE_ORDER
+ elif len(get_first_fake_tensor(input_node).size()) == 5:
+ mem_format = NNHWC_INVERSE_ORDER
+ else:
+ mem_format = NHWC_INVERSE_ORDER
+ # Guard: mem_format must be a true permutation for the current rank
+ _rank_ = len(
+ get_first_fake_tensor(input_node).size()
+ ) # or (node) in output path
+ assert sorted(mem_format) == list(
+ range(_rank_)
+ ), f"bad perm {mem_format} for rank {_rank_} in insert_input_transpose"
+
with graph_module.graph.inserting_before(node):
permute_node = create_node(
graph_module.graph,
exir_ops.backend.tosa.TRANSPOSE.default,
args=(
input_node,
- list(
- NNHWC_INVERSE_ORDER
- if len(get_first_fake_tensor(input_node).size()) == 5
- else NHWC_INVERSE_ORDER
- ),
+ list(mem_format),
),
from_node=node,
)
@@ -163,26 +175,38 @@ def insert_input_transpose(node, input_node, graph_module):
@staticmethod
def insert_output_transpose(node, graph_module):
+
+ if len(get_first_fake_tensor(node).size()) == 6:
+ mem_format = NNNHWC_ORDER
+ elif len(get_first_fake_tensor(node).size()) == 5:
+ mem_format = NNHWC_ORDER
+ else:
+ mem_format = NHWC_ORDER
+ # Guard: mem_format must be a true permutation for the current rank
+ _rank_ = len(get_first_fake_tensor(node).size()) # or (node) in output path
+ assert sorted(mem_format) == list(
+ range(_rank_)
+ ), f"bad perm {mem_format} for rank {_rank_} in insert_input_transpose"
+
with graph_module.graph.inserting_after(node):
permute_node = create_node(
graph_module.graph,
exir_ops.backend.tosa.TRANSPOSE.default,
args=(
node,
- list(
- NNHWC_ORDER
- if len(get_first_fake_tensor(node).size()) == 5
- else NHWC_ORDER
- ),
+ list(mem_format),
),
from_node=node,
)
- permute_node.meta["tosa_dim_order"] = (
- NNHWC_ORDER
- if len(get_first_fake_tensor(node).size()) == 5
- else NHWC_ORDER
- )
+ rank = len(get_first_fake_tensor(node).size())
+ if rank == 6:
+ permute_node.meta["tosa_dim_order"] = NNNHWC_ORDER
+ elif rank == 5:
+ permute_node.meta["tosa_dim_order"] = NNHWC_ORDER
+ else:
+ permute_node.meta["tosa_dim_order"] = NHWC_ORDER
+
node.meta["tosa_dim_order"] = tuple(
range(len(get_first_fake_tensor(node).size()))
)
@@ -261,7 +285,7 @@ def insert_tosa_transposes(self, graph_module: torch.fx.GraphModule):
]
for input_node in inputs:
input_dim_order = get_first_fake_tensor(input_node).dim_order()
- if input_dim_order in (NCHW_ORDER, NNCHW_ORDER):
+ if input_dim_order in (NCHW_ORDER, NNCHW_ORDER, NNNCHW_ORDER):
self.insert_output_transpose(input_node, graph_module)
# Transpose outputs if they are in (N)NCHW format
@@ -276,6 +300,7 @@ def insert_tosa_transposes(self, graph_module: torch.fx.GraphModule):
if output_dim_order in (
NCHW_ORDER,
NNCHW_ORDER,
+ NNNCHW_ORDER,
):
self.insert_input_transpose(
output_node, output_node_input, graph_module
@@ -313,6 +338,8 @@ def call(self, graph_module: torch.fx.GraphModule):
dim_order = HWCM_ORDER
elif node_data.dim() == 5:
dim_order = NNHWC_ORDER
+ elif node_data.dim() == 6:
+ dim_order = NNNHWC_ORDER
else:
dim_order = tuple(range(node_data.dim())) # type: ignore[assignment]
diff --git a/backends/arm/constants.py b/backends/arm/constants.py
index b9995410b23..0e562f12e88 100644
--- a/backends/arm/constants.py
+++ b/backends/arm/constants.py
@@ -34,10 +34,13 @@
NHWC_INVERSE_ORDER: Final = (0, 3, 1, 2)
NNHWC_ORDER: Final = (0, 1, 3, 4, 2)
NNHWC_INVERSE_ORDER: Final = (0, 1, 4, 2, 3)
+NNNHWC_ORDER: Final = (0, 1, 2, 4, 5, 3)
+NNNHWC_INVERSE_ORDER: Final = (0, 1, 2, 5, 3, 4)
NCHW_ORDER: Final = (0, 1, 2, 3)
-NCHW_INVERSE_ORDER: Final = (0, 2, 3, 1)
NNCHW_ORDER: Final = (0, 1, 2, 3, 4)
-NNCHW_INVERSE_ORDER: Final = (0, 1, 3, 4, 2)
+NNNCHW_ORDER: Final = (0, 1, 2, 3, 4, 5)
HWCM_ORDER: Final = (2, 3, 0, 1)
+
+MAX_RANK: Final = 6
diff --git a/backends/arm/operator_support/tosa_supported_operators.py b/backends/arm/operator_support/tosa_supported_operators.py
index f7dace09c0b..f7857894d40 100644
--- a/backends/arm/operator_support/tosa_supported_operators.py
+++ b/backends/arm/operator_support/tosa_supported_operators.py
@@ -19,7 +19,7 @@
FuseQuantizedActivationPass,
)
from executorch.backends.arm._passes.insert_table_ops import TableOps
-from executorch.backends.arm.constants import DQ_OPS, Q_OPS
+from executorch.backends.arm.constants import DQ_OPS, MAX_RANK, Q_OPS
from executorch.backends.arm.operator_support.ethos_u55_support import (
EthosU55CastCheck,
EthosU55DtypeSupport,
@@ -127,7 +127,7 @@ def tosa_support_factory(
negative_checks: list[OperatorSupportBase] = [
CheckInt64InputsAndOutputs(exported_program, reporter),
CheckFloat64Inputs(exported_program, reporter),
- RankCheck(reporter, max_rank=5),
+ RankCheck(reporter, max_rank=MAX_RANK),
*[
reporter.wrap_check(check, f"Rejected by {check.__class__.__name__}")
for check in (additional_checks if additional_checks else [])
diff --git a/backends/arm/quantizer/quantization_annotator.py b/backends/arm/quantizer/quantization_annotator.py
index ebc91c22bbb..349aa3e6b21 100644
--- a/backends/arm/quantizer/quantization_annotator.py
+++ b/backends/arm/quantizer/quantization_annotator.py
@@ -370,6 +370,8 @@ def _match_pattern(
torch.ops.aten.dropout_.default,
torch.ops.aten.adaptive_avg_pool2d.default,
torch.ops.aten.alias_copy.default,
+ torch.ops.aten.pixel_shuffle.default,
+ torch.ops.aten.pixel_unshuffle.default,
]
diff --git a/backends/arm/scripts/parse_test_names.py b/backends/arm/scripts/parse_test_names.py
index 2629d8eb257..54f8aa7421d 100644
--- a/backends/arm/scripts/parse_test_names.py
+++ b/backends/arm/scripts/parse_test_names.py
@@ -26,6 +26,8 @@
"_native_batch_norm_legit_no_training.default",
"_native_batch_norm_legit.no_stats",
"alias_copy.default",
+ "pixel_shuffle.default",
+ "pixel_unshuffle.default",
]
ALL_EDGE_OPS = SAMPLE_INPUT.keys() | CUSTOM_EDGE_OPS
diff --git a/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py b/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py
index 1267c5b8e4c..9506fe727db 100644
--- a/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py
+++ b/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py
@@ -30,16 +30,12 @@ class TestSD3Transformer2DModel:
# Adjust nbr below as we increase op support.
ops_after_partitioner_FP = {
- "executorch_exir_dialects_edge__ops_aten_permute_copy_default": 1,
"executorch_exir_dialects_edge__ops_aten_unsqueeze_copy_default": 1,
- "executorch_exir_dialects_edge__ops_aten_view_copy_default": 2,
"executorch_exir_dialects_edge__ops_dim_order_ops__to_dim_order_copy_default": 1,
"torch.ops.higher_order.executorch_call_delegate": 1,
}
ops_after_partitioner_INT = {
- "executorch_exir_dialects_edge__ops_aten_permute_copy_default": 1,
- "executorch_exir_dialects_edge__ops_aten_view_copy_default": 2,
"executorch_exir_dialects_edge__ops_dim_order_ops__to_dim_order_copy_default": 2,
"torch.ops.higher_order.executorch_call_delegate": 2,
}
diff --git a/backends/arm/test/ops/test_pixel_shuffling.py b/backends/arm/test/ops/test_pixel_shuffling.py
new file mode 100644
index 00000000000..5aeb8b2d1bb
--- /dev/null
+++ b/backends/arm/test/ops/test_pixel_shuffling.py
@@ -0,0 +1,233 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from typing import Tuple
+
+import pytest
+
+import torch
+
+from executorch.backends.arm.constants import MAX_RANK
+
+from executorch.backends.arm.test import common
+
+from executorch.backends.arm.test.tester.test_pipeline import (
+ EthosU55PipelineINT,
+ EthosU85PipelineINT,
+ TosaPipelineFP,
+ TosaPipelineINT,
+ VgfPipeline,
+)
+from torch import nn
+
+aten_op_pixel_unshuffle = "torch.ops.aten.pixel_unshuffle.default"
+exir_op_pixel_unshuffle = (
+ "executorch_exir_dialects_edge__ops_aten_pixel_unshuffle_default"
+)
+
+aten_op_pixel_shuffle = "torch.ops.aten.pixel_shuffle.default"
+exir_op_pixel_shuffle = "executorch_exir_dialects_edge__ops_aten_pixel_shuffle_default"
+
+input_t1 = Tuple[torch.Tensor] # single positional input (1-tuple)
+
+max_rank_input_supported = MAX_RANK - 2
+
+
+class PixelUnShuffle(nn.Module):
+
+ upscale_factor = 2
+ test_data_generators = {
+ "rand_4d": lambda: (torch.randn(1, 12, 64, 64),),
+ "test_4d": lambda: (torch.tensor([[[[10.0, 20.0], [30.0, 40.0]]]]),),
+ "test_3d": lambda: (torch.tensor([[[10.0, 20.0], [30.0, 40.0]]]),),
+ }
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.space_to_depth = nn.PixelUnshuffle(self.upscale_factor)
+
+ def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+ if inputs.dim() > max_rank_input_supported:
+ raise RuntimeError(
+ f"Max rank of input for pixel_unshuffle is currently {max_rank_input_supported}, got {inputs.dim()}"
+ )
+ return self.space_to_depth(inputs)
+
+
+class PixelShuffle(nn.Module):
+
+ upscale_factor = 2
+ test_data_generators = {
+ "rand_4d": lambda: (torch.randn(1, 12, 64, 64),),
+ "test_4d": lambda: (torch.tensor([[[[10.0]], [[20.0]], [[30.0]], [[40.0]]]]),),
+ "test_3d": lambda: (torch.tensor([[[10.0]], [[20.0]], [[30.0]], [[40.0]]]),),
+ }
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ self.depth_to_space = nn.PixelShuffle(self.upscale_factor)
+
+ def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+ if inputs.dim() > max_rank_input_supported:
+ raise RuntimeError(
+ f"Max rank of input for pixel_shuffle is currently {max_rank_input_supported}, got {inputs.dim()}"
+ )
+ return self.depth_to_space(inputs)
+
+
+@common.parametrize("test_data", PixelUnShuffle.test_data_generators)
+def test_pixel_unshuffle_tosa_FP(test_data: input_t1):
+ pipeline = TosaPipelineFP[input_t1](
+ PixelUnShuffle(),
+ test_data(),
+ aten_op_pixel_unshuffle,
+ exir_op_pixel_unshuffle,
+ )
+ pipeline.run()
+
+
+@common.parametrize("test_data", PixelUnShuffle.test_data_generators)
+def test_pixel_unshuffle_tosa_INT(test_data: input_t1):
+ pipeline = TosaPipelineINT[input_t1](
+ PixelUnShuffle(),
+ test_data(),
+ aten_op_pixel_unshuffle,
+ exir_op_pixel_unshuffle,
+ )
+ pipeline.run()
+
+
+@common.parametrize("test_data", PixelShuffle.test_data_generators)
+def test_pixel_shuffle_tosa_FP(test_data: input_t1):
+ pipeline = TosaPipelineFP[input_t1](
+ PixelShuffle(),
+ test_data(),
+ aten_op_pixel_shuffle,
+ exir_op_pixel_shuffle,
+ )
+ pipeline.run()
+
+
+@common.parametrize("test_data", PixelShuffle.test_data_generators)
+def test_pixel_shuffle_tosa_INT(test_data: input_t1):
+ pipeline = TosaPipelineINT[input_t1](
+ PixelShuffle(),
+ test_data(),
+ aten_op_pixel_shuffle,
+ exir_op_pixel_shuffle,
+ )
+ pipeline.run()
+
+
+@common.parametrize("test_data", PixelUnShuffle.test_data_generators)
+@common.SkipIfNoModelConverter
+def test_pixel_unshuffle_vgf_FP(test_data: input_t1):
+ pipeline = VgfPipeline[input_t1](
+ PixelUnShuffle(),
+ test_data(),
+ aten_op_pixel_unshuffle,
+ exir_op_pixel_unshuffle,
+ tosa_version="TOSA-1.0+FP",
+ run_on_vulkan_runtime=True,
+ )
+ pipeline.run()
+
+
+@common.parametrize("test_data", PixelUnShuffle.test_data_generators)
+@common.SkipIfNoModelConverter
+def test_pixel_unshuffle_vgf_INT(test_data: input_t1):
+ pipeline = VgfPipeline[input_t1](
+ PixelUnShuffle(),
+ test_data(),
+ aten_op_pixel_unshuffle,
+ exir_op_pixel_unshuffle,
+ tosa_version="TOSA-1.0+INT",
+ run_on_vulkan_runtime=True,
+ )
+ pipeline.run()
+
+
+@common.parametrize("test_data", PixelShuffle.test_data_generators)
+@common.SkipIfNoModelConverter
+def test_pixel_shuffle_vgf_FP(test_data: input_t1):
+ pipeline = VgfPipeline[input_t1](
+ PixelShuffle(),
+ test_data(),
+ aten_op_pixel_shuffle,
+ exir_op_pixel_shuffle,
+ tosa_version="TOSA-1.0+FP",
+ run_on_vulkan_runtime=True,
+ )
+ pipeline.run()
+
+
+@common.parametrize("test_data", PixelShuffle.test_data_generators)
+@common.SkipIfNoModelConverter
+def test_pixel_shuffle_vgf_INT(test_data: input_t1):
+ pipeline = VgfPipeline[input_t1](
+ PixelShuffle(),
+ test_data(),
+ aten_op_pixel_shuffle,
+ exir_op_pixel_shuffle,
+ tosa_version="TOSA-1.0+INT",
+ run_on_vulkan_runtime=True,
+ )
+ pipeline.run()
+
+
+@common.parametrize("test_data", PixelUnShuffle.test_data_generators)
+@common.XfailIfNoCorstone300
+def test_pixel_unshuffle_u55_INT(test_data: input_t1):
+ pipeline = EthosU55PipelineINT[input_t1](
+ PixelUnShuffle(),
+ test_data(),
+ aten_op_pixel_unshuffle,
+ exir_op_pixel_unshuffle,
+ run_on_fvp=True,
+ )
+ pipeline.run()
+
+
+@common.parametrize("test_data", PixelUnShuffle.test_data_generators)
+@common.XfailIfNoCorstone320
+@pytest.mark.xfail(reason="MLETORCH-1424: rand test fails")
+def test_pixel_unshuffle_u85_INT(test_data: input_t1):
+ pipeline = EthosU85PipelineINT[input_t1](
+ PixelUnShuffle(),
+ test_data(),
+ aten_op_pixel_unshuffle,
+ exir_op_pixel_unshuffle,
+ run_on_fvp=True,
+ )
+ pipeline.run()
+
+
+@common.parametrize("test_data", PixelShuffle.test_data_generators)
+@common.XfailIfNoCorstone300
+def test_pixel_shuffle_u55_INT(test_data: input_t1):
+ pipeline = EthosU55PipelineINT[input_t1](
+ PixelShuffle(),
+ test_data(),
+ aten_op_pixel_shuffle,
+ exir_op_pixel_shuffle,
+ run_on_fvp=True,
+ )
+ pipeline.run()
+
+
+@common.parametrize("test_data", PixelShuffle.test_data_generators)
+@common.XfailIfNoCorstone320
+@pytest.mark.xfail(reason="MLETORCH-1424: rand test fails")
+def test_pixel_shuffle_u85_INT(test_data: input_t1):
+ pipeline = EthosU85PipelineINT[input_t1](
+ PixelShuffle(),
+ test_data(),
+ aten_op_pixel_shuffle,
+ exir_op_pixel_shuffle,
+ run_on_fvp=True,
+ )
+ pipeline.run()
diff --git a/backends/arm/tosa/dialect/ops/transpose.py b/backends/arm/tosa/dialect/ops/transpose.py
index 9c5aba05394..8d5bf8bac70 100644
--- a/backends/arm/tosa/dialect/ops/transpose.py
+++ b/backends/arm/tosa/dialect/ops/transpose.py
@@ -26,9 +26,9 @@ def TRANSPOSE(a, perms):
# By utilizing an edge IR passthrough operator we can keep the edge program in
# channels-first/contiguous and get the desired behavior in the TOSA lowering.
- if len(perms) not in (4, 5):
+ if len(perms) not in (4, 5, 6):
raise TosaValueError(
- f"Only 4D and 5D tensors are supported, got {len(perms)}: {perms}",
+ f"Only 4D, 5D and 6D tensors are supported, got {len(perms)}: {perms}",
op="TRANSPOSE",
)
From 266cfd03c0814653d0fb4664b87ca3d2705d3a0e Mon Sep 17 00:00:00 2001
From: per held
Date: Mon, 6 Oct 2025 20:09:14 +0200
Subject: [PATCH 139/266] Arm backend: Add test for monitoring memory
allocation (#14657)
Simple test to monitor memory allocations when running the "add" model
in fvp.
Signed-off-by: per.held@arm.com
---
.github/workflows/trunk.yml | 1 +
backends/arm/test/test_arm_baremetal.sh | 15 ++
.../arm/test/test_memory_allocator_log.py | 170 ++++++++++++++++++
3 files changed, 186 insertions(+)
create mode 100644 backends/arm/test/test_memory_allocator_log.py
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index adf3b7da151..aabea88f517 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -289,6 +289,7 @@ jobs:
- test_arm_baremetal: test_models_ethos-u55
- test_arm_baremetal: test_models_ethos-u85
- test_arm_baremetal: test_smaller_stories_llama
+ - test_arm_baremetal: test_memory_allocation
fail-fast: false
with:
runner: linux.2xlarge.memory
diff --git a/backends/arm/test/test_arm_baremetal.sh b/backends/arm/test/test_arm_baremetal.sh
index be87ea629d8..b8e8aee4e3a 100755
--- a/backends/arm/test/test_arm_baremetal.sh
+++ b/backends/arm/test/test_arm_baremetal.sh
@@ -366,5 +366,20 @@ test_smaller_stories_llama() {
echo "${TEST_SUITE_NAME}: PASS"
}
+test_memory_allocation() {
+ echo "${TEST_SUITE_NAME}: Test ethos-u memory allocation with run.sh"
+
+ mkdir -p arm_test/test_run
+ # Ethos-U85
+ echo "${TEST_SUITE_NAME}: Test target Ethos-U85"
+ examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u85-128 --model_name=examples/arm/example_modules/add.py &> arm_test/test_run/full.log
+ python3 backends/arm/test/test_memory_allocator_log.py --log arm_test/test_run/full.log \
+ --require "model_pte_program_size" "<= 3000 B" \
+ --require "method_allocator_planned" "<= 64 B" \
+ --require "method_allocator_loaded" "<= 1024 B" \
+ --require "method_allocator_input" "<= 4 B" \
+ --require "Total DRAM used" "<= 0.06 KiB"
+ echo "${TEST_SUITE_NAME}: PASS"
+}
${TEST_SUITE}
diff --git a/backends/arm/test/test_memory_allocator_log.py b/backends/arm/test/test_memory_allocator_log.py
new file mode 100644
index 00000000000..3853b60b7f6
--- /dev/null
+++ b/backends/arm/test/test_memory_allocator_log.py
@@ -0,0 +1,170 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Check log files for memory metrics and compare them against thresholds.
+
+Usage example:
+ python3 test_memory_allocator_log.py \
+ --log path/to/log.txt \
+ --require "Total SRAM used" "<= 310 KiB" \
+ --require "method_allocator_input" "<= 4 B"
+"""
+
+import argparse
+import re
+import sys
+from typing import List, Optional, Tuple
+
+
+def unit_factor(u: str) -> float:
+ if not u:
+ return 1.0
+ ul = u.strip().lower()
+ table = {
+ "b": 1,
+ "byte": 1,
+ "bytes": 1,
+ "kb": 1000,
+ "mb": 1000**2,
+ "gb": 1000**3,
+ "kib": 1024,
+ "mib": 1024**2,
+ "gib": 1024**3,
+ }
+ if ul in table:
+ return float(table[ul])
+ return 1.0
+
+
+def parse_value(text_num: str, text_unit: Optional[str]) -> float:
+ return float(text_num) * unit_factor(text_unit or "")
+
+
+def parse_cond(cond: str) -> Tuple[str, float, str]:
+ # Regexp explained. Example of things it will parse:
+ # "< 310 KiB", ">=10MB", "== 42", "!=3 bytes", "<=0.5 MiB"
+
+ # The regexp explained in detail:
+ # ^: anchor the match to the start and end of the string (no extra chars allowed).
+ # \s*: optional whitespace (spaces, tabs, etc.).
+ # (<=|>=|==|!=|<|>): capturing group 1. One of the comparison operators: <=, >=, ==, !=, <, >.
+ # \s*: optional whitespace.
+ # ([0-9]+(?:\.[0-9]+)?): capturing group 2. A number:
+ # [0-9]+: one or more digits (the integer part).
+ # (?:\.[0-9]+)?: optional non-capturing group for a fractional part like .25.
+ # \s*: optional whitespace between number and unit
+ # ([A-Za-z]+)?: capturing group 3, optional. A unit made of letters only (e.g., B, KB, KiB, MB, MiB). Case# insensitive by class choice.
+ # \s*: optional trailing whitespace.
+ m = re.match(
+ r"^\s*(<=|>=|==|!=|<|>)\s*([0-9]+(?:\.[0-9]+)?)\s*([A-Za-z]+)?\s*$", cond
+ )
+ if not m:
+ raise ValueError(f"Invalid condition: {cond}")
+ op, num, unit = m.groups()
+ return op, float(num), (unit or "")
+
+
+def compare(a: float, b: float, op: str) -> bool:
+ return {
+ "<": a < b,
+ "<=": a <= b,
+ ">": a > b,
+ ">=": a >= b,
+ "==": abs(a - b) < 1e-9,
+ "!=": abs(a - b) >= 1e-9,
+ }[op]
+
+
+def find_metric_value(line: str, label: str) -> Tuple[Optional[str], Optional[str]]:
+ # Same regexp as parse_cond() but without the first group of matching comparison operators
+ # First go, search for the pattern but escape and ignore cases
+ # The regexp:
+ # ([0-9]+(?:\.[0-9]+)?) — capturing group 1: a decimal number
+ # [0-9]+ — one or more digits (integer part)
+ # (?:\.[0-9]+)? — optional fractional part like .25 (non-capturing)
+ # \s* — optional whitespace between number and unit
+ # ([A-Za-z]+)? — capturing group 2 (optional): a unit made only of letters (e.g., B, KB, KiB, MB)
+ m = re.search(
+ re.escape(label) + r".*?([0-9]+(?:\.[0-9]+)?)\s*([A-Za-z]+)?",
+ line,
+ flags=re.IGNORECASE,
+ )
+ if m:
+ return m.group(1), m.group(2)
+ # Second go, same regexp as above but not caring about label. If
+ # no number was tied to a label be happy just salvaging it from
+ # the line
+ m = re.search(r"([0-9]+(?:\.[0-9]+)?)\s*([A-Za-z]+)?", line)
+ if m:
+ return m.group(1), m.group(2)
+ return None, None
+
+
+def first_line_with_label(lines: List[str], label: str) -> Optional[str]:
+ label_lc = label.lower()
+ return next((ln for ln in lines if label_lc in ln.lower()), None)
+
+
+def check_requirement(label: str, cond: str, lines: List[str]) -> Optional[str]:
+ op, thr_num, thr_unit = parse_cond(cond)
+ matched = first_line_with_label(lines, label)
+ if matched is None:
+ return f"{label}: not found in log"
+
+ num_str, unit_str = find_metric_value(matched, label)
+ if num_str is None:
+ return f"{label}: value not found on line: {matched.strip()}"
+
+ left_bytes = parse_value(num_str, unit_str)
+ right_bytes = parse_value(str(thr_num), thr_unit or (unit_str or ""))
+ ok = compare(left_bytes, right_bytes, op)
+
+ human_left = f"{num_str} {unit_str or 'B'}"
+ human_right = f"{thr_num:g} {thr_unit or (unit_str or 'B')}"
+ print(
+ f"[check] {label}: {human_left} {op} {human_right} -> {'OK' if ok else 'FAIL'}"
+ )
+
+ if ok:
+ return None
+ return f"{label}: {human_left} not {op} {human_right}"
+
+
+def main() -> int:
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--log", required=True, help="Path to log file")
+ parser.add_argument(
+ "--require",
+ action="append",
+ nargs=2,
+ metavar=("LABEL", "COND"),
+ default=[],
+ help="""Required label and condition consisting
+ of a number and unit. Example: \"Total DRAM
+ used\" \"<= 0.06 KiB\"""",
+ )
+ args = parser.parse_args()
+
+ with open(args.log, "r", encoding="utf-8", errors="ignore") as f:
+ lines = f.readlines()
+
+ failures: List[str] = []
+ for label, cond in args.require:
+ msg = check_requirement(label, cond, lines)
+ if msg:
+ failures.append(msg)
+
+ if failures:
+ print("Failures:")
+ for msg in failures:
+ print(" - " + msg)
+ return 1
+
+ print("All checks passed.")
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
From f174974eb72df4c74cc863da05d930444e60fa6a Mon Sep 17 00:00:00 2001
From: per held
Date: Mon, 6 Oct 2025 20:10:37 +0200
Subject: [PATCH 140/266] Arm backend: Remove hello_world in core_software
(#14775)
---
...Remove-hello_world-from-applications.patch | 25 +++++++++++++++++++
1 file changed, 25 insertions(+)
create mode 100644 examples/arm/ethos-u-setup/core_platform/0001-Remove-hello_world-from-applications.patch
diff --git a/examples/arm/ethos-u-setup/core_platform/0001-Remove-hello_world-from-applications.patch b/examples/arm/ethos-u-setup/core_platform/0001-Remove-hello_world-from-applications.patch
new file mode 100644
index 00000000000..11590a8578f
--- /dev/null
+++ b/examples/arm/ethos-u-setup/core_platform/0001-Remove-hello_world-from-applications.patch
@@ -0,0 +1,25 @@
+From f6a7d867212336b3e344c21240a2a03671bffd65 Mon Sep 17 00:00:00 2001
+From: Per Held
+Date: Wed, 17 Sep 2025 13:46:05 +0200
+Subject: Remove hello_world from applications
+
+---
+ applications/CMakeLists.txt | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/applications/CMakeLists.txt b/applications/CMakeLists.txt
+index a017575..130f0f7 100644
+--- a/applications/CMakeLists.txt
++++ b/applications/CMakeLists.txt
+@@ -21,7 +21,7 @@ add_subdirectory(driver_unit_tests)
+
+ add_subdirectory(freertos)
+
+-add_subdirectory(hello_world)
++#add_subdirectory(hello_world)
+
+ add_subdirectory(threadx_demo)
+
+--
+2.43.0
+
From cf314751807e5b37a87d9f01877be4013b9c021a Mon Sep 17 00:00:00 2001
From: Gregory Comer
Date: Mon, 6 Oct 2025 14:16:32 -0600
Subject: [PATCH 141/266] Revert "[Windows] Enable LLM preset in CI (#14805)"
(#14823)
This reverts commit 8c434ddb066feafa3773ac4332a7fed62e9c6c76.
Disabling for now as the Windows unittest jobs are failing post-merge.
They were clean on the PR, so probably just a conflict with a recent
change. I will investigate and re-merge.
---
.github/workflows/build-presets.yml | 2 +-
tools/cmake/preset/windows.cmake | 9 +++++++--
2 files changed, 8 insertions(+), 3 deletions(-)
diff --git a/.github/workflows/build-presets.yml b/.github/workflows/build-presets.yml
index 46031ac7ea3..66ab19eef3c 100644
--- a/.github/workflows/build-presets.yml
+++ b/.github/workflows/build-presets.yml
@@ -109,7 +109,7 @@ jobs:
strategy:
fail-fast: false
matrix:
- preset: [pybind, windows, llm]
+ preset: [pybind, windows]
with:
job-name: build
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
diff --git a/tools/cmake/preset/windows.cmake b/tools/cmake/preset/windows.cmake
index ef8bbbedbbf..b75a5af578e 100644
--- a/tools/cmake/preset/windows.cmake
+++ b/tools/cmake/preset/windows.cmake
@@ -4,9 +4,14 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
-include(${PROJECT_SOURCE_DIR}/tools/cmake/preset/llm.cmake)
-
# keep sorted
set_overridable_option(EXECUTORCH_BUILD_EXECUTOR_RUNNER ON)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER ON)
set_overridable_option(EXECUTORCH_BUILD_EXTENSION_EVALUE_UTIL ON)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_MODULE ON)
set_overridable_option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL ON)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
+set_overridable_option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED ON)
+set_overridable_option(EXECUTORCH_BUILD_KERNELS_QUANTIZED ON)
+set_overridable_option(EXECUTORCH_BUILD_XNNPACK ON)
From a39866ca8c9a0a497f6682eb80e07ac99dbb96ba Mon Sep 17 00:00:00 2001
From: Andrew Grebenisan <33402477+DrJessop@users.noreply.github.com>
Date: Mon, 6 Oct 2025 13:31:56 -0700
Subject: [PATCH 142/266] Fix op signature for avg_pool2d
Differential Revision: D83873533
Pull Request resolved: https://github.com/pytorch/executorch/pull/14787
---
backends/cadence/aot/ops_registrations.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
index e3009163d62..f7d07018e59 100644
--- a/backends/cadence/aot/ops_registrations.py
+++ b/backends/cadence/aot/ops_registrations.py
@@ -329,7 +329,7 @@
"Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, bool channel_last=False) -> (Tensor out)"
)
lib.define(
- "avg_pool2d(Tensor input, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, "
+ "avg_pool2d(Tensor input, int[2] kernel_size, int[2] stride=[], int[2] padding=[], bool ceil_mode=False, "
"bool count_include_pad=True, int? divisor_override=None, Tensor? in_zero_point=None, bool channel_last=False) -> (Tensor out)"
)
lib.define(
@@ -525,7 +525,7 @@
"Tensor out_multiplier, Tensor out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)"
)
lib.define(
- "avg_pool2d.out(Tensor input, int[2] kernel_size, int[2] stride=[], int[2] padding=0, "
+ "avg_pool2d.out(Tensor input, int[2] kernel_size, int[2] stride=[], int[2] padding=[], "
"bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, "
"Tensor? in_zero_point=None, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)"
)
From bc931e17135e38554e4752b2b3324b9754f29139 Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin
Date: Mon, 6 Oct 2025 15:35:52 -0700
Subject: [PATCH 143/266] Update APP_PATH to point to mv3 directory (#14828)
---
scripts/test_ios.sh | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/scripts/test_ios.sh b/scripts/test_ios.sh
index 8cb86f8f43c..599ae1683a4 100755
--- a/scripts/test_ios.sh
+++ b/scripts/test_ios.sh
@@ -15,7 +15,7 @@ set -e
OUTPUT="${1:-executorch}"
EXIT_STATUS=0
-APP_PATH="executorch-examples/apple/ExecuTorchDemo/ExecuTorchDemo"
+APP_PATH="executorch-examples/mv3/apple/ExecuTorchDemo/ExecuTorchDemo"
MODEL_NAME="mv3"
SIMULATOR_NAME="executorch"
From 270873fa4fbab639820bb4375bd47ef2d2cd2fde Mon Sep 17 00:00:00 2001
From: Siddartha Pothapragada
Date: Mon, 6 Oct 2025 15:38:40 -0700
Subject: [PATCH 144/266] Restructure ET documentation with 'Platform First'
model (#14720)
### Summary
[PLEASE REMOVE] See [CONTRIBUTING.md's Pull
Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests)
for ExecuTorch PR guidelines.
[PLEASE REMOVE] If this PR closes an issue, please add a `Fixes
#` line.
[PLEASE REMOVE] If this PR introduces a fix or feature that should be
the upcoming release notes, please add a "Release notes: " label.
For a list of available release notes labels, check out
[CONTRIBUTING.md's Pull
Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests).
### Test plan
[PLEASE REMOVE] How did you test this PR? Please write down any manual
commands you used and note down tests that you have written if
applicable.
---
docs/source/advanced-topics-section.md | 112 +++++++
docs/source/android-arm-vgf.md | 1 +
docs/source/android-backends.md | 28 ++
docs/source/android-examples.md | 9 +
docs/source/android-mediatek.md | 1 +
docs/source/android-qualcomm.md | 1 +
docs/source/android-samsung-exynos.md | 1 +
docs/source/android-section.md | 23 ++
docs/source/android-vulkan.md | 1 +
docs/source/android-xnnpack.md | 1 +
docs/source/api-section.md | 26 ++
docs/source/api.md | 11 -
docs/source/backend-delegate-advanced.md | 33 ++
docs/source/backends-overview.md | 73 ++++-
docs/source/backends-samsung-exynos.md | 1 +
docs/source/backends-section.md | 1 +
docs/source/backends-xnnpack.md | 7 +-
docs/source/backends.md | 17 -
.../compiler-delegate-and-partitioner.md | 2 +-
docs/source/compiler-ir-advanced.md | 31 ++
docs/source/desktop-backends.md | 27 ++
docs/source/desktop-coreml.md | 1 +
docs/source/desktop-mps.md | 1 +
docs/source/desktop-openvino.md | 1 +
docs/source/desktop-section.md | 19 ++
docs/source/desktop-xnnpack.md | 1 +
docs/source/edge-platforms-section.md | 73 +++++
docs/source/embedded-arm-ethos-u.md | 1 +
docs/source/embedded-backends.md | 20 ++
docs/source/embedded-cadence.md | 1 +
docs/source/embedded-nxp.md | 1 +
docs/source/embedded-section.md | 39 +++
docs/source/file-formats-advanced.md | 17 +
docs/source/index.md | 307 +++++++++++-------
docs/source/intro-section.md | 27 ++
docs/source/intro.md | 10 -
docs/source/ios-backends.md | 19 ++
docs/source/ios-coreml.md | 1 +
docs/source/ios-examples.md | 4 +
docs/source/ios-mps.md | 1 +
docs/source/ios-section.md | 23 ++
docs/source/ios-xnnpack.md | 1 +
docs/source/kernel-library-advanced.md | 23 ++
docs/source/kernel-library-overview.md | 4 +-
...lama3-qualcomm-ai-engine-direct-backend.md | 5 +-
docs/source/llm/working-with-llms.md | 9 +-
docs/source/platforms-desktop.md | 23 ++
docs/source/platforms-embedded.md | 19 ++
docs/source/quantization-optimization.md | 20 ++
docs/source/quick-start-section.md | 38 +++
docs/source/runtime-integration-advanced.md | 20 ++
docs/source/success-stories.md | 56 ++++
docs/source/support-section.md | 17 +
docs/source/tools-section.md | 30 ++
docs/source/using-executorch-export.md | 2 +-
55 files changed, 1054 insertions(+), 187 deletions(-)
create mode 100644 docs/source/advanced-topics-section.md
create mode 100644 docs/source/android-arm-vgf.md
create mode 100644 docs/source/android-backends.md
create mode 100644 docs/source/android-examples.md
create mode 100644 docs/source/android-mediatek.md
create mode 100644 docs/source/android-qualcomm.md
create mode 100644 docs/source/android-samsung-exynos.md
create mode 100644 docs/source/android-section.md
create mode 100644 docs/source/android-vulkan.md
create mode 100644 docs/source/android-xnnpack.md
create mode 100644 docs/source/api-section.md
delete mode 100644 docs/source/api.md
create mode 100644 docs/source/backend-delegate-advanced.md
create mode 100644 docs/source/backends-samsung-exynos.md
create mode 100644 docs/source/backends-section.md
delete mode 100644 docs/source/backends.md
create mode 100644 docs/source/compiler-ir-advanced.md
create mode 100644 docs/source/desktop-backends.md
create mode 100644 docs/source/desktop-coreml.md
create mode 100644 docs/source/desktop-mps.md
create mode 100644 docs/source/desktop-openvino.md
create mode 100644 docs/source/desktop-section.md
create mode 100644 docs/source/desktop-xnnpack.md
create mode 100644 docs/source/edge-platforms-section.md
create mode 100644 docs/source/embedded-arm-ethos-u.md
create mode 100644 docs/source/embedded-backends.md
create mode 100644 docs/source/embedded-cadence.md
create mode 100644 docs/source/embedded-nxp.md
create mode 100644 docs/source/embedded-section.md
create mode 100644 docs/source/file-formats-advanced.md
create mode 100644 docs/source/intro-section.md
delete mode 100644 docs/source/intro.md
create mode 100644 docs/source/ios-backends.md
create mode 100644 docs/source/ios-coreml.md
create mode 100644 docs/source/ios-examples.md
create mode 100644 docs/source/ios-mps.md
create mode 100644 docs/source/ios-section.md
create mode 100644 docs/source/ios-xnnpack.md
create mode 100644 docs/source/kernel-library-advanced.md
create mode 100644 docs/source/platforms-desktop.md
create mode 100644 docs/source/platforms-embedded.md
create mode 100644 docs/source/quantization-optimization.md
create mode 100644 docs/source/quick-start-section.md
create mode 100644 docs/source/runtime-integration-advanced.md
create mode 100644 docs/source/success-stories.md
create mode 100644 docs/source/support-section.md
create mode 100644 docs/source/tools-section.md
diff --git a/docs/source/advanced-topics-section.md b/docs/source/advanced-topics-section.md
new file mode 100644
index 00000000000..e7b7f5490c6
--- /dev/null
+++ b/docs/source/advanced-topics-section.md
@@ -0,0 +1,112 @@
+(advanced-topics-section)=
+
+# Advanced
+
+Deep dive into ExecuTorch's advanced features for optimization, customization, and integration.
+
+This section covers advanced concepts for developers who need to customize ExecuTorch for specific use cases, optimize performance, or integrate with custom hardware backends.
+
+## Quantization & Optimization
+
+Techniques for model compression and performance optimization.
+
+**→ {doc}`quantization-optimization` — Quantization strategies and performance optimization**
+
+Key topics:
+
+- Quantization strategies and techniques
+- Performance profiling and optimization
+
+## Model Export
+
+Learn the core ExecuTorch workflow, exporting PyTorch models to the `.pte` format for edge deployment.
+
+**→ {doc}`using-executorch-export`** - Model Export & Lowering
+
+Key topics:
+
+- Export and Lowering Workflow
+- Hardware Backend Selection & Optimization
+- Dynamic Shapes & Advanced Model Features
+
+
+## Kernel Library
+
+Deep dive into ExecuTorch's kernel implementation and customization.
+
+**→ {doc}`kernel-library-advanced` — Kernel library deep dive and customization**
+
+Key topics:
+
+- Kernel library architecture
+- Custom kernel implementation
+- Selective build and optimization
+
+## Backend & Delegates
+
+**→ {doc}`backend-delegate-advanced` — Backend delegate integration**
+
+Key topics:
+
+- Learn how to integrate Backend Delegate into ExecuTorch and more
+- XNNPACK Delegate Internals
+- Debugging Delegation
+
+
+## Runtime & Integration
+
+Advanced runtime features and backend integration.
+
+**→ {doc}`runtime-integration-advanced` — Runtime customization and backend integration**
+
+Key topics:
+
+- Backend delegate implementation
+- Platform abstraction layer
+- Custom runtime integration
+
+## Compiler & IR
+
+Advanced compiler features and intermediate representation details.
+
+**→ {doc}`compiler-ir-advanced` — Compiler passes and IR specification**
+
+Key topics:
+
+- Custom compiler passes
+- Memory planning strategies
+- Backend dialect and EXIR
+- Ops set definition
+
+
+## File Formats
+
+ExecuTorch file format specifications and internals.
+
+**→ {doc}`file-formats-advanced` — PTE and PTD file format specifications**
+
+Key topics:
+
+- PTE file format internals
+- PTD file format specification
+- Custom file format handling
+
+## Next Steps
+
+After exploring advanced topics:
+
+- **{doc}`tools-sdk-section`** - Developer tools for debugging and profiling
+- **{doc}`api-section`** - Complete API reference documentation
+
+```{toctree}
+:hidden:
+:maxdepth: 2
+:caption: Advanced Topics
+
+quantization-optimization
+using-executorch-export
+kernel-library-advanced
+backend-delegate-advanced
+runtime-integration-advanced
+compiler-ir-advanced
+file-formats-advanced
diff --git a/docs/source/android-arm-vgf.md b/docs/source/android-arm-vgf.md
new file mode 100644
index 00000000000..cc39b53e176
--- /dev/null
+++ b/docs/source/android-arm-vgf.md
@@ -0,0 +1 @@
+```{include} backends-arm-vgf.md
diff --git a/docs/source/android-backends.md b/docs/source/android-backends.md
new file mode 100644
index 00000000000..d506813990b
--- /dev/null
+++ b/docs/source/android-backends.md
@@ -0,0 +1,28 @@
+(android-backends)=
+# Backends
+
+Available hardware acceleration backends for Android deployment.
+
+## CPU Acceleration
+
+- {doc}`android-xnnpack` — XNNPACK CPU acceleration
+
+## GPU Acceleration
+
+- {doc}`android-vulkan` — Vulkan GPU acceleration
+
+## NPU/Accelerator Backends
+
+- {doc}`android-qualcomm` — Qualcomm AI Engine (NPU)
+- {doc}`android-mediatek` — MediaTek NPU acceleration
+- {doc}`android-arm-vgf` — ARM VGF Backend
+- {doc}`android-samsung-exynos` — Samsung Exynos NPU
+
+```{toctree}
+:hidden:
+android-xnnpack
+android-vulkan
+android-qualcomm
+android-mediatek
+android-arm-vgf
+android-samsung-exynos
diff --git a/docs/source/android-examples.md b/docs/source/android-examples.md
new file mode 100644
index 00000000000..65580870c57
--- /dev/null
+++ b/docs/source/android-examples.md
@@ -0,0 +1,9 @@
+# Examples & Demos
+
+- [Working with LLMs - Android Examples](https://github.com/meta-pytorch/executorch-examples/tree/main/llm/android)
+- [Demo Apps](https://github.com/meta-pytorch/executorch-examples/tree/main/dl3/android/DeepLabV3Demo#executorch-android-demo-app)
+- {doc}`tutorial-arm-vgf` — Export a simple PyTorch model for the ExecuTorch VGF backend
+
+```{toctree}
+:hidden:
+tutorial-arm-vgf
diff --git a/docs/source/android-mediatek.md b/docs/source/android-mediatek.md
new file mode 100644
index 00000000000..7034fe439dd
--- /dev/null
+++ b/docs/source/android-mediatek.md
@@ -0,0 +1 @@
+```{include} backends-mediatek.md
diff --git a/docs/source/android-qualcomm.md b/docs/source/android-qualcomm.md
new file mode 100644
index 00000000000..f484d771a8b
--- /dev/null
+++ b/docs/source/android-qualcomm.md
@@ -0,0 +1 @@
+```{include} backends-qualcomm.md
diff --git a/docs/source/android-samsung-exynos.md b/docs/source/android-samsung-exynos.md
new file mode 100644
index 00000000000..4c5a470edca
--- /dev/null
+++ b/docs/source/android-samsung-exynos.md
@@ -0,0 +1 @@
+```{include} backends-samsung-exynos.md
diff --git a/docs/source/android-section.md b/docs/source/android-section.md
new file mode 100644
index 00000000000..a5774352bc1
--- /dev/null
+++ b/docs/source/android-section.md
@@ -0,0 +1,23 @@
+(android-section)=
+
+# Android
+
+Deploy ExecuTorch on Android devices with hardware acceleration support.
+
+## Quick Start & Integration
+
+- {doc}`using-executorch-android` — Complete Android integration guide
+
+## Backends
+
+- {doc}`android-backends` — Available Android backends and acceleration options
+
+## Examples & Demos
+
+- {doc}`android-examples` — Explore Android Examples & Demos
+
+```{toctree}
+:hidden:
+using-executorch-android
+android-backends
+android-examples
diff --git a/docs/source/android-vulkan.md b/docs/source/android-vulkan.md
new file mode 100644
index 00000000000..6399ac4ec7c
--- /dev/null
+++ b/docs/source/android-vulkan.md
@@ -0,0 +1 @@
+```{include} backends-vulkan.md
diff --git a/docs/source/android-xnnpack.md b/docs/source/android-xnnpack.md
new file mode 100644
index 00000000000..315dd747006
--- /dev/null
+++ b/docs/source/android-xnnpack.md
@@ -0,0 +1 @@
+```{include} backends-xnnpack.md
diff --git a/docs/source/api-section.md b/docs/source/api-section.md
new file mode 100644
index 00000000000..f5725a063d4
--- /dev/null
+++ b/docs/source/api-section.md
@@ -0,0 +1,26 @@
+(api-section)=
+# API
+
+In this section, find complete API documentation for ExecuTorch's export, runtime, and extension interfaces. Includes comprehensive references for Python, C++, and Java APIs across all supported platforms.
+
+- {doc}`export-to-executorch-api-reference` — Export to ExecuTorch API Reference
+- {doc}`executorch-runtime-api-reference` — ExecuTorch Runtime API Reference
+- {doc}`runtime-python-api-reference` — Runtime Python API Reference
+- {doc}`api-life-cycle` — API Life Cycle
+- [Android doc →](https://pytorch.org/executorch/main/javadoc/)** — Android API Documentation
+- {doc}`extension-module` — Extension Module
+- {doc}`extension-tensor` — Extension Tensor
+- {doc}`running-a-model-cpp-tutorial` — Detailed C++ Runtime APIs Tutorial
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+:caption: API Reference
+
+export-to-executorch-api-reference
+executorch-runtime-api-reference
+runtime-python-api-reference
+api-life-cycle
+extension-module
+extension-tensor
+running-a-model-cpp-tutorial
diff --git a/docs/source/api.md b/docs/source/api.md
deleted file mode 100644
index 4f6160d258a..00000000000
--- a/docs/source/api.md
+++ /dev/null
@@ -1,11 +0,0 @@
-# API
-
-```{toctree}
-:maxdepth: 1
-
-export-to-executorch-api-reference
-executorch-runtime-api-reference
-runtime-python-api-reference
-api-life-cycle
-Javadoc
-```
diff --git a/docs/source/backend-delegate-advanced.md b/docs/source/backend-delegate-advanced.md
new file mode 100644
index 00000000000..752bd1cdc02
--- /dev/null
+++ b/docs/source/backend-delegate-advanced.md
@@ -0,0 +1,33 @@
+(backend-delegate-advanced)=
+
+# Backend & Delegates
+
+## Integration
+
+- {doc}`backend-delegates-integration` — Learn how to integrate a backend delegate into ExecuTorch
+
+## XNNPACK Reference
+
+- {doc}`backend-delegates-xnnpack-reference` — Deep dive into XNNPACK delegate internals and implementation details
+
+## Dependency Management
+
+- {doc}`backend-delegates-dependencies` — Manage third-party dependencies for backend delegates
+
+## Overview
+
+- {doc}`compiler-delegate-and-partitioner` — Understanding backends, delegates, and the partitioner system
+
+## Debugging
+
+- {doc}`debug-backend-delegate` — Tools and techniques for debugging delegation issues
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+
+backend-delegates-integration
+backend-delegates-xnnpack-reference
+backend-delegates-dependencies
+compiler-delegate-and-partitioner
+debug-backend-delegate
diff --git a/docs/source/backends-overview.md b/docs/source/backends-overview.md
index c83ace26853..b15b466d6a6 100644
--- a/docs/source/backends-overview.md
+++ b/docs/source/backends-overview.md
@@ -1,21 +1,64 @@
-# Backend Overview
+# Backends
-ExecuTorch backends provide hardware acceleration for a specific hardware target. In order to achieve maximum performance on target hardware, ExecuTorch optimizes the model for a specific backend during the export and lowering process. This means that the resulting .pte file is specialized for the specific hardware. In order to deploy to multiple backends, such as Core ML on iOS and Arm CPU on Android, it is common to generate a dedicated .pte file for each.
+## Backend Overview
-The choice of hardware backend is informed by the hardware that the model is intended to be deployed on. Each backend has specific hardware requires and level of model support. See the documentation for each hardware backend for more details.
+ExecuTorch backends provide hardware acceleration for specific hardware targets, enabling models to run efficiently on devices ranging from mobile phones to embedded systems and DSPs. During the export and lowering process, ExecuTorch optimizes your model for the chosen backend, resulting in a `.pte` file specialized for that hardware. To support multiple platforms (e.g., Core ML on iOS, Arm CPU on Android), you typically generate a dedicated `.pte` file for each backend.
-As part of the .pte file creation process, ExecuTorch identifies portions of the model (partitions) that are supported for the given backend. These sections are processed by the backend ahead of time to support efficient execution. Portions of the model that are not supported on the delegate, if any, are executed using the portable fallback implementation on CPU. This allows for partial model acceleration when not all model operators are supported on the backend, but may have negative performance implications. In addition, multiple partitioners can be specified in order of priority. This allows for operators not supported on GPU to run on CPU via XNNPACK, for example.
+The choice of backend is informed by the hardware your model will run on. Each backend has its own hardware requirements and level of model/operator support. See the documentation for each backend for details.
-### Available Backends
+As part of `.pte` file creation, ExecuTorch identifies model partitions supported by the backend. These are processed ahead of time for efficient execution. Operators not supported by the delegate are executed using the portable CPU fallback (e.g., XNNPACK), allowing for partial acceleration. You can also specify multiple partitioners in order of priority, so unsupported GPU ops can fall back to CPU, for example.
-Commonly used hardware backends are listed below. For mobile, consider using XNNPACK for Android and XNNPACK or Core ML for iOS. To create a .pte file for a specific backend, pass the appropriate partitioner class to `to_edge_transform_and_lower`. See the appropriate backend documentation for more information.
+---
-- [XNNPACK (Mobile CPU)](backends-xnnpack.md)
-- [Core ML (iOS)](backends-coreml.md)
-- [Metal Performance Shaders (iOS GPU)](backends-mps.md)
-- [Vulkan (Android GPU)](backends-vulkan.md)
-- [Qualcomm NPU](backends-qualcomm.md)
-- [MediaTek NPU](backends-mediatek.md)
-- [ARM Ethos-U NPU](backends-arm-ethos-u.md)
-- [ARM VGF](backends-arm-vgf.md)
-- [Cadence DSP](backends-cadence.md)
+## Why Backends Matter
+
+Backends are the bridge between your exported model and the hardware it runs on. Choosing the right backend ensures your model takes full advantage of device-specific acceleration, balancing performance, compatibility, and resource usage.
+
+---
+
+## Choosing a Backend
+
+| Backend | Platform(s) | Hardware Type | Typical Use Case |
+|------------------------------------------|---------------------|---------------|---------------------------------|
+| [XNNPACK](backends-xnnpack) | All | CPU | General-purpose, fallback |
+| [Core ML](backends-coreml) | iOS, macOS | NPU/GPU | Apple devices, high performance |
+| [Metal Performance Shaders](backends-mps)| iOS, macOS | GPU | Apple GPU acceleration |
+| [Vulkan ](backends-vulkan) | Android | GPU | Android GPU acceleration |
+| [Qualcomm](backends-qualcomm) | Android | NPU | Qualcomm SoCs |
+| [MediaTek](backends-mediatek) | Android | NPU | MediaTek SoCs |
+| [ARM EthosU](backends-arm-ethos-u) | Embedded | NPU | ARM MCUs |
+| [ARM VGF](backends-arm-vgf) | Android | NPU | ARM platforms |
+| [OpenVINO](build-run-openvino) | Embedded | CPU/GPU/NPU | Intel SoCs |
+| [NXP](backends-nxp) | Embedded | NPU | NXP SoCs |
+| [Cadence](backends-cadence) | Embedded | DSP | DSP-optimized workloads |
+| [Samsung Exynos](backends-samsung-exynos)| Android | NPU | Samsung Socs |
+
+**Tip:** For best performance, export a `.pte` file for each backend you plan to support.
+
+---
+
+## Best Practices
+
+- **Test on all target devices:** Operator support may vary by backend.
+- **Use fallback wisely:** If a backend doesn't support an operator, ExecuTorch will run it on CPU.
+- **Consult backend docs:** Each backend has unique setup and tuning options.
+
+---
+
+```{toctree}
+:maxdepth: 1
+:hidden:
+:caption: Backend Overview
+
+backends-xnnpack
+backends-coreml
+backends-mps
+backends-vulkan
+backends-qualcomm
+backends-mediatek
+backends-arm-ethos-u
+backends-arm-vgf
+build-run-openvino
+backends-nxp
+backends-cadence
+backends-samsung-exynos
diff --git a/docs/source/backends-samsung-exynos.md b/docs/source/backends-samsung-exynos.md
new file mode 100644
index 00000000000..0d77936bf7f
--- /dev/null
+++ b/docs/source/backends-samsung-exynos.md
@@ -0,0 +1 @@
+# Samsung Exynos Backend (TBD)
diff --git a/docs/source/backends-section.md b/docs/source/backends-section.md
new file mode 100644
index 00000000000..29a235a9416
--- /dev/null
+++ b/docs/source/backends-section.md
@@ -0,0 +1 @@
+```{include} backends-overview.md
diff --git a/docs/source/backends-xnnpack.md b/docs/source/backends-xnnpack.md
index d1a120e69fa..75ec17809a4 100644
--- a/docs/source/backends-xnnpack.md
+++ b/docs/source/backends-xnnpack.md
@@ -67,10 +67,11 @@ The XNNPACK delegate can also be used as a backend to execute symmetrically quan
### Supported Quantization Schemes
The XNNPACK delegate supports the following quantization schemes:
+
- 8-bit symmetric weights with 8-bit asymmetric activations (via the PT2E quantization flow).
- - Supports both static and dynamic activations.
- - Supports per-channel and per-tensor schemes.
- - Supports linear, convolution, add, mul, cat, and adaptive avg pool 2d operators.
+ - Supports both static and dynamic activations.
+ - Supports per-channel and per-tensor schemes.
+ - Supports linear, convolution, add, mul, cat, and adaptive avg pool 2d operators.
Weight-only quantization is not currently supported on XNNPACK.
diff --git a/docs/source/backends.md b/docs/source/backends.md
deleted file mode 100644
index 53db638f36d..00000000000
--- a/docs/source/backends.md
+++ /dev/null
@@ -1,17 +0,0 @@
-# Backends
-
-```{toctree}
-:maxdepth: 1
-
-backends-overview
-backends-xnnpack
-backends-coreml
-backends-mps
-backends-vulkan
-backends-arm-ethos-u
-backends-qualcomm
-backends-mediatek
-backends-cadence
-OpenVINO Backend
-backends-nxp
-```
diff --git a/docs/source/compiler-delegate-and-partitioner.md b/docs/source/compiler-delegate-and-partitioner.md
index c633bb1fd12..437361517cc 100644
--- a/docs/source/compiler-delegate-and-partitioner.md
+++ b/docs/source/compiler-delegate-and-partitioner.md
@@ -1,4 +1,4 @@
-# Backends and Delegates
+# Understanding Backends and Delegates
Audience: Vendors, Backend Delegate developers, who are interested in integrating their own compilers and hardware as part of ExecuTorch
diff --git a/docs/source/compiler-ir-advanced.md b/docs/source/compiler-ir-advanced.md
new file mode 100644
index 00000000000..b6d24026d5a
--- /dev/null
+++ b/docs/source/compiler-ir-advanced.md
@@ -0,0 +1,31 @@
+(compiler-ir-advanced)=
+# Compiler & IR
+
+Advanced compiler features and intermediate representation specifications.
+
+## Compiler Passes
+
+- {doc}`compiler-custom-compiler-passes` — Custom compiler passes and optimization
+
+## Memory Management
+
+- {doc}`compiler-memory-planning` — Advanced memory planning strategies
+
+## Intermediate Representation
+
+- {doc}`ir-exir` — EXIR (Export Intermediate Representation) specification
+- {doc}`ir-ops-set-definition` — Ops set definition and operator standardization
+
+## Backend dialect
+
+- {doc}`compiler-backend-dialect` — Backend dialect and compiler integration
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+
+compiler-custom-compiler-passes
+compiler-memory-planning
+ir-exir
+ir-ops-set-definition
+compiler-backend-dialect
diff --git a/docs/source/desktop-backends.md b/docs/source/desktop-backends.md
new file mode 100644
index 00000000000..e4220edb47f
--- /dev/null
+++ b/docs/source/desktop-backends.md
@@ -0,0 +1,27 @@
+(desktop-backends)=
+# Backends
+
+Available hardware acceleration backends for desktop platforms.
+
+## Linux Backends
+
+- {doc}`desktop-xnnpack` — XNNPACK (CPU acceleration)
+- {doc}`desktop-openvino` — OpenVINO (Intel hardware optimization)
+
+## macOS Backends
+
+- {doc}`desktop-coreml` — CoreML (recommended for Apple Silicon)
+- {doc}`desktop-mps` — Metal Performance Shaders (Apple Silicon GPU)
+- {doc}`desktop-xnnpack` — XNNPACK (CPU acceleration)
+
+## Windows Backends
+
+- {doc}`desktop-xnnpack` — XNNPACK (CPU acceleration)
+- {doc}`desktop-openvino` — OpenVINO (Intel hardware optimization)
+
+```{toctree}
+:hidden:
+desktop-xnnpack
+desktop-openvino
+desktop-coreml
+desktop-mps
diff --git a/docs/source/desktop-coreml.md b/docs/source/desktop-coreml.md
new file mode 100644
index 00000000000..48271326d87
--- /dev/null
+++ b/docs/source/desktop-coreml.md
@@ -0,0 +1 @@
+```{include} backends-coreml.md
diff --git a/docs/source/desktop-mps.md b/docs/source/desktop-mps.md
new file mode 100644
index 00000000000..d6f305d33aa
--- /dev/null
+++ b/docs/source/desktop-mps.md
@@ -0,0 +1 @@
+```{include} backends-mps.md
diff --git a/docs/source/desktop-openvino.md b/docs/source/desktop-openvino.md
new file mode 100644
index 00000000000..a0fd5774c73
--- /dev/null
+++ b/docs/source/desktop-openvino.md
@@ -0,0 +1 @@
+```{include} build-run-openvino.md
diff --git a/docs/source/desktop-section.md b/docs/source/desktop-section.md
new file mode 100644
index 00000000000..7afccbe1d4f
--- /dev/null
+++ b/docs/source/desktop-section.md
@@ -0,0 +1,19 @@
+(desktop-section)=
+# Desktop & Laptop Platforms
+
+Deploy ExecuTorch on Linux, macOS, and Windows with optimized backends for each platform.
+
+## Platform Overview & Runtime
+
+- {doc}`using-executorch-cpp` — C++ runtime integration guide
+- {doc}`using-executorch-building-from-source` — Building ExecuTorch from source
+
+## Backends
+
+- {doc}`desktop-backends` — Available desktop backends and platform-specific optimization
+
+```{toctree}
+:hidden:
+using-executorch-cpp
+using-executorch-building-from-source
+desktop-backends
diff --git a/docs/source/desktop-xnnpack.md b/docs/source/desktop-xnnpack.md
new file mode 100644
index 00000000000..315dd747006
--- /dev/null
+++ b/docs/source/desktop-xnnpack.md
@@ -0,0 +1 @@
+```{include} backends-xnnpack.md
diff --git a/docs/source/edge-platforms-section.md b/docs/source/edge-platforms-section.md
new file mode 100644
index 00000000000..8761325451d
--- /dev/null
+++ b/docs/source/edge-platforms-section.md
@@ -0,0 +1,73 @@
+(edge-platforms-section)=
+# Edge
+
+Deploy ExecuTorch on mobile, desktop, and embedded platforms with optimized backends for each.
+
+ExecuTorch supports deployment across a wide variety of edge computing platforms, from high-end mobile devices to constrained embedded systems and microcontrollers.
+
+## Android
+
+Deploy ExecuTorch on Android devices with hardware acceleration support.
+
+**→ {doc}`android-section` — Complete Android deployment guide**
+
+Key features:
+- Hardware acceleration support (CPU, GPU, NPU)
+- Multiple backend options (XNNPACK, Vulkan, Qualcomm, MediaTek, ARM, Samsung)
+- Comprehensive examples and demos
+
+## iOS
+
+Deploy ExecuTorch on iOS devices with Apple hardware acceleration.
+
+**→ {doc}`ios-section` — Complete iOS deployment guide**
+
+Key features:
+- Apple hardware optimization (CoreML, MPS, XNNPACK)
+- Swift and Objective-C integration
+- LLM and computer vision examples
+
+## Desktop & Laptop Platforms
+
+Deploy ExecuTorch on Linux, macOS, and Windows with optimized backends.
+
+**→ {doc}`desktop-section` — Complete desktop deployment guide**
+
+Key features:
+- Cross-platform C++ runtime
+- Platform-specific optimization (OpenVINO, CoreML, MPS)
+- CPU and GPU acceleration options
+
+## Embedded Systems
+
+Deploy ExecuTorch on constrained embedded systems and microcontrollers.
+
+**→ {doc}`embedded-section` — Complete embedded deployment guide**
+
+Key features:
+
+- Resource-constrained deployment
+- DSP and NPU acceleration (Cadence, ARM Ethos-U, NXP)
+- Custom backend development support
+- LLM and computer vision examples
+
+## Troubleshooting & Support
+
+- **{doc}`using-executorch-troubleshooting`** - Common issues and solutions across all platforms
+
+## Next Steps
+
+After choosing your platform:
+- **{doc}`backends-section`** - Deep dive into backend selection and optimization
+- **{doc}`llms-section`** - Working with Large Language Models on edge devices
+
+```{toctree}
+:hidden:
+:maxdepth: 2
+:caption: Edge Platforms
+
+android-section
+ios-section
+desktop-section
+embedded-section
+using-executorch-troubleshooting
diff --git a/docs/source/embedded-arm-ethos-u.md b/docs/source/embedded-arm-ethos-u.md
new file mode 100644
index 00000000000..cdc544a6553
--- /dev/null
+++ b/docs/source/embedded-arm-ethos-u.md
@@ -0,0 +1 @@
+```{include} backends-arm-ethos-u.md
diff --git a/docs/source/embedded-backends.md b/docs/source/embedded-backends.md
new file mode 100644
index 00000000000..4ed7962ef42
--- /dev/null
+++ b/docs/source/embedded-backends.md
@@ -0,0 +1,20 @@
+(embedded-backends)=
+# Backends
+
+Available hardware acceleration backends for embedded systems.
+
+## DSP Acceleration
+
+- {doc}`embedded-cadence` — Cadence Xtensa DSP processors
+
+## NPU Acceleration
+
+- {doc}`embedded-arm-ethos-u` — ARM Ethos-U NPU acceleration
+- {doc}`embedded-nxp` — NXP eIQ Neutron Backend
+
+
+```{toctree}
+:hidden:
+embedded-cadence
+embedded-arm-ethos-u
+embedded-nxp
diff --git a/docs/source/embedded-cadence.md b/docs/source/embedded-cadence.md
new file mode 100644
index 00000000000..d2f7ea78259
--- /dev/null
+++ b/docs/source/embedded-cadence.md
@@ -0,0 +1 @@
+```{include} backends-cadence.md
diff --git a/docs/source/embedded-nxp.md b/docs/source/embedded-nxp.md
new file mode 100644
index 00000000000..35d8f0ab75d
--- /dev/null
+++ b/docs/source/embedded-nxp.md
@@ -0,0 +1 @@
+```{include} backends-nxp.md
diff --git a/docs/source/embedded-section.md b/docs/source/embedded-section.md
new file mode 100644
index 00000000000..834001afbc3
--- /dev/null
+++ b/docs/source/embedded-section.md
@@ -0,0 +1,39 @@
+(embedded-section)=
+
+# Embedded Systems
+
+Deploy ExecuTorch on constrained embedded systems and microcontrollers.
+
+## API Reference & Development
+
+Start here for C++ development with ExecuTorch runtime APIs and essential tutorials.
+
+- {doc}`executorch-runtime-api-reference` — **Start here**: Complete runtime API reference for embedded development
+- {doc}`running-a-model-cpp-tutorial` — Step-by-step C++ API tutorial with practical examples
+- {doc}`extension-module` — Custom module extensions for specialized functionality
+- {doc}`extension-tensor` — Tensor operations and memory management extensions
+
+## Build & Integration Guide
+
+- {doc}`using-executorch-cpp` — Complete setup guide for C++ runtime integration
+- {doc}`using-executorch-building-from-source` — Building from Source
+
+## Choose Backend for acceleration
+
+- {doc}`embedded-backends` — Available embedded backends and acceleration options
+
+## Tutorials
+
+- {doc}`tutorial-arm-ethos-u` — Export a simple PyTorch model for the ExecuTorch Ethos-U backend
+
+
+```{toctree}
+:hidden:
+executorch-runtime-api-reference
+running-a-model-cpp-tutorial
+extension-module
+extension-tensor
+using-executorch-cpp
+using-executorch-building-from-source
+embedded-backends
+tutorial-arm-ethos-u
diff --git a/docs/source/file-formats-advanced.md b/docs/source/file-formats-advanced.md
new file mode 100644
index 00000000000..c16ebccfd65
--- /dev/null
+++ b/docs/source/file-formats-advanced.md
@@ -0,0 +1,17 @@
+(file-formats-advanced)=
+
+# File Formats
+
+ExecuTorch file format specifications and internal structure.
+
+## Program File Formats
+
+- {doc}`pte-file-format` — PTE (PyTorch ExecuTorch) file format specification
+- {doc}`ptd-file-format` — PTD file format specification
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+
+pte-file-format
+ptd-file-format
diff --git a/docs/source/index.md b/docs/source/index.md
index fd0957d8fd4..b65139319a7 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -1,134 +1,195 @@
(home)=
# Welcome to the ExecuTorch Documentation
-**ExecuTorch** is PyTorch's solution to training and inference on the
-Edge.
+**ExecuTorch** is PyTorch's solution for efficient AI inference on edge devices — from mobile phones to embedded systems.
## Key Value Propositions
-- **Portability:** Compatibility with a wide variety of computing
- platforms, from high-end mobile phones to highly constrained
- embedded systems and microcontrollers.
-- **Productivity:** Enabling developers to use the same toolchains and
- Developer Tools from PyTorch model authoring and conversion, to
- debugging and deployment to a wide variety of platforms.
-- **Performance:** Providing end users with a seamless and
- high-performance experience due to a lightweight runtime and
- utilizing full hardware capabilities such as CPUs, NPUs, and DSPs.
-
-ExecuTorch provides support for:
-
-* **Strong Model Support** LLMs (Large Language Models),
- CV (Computer Vision), ASR (Automatic Speech Recognition), TTS (Text To Speech)
-* **All Major Platforms** Android, Mac, Linux, Windows
-* **Rich Acceleration Support** Apple, Arm, Cadence, MediaTek, NXP, OpenVino, Qualcomm, Vulkan, XNNPACK
-
-### Documentation Navigation
-#### Introduction
-- [Overview](intro-overview)
-- [How it Works](intro-how-it-works)
-- [Getting Started with Architecture](getting-started-architecture)
-- [Concepts](concepts)
-#### Usage
-- [Getting Started](getting-started)
-- [Using Executorch Export](using-executorch-export)
-- [Using Executorch on Android](using-executorch-android)
-- [Using Executorch on iOS](using-executorch-ios)
-- [Using Executorch with C++](using-executorch-cpp)
-- [Runtime Integration](using-executorch-runtime-integration)
-- [Troubleshooting](using-executorch-troubleshooting)
-- [Building from Source](using-executorch-building-from-source)
-- [Quantization](quantization-overview)
-- [FAQs](using-executorch-faqs)
-#### Examples
-- [Android Demo Apps](https://github.com/meta-pytorch/executorch-examples/tree/main/dl3/android/DeepLabV3Demo#executorch-android-demo-app)
-- [iOS Demo Apps](https://github.com/meta-pytorch/executorch-examples/tree/main/mv3/apple/ExecuTorchDemo)
-- [Hugging Face Models](https://github.com/huggingface/optimum-executorch/blob/main/README.md)
-#### Backends
-- [Overview](backends-overview)
-- [XNNPACK](backends-xnnpack)
-- [Core ML](backends-coreml)
-- [MPS](backends-mps)
-- [Vulkan](backends-vulkan)
-- [ARM Ethos-U](backends-arm-ethos-u)
-- [ARM VGF](backends-arm-vgf)
-- [Qualcomm](backends-qualcomm)
-- [MediaTek](backends-mediatek)
-- [Cadence](backends-cadence)
-- [OpenVINO](build-run-openvino)
-- [NXP](backend-nxp)
-#### Developer Tools
-- [Overview](devtools-overview)
-- [Bundled IO](bundled-io)
-- [ETRecord](etrecord)
-- [ETDump](etdump)
-- [Runtime Profiling](runtime-profiling)
-- [Model Debugging](model-debugging)
-- [Model Inspector](model-inspector)
-- [Memory Planning Inspection](memory-planning-inspection)
-- [Delegate Debugging](delegate-debugging)
-- [Tutorial](devtools-tutorial)
-#### Runtime
-- [Overview](runtime-overview)
-- [Extension Module](extension-module)
-- [Extension Tensor](extension-tensor)
-- [Detailed C++ Runtime APIs Tutorial](running-a-model-cpp-tutorial)
-- [Backend Delegate Implementation and Linking](runtime-backend-delegate-implementation-and-linking)
-- [Platform Abstraction Layer](runtime-platform-abstraction-layer)
-#### Portable C++ Programming
-- [PTE File Format](pte-file-format)
-- [PTD File Format](ptd-file-format)
-#### API Reference
-- [Export to Executorch API Reference](export-to-executorch-api-reference)
-- [Executorch Runtime API Reference](executorch-runtime-api-reference)
-- [Runtime Python API Reference](runtime-python-api-reference)
-- [API Life Cycle](api-life-cycle)
-- [Javadoc](https://pytorch.org/executorch/main/javadoc/)
-#### Kernel Library
-- [Overview](kernel-library-overview)
-- [Custom ATen Kernel](kernel-library-custom-aten-kernel)
-- [Selective Build](kernel-library-selective-build)
-#### Working with LLMs
-- [Getting Started](llm/getting-started.md)
-- [Exporting LLMs](llm/export-llm.md)
-- [Exporting custom LLMs](llm/export-custom-llm.md)
-- [Running with C++](llm/run-with-c-plus-plus.md)
-- [Running on Android (XNNPack)](https://github.com/meta-pytorch/executorch-examples/tree/main/llm/android)
-- [Running on Android (QNN)](llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md)
-- [Running on iOS](llm/run-on-ios.md)
-#### Backend Development
-- [Delegates Integration](backend-delegates-integration)
-- [XNNPACK Reference](backend-delegates-xnnpack-reference)
-- [Dependencies](backend-delegates-dependencies)
-- [Compiler Delegate and Partitioner](compiler-delegate-and-partitioner)
-- [Debug Backend Delegate](debug-backend-delegate)
-#### IR Specification
-- [EXIR](ir-exir)
-- [Ops Set Definition](ir-ops-set-definition)
-#### Compiler Entry Points
-- [Backend Dialect](compiler-backend-dialect)
-- [Custom Compiler Passes](compiler-custom-compiler-passes)
-- [Memory Planning](compiler-memory-planning)
-#### Contributing
-- [Contributing](contributing)
+- **Portability:** Run on diverse platforms, from high-end mobile to constrained microcontrollers
+- **Performance:** Lightweight runtime with full hardware acceleration (CPU, GPU, NPU, DSP)
+- **Productivity:** Use familiar PyTorch tools from authoring to deployment
+
+---
+
+## 🎯 Wins & Success Stories
+
+::::{grid} 1
+:class-container: success-showcase
+:::{grid-item-card}
+:class-header: bg-primary text-white
+:class-body: text-center
+[View All Success Stories →](success-stories)
+:::
+::::
+
+---
+
+## Quick Navigation
+
+::::{grid} 2
+
+:::{grid-item-card} **Get Started**
+:link: quick-start-section
+:link-type: doc
+
+New to ExecuTorch? Start here for installation and your first model deployment.
+:::
+
+:::{grid-item-card} **Deploy on Edge Platforms**
+:link: edge-platforms-section
+:link-type: doc
+
+Deploy on Android, iOS, Laptops / Desktops and embedded platforms with optimized backends.
+:::
+
+:::{grid-item-card} **Work with LLMs**
+:link: llm/working-with-llms
+:link-type: doc
+
+Export, optimize, and deploy Large Language Models on edge devices.
+:::
+
+:::{grid-item-card} 🔧 **Developer Tools**
+:link: tools-section
+:link-type: doc
+
+Profile, debug, and inspect your models with comprehensive tooling.
+:::
+
+::::
+
+---
+
+## Explore Documentation
+
+::::{grid} 1
+:::{grid-item-card} **Intro**
+:link: intro-section
+:link-type: doc
+
+**Overview, architecture, and core concepts** — Understand how ExecuTorch works and its benefits
+:::
+::::
+
+::::{grid} 1
+:::{grid-item-card} **Quick Start**
+:link: quick-start-section
+:link-type: doc
+
+**Get started with ExecuTorch** — Install, export your first model, and run inference
+:::
+::::
+
+::::{grid} 1
+:::{grid-item-card} **Edge**
+:link: edge-platforms-section
+:link-type: doc
+
+**Android, iOS, Desktop, Embedded** — Platform-specific deployment guides and examples
+:::
+::::
+
+::::{grid} 1
+:::{grid-item-card} **Backends**
+:link: backends-section
+:link-type: doc
+
+**CPU, GPU, NPU/Accelerator backends** — Hardware acceleration and backend selection
+:::
+::::
+
+::::{grid} 1
+:::{grid-item-card} **LLMs**
+:link: llm/working-with-llms
+:link-type: doc
+
+**LLM export, optimization, and deployment** — Complete LLM workflow for edge devices
+:::
+::::
+
+::::{grid} 1
+:::{grid-item-card} **Advanced**
+:link: advanced-topics-section
+:link-type: doc
+
+**Quantization, memory planning, custom passes** — Deep customization and optimization
+:::
+::::
+
+::::{grid} 1
+:::{grid-item-card} **Tools**
+:link: tools-section
+:link-type: doc
+
+**Developer tools, profiling, debugging** — Comprehensive development and debugging suite
+:::
+::::
+
+::::{grid} 1
+:::{grid-item-card} **API**
+:link: api-section
+:link-type: doc
+
+**API Reference Usages & Examples** — Detailed Python, C++, and Java API references
+:::
+::::
+
+::::{grid} 1
+:::{grid-item-card} **💬 Support**
+:link: support-section
+:link-type: doc
+
+**FAQ, troubleshooting, contributing** — Get help and contribute to the project
+:::
+::::
+
+---
+
+## What's Supported
+
+::::{grid} 3
+
+:::{grid-item}
+**Model Types**
+
+- Large Language Models (LLMs)
+- Computer Vision (CV)
+- Speech Recognition (ASR)
+- Text-to-Speech (TTS)
+- More ...
+:::
+
+:::{grid-item}
+**Platforms**
+
+- Android & iOS
+- Linux, macOS, Windows
+- Embedded & MCUs
+- Go **→ {doc}`edge-platforms-section`**
+:::
+
+:::{grid-item}
+**Rich Acceleration**
+
+- CPU
+- GPU
+- NPU
+- DSP
+- Go **→ {doc}`backends-section`**
+:::
+
+::::
```{toctree}
-:glob:
-:maxdepth: 1
:hidden:
+:maxdepth: 1
-intro
-usage
-examples
-backends
-developer-tools
-runtime
-api
-quantization
-kernel-library
+intro-section
+quick-start-section
+edge-platforms-section
+backends-section
llm/working-with-llms
-backend-development
-ir-specification
-compiler-entry-points
-contributing
-```
+advanced-topics-section
+tools-section
+api-section
+support-section
diff --git a/docs/source/intro-section.md b/docs/source/intro-section.md
new file mode 100644
index 00000000000..2f6f3c57c88
--- /dev/null
+++ b/docs/source/intro-section.md
@@ -0,0 +1,27 @@
+(intro-section)=
+
+# Intro
+
+Overview, architecture, and core concepts of ExecuTorch.
+
+ExecuTorch is PyTorch's solution for training and inference on the Edge, providing portability, productivity, and performance for edge computing platforms.
+
+## Getting Started with ExecuTorch
+
+New to ExecuTorch? Start with these foundational topics:
+
+- **{doc}`intro-overview`** - High-level overview of ExecuTorch capabilities
+- **{doc}`intro-how-it-works`** - Technical overview of the ExecuTorch workflow
+- **{doc}`getting-started-architecture`** - System architecture and components
+- **{doc}`concepts`** - Core concepts and terminology
+
+```{toctree}
+:hidden:
+:maxdepth: 2
+:caption: Introduction Topics
+
+intro-overview
+intro-how-it-works
+getting-started-architecture
+concepts
+```
diff --git a/docs/source/intro.md b/docs/source/intro.md
deleted file mode 100644
index f6609cc3ba7..00000000000
--- a/docs/source/intro.md
+++ /dev/null
@@ -1,10 +0,0 @@
-# Intro
-
-```{toctree}
-:maxdepth: 1
-
-intro-overview
-intro-how-it-works
-getting-started-architecture
-concepts
-```
diff --git a/docs/source/ios-backends.md b/docs/source/ios-backends.md
new file mode 100644
index 00000000000..cb186f53319
--- /dev/null
+++ b/docs/source/ios-backends.md
@@ -0,0 +1,19 @@
+(ios-backends)=
+# Backends
+
+Available hardware acceleration backends for iOS deployment.
+
+## Apple Hardware Acceleration (Recommended)
+
+- {doc}`ios-coreml` — CoreML (NPU/GPU, recommended for iOS)
+- {doc}`ios-mps` — Metal Performance Shaders (GPU)
+
+## CPU Acceleration
+
+- {doc}`ios-xnnpack` — XNNPACK (CPU acceleration)
+
+```{toctree}
+:hidden:
+ios-coreml
+ios-mps
+ios-xnnpack
diff --git a/docs/source/ios-coreml.md b/docs/source/ios-coreml.md
new file mode 100644
index 00000000000..48271326d87
--- /dev/null
+++ b/docs/source/ios-coreml.md
@@ -0,0 +1 @@
+```{include} backends-coreml.md
diff --git a/docs/source/ios-examples.md b/docs/source/ios-examples.md
new file mode 100644
index 00000000000..86acf3273a6
--- /dev/null
+++ b/docs/source/ios-examples.md
@@ -0,0 +1,4 @@
+# Examples & Demos
+
+- [iOS LLM Examples Repository](https://github.com/meta-pytorch/executorch-examples/tree/main/llm/apple)
+- [MobileViT Demo App](https://github.com/meta-pytorch/executorch-examples/tree/main/mv3/apple/ExecuTorchDemo)
diff --git a/docs/source/ios-mps.md b/docs/source/ios-mps.md
new file mode 100644
index 00000000000..d6f305d33aa
--- /dev/null
+++ b/docs/source/ios-mps.md
@@ -0,0 +1 @@
+```{include} backends-mps.md
diff --git a/docs/source/ios-section.md b/docs/source/ios-section.md
new file mode 100644
index 00000000000..33c9a61ce1d
--- /dev/null
+++ b/docs/source/ios-section.md
@@ -0,0 +1,23 @@
+(ios-section)=
+# iOS
+
+Deploy ExecuTorch on iOS devices with Apple hardware acceleration.
+
+## Quick Start & Integration
+
+- {doc}`using-executorch-ios` — Complete iOS integration guide
+
+## Backends
+
+- {doc}`ios-backends` — Available iOS backends and acceleration options
+
+## Examples & Demos
+
+- {doc}`ios-examples` — Explore iOS Examples & Demos
+
+
+```{toctree}
+:hidden:
+using-executorch-ios
+ios-backends
+ios-examples
diff --git a/docs/source/ios-xnnpack.md b/docs/source/ios-xnnpack.md
new file mode 100644
index 00000000000..315dd747006
--- /dev/null
+++ b/docs/source/ios-xnnpack.md
@@ -0,0 +1 @@
+```{include} backends-xnnpack.md
diff --git a/docs/source/kernel-library-advanced.md b/docs/source/kernel-library-advanced.md
new file mode 100644
index 00000000000..5f0215b87c1
--- /dev/null
+++ b/docs/source/kernel-library-advanced.md
@@ -0,0 +1,23 @@
+(kernel-library-advanced)=
+
+# Kernel Library Deep Dive
+
+Advanced kernel implementation and customization for ExecuTorch.
+
+## Kernel Library Overview
+
+- {doc}`kernel-library-overview` — Architecture and design of the kernel library
+
+- {doc}`kernel-library-custom-aten-kernel` — Kernel registration and customization
+
+## Build Optimization
+
+- {doc}`kernel-library-selective-build` — Selective build for reduced binary footprint
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+
+kernel-library-overview
+kernel-library-custom-aten-kernel
+kernel-library-selective-build
diff --git a/docs/source/kernel-library-overview.md b/docs/source/kernel-library-overview.md
index cfd46524097..a826b334ba4 100644
--- a/docs/source/kernel-library-overview.md
+++ b/docs/source/kernel-library-overview.md
@@ -1,7 +1,7 @@
-This page provides a description of the Portable Kernel Library and the Optimized Kernel Library, which are the default kernel libraries shipped with ExecuTorch. It is recommended reading for those who are interested in executing ExecuTorch programs with these kernel libraries, or for those who want to implement their own kernels and kernel libraries.
-
# Overview of ExecuTorch’s Kernel Libraries
+This page provides a description of the Portable Kernel Library and the Optimized Kernel Library, which are the default kernel libraries shipped with ExecuTorch. It is recommended reading for those who are interested in executing ExecuTorch programs with these kernel libraries, or for those who want to implement their own kernels and kernel libraries.
+
An ExecuTorch program encodes instructions that describe the computation that should be performed by the program. Many of these instructions will correspond to calling a specific ATen operator, for example `aten.convolution`. However, one of the core design principles of ExecuTorch is that the signature of an operator should be separate from the implementation of the operator. This means that the ExecuTorch runtime does not ship with any standard implementation for ATen operators; users must make sure to link against kernel libraries that contain implementations of the operators required by their ExecuTorch program, and configure [operator registration](kernel-library-custom-aten-kernel.md) to map an operator signature to the desired implementation. This makes it easy to adjust the implementation of operators such as `aten.convolution` that will be called when executing an ExecuTorch program; it allows users to select the exact operator implementations that will meet the unique performance, memory usage, battery usage, etc. constraints of their use-case.
**In essence, a kernel library is simply a collection of ATen operator implementations that follow a common theme or design principle**. Note that due to ExecuTorch’s selective build process (discussed in the following section), operator implementations are linked individually. This means that users can easily mix different kernel libraries in their build without sacrificing build size.
diff --git a/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md
index 4587589a51b..642dc04da58 100644
--- a/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md
+++ b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md
@@ -1,4 +1,4 @@
-# Building and Running Llama 3 8B Instruct with Qualcomm AI Engine Direct Backend
+# Run Llama 3 8B on Android (with Qualcomm AI Engine Direct Backend)
This tutorial demonstrates how to export Llama 3 8B Instruct for Qualcomm AI Engine Direct Backend and running the model on a Qualcomm device.
@@ -56,7 +56,7 @@ backend:
qnn:
enabled: True
num_sharding: 8
-
+
# export_llm
python -m extension.llm.export.export_llm \
@@ -136,6 +136,7 @@ You should see the message:
```
## What is coming?
+
- Performance improvements
- Reduce the memory pressure during inference to support 12GB Qualcomm devices
- Support more LLMs (Qwen, Phi-4-mini, etc.)
diff --git a/docs/source/llm/working-with-llms.md b/docs/source/llm/working-with-llms.md
index 17b2e46c0a5..4c238f7ae5c 100644
--- a/docs/source/llm/working-with-llms.md
+++ b/docs/source/llm/working-with-llms.md
@@ -1,13 +1,18 @@
-# Working with LLMs
+(working-with-llms)=
+
+# LLMs
+
+Learn how to export LLM models and deploy them across different platforms and runtime environments. This section covers the complete workflow from model export to running inference on mobile devices and edge hardware.
+
```{toctree}
:maxdepth: 1
+:caption: Working with LLMs
getting-started
export-llm
export-custom-llm
run-with-c-plus-plus
-llama-demo-android
build-run-llama3-qualcomm-ai-engine-direct-backend
run-on-ios
```
diff --git a/docs/source/platforms-desktop.md b/docs/source/platforms-desktop.md
new file mode 100644
index 00000000000..acbdb06a6b6
--- /dev/null
+++ b/docs/source/platforms-desktop.md
@@ -0,0 +1,23 @@
+# Desktop & Laptop
+
+ExecuTorch supports desktop and laptop deployment across Linux, macOS, and Windows.
+
+## Platform-Specific Guides
+- [C++ Runtime Integration](using-executorch-cpp) - Complete setup guide
+- [Building from Source](using-executorch-building-from-source)
+
+## Available Backends by Platform
+
+### Linux
+- [XNNPACK (CPU)](backends-xnnpack)
+- [OpenVINO (Intel)](build-run-openvino)
+- [ARM Ethos-U (ARM64)](backends-arm-ethos-u)
+
+### macOS
+- [CoreML (recommended)](backends-coreml)
+- [MPS (Apple Silicon)](backends-mps)
+- [XNNPACK (CPU)](backends-xnnpack)
+
+### Windows
+- [XNNPACK (CPU)](backends-xnnpack)
+- [OpenVINO (Intel)](build-run-openvino)
diff --git a/docs/source/platforms-embedded.md b/docs/source/platforms-embedded.md
new file mode 100644
index 00000000000..5ea248fc0d9
--- /dev/null
+++ b/docs/source/platforms-embedded.md
@@ -0,0 +1,19 @@
+# Embedded Platforms
+
+ExecuTorch supports embedded devices from microcontrollers to edge devices.
+
+## Platform-Specific Guides
+- [C++ Runtime Integration](using-executorch-cpp) - Complete setup guide
+- [Building from Source](using-executorch-building-from-source)
+
+## Available Backends by Device Type
+
+### Microcontrollers
+- [Cadence Xtensa Backend](backends-cadence)
+- [ARM Ethos-U NPU Backend](backends-arm-ethos-u)
+- [Custom Backend Development](backend-delegates-integration)
+
+### Edge Devices
+- [ARM Ethos-U NPU Backend](backends-arm-ethos-u)
+- [NXP eIQ Neutron Backend](backend-nxp)
+- [Custom Hardware Integration](backend-delegates-integration)
diff --git a/docs/source/quantization-optimization.md b/docs/source/quantization-optimization.md
new file mode 100644
index 00000000000..d2005b3adac
--- /dev/null
+++ b/docs/source/quantization-optimization.md
@@ -0,0 +1,20 @@
+(quantization-optimization)=
+
+# Quantization & Optimization
+
+Advanced techniques for model compression and performance optimization.
+
+## Quantization Strategies
+
+- {doc}`quantization-overview` — Comprehensive quantization strategies and techniques
+
+## Performance Optimization
+
+- {doc}`runtime-profiling` — Performance profiling and optimization techniques
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+
+quantization-overview
+runtime-profiling
diff --git a/docs/source/quick-start-section.md b/docs/source/quick-start-section.md
new file mode 100644
index 00000000000..b35bed8d22c
--- /dev/null
+++ b/docs/source/quick-start-section.md
@@ -0,0 +1,38 @@
+(quick-start-section)=
+# Quick Start
+
+Get started with ExecuTorch in just a few steps.
+
+This section walks you through the essential steps to get ExecuTorch up and running, from initial setup to exporting your first model for edge deployment.
+
+## What You'll Learn
+
+Follow these guides in order to get started with ExecuTorch:
+
+- **{doc}`getting-started`** - Initial Setup: Set up your development environment and run your first ExecuTorch example.
+
+- **{doc}`using-executorch-export`** - Exporting your model: Export for Edge deployment.
+
+- **{doc}`using-executorch-building-from-source`** - Building from Source: Build ExecuTorch from source for custom configurations and development.
+
+## Prerequisites
+
+- Python 3.10-3.12
+- PyTorch 2.9+
+- Basic familiarity with PyTorch model development
+
+## Next Steps
+
+After completing the quick start, explore:
+
+- **{doc}`edge-platforms-section`** - Deploy to specific platforms (Android, iOS, Desktop, Embedded)
+- **{doc}`backends-section`** - Choose the right acceleration backend for your hardware
+
+```{toctree}
+:hidden:
+:maxdepth: 2
+:caption: Quick Start Guide
+
+getting-started
+using-executorch-export
+using-executorch-building-from-source
diff --git a/docs/source/runtime-integration-advanced.md b/docs/source/runtime-integration-advanced.md
new file mode 100644
index 00000000000..a76265c4093
--- /dev/null
+++ b/docs/source/runtime-integration-advanced.md
@@ -0,0 +1,20 @@
+(runtime-integration-advanced)=
+
+# Runtime & Integration
+
+Advanced runtime integration topics
+
+## Platform Integration
+
+- {doc}`runtime-platform-abstraction-layer` — Platform abstraction layer for cross-platform deployment
+
+## Portable C++ Programming
+
+- {doc}`portable-cpp-programming` — Portable C++ programming for cross-platform deployment
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+
+runtime-platform-abstraction-layer
+portable-cpp-programming
diff --git a/docs/source/success-stories.md b/docs/source/success-stories.md
new file mode 100644
index 00000000000..cba874132c6
--- /dev/null
+++ b/docs/source/success-stories.md
@@ -0,0 +1,56 @@
+(success-stories)=
+
+# Success Stories
+
+Discover how organizations are leveraging ExecuTorch to deploy AI models at scale on edge devices.
+
+---
+
+## 🎯 Featured Success Stories
+
+::::{grid} 1
+:gutter: 3
+
+:::{grid-item-card} **🚀 Story 1: [Title Placeholder]**
+:class-header: bg-primary text-white
+
+**Industry:** [Industry]
+**Hardware:** [Hardware Platform]
+**Impact:** [Key Metrics]
+
+[Placeholder Description] - Brief overview of the challenge, solution, and results achieved.
+
+
+[Read Full Story →](#story-1-details)
+:::
+
+:::{grid-item-card} **⚡ Story 2: [Title Placeholder]**
+:class-header: bg-success text-white
+
+**Industry:** [Industry]
+**Hardware:** [Hardware Platform]
+**Impact:** [Key Metrics]
+
+[Placeholder Description] - Brief overview of the challenge, solution, and results achieved.
+
+
+
+[Read Full Story →](#story-2-details)
+:::
+
+:::{grid-item-card} **🧠 Story 3: [Title Placeholder]**
+:class-header: bg-info text-white
+
+**Industry:** [Industry]
+**Hardware:** [Hardware Platform]
+**Impact:** [Key Metrics]
+
+[Placeholder Description] - Brief overview of the challenge, solution, and results achieved.
+
+
+[Read Full Story →](#story-3-details)
+:::
+
+::::
+
+---
diff --git a/docs/source/support-section.md b/docs/source/support-section.md
new file mode 100644
index 00000000000..64c47a3e55b
--- /dev/null
+++ b/docs/source/support-section.md
@@ -0,0 +1,17 @@
+(support-section)=
+# Support
+
+In this section, find answers to common questions, troubleshooting guides, and information on how to contribute to the ExecuTorch project. Get help with issues and learn how to participate in the community.
+
+- {doc}`using-executorch-faqs` — FAQ
+- {doc}`using-executorch-troubleshooting` — Common Issues
+- {doc}`contributing` — Contributing
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+:caption: Support
+
+using-executorch-faqs
+using-executorch-troubleshooting
+contributing
diff --git a/docs/source/tools-section.md b/docs/source/tools-section.md
new file mode 100644
index 00000000000..461a1f6849a
--- /dev/null
+++ b/docs/source/tools-section.md
@@ -0,0 +1,30 @@
+(tools-sdk-section)=
+
+# Tools
+
+In this section, explore ExecuTorch's comprehensive developer tools for profiling, debugging, and model inspection. These tools help optimize performance and troubleshoot issues during development and deployment.
+
+- {doc}`devtools-overview` — Developer Tools Overview
+- {doc}`bundled-io` — Bundled I/O
+- {doc}`etrecord` — ETRecord
+- {doc}`etdump` — ETDump
+- {doc}`runtime-profiling` — Profiling Suite
+- {doc}`model-debugging` — Debugging Tools
+- {doc}`model-inspector` — Model Inspector
+- {doc}`memory-planning-inspection` — Memory Planning Inspection
+- {doc}`devtools-tutorial` — Development Utilities
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+:caption: Tools
+
+devtools-overview
+bundled-io
+etrecord
+etdump
+runtime-profiling
+model-debugging
+model-inspector
+memory-planning-inspection
+devtools-tutorial
diff --git a/docs/source/using-executorch-export.md b/docs/source/using-executorch-export.md
index b3d1836b78a..2363affa7cb 100644
--- a/docs/source/using-executorch-export.md
+++ b/docs/source/using-executorch-export.md
@@ -32,7 +32,7 @@ As part of the .pte file creation process, ExecuTorch identifies portions of the
Commonly used hardware backends are listed below. For mobile, consider using XNNPACK for Android and XNNPACK or Core ML for iOS. To create a .pte file for a specific backend, pass the appropriate partitioner class to `to_edge_transform_and_lower`. See the appropriate backend documentation and the [Export and Lowering](#export-and-lowering) section below for more information.
-- [XNNPACK (Mobile CPU)](backends-xnnpack.md)
+- [XNNPACK (CPU)](backends-xnnpack.md)
- [Core ML (iOS)](backends-coreml.md)
- [Metal Performance Shaders (iOS GPU)](backends-mps.md)
- [Vulkan (Android GPU)](backends-vulkan.md)
From d8a21260d35a4acf2073266820950a819aafb8ae Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin
Date: Mon, 6 Oct 2025 16:42:10 -0700
Subject: [PATCH 145/266] Add Gemma 3 test.
Differential Revision: D84001548
Pull Request resolved: https://github.com/pytorch/executorch/pull/14825
---
.../Exported/ExecuTorchLLMMultimodalRunner.h | 16 ++
.../Exported/ExecuTorchLLMMultimodalRunner.mm | 84 +++++++-
.../__tests__/MultimodalRunnerTest.swift | 179 ++++++++++++++----
.../__tests__/TextRunnerTest.swift | 4 +-
4 files changed, 233 insertions(+), 50 deletions(-)
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h
index 8523581da8a..250241b9c9d 100644
--- a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h
+++ b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h
@@ -44,6 +44,12 @@ __attribute__((objc_subclassing_restricted))
channels:(NSInteger)channels
NS_DESIGNATED_INITIALIZER;
+- (instancetype)initWithFloatData:(NSData *)data
+ width:(NSInteger)width
+ height:(NSInteger)height
+ channels:(NSInteger)channels
+ NS_DESIGNATED_INITIALIZER;
+
@property(nonatomic, readonly) NSData *data;
@property(nonatomic, readonly) NSInteger width;
@@ -52,6 +58,8 @@ __attribute__((objc_subclassing_restricted))
@property(nonatomic, readonly) NSInteger channels;
+@property(nonatomic, readonly) BOOL isFloat;
+
+ (instancetype)new NS_UNAVAILABLE;
- (instancetype)init NS_UNAVAILABLE;
@@ -80,6 +88,12 @@ __attribute__((objc_subclassing_restricted))
frames:(NSInteger)frames
NS_DESIGNATED_INITIALIZER;
+- (instancetype)initWithFloatData:(NSData *)data
+ batchSize:(NSInteger)batchSize
+ bins:(NSInteger)bins
+ frames:(NSInteger)frames
+ NS_DESIGNATED_INITIALIZER;
+
@property(nonatomic, readonly) NSData *data;
@property(nonatomic, readonly) NSInteger batchSize;
@@ -88,6 +102,8 @@ __attribute__((objc_subclassing_restricted))
@property(nonatomic, readonly) NSInteger frames;
+@property(nonatomic, readonly) BOOL isFloat;
+
+ (instancetype)new NS_UNAVAILABLE;
- (instancetype)init NS_UNAVAILABLE;
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm
index a3dc3e6afd1..964805053e2 100644
--- a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm
+++ b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm
@@ -32,6 +32,22 @@ - (instancetype)initWithData:(NSData *)data
_width = width;
_height = height;
_channels = channels;
+ _isFloat = NO;
+ }
+ return self;
+}
+
+- (instancetype)initWithFloatData:(NSData *)data
+ width:(NSInteger)width
+ height:(NSInteger)height
+ channels:(NSInteger)channels {
+ self = [super init];
+ if (self) {
+ _data = [data copy];
+ _width = width;
+ _height = height;
+ _channels = channels;
+ _isFloat = YES;
}
return self;
}
@@ -53,6 +69,22 @@ - (instancetype)initWithData:(NSData *)data
_batchSize = batchSize;
_bins = bins;
_frames = frames;
+ _isFloat = NO;
+ }
+ return self;
+}
+
+- (instancetype)initWithFloatData:(NSData *)data
+ batchSize:(NSInteger)batchSize
+ bins:(NSInteger)bins
+ frames:(NSInteger)frames {
+ self = [super init];
+ if (self) {
+ _data = [data copy];
+ _batchSize = batchSize;
+ _bins = bins;
+ _frames = frames;
+ _isFloat = YES;
}
return self;
}
@@ -170,6 +202,7 @@ - (BOOL)generateWithInputs:(NSArray *)inputs
return NO;
}
std::vector nativeInputs;
+ nativeInputs.reserve((size_t)inputs.count);
for (ExecuTorchLLMMultimodalInput *input in inputs) {
switch (input.type) {
case ExecuTorchLLMMultimodalInputTypeText:
@@ -177,13 +210,50 @@ - (BOOL)generateWithInputs:(NSArray *)inputs
break;
case ExecuTorchLLMMultimodalInputTypeImage: {
ExecuTorchLLMImage *image = input.image;
- std::vector data((uint8_t *)image.data.bytes, (uint8_t *)image.data.bytes + image.data.length);
- nativeInputs.emplace_back(llm::MultimodalInput(llm::Image(
- std::move(data),
- (int32_t)image.width,
- (int32_t)image.height,
- (int32_t)image.channels
- )));
+ if (image.isFloat) {
+ const float *buffer = (const float *)image.data.bytes;
+ size_t elementCount = (size_t)image.data.length / sizeof(float);
+ std::vector data(buffer, buffer + elementCount);
+ nativeInputs.emplace_back(llm::MultimodalInput(llm::Image(
+ std::move(data),
+ (int32_t)image.width,
+ (int32_t)image.height,
+ (int32_t)image.channels
+ )));
+ } else {
+ const uint8_t *buffer = (const uint8_t *)image.data.bytes;
+ std::vector data(buffer, buffer + image.data.length);
+ nativeInputs.emplace_back(llm::MultimodalInput(llm::Image(
+ std::move(data),
+ (int32_t)image.width,
+ (int32_t)image.height,
+ (int32_t)image.channels
+ )));
+ }
+ break;
+ }
+ case ExecuTorchLLMMultimodalInputTypeAudio: {
+ ExecuTorchLLMAudio *audio = input.audio;
+ if (audio.isFloat) {
+ const float *buffer = (const float *)audio.data.bytes;
+ size_t elementCount = (size_t)audio.data.length / sizeof(float);
+ std::vector data(buffer, buffer + elementCount);
+ nativeInputs.emplace_back(llm::MultimodalInput(llm::Audio(
+ std::move(data),
+ (int32_t)audio.batchSize,
+ (int32_t)audio.bins,
+ (int32_t)audio.frames
+ )));
+ } else {
+ const uint8_t *buffer = (const uint8_t *)audio.data.bytes;
+ std::vector data(buffer, buffer + audio.data.length);
+ nativeInputs.emplace_back(llm::MultimodalInput(llm::Audio(
+ std::move(data),
+ (int32_t)audio.batchSize,
+ (int32_t)audio.bins,
+ (int32_t)audio.frames
+ )));
+ }
break;
}
default: {
diff --git a/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift b/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift
index 7ae9da4969b..7281740c3af 100644
--- a/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift
+++ b/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift
@@ -10,60 +10,157 @@ import ExecuTorchLLM
import XCTest
extension UIImage {
- func asImage() -> Image {
- let targetSide = CGFloat(336)
- let scale = max(targetSide / size.width, targetSide / size.height)
- let scaledSize = CGSize(width: size.width * scale, height: size.height * scale)
+ func centerCropped(to sideSize: CGFloat) -> UIImage {
+ precondition(sideSize > 0)
let format = UIGraphicsImageRendererFormat.default()
format.scale = 1
- let scaledImage = UIGraphicsImageRenderer(size: scaledSize, format: format).image { _ in
- draw(in: CGRect(origin: .zero, size: scaledSize))
- }
- guard let scaledCGImage = scaledImage.cgImage else {
- return Image(data: Data(), width: 336, height: 336, channels: 3)
- }
- let cropRect = CGRect(
- x: ((scaledSize.width - targetSide) * 0.5).rounded(.down),
- y: ((scaledSize.height - targetSide) * 0.5).rounded(.down),
- width: targetSide.rounded(.down),
- height: targetSide.rounded(.down)
- )
- let croppedCGImage = scaledCGImage.cropping(to: cropRect) ?? scaledCGImage
- let imageWidth = croppedCGImage.width
- let imageHeight = croppedCGImage.height
- let pixelCount = imageWidth * imageHeight
- var rgbaBuffer = [UInt8](repeating: 0, count: pixelCount * 4)
- let context = CGContext(
+ format.opaque = false
+ return UIGraphicsImageRenderer(size: CGSize(width: sideSize, height: sideSize), format: format)
+ .image { _ in
+ let scaleFactor = max(sideSize / size.width, sideSize / size.height)
+ let scaledWidth = size.width * scaleFactor
+ let scaledHeight = size.height * scaleFactor
+ let originX = (sideSize - scaledWidth) / 2
+ let originY = (sideSize - scaledHeight) / 2
+ draw(in: CGRect(x: originX, y: originY, width: scaledWidth, height: scaledHeight))
+ }
+ }
+
+ func rgbBytes() -> [UInt8]? {
+ guard let cgImage = cgImage else { return nil }
+ let pixelWidth = Int(cgImage.width)
+ let pixelHeight = Int(cgImage.height)
+ let pixelCount = pixelWidth * pixelHeight
+ let bytesPerPixel = 4
+ let bytesPerRow = pixelWidth * bytesPerPixel
+ var rgbaBuffer = [UInt8](repeating: 0, count: pixelCount * bytesPerPixel)
+ guard let context = CGContext(
data: &rgbaBuffer,
- width: imageWidth,
- height: imageHeight,
+ width: pixelWidth,
+ height: pixelHeight,
bitsPerComponent: 8,
- bytesPerRow: imageWidth * 4,
+ bytesPerRow: bytesPerRow,
space: CGColorSpaceCreateDeviceRGB(),
bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue | CGBitmapInfo.byteOrder32Big.rawValue
- )!
- context.draw(croppedCGImage, in: CGRect(x: 0, y: 0, width: imageWidth, height: imageHeight))
- var planarRGB = [UInt8](repeating: 0, count: pixelCount * 3)
+ ) else { return nil }
+
+ context.draw(cgImage, in: CGRect(x: 0, y: 0, width: pixelWidth, height: pixelHeight))
+
+ var rgbBytes = [UInt8](repeating: 0, count: pixelCount * 3)
for pixelIndex in 0.. [Float]? {
+ precondition(mean.count == 3 && std.count == 3)
+ precondition(std[0] != 0 && std[1] != 0 && std[2] != 0)
+ guard let rgbBytes = rgbBytes() else { return nil }
+ let pixelCount = rgbBytes.count / 3
+ var rgbBytesNormalized = [Float](repeating: 0, count: pixelCount * 3)
+ for pixelIndex in 0.. Image {
+ return Image(
+ data: Data(centerCropped(to: sideSize).rgbBytes() ?? []),
+ width: Int(sideSize),
+ height: Int(sideSize),
+ channels: 3
+ )
+ }
+
+ func asNormalizedImage(
+ _ sideSize: CGFloat,
+ mean: [Float] = [0.485, 0.456, 0.406],
+ std: [Float] = [0.229, 0.224, 0.225]
+ ) -> Image {
+ return Image(
+ float: (centerCropped(to: sideSize).rgbBytesNormalized(mean: mean, std: std) ?? []).withUnsafeBufferPointer { Data(buffer: $0) },
+ width: Int(sideSize),
+ height: Int(sideSize),
+ channels: 3
+ )
}
}
class MultimodalRunnerTest: XCTestCase {
- let systemPrompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: "
- let assistantPrompt = "ASSISTANT: "
+ let systemPrompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
let userPrompt = "What's on the picture?"
- let sequenceLength = 768
+
+ func testGemma() {
+ let chatTemplate = "user\n%@\nmodel"
+ let sideSize: CGFloat = 896
+ let sequenceLength = 768
+ let bundle = Bundle(for: type(of: self))
+ guard let modelPath = bundle.path(forResource: "gemma3", ofType: "pte"),
+ let tokenizerPath = bundle.path(forResource: "gemma3_tokenizer", ofType: "model"),
+ let imagePath = bundle.path(forResource: "IMG_0005", ofType: "jpg"),
+ let uiImage = UIImage(contentsOfFile: imagePath) else {
+ XCTFail("Couldn't find model or tokenizer files")
+ return
+ }
+ let runner = MultimodalRunner(modelPath: modelPath, tokenizerPath: tokenizerPath)
+ var text = ""
+
+ do {
+ try runner.generate([
+ MultimodalInput(systemPrompt),
+ MultimodalInput(uiImage.asNormalizedImage(sideSize)),
+ MultimodalInput(String(format: chatTemplate, userPrompt)),
+ ], Config {
+ $0.sequenceLength = sequenceLength
+ }) { token in
+ text += token
+ if token == "" {
+ runner.stop()
+ }
+ }
+ } catch {
+ XCTFail("Failed to generate text with error \(error)")
+ }
+ XCTAssertTrue(text.lowercased().contains("waterfall"))
+
+ text = ""
+ runner.reset()
+ do {
+ try runner.generate([
+ MultimodalInput(systemPrompt),
+ MultimodalInput(uiImage.asNormalizedImage(sideSize)),
+ MultimodalInput(String(format: chatTemplate, userPrompt)),
+ ], Config {
+ $0.sequenceLength = sequenceLength
+ }) { token in
+ text += token
+ if token == "" {
+ runner.stop()
+ }
+ }
+ } catch {
+ XCTFail("Failed to generate text with error \(error)")
+ }
+ XCTAssertTrue(text.lowercased().contains("waterfall"))
+ }
func testLLaVA() {
+ let chatTemplate = "USER: %@ ASSISTANT: "
+ let sideSize: CGFloat = 336
+ let sequenceLength = 768
let bundle = Bundle(for: type(of: self))
guard let modelPath = bundle.path(forResource: "llava", ofType: "pte"),
- let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "bin"),
+ let tokenizerPath = bundle.path(forResource: "llava_tokenizer", ofType: "bin"),
let imagePath = bundle.path(forResource: "IMG_0005", ofType: "jpg"),
let uiImage = UIImage(contentsOfFile: imagePath) else {
XCTFail("Couldn't find model or tokenizer files")
@@ -75,8 +172,8 @@ class MultimodalRunnerTest: XCTestCase {
do {
try runner.generate([
MultimodalInput(systemPrompt),
- MultimodalInput(uiImage.asImage()),
- MultimodalInput("\(userPrompt) \(assistantPrompt)"),
+ MultimodalInput(uiImage.asImage(sideSize)),
+ MultimodalInput(String(format: chatTemplate, userPrompt)),
], Config {
$0.sequenceLength = sequenceLength
}) { token in
@@ -92,8 +189,8 @@ class MultimodalRunnerTest: XCTestCase {
do {
try runner.generate([
MultimodalInput(systemPrompt),
- MultimodalInput(uiImage.asImage()),
- MultimodalInput("\(userPrompt) \(assistantPrompt)"),
+ MultimodalInput(uiImage.asImage(sideSize)),
+ MultimodalInput(String(format: chatTemplate, userPrompt)),
], Config {
$0.sequenceLength = sequenceLength
}) { token in
diff --git a/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift b/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift
index f7124fec640..0fa2b59d05d 100644
--- a/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift
+++ b/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift
@@ -42,7 +42,7 @@ class TextRunnerTest: XCTestCase {
func testLLaMA() {
let bundle = Bundle(for: type(of: self))
guard let modelPath = bundle.path(forResource: "llama3_2-1B", ofType: "pte"),
- let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "model") else {
+ let tokenizerPath = bundle.path(forResource: "llama_tokenizer", ofType: "model") else {
XCTFail("Couldn't find model or tokenizer files")
return
}
@@ -77,7 +77,7 @@ class TextRunnerTest: XCTestCase {
func testPhi4() {
let bundle = Bundle(for: type(of: self))
guard let modelPath = bundle.path(forResource: "phi4-mini", ofType: "pte"),
- let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "json") else {
+ let tokenizerPath = bundle.path(forResource: "phi_tokenizer", ofType: "json") else {
XCTFail("Couldn't find model or tokenizer files")
return
}
From c609f635ad6fb7939e7f56ed955a59ae4221a5fb Mon Sep 17 00:00:00 2001
From: Andrew Grebenisan <33402477+DrJessop@users.noreply.github.com>
Date: Mon, 6 Oct 2025 17:36:45 -0700
Subject: [PATCH 146/266] Fixed assumption on out_shift for quantized linear
Differential Revision: D83875670
Pull Request resolved: https://github.com/pytorch/executorch/pull/14789
---
backends/cadence/aot/ref_implementations.py | 4 ++--
.../aot/tests/test_ref_implementations.py | 16 ++++++++--------
2 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py
index 2642340679e..ad1abb3ce4b 100644
--- a/backends/cadence/aot/ref_implementations.py
+++ b/backends/cadence/aot/ref_implementations.py
@@ -330,8 +330,8 @@ def variant(
if out_shift.numel() != 1:
raise ValueError("out_shift must be a scalar")
- if out_shift.dtype != torch.int64:
- raise ValueError("out_shift must be an int64")
+ if out_shift.dtype != torch.int32:
+ raise ValueError("out_shift must be an int32")
_out_shift = int(out_shift.item())
_out_multiplier = int(out_multiplier[0].item())
diff --git a/backends/cadence/aot/tests/test_ref_implementations.py b/backends/cadence/aot/tests/test_ref_implementations.py
index f78d2292e7b..d8a79454097 100644
--- a/backends/cadence/aot/tests/test_ref_implementations.py
+++ b/backends/cadence/aot/tests/test_ref_implementations.py
@@ -172,7 +172,7 @@ def test_quantized_add(
torch.tensor(
[1073741824], dtype=torch.int32
), # out_multiplier (0.5 * 2^31)
- torch.tensor([0], dtype=torch.int64), # out_shift
+ torch.tensor([0], dtype=torch.int32), # out_shift
0, # out_zero_point
torch.tensor([[0]], dtype=dtype), # expected_output
per_tensor,
@@ -197,7 +197,7 @@ def test_quantized_add(
torch.tensor(
[1073741824], dtype=torch.int32
), # out_multiplier (0.5 * 2^31)
- torch.tensor([0], dtype=torch.int64), # out_shift
+ torch.tensor([0], dtype=torch.int32), # out_shift
0, # out_zero_point
torch.tensor([[-2, -8]], dtype=dtype), # expected_output
per_tensor,
@@ -220,7 +220,7 @@ def test_quantized_add(
torch.tensor(
[1073741824], dtype=torch.int32
), # out_multiplier (0.5 * 2^31)
- torch.tensor([0], dtype=torch.int64), # out_shift
+ torch.tensor([0], dtype=torch.int32), # out_shift
0, # out_zero_point
torch.tensor([[0, 0]], dtype=dtype), # expected_output
per_tensor,
@@ -244,7 +244,7 @@ def test_quantized_add(
torch.tensor(
[1073741824], dtype=torch.int32
), # out_multiplier (0.5 * 2^31)
- torch.tensor([0], dtype=torch.int64), # out_shift
+ torch.tensor([0], dtype=torch.int32), # out_shift
0, # out_zero_point
torch.tensor(
[[[0, -2, -4], [-2, -7, -12]]], dtype=dtype
@@ -270,7 +270,7 @@ def test_quantized_add(
torch.tensor(
[268435456], dtype=torch.int32
), # out_multiplier (1.0 * 2^31)
- torch.tensor([0], dtype=torch.int64), # out_shift
+ torch.tensor([0], dtype=torch.int32), # out_shift
1, # out_zero_point
torch.tensor([[1, 1]], dtype=dtype), # expected_output
per_tensor,
@@ -295,7 +295,7 @@ def test_quantized_add(
torch.tensor(
[268435456], dtype=torch.int32
), # out_multiplier (1.0 * 2^31)
- torch.tensor([0], dtype=torch.int64), # out_shift
+ torch.tensor([0], dtype=torch.int32), # out_shift
1, # out_zero_point
torch.tensor([[1, 1]], dtype=dtype), # expected_output
False,
@@ -317,7 +317,7 @@ def test_quantized_add(
[268435456], dtype=torch.int32
), # out_multiplier (0.125 * 2^31)
torch.tensor(
- [1], dtype=torch.int64
+ [1], dtype=torch.int32
), # out_shift (shift=1, doubles the scale)
1, # out_zero_point
torch.tensor([[1, 2]], dtype=dtype), # expected_output
@@ -339,7 +339,7 @@ def test_quantized_add(
[268435456], dtype=torch.int32
), # out_multiplier (0.125 * 2^31)
torch.tensor(
- [1], dtype=torch.int64
+ [1], dtype=torch.int32
), # out_shift (shift=1, doubles the scale)
1, # out_zero_point
torch.tensor([[1, 2]], dtype=dtype), # expected_output
From d36bf8ce6ea37d867384f58829418d3a365f8c3b Mon Sep 17 00:00:00 2001
From: derekxu
Date: Mon, 6 Oct 2025 21:44:21 -0700
Subject: [PATCH 147/266] Run ET-eager on message recall
Differential Revision: D83990682
Pull Request resolved: https://github.com/pytorch/executorch/pull/14822
---
examples/models/llama/rope.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/examples/models/llama/rope.py b/examples/models/llama/rope.py
index 0d1dd306091..ea4e6b37243 100644
--- a/examples/models/llama/rope.py
+++ b/examples/models/llama/rope.py
@@ -240,7 +240,7 @@ def __init__(self, params: ModelArgs):
self.precompute_freqs_cis = partial(
hf_precompute_freqs_cis,
partial_rotary_factor=self.params.partial_rotary_factor,
- device=self.params.device,
+ device=getattr(self.params, "device", "cpu"),
)
self.apply_rotary_emb = hf_apply_rotary_emb
else:
@@ -249,7 +249,7 @@ def __init__(self, params: ModelArgs):
use_scaled=self.params.use_scaled_rope,
scale_factor=self.params.rope_scale_factor,
high_freq_factor=self.params.high_freq_factor,
- device=self.params.device,
+ device=getattr(self.params, "device", "cpu"),
)
self.apply_rotary_emb = RotaryEmbedding()
From 0b748bfea8278cfdf60233be475e852d5eaf57f2 Mon Sep 17 00:00:00 2001
From: billmguo
Date: Mon, 6 Oct 2025 21:47:12 -0700
Subject: [PATCH 148/266] oss et update to support SAR2230P
Differential Revision: D83934187
Pull Request resolved: https://github.com/pytorch/executorch/pull/14808
---
backends/qualcomm/serialization/qc_schema.py | 3 +++
backends/qualcomm/utils/utils.py | 2 ++
2 files changed, 5 insertions(+)
diff --git a/backends/qualcomm/serialization/qc_schema.py b/backends/qualcomm/serialization/qc_schema.py
index f3b9e2cc1a5..6f0bceec4c9 100644
--- a/backends/qualcomm/serialization/qc_schema.py
+++ b/backends/qualcomm/serialization/qc_schema.py
@@ -27,6 +27,7 @@ class HtpArch(IntEnum):
V73 = 73
V75 = 75
V79 = 79
+ V81 = 81
@dataclass
@@ -49,6 +50,7 @@ class QcomChipset(IntEnum):
SXR1230P = 45 # v73
SXR2230P = 53 # v69
SXR2330P = 75 # v79
+ SAR2230P = 95 # v81
@dataclass
@@ -69,6 +71,7 @@ class SocInfo:
QcomChipset.SXR1230P: SocInfo(QcomChipset.SXR1230P, HtpInfo(HtpArch.V73, 2)),
QcomChipset.SXR2230P: SocInfo(QcomChipset.SXR2230P, HtpInfo(HtpArch.V69, 8)),
QcomChipset.SXR2330P: SocInfo(QcomChipset.SXR2330P, HtpInfo(HtpArch.V79, 8)),
+ QcomChipset.SAR2230P: SocInfo(QcomChipset.SAR2230P, HtpInfo(HtpArch.V81, 4)),
}
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
index be4e86de50f..c57bec43dcf 100644
--- a/backends/qualcomm/utils/utils.py
+++ b/backends/qualcomm/utils/utils.py
@@ -1099,6 +1099,7 @@ def get_soc_to_arch_map():
"SXR1230P": HtpArch.V73,
"SXR2230P": HtpArch.V69,
"SXR2330P": HtpArch.V79,
+ "SAR2230P": HtpArch.V81,
}
@@ -1115,6 +1116,7 @@ def get_soc_to_chipset_map():
"SXR1230P": QcomChipset.SXR1230P,
"SXR2230P": QcomChipset.SXR2230P,
"SXR2330P": QcomChipset.SXR2330P,
+ "SAR2230P": QcomChipset.SAR2230P,
}
From 2c603e43dc2f2db2e1e48512431f21b5910a0a73 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Lindstr=C3=B6m?=
<33344797+martinlsm@users.noreply.github.com>
Date: Tue, 7 Oct 2025 14:37:05 +0200
Subject: [PATCH 149/266] Arm backend: Move rescale ops out of node visitors
(#14584)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Some TOSA ops do not support INT8 as inputs and outputs. Instead, only
INT32 is supported as a whole number type. Prior to this patch, affected
node visitors inserted rescale ops between the data types INT8 and INT32
before and after the operator such that it will accept its input and
output.
Change this by moving the insertion of the rescale ops to a new pass
called `InsertRescaleInt32Pass`. This will further enable optimizations
to the graph by fusing the rescale nodes.
Only comparison, ABS, MAXIMUM and MINIMUM operators are handled in this
patch; the remaining ones are left out to be done in another patch.
### Test plan
This is refactoring which means that external behavior is not altered. A
new pass `InsertRescaleInt32Pass` has been added and it comes with a new
unit test in backends/arm/test/passes/test_insert_rescale_i32_pass.py.
Signed-off-by: Martin Lindström
Co-authored-by: Oscar Andersson
---
backends/arm/_passes/__init__.py | 2 +-
backends/arm/_passes/arm_pass_manager.py | 2 +
backends/arm/_passes/insert_rescales_pass.py | 240 +++++++++++++++++-
backends/arm/operators/op_abs.py | 90 +------
backends/arm/operators/op_eq.py | 15 +-
backends/arm/operators/op_ge.py | 15 +-
backends/arm/operators/op_gt.py | 15 +-
backends/arm/operators/op_le.py | 15 +-
backends/arm/operators/op_lt.py | 15 +-
backends/arm/operators/op_maximum.py | 48 +---
backends/arm/operators/op_minimum.py | 45 +---
.../passes/test_insert_rescale_i32_pass.py | 77 ++++++
12 files changed, 341 insertions(+), 238 deletions(-)
create mode 100644 backends/arm/test/passes/test_insert_rescale_i32_pass.py
diff --git a/backends/arm/_passes/__init__.py b/backends/arm/_passes/__init__.py
index 93bf20e69c1..008bc305aad 100644
--- a/backends/arm/_passes/__init__.py
+++ b/backends/arm/_passes/__init__.py
@@ -81,7 +81,7 @@
from .insert_int32_casts_after_int64_placeholders import ( # noqa
InsertInt32CastsAfterInt64PlaceholdersPass,
)
-from .insert_rescales_pass import InsertRescalePass # noqa
+from .insert_rescales_pass import InsertRescaleInt32Pass, InsertRescalePass # noqa
from .insert_table_ops import InsertTableOpsPass # noqa
from .match_arg_dtype_pass import MatchArgDtypePass # noqa
from .match_arg_ranks_pass import MatchArgRanksPass # noqa
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
index b7c511bbe0b..1a0f4e4d384 100644
--- a/backends/arm/_passes/arm_pass_manager.py
+++ b/backends/arm/_passes/arm_pass_manager.py
@@ -81,6 +81,7 @@
FuseEqualPlaceholdersPass,
FuseQuantizedActivationPass,
InsertInt32CastsAfterInt64PlaceholdersPass,
+ InsertRescaleInt32Pass,
InsertRescalePass,
InsertTableOpsPass,
MatchArgDtypePass,
@@ -214,6 +215,7 @@ def _tosa_INT_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
self.add_pass(ToTosaMemoryFormatPass(exported_program))
self.add_pass(RemoveNoopPass())
self.add_pass(InsertRescalePass())
+ self.add_pass(InsertRescaleInt32Pass())
self.validate_constraints_mandatory()
return self._transform(exported_program.graph_module)
diff --git a/backends/arm/_passes/insert_rescales_pass.py b/backends/arm/_passes/insert_rescales_pass.py
index 100ac03c2b0..d56e70e78b3 100644
--- a/backends/arm/_passes/insert_rescales_pass.py
+++ b/backends/arm/_passes/insert_rescales_pass.py
@@ -4,9 +4,14 @@
# LICENSE file in the root directory of this source tree.
from copy import copy
-from typing import cast, Set, Type
+from typing import cast, Dict, Optional, Set, Tuple, Type
-from executorch.backends.arm._passes.arm_pass_utils import create_node
+import torch
+from executorch.backends.arm._passes.arm_pass import ArmPass
+from executorch.backends.arm._passes.arm_pass_utils import create_node, set_node_arg
+from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
+ get_output_qparams,
+)
from executorch.backends.arm._passes.quant_args import QuantArgs
from executorch.backends.arm.constants import DQ_OPS, Q_OPS
from executorch.exir.dialects._ops import ops as exir_ops
@@ -65,3 +70,234 @@ def call(self, graph_module: GraphModule) -> PassResult:
graph_module = super().call(graph_module).graph_module
graph_module.recompile()
return PassResult(graph_module, modified)
+
+
+class InsertRescaleInt32Pass(ArmPass):
+ """
+ Numerous TOSA ops require inputs and outputs to be 32-bit integers in their
+ quantized implementations. This pass treats such operator nodes by
+ inserting rescale ops before and after them if needed. Note that extra logic
+ that handles the scales and zero points must be in place because the affected
+ TOSA have naive implementations that do not account for the quantization
+ parameters.
+ """
+
+ _passes_required_after: Set[Type[ExportPass]] = set()
+
+ included_targets = [
+ exir_ops.edge.aten.abs.default,
+ exir_ops.edge.aten.eq.Tensor,
+ exir_ops.edge.aten.ge.Tensor,
+ exir_ops.edge.aten.gt.Tensor,
+ exir_ops.edge.aten.le.Tensor,
+ exir_ops.edge.aten.lt.Tensor,
+ exir_ops.edge.aten.maximum.default,
+ exir_ops.edge.aten.minimum.default,
+ ]
+
+ def _int32_qargs(self, s):
+ """Helper creator function for INT32-based QuantArgs"""
+
+ return QuantArgs(
+ scale=s,
+ zp=0,
+ qmin=torch.iinfo(torch.int32).min,
+ qmax=torch.iinfo(torch.int32).max,
+ dtype=torch.int32,
+ )
+
+ def _get_inputs_rescaled_qparams(
+ self, target, input_qparams: Dict[int, QuantArgs]
+ ) -> Dict[int, QuantArgs]:
+ """Get the qparams for the INT32 operands to the op ``target``
+
+ Inputs to the INT32-based operator must be rescaled from INT8 to INT32.
+ This function computes the ``QuantArgs`` for each of the operands and returns
+ it as a dict, mapping tensor index to ``QuantArgs``.
+ """
+
+ if target in [
+ exir_ops.edge.aten.abs.default,
+ exir_ops.edge.aten.eq.Tensor,
+ exir_ops.edge.aten.ge.Tensor,
+ exir_ops.edge.aten.gt.Tensor,
+ exir_ops.edge.aten.le.Tensor,
+ exir_ops.edge.aten.lt.Tensor,
+ exir_ops.edge.aten.minimum.default,
+ exir_ops.edge.aten.maximum.default,
+ ]:
+ # For these ops, use the smallest scale among the INT8 operands.
+ min_scale = min(
+ [qp.get_scale_per_tensor() for qp in input_qparams.values()]
+ )
+ qparams = {
+ i: self._int32_qargs(min_scale) for i in range(len(input_qparams))
+ }
+ else:
+ raise ValueError(f"Not a valid target: {target}")
+
+ return qparams
+
+ def _get_output_qparams(
+ self, target, inputs_qparams: Dict[int, QuantArgs]
+ ) -> Optional[QuantArgs]:
+ """Given an op ``target`` and the ``QuantArgs`` for each of its inputs, compute
+ the scale of the output based on how the operator itself affects it."""
+
+ if target in [
+ exir_ops.edge.aten.abs.default,
+ exir_ops.edge.aten.maximum.default,
+ exir_ops.edge.aten.minimum.default,
+ ]:
+ # The op has not altered the scale; the output scale is equal to
+ # the operands' scales.
+ return self._int32_qargs(inputs_qparams[0].get_scale_per_tensor())
+ elif target in [
+ exir_ops.edge.aten.eq.Tensor,
+ exir_ops.edge.aten.ge.Tensor,
+ exir_ops.edge.aten.gt.Tensor,
+ exir_ops.edge.aten.le.Tensor,
+ exir_ops.edge.aten.lt.Tensor,
+ ]:
+ # Output is bool for these ops and thus no qparams are present
+ return None
+ else:
+ raise ValueError(f"Not a valid target: {target}")
+
+ def _get_rescale_qparams(
+ self, target, input_qparams: Dict[int, QuantArgs]
+ ) -> Tuple[Dict[int, QuantArgs], Optional[QuantArgs]]:
+ """
+ Get the quantization parameters of the INT32 inputs/outputs that will
+ surround the node after the new RESCALE ops have been inserted.
+ """
+
+ inputs_rescaled_qparams = self._get_inputs_rescaled_qparams(
+ target, input_qparams
+ )
+ output_qparams = self._get_output_qparams(target, inputs_rescaled_qparams)
+
+ return (inputs_rescaled_qparams, output_qparams)
+
+ def _rescale_inputs(self, graph, node, rescale_qargs: Dict[int, QuantArgs]) -> bool:
+ qargs = node.meta["input_qparams"]
+
+ args_copy = list(node.args)
+ seen_args = set()
+ modified = False
+ for i in qargs:
+ qp = qargs[i]
+ if qp.dtype != torch.int8:
+ continue
+
+ arg_node = args_copy[i]
+ if arg_node in seen_args:
+ continue
+ seen_args.add(arg_node)
+
+ with graph.inserting_after(arg_node):
+ rescale_node = create_node(
+ graph,
+ exir_ops.backend.tosa.RESCALE.default,
+ (
+ arg_node,
+ torch.int32,
+ qp.get_scale_per_tensor()
+ / rescale_qargs[
+ i
+ ].get_scale_per_tensor(), # Old scale / new scale
+ qp.get_zp_per_tensor(), # Old zero point
+ rescale_qargs[i].get_zp_per_tensor(), # New zero point
+ ),
+ from_node=node,
+ )
+
+ node.replace_input_with(arg_node, rescale_node)
+ modified = True
+
+ return modified
+
+ def _rescale_outputs(self, graph, node, rescale_qargs: Optional[QuantArgs]) -> bool:
+ if "output_qparams" not in node.meta or len(node.meta["output_qparams"]) == 0:
+ return False
+
+ qargs = get_output_qparams(node)
+ assert len(qargs) == 1
+ assert rescale_qargs is not None
+
+ qarg = qargs[0]
+ if qarg.dtype != torch.int8:
+ return False
+
+ users_copy = list(node.users)
+
+ with graph.inserting_after(node):
+ rescale_node = create_node(
+ graph,
+ exir_ops.backend.tosa.RESCALE.default,
+ (
+ node,
+ torch.int8,
+ rescale_qargs.get_scale_per_tensor()
+ / qarg.get_scale_per_tensor(), # Old scale / new scale
+ rescale_qargs.get_zp_per_tensor(), # Old zero point
+ qarg.get_zp_per_tensor(), # New zero point
+ ),
+ from_node=node,
+ )
+
+ for user in users_copy:
+ user.replace_input_with(node, rescale_node)
+
+ return True
+
+ def call(self, graph_module: GraphModule) -> PassResult:
+ graph = graph_module.graph
+
+ modified = False
+ for node in list(graph.nodes):
+ node = cast(Node, node)
+
+ if node.op != "call_function" or node.target not in self.included_targets:
+ continue
+
+ if "input_qparams" not in node.meta or len(node.meta["input_qparams"]) == 0:
+ continue
+ input_qparams = node.meta["input_qparams"]
+
+ inputs_rescale_qargs, output_rescale_qargs = self._get_rescale_qparams(
+ node.target, input_qparams
+ )
+
+ inputs_was_rescaled = self._rescale_inputs(
+ graph, node, inputs_rescale_qargs
+ )
+ outputs_was_rescaled = False
+ if inputs_was_rescaled:
+ outputs_was_rescaled = self._rescale_outputs(
+ graph, node, output_rescale_qargs
+ )
+ modified = True
+
+ # Update node metadata
+
+ if inputs_was_rescaled:
+ assert len(inputs_rescale_qargs) == len(node.meta["input_qparams"])
+ node.meta["input_qparams"] = inputs_rescale_qargs
+
+ if outputs_was_rescaled:
+ assert len(node.meta["output_qparams"]) == 1
+ node.meta["output_qparams"] = {0: output_rescale_qargs}
+
+ # If the output type is specified in the node, change it such
+ # that it matches the subsequent rescale node(s) that this node
+ # now has output edges to.
+ if "dtype" in node.kwargs:
+ set_node_arg(node, "dtype", torch.int32)
+
+ if modified:
+ # Retrace the graph to update the fake tensor types
+ graph_module = super().call(graph_module).graph_module
+ graph_module.recompile()
+
+ return PassResult(graph_module, modified)
diff --git a/backends/arm/operators/op_abs.py b/backends/arm/operators/op_abs.py
index ec76eb5517f..943c4778867 100644
--- a/backends/arm/operators/op_abs.py
+++ b/backends/arm/operators/op_abs.py
@@ -6,9 +6,6 @@
# pyre-unsafe
from typing import Any, List
-import executorch.backends.arm.tosa.quant_utils as tqutils
-import executorch.backends.arm.tosa.utils as tutils
-
from executorch.backends.arm.operators.node_visitor import (
NodeVisitor,
register_node_visitor,
@@ -18,22 +15,20 @@
validate_same_dtype,
validate_valid_dtype,
)
-from executorch.backends.arm.tosa import TosaSpecification
from executorch.backends.arm.tosa.mapping import TosaArg
+from executorch.backends.arm.tosa.specification import TosaSpecification
from torch.fx import Node
@register_node_visitor
-class AbsVisitor_INT(NodeVisitor):
+class AbsVisitor(NodeVisitor):
target = "aten.abs.default"
tosa_specs = [
TosaSpecification.create_from_string("TOSA-1.0+INT"),
+ TosaSpecification.create_from_string("TOSA-1.0+FP"),
]
- def __init__(self, *args):
- super().__init__(*args)
-
def define_node(
self,
node: Node,
@@ -47,89 +42,18 @@ def define_node(
validate_num_inputs(self.target, inputs, 1)
validate_same_dtype(self.target, [*inputs, output], ts)
- # Handle int8 (quantized) and int32
validate_valid_dtype(
self.target,
[*inputs, output],
- [ts.DType.INT8, ts.DType.INT32],
+ [ts.DType.INT32, ts.DType.FP32],
output.tosa_spec,
)
- scale_back = 1.0
- if inputs[0].dtype == ts.DType.INT8:
- rescaled_inputs, scale_back = tqutils.insert_rescale_ops_to_int32(
- tosa_graph, inputs, node, self.tosa_spec
- ) # type: ignore[possibly-undefined]
- else:
- # input[0].dtype == ts.DType.INT32
- # Non quantized input, natively support by TOSA.abs
- rescaled_inputs = inputs
-
- if output.dtype == ts.DType.INT8:
- broadcasted_shape = tutils.tosa_shape(output.shape, output.dim_order)
- abs_output = tosa_graph.addIntermediate(broadcasted_shape, ts.DType.INT32)
- else:
- # output.dtype == ts.DType.INT32
- abs_output = output
-
- # Do the INT32 Abs
- self._serialize_operator(
- node,
- tosa_graph,
+ tosa_graph.addOperator(
ts.TosaOp.Op().ABS,
[
- rescaled_inputs[0].name,
+ inputs[0].name,
],
- [abs_output.name],
+ [output.name],
None,
)
-
- if output.dtype == ts.DType.INT8:
- # Scale output back to 8 bit
- # pyre-ignore
- tqutils.insert_rescale_op_to_int8(
- tosa_graph, abs_output, scale_back, node, self.tosa_spec
- ) # type: ignore[possibly-undefined]
-
-
-@register_node_visitor
-class AbsVisitor_FP(AbsVisitor_INT):
- # inheriting 'target' from BI class
-
- tosa_specs = [TosaSpecification.create_from_string("TOSA-1.0+FP")]
-
- def __init__(self, *args):
- super().__init__(*args)
-
- def define_node(
- self,
- node: Node,
- tosa_graph: Any,
- inputs: List[TosaArg],
- output: TosaArg,
- ) -> None:
-
- import serializer.tosa_serializer as ts # type: ignore
-
- validate_num_inputs(self.target, inputs, 1)
- validate_same_dtype(self.target, [*inputs, output], ts)
-
- if inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32]:
- # Call the inherited define_node for handling integers
- super().define_node(node, tosa_graph, inputs, output)
- else:
- # FP32 Abs lowering
-
- validate_valid_dtype(
- self.target, [*inputs, output], ts.DType.FP32, output.tosa_spec
- )
-
- # MI lowering
- self._serialize_operator(
- node,
- tosa_graph,
- ts.TosaOp.Op().ABS,
- [inputs[0].name],
- [output.name],
- None,
- )
diff --git a/backends/arm/operators/op_eq.py b/backends/arm/operators/op_eq.py
index 2136fe2e946..76b6e67cd8d 100644
--- a/backends/arm/operators/op_eq.py
+++ b/backends/arm/operators/op_eq.py
@@ -7,8 +7,6 @@
from typing import Any, List
-import executorch.backends.arm.tosa.quant_utils as tqutils
-
from executorch.backends.arm.operators.node_visitor import (
NodeVisitor,
register_node_visitor,
@@ -56,23 +54,12 @@ def define_node(
)
validate_valid_dtype(self.target, output, ts.DType.BOOL, output.tosa_spec)
- input_nodes = inputs
- # Handle quantization
- if inputs[0].dtype == ts.DType.INT8:
- # Rescale inputs to 32 bit
- rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32(
- tosa_graph, inputs, node, self.tosa_spec
- )
-
- # Update IO
- input_nodes = rescaled_inputs
-
# Do the equal comparison
self._serialize_operator(
node,
tosa_graph,
ts.TosaOp.Op().EQUAL,
- [input_nodes[0].name, input_nodes[1].name],
+ [inputs[0].name, inputs[1].name],
[output.name],
None,
)
diff --git a/backends/arm/operators/op_ge.py b/backends/arm/operators/op_ge.py
index c538e735880..4bb20cac77f 100644
--- a/backends/arm/operators/op_ge.py
+++ b/backends/arm/operators/op_ge.py
@@ -7,8 +7,6 @@
from typing import Any, List
-import executorch.backends.arm.tosa.quant_utils as tqutils
-
from executorch.backends.arm.operators.node_visitor import (
NodeVisitor,
register_node_visitor,
@@ -56,22 +54,11 @@ def define_node(
)
validate_valid_dtype(self.target, output, ts.DType.BOOL, output.tosa_spec)
- input_nodes = inputs
- # Handle quantization
- if inputs[0].dtype == ts.DType.INT8:
- # Rescale inputs to 32 bit
- rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32(
- tosa_graph, inputs, node, self.tosa_spec
- )
-
- # Update IO
- input_nodes = rescaled_inputs
-
self._serialize_operator(
node,
tosa_graph,
ts.TosaOp.Op().GREATER_EQUAL,
- [input_nodes[0].name, input_nodes[1].name],
+ [inputs[0].name, inputs[1].name],
[output.name],
None,
)
diff --git a/backends/arm/operators/op_gt.py b/backends/arm/operators/op_gt.py
index d407e28c1b6..c25c959681e 100644
--- a/backends/arm/operators/op_gt.py
+++ b/backends/arm/operators/op_gt.py
@@ -7,8 +7,6 @@
from typing import Any, List
-import executorch.backends.arm.tosa.quant_utils as tqutils
-
from executorch.backends.arm.operators.node_visitor import (
NodeVisitor,
register_node_visitor,
@@ -56,22 +54,11 @@ def define_node(
)
validate_valid_dtype(self.target, output, ts.DType.BOOL, output.tosa_spec)
- input_nodes = inputs
- # Handle quantization
- if inputs[0].dtype == ts.DType.INT8:
- # Rescale inputs to 32 bit
- rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32(
- tosa_graph, inputs, node, self.tosa_spec
- )
-
- # Update IO
- input_nodes = rescaled_inputs
-
self._serialize_operator(
node,
tosa_graph,
ts.TosaOp.Op().GREATER,
- [input_nodes[0].name, input_nodes[1].name],
+ [inputs[0].name, inputs[1].name],
[output.name],
None,
)
diff --git a/backends/arm/operators/op_le.py b/backends/arm/operators/op_le.py
index 403c6c233d3..e62d669814f 100644
--- a/backends/arm/operators/op_le.py
+++ b/backends/arm/operators/op_le.py
@@ -7,8 +7,6 @@
from typing import Any, List
-import executorch.backends.arm.tosa.quant_utils as tqutils
-
from executorch.backends.arm.operators.node_visitor import (
NodeVisitor,
register_node_visitor,
@@ -56,22 +54,11 @@ def define_node(
)
validate_valid_dtype(self.target, output, ts.DType.BOOL, output.tosa_spec)
- input_nodes = inputs
- # Handle quantization
- if inputs[0].dtype == ts.DType.INT8:
- # Rescale inputs to 32 bit
- rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32(
- tosa_graph, inputs, node, self.tosa_spec
- )
-
- # Update IO
- input_nodes = rescaled_inputs
-
self._serialize_operator(
node,
tosa_graph,
ts.TosaOp.Op().GREATER_EQUAL,
- [input_nodes[1].name, input_nodes[0].name],
+ [inputs[1].name, inputs[0].name],
[output.name],
None,
)
diff --git a/backends/arm/operators/op_lt.py b/backends/arm/operators/op_lt.py
index f5132dd4feb..cccb0abd5d7 100644
--- a/backends/arm/operators/op_lt.py
+++ b/backends/arm/operators/op_lt.py
@@ -7,8 +7,6 @@
from typing import Any, List
-import executorch.backends.arm.tosa.quant_utils as tqutils
-
from executorch.backends.arm.operators.node_visitor import (
NodeVisitor,
register_node_visitor,
@@ -56,22 +54,11 @@ def define_node(
)
validate_valid_dtype(self.target, output, ts.DType.BOOL, output.tosa_spec)
- input_nodes = inputs
- # Handle quantization
- if inputs[0].dtype == ts.DType.INT8:
- # Rescale inputs to 32 bit
- rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32(
- tosa_graph, inputs, node, self.tosa_spec
- )
-
- # Update IO
- input_nodes = rescaled_inputs
-
self._serialize_operator(
node,
tosa_graph,
ts.TosaOp.Op().GREATER,
- [input_nodes[1].name, input_nodes[0].name],
+ [inputs[1].name, inputs[0].name],
[output.name],
None,
)
diff --git a/backends/arm/operators/op_maximum.py b/backends/arm/operators/op_maximum.py
index 66437f8af1d..50c6e06a4bb 100644
--- a/backends/arm/operators/op_maximum.py
+++ b/backends/arm/operators/op_maximum.py
@@ -7,12 +7,6 @@
from typing import Any, List
-import executorch.backends.arm.tosa.quant_utils as tqutils
-
-from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
- get_input_qparams,
-)
-
from executorch.backends.arm.operators.node_visitor import (
NodeVisitor,
register_node_visitor,
@@ -22,9 +16,8 @@
validate_same_dtype,
validate_valid_dtype,
)
-from executorch.backends.arm.tosa import TosaSpecification
from executorch.backends.arm.tosa.mapping import TosaArg
-from executorch.backends.arm.tosa.utils import tosa_shape
+from executorch.backends.arm.tosa.specification import TosaSpecification
from torch.fx import Node
@@ -56,35 +49,12 @@ def define_node(
validate_valid_dtype(
self.target,
[*inputs, output],
- [ts.DType.INT8, ts.DType.INT32, ts.DType.FP32],
+ [ts.DType.INT32, ts.DType.FP32],
output.tosa_spec,
)
- scale_back = 1.0
- max_output = output
- if inputs[0].dtype == ts.DType.INT8:
- input_qparams = get_input_qparams(node)
- if len(input_qparams) != 2:
- raise ValueError(
- f"Both inputs need to have quantization information for {node}"
- )
- if input_qparams[0] != input_qparams[1]:
- raise ValueError(
- "Both inputs must have the same quantization parameters for MAX"
- )
-
- operand_inputs, scale_back = tqutils.insert_rescale_ops_to_int32(
- tosa_graph, inputs, node, self.tosa_spec
- )
-
- output.shape = tosa_shape(output.shape, output.dim_order)
- max_output = tosa_graph.addIntermediate(output.shape, ts.DType.INT32)
- else:
- operand_inputs = inputs
-
attr_maximum = ts.TosaSerializerAttribute()
-
- # Set to PROPOGATE as default
+ # Set to PROPAGATE as default
attr_maximum.MaximumAttribute(nan_mode=NanPropagationMode.PROPAGATE)
self._serialize_operator(
@@ -92,15 +62,9 @@ def define_node(
tosa_graph,
ts.TosaOp.Op().MAXIMUM,
[
- operand_inputs[0].name,
- operand_inputs[1].name,
+ inputs[0].name,
+ inputs[1].name,
],
- [max_output.name],
+ [output.name],
attr_maximum,
)
-
- if output.dtype == ts.DType.INT8:
- # insert RESCALE from int32 back to int8
- tqutils.insert_rescale_op_to_int8(
- tosa_graph, max_output, scale_back, node, self.tosa_spec
- )
diff --git a/backends/arm/operators/op_minimum.py b/backends/arm/operators/op_minimum.py
index 518366d5463..d5b97f186d3 100644
--- a/backends/arm/operators/op_minimum.py
+++ b/backends/arm/operators/op_minimum.py
@@ -7,11 +7,6 @@
from typing import Any, List
-import executorch.backends.arm.tosa.quant_utils as tqutils
-
-from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
- get_input_qparams,
-)
from executorch.backends.arm.operators.node_visitor import (
NodeVisitor,
register_node_visitor,
@@ -23,7 +18,6 @@
)
from executorch.backends.arm.tosa import TosaSpecification
from executorch.backends.arm.tosa.mapping import TosaArg
-from executorch.backends.arm.tosa.utils import tosa_shape
from torch.fx import Node
@@ -55,35 +49,12 @@ def define_node(
validate_valid_dtype(
self.target,
[*inputs, output],
- [ts.DType.INT8, ts.DType.INT32, ts.DType.FP32],
+ [ts.DType.INT32, ts.DType.FP32],
output.tosa_spec,
)
- scale_back = 1.0
- min_output = output
- if inputs[0].dtype == ts.DType.INT8:
- input_qparams = get_input_qparams(node)
- if len(input_qparams) != 2:
- raise ValueError(
- f"Both inputs need to have quantization information for {node}"
- )
- if input_qparams[0] != input_qparams[1]:
- raise ValueError(
- "Both inputs must have the same quantization parameters for MIN"
- )
-
- operand_inputs, scale_back = tqutils.insert_rescale_ops_to_int32(
- tosa_graph, inputs, node, self.tosa_spec
- )
-
- output.shape = tosa_shape(output.shape, output.dim_order)
- min_output = tosa_graph.addIntermediate(output.shape, ts.DType.INT32)
- else:
- operand_inputs = inputs
-
attr_minimum = ts.TosaSerializerAttribute()
-
- # Set to PROPOGATE as default
+ # Set to PROPAGATE as default
attr_minimum.MinimumAttribute(nan_mode=NanPropagationMode.PROPAGATE)
self._serialize_operator(
@@ -91,15 +62,9 @@ def define_node(
tosa_graph,
ts.TosaOp.Op().MINIMUM,
[
- operand_inputs[0].name,
- operand_inputs[1].name,
+ inputs[0].name,
+ inputs[1].name,
],
- [min_output.name],
+ [output.name],
attr_minimum,
)
-
- if output.dtype == ts.DType.INT8:
- # insert RESCALE from int32 back to int8
- tqutils.insert_rescale_op_to_int8(
- tosa_graph, min_output, scale_back, node, self.tosa_spec
- )
diff --git a/backends/arm/test/passes/test_insert_rescale_i32_pass.py b/backends/arm/test/passes/test_insert_rescale_i32_pass.py
new file mode 100644
index 00000000000..096c90d330d
--- /dev/null
+++ b/backends/arm/test/passes/test_insert_rescale_i32_pass.py
@@ -0,0 +1,77 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+from executorch.backends.arm._passes import (
+ FoldAndAnnotateQParamsPass,
+ InsertRescaleInt32Pass,
+)
+from executorch.backends.arm.test.tester.test_pipeline import PassPipeline
+
+
+class NeedsRescaleOps(torch.nn.Module):
+ """A module containing ops that require INT32 inputs/outputs."""
+
+ input_t = Tuple[torch.Tensor, torch.Tensor]
+
+ def __init__(self):
+ super().__init__()
+
+ def forward(self, x, y):
+ a = torch.maximum(x, y)
+ b = torch.abs(a)
+ c = a > b
+ return c
+
+ def get_inputs(self, dtype) -> input_t:
+ if dtype == torch.float32:
+ return (torch.rand(1, 3, 5, 6), torch.rand(1, 3, 5, 6))
+ elif dtype == torch.int32:
+ return (
+ torch.randint(3, 5, (3,), dtype=torch.int32),
+ torch.randint(3, 5, (3,), dtype=torch.int32),
+ )
+ else:
+ raise ValueError("Not a valid input dtype for model")
+
+
+def test_insert_rescales():
+ module = NeedsRescaleOps()
+ input_t = Tuple[torch.Tensor, torch.Tensor]
+ ops_not_before = {"executorch_exir_dialects_backend__ops_tosa_RESCALE_default"}
+ ops_after = {
+ # "number of op nodes with i8 output" + "number of i8 node inputs"
+ "executorch_exir_dialects_backend__ops_tosa_RESCALE_default": 2
+ + 5,
+ }
+ pipeline = PassPipeline[input_t](
+ module,
+ module.get_inputs(torch.float32),
+ quantize=True,
+ ops_not_before_pass=ops_not_before,
+ ops_after_pass=ops_after,
+ pass_list=[FoldAndAnnotateQParamsPass, InsertRescaleInt32Pass],
+ )
+ pipeline.pop_stage("run_method_and_compare_outputs")
+ pipeline.run()
+
+
+def test_dont_insert_rescales():
+ module = NeedsRescaleOps()
+ input_t = Tuple[torch.Tensor, torch.Tensor]
+ ops_not_before = {"executorch_exir_dialects_backend__ops_tosa_RESCALE_default"}
+ # All inputs are already i32. Rescales should not be added.
+ ops_not_after = {"executorch_exir_dialects_backend__ops_tosa_RESCALE_default"}
+ pipeline = PassPipeline[input_t](
+ module,
+ module.get_inputs(torch.int32),
+ ops_not_before_pass=ops_not_before,
+ ops_not_after_pass=ops_not_after,
+ pass_list=[FoldAndAnnotateQParamsPass, InsertRescaleInt32Pass],
+ )
+ pipeline.pop_stage("run_method_and_compare_outputs")
+ pipeline.run()
From 1b8d380bf1db79ce22fba5096aefb80c2224e5a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0imon=20Str=C3=BD=C4=8Dek?=
Date: Tue, 7 Oct 2025 15:30:38 +0200
Subject: [PATCH 150/266] NXP backend: Add NXP backend tutorial page (#14850)
### Summary
Adds tutorial page for NXP backend.
### Test plan
Documentation built locally using Makefile without any problems.
cc @robert-kalmar @JakeStevens @digantdesai
---
docs/source/backends-nxp.md | 41 ++++++++++++++++++++++++++++++++++---
1 file changed, 38 insertions(+), 3 deletions(-)
diff --git a/docs/source/backends-nxp.md b/docs/source/backends-nxp.md
index f02f495f685..4783b4a5bc6 100644
--- a/docs/source/backends-nxp.md
+++ b/docs/source/backends-nxp.md
@@ -1,5 +1,40 @@
# NXP eIQ Neutron Backend
-See
-[NXP eIQ Neutron Backend](https://github.com/pytorch/executorch/blob/main/backends/nxp/README.md)
-for current status about running ExecuTorch on NXP eIQ Neutron Backend.
+This manual page is dedicated to introduction of using the ExecuTorch with NXP eIQ Neutron Backend.
+NXP offers accelerated machine learning models inference on edge devices.
+To learn more about NXP's machine learning acceleration platform, please refer to [the official NXP website](https://www.nxp.com/applications/technologies/ai-and-machine-learning:MACHINE-LEARNING).
+
+
+For up-to-date status about running ExecuTorch on Neutron Backend please visit the
manual page.
+
+
+## Features
+
+Executorch v1.0 supports running machine learning models on selected NXP chips (for now only i.MXRT700).
+Among currently supported machine learning models are:
+- Convolution-based neutral networks
+- Full support for MobileNetv2 and CifarNet
+
+## Prerequisites (Hardware and Software)
+
+In order to succesfully build executorch project and convert models for NXP eIQ Neutron Backend you will need a computer running Windows or Linux.
+
+If you want to test the runtime, you'll also need:
+- Hardware with NXP's [i.MXRT700](https://www.nxp.com/products/i.MX-RT700) chip or a testing board like MIMXRT700-AVK
+- [MCUXpresso IDE](https://www.nxp.com/design/design-center/software/development-software/mcuxpresso-software-and-tools-/mcuxpresso-integrated-development-environment-ide:MCUXpresso-IDE) or [MCUXpresso Visual Studio Code extension](https://www.nxp.com/design/design-center/software/development-software/mcuxpresso-software-and-tools-/mcuxpresso-for-visual-studio-code:MCUXPRESSO-VSC)
+
+## Using NXP backend
+
+To test converting a neural network model for inference on NXP eIQ Neutron Backend, you can use our example script:
+
+```shell
+# cd to the root of executorch repository
+./examples/nxp/aot_neutron_compile.sh [model (cifar10 or mobilenetv2)]
+```
+
+For a quick overview how to convert a custom PyTorch model, take a look at our [exmple python script](https://github.com/pytorch/executorch/tree/release/1.0/examples/nxp/aot_neutron_compile.py).
+
+## Runtime Integration
+
+To learn how to run the converted model on the NXP hardware, use one of our example projects on using executorch runtime from MCUXpresso IDE example projects list.
+For more finegrained tutorial, visit [this manual page](https://mcuxpresso.nxp.com/mcuxsdk/latest/html/middleware/eiq/executorch/docs/nxp/topics/example_applications.html).
From d8e07bd20c848f8b85d78444d8b9b5dcf8df2924 Mon Sep 17 00:00:00 2001
From: Mengwei Liu
Date: Tue, 7 Oct 2025 07:55:15 -0700
Subject: [PATCH 151/266] Add .ptd support to portable executor runner (#14833)
This pull request enhances the `executor_runner` example by adding
support for loading and using `.ptd` (portable tensor data) files. This
enables the runner to ingest pre-serialized tensor data, improving
flexibility for model input handling. The changes include updates to
both build configuration and the main runner logic.
**Support for .ptd file loading and usage:**
* Added a new command-line flag `data_path` to specify the path to a
`.ptd` data file in `executor_runner.cpp` and integrated logic to load
this file and parse its contents using `FlatTensorDataMap`.
[[1]](diffhunk://#diff-179a73518cca7aa859d17ae188553f0eb0bee3ba5d2a99d8c636fae0bb39f759R54)
[[2]](diffhunk://#diff-179a73518cca7aa859d17ae188553f0eb0bee3ba5d2a99d8c636fae0bb39f759R177-R204)
* Updated the runner to pass the loaded tensor data map to the model
method loader, allowing methods to access pre-loaded input data.
**Build and dependency updates:**
* Included `flat_tensor_data_map` as a dependency in both the Bazel
build targets and CMake build configuration to ensure the new
functionality is available during compilation.
[[1]](diffhunk://#diff-d613fef537c6c97cf343cfcde252e980f7673c21aad54b40a2315aa44c284a8cR22)
[[2]](diffhunk://#diff-d613fef537c6c97cf343cfcde252e980f7673c21aad54b40a2315aa44c284a8cR42)
[[3]](diffhunk://#diff-1e7de1ae2d059d21e1dd75d5812d5a34b0222cef273b7c3a2af62eb747f9d20aR1024-R1026)
* Added the necessary header include for `flat_tensor_data_map` in
`executor_runner.cpp` and updated the relevant namespace usage.
[[1]](diffhunk://#diff-179a73518cca7aa859d17ae188553f0eb0bee3ba5d2a99d8c636fae0bb39f759R29)
[[2]](diffhunk://#diff-179a73518cca7aa859d17ae188553f0eb0bee3ba5d2a99d8c636fae0bb39f759R77)
## Test Plan:
Tested with .pte and .ptd for CUDA backend:
```
python -m executorch.examples.cuda.scripts.export --model_name linear --output_dir ./
```
Make sure we have `linear.pte` and `aoti_cuda_blob.ptd`.
Build executor runner with the following options:
```
cmake -DCMAKE_BUILD_TYPE=Debug -DEXECUTORCH_BUILD_CUDA=ON -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON -S. -Bcmake-out
```
Then do:
```
cmake --build cmake-out -j8
```
Then we can run:
```
cmake-out/executor_runner --model_path linear.pte --ptd_path aoti_cuda_blob.ptd
I 00:00:00.000594 executorch:executor_runner.cpp:189] PTD file aoti_cuda_blob.ptd is loaded.
I 00:00:00.000671 executorch:executor_runner.cpp:199] PTD data map created with 1 keys.
I 00:00:00.000749 executorch:executor_runner.cpp:249] Model file linear.pte is loaded.
I 00:00:00.000758 executorch:executor_runner.cpp:258] Using method forward
I 00:00:00.000770 executorch:executor_runner.cpp:309] Setting up planned buffer 0, size 96.
I 00:00:00.002908 executorch:cuda_backend.cpp:140] Writing 394624 bytes to /tmp/linear_so_blob844427.so
I 00:00:00.324783 executorch:cuda_backend.cpp:174] container_handle = 0x26a71b0
I 00:00:00.324867 executorch:executor_runner.cpp:337] Method loaded.
I 00:00:00.325796 executorch:cuda_backend.cpp:249] Inputs copied to GPU
I 00:00:00.325829 executorch:cuda_backend.cpp:278] Outputs created on GPU
E 00:00:00.326623 executorch:memory.cpp:286] Cannot delete null tensor
I 00:00:00.326678 executorch:executor_runner.cpp:374] Model executed successfully 1 time(s) in 1.777041 ms.
I 00:00:00.326691 executorch:executor_runner.cpp:383] 1 outputs:
OutputX 0: tensor(sizes=[3, 3], [-0.199237, 0.550725, 0.0830356, -0.199237, 0.550725, 0.0830356, -0.199237, 0.550725, 0.0830356])
E 00:00:00.328474 executorch:memory.cpp:299] Didn't find tensor 0x699a3d0
```
---
.ci/scripts/test_model.sh | 11 +++---
.ci/scripts/utils.sh | 7 ++--
CMakeLists.txt | 4 +++
examples/portable/custom_ops/CMakeLists.txt | 10 ++++--
.../executor_runner/executor_runner.cpp | 36 ++++++++++++++++++-
examples/portable/executor_runner/targets.bzl | 2 ++
.../selective_build/advanced/CMakeLists.txt | 9 +++--
examples/selective_build/basic/CMakeLists.txt | 9 +++--
.../flat_tensor/flat_tensor_data_map.cpp | 2 +-
.../serialize/flat_tensor_header.cpp | 2 ++
tools/cmake/preset/arm_baremetal.cmake | 3 +-
tools/cmake/preset/default.cmake | 4 +--
12 files changed, 81 insertions(+), 18 deletions(-)
diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh
index de28597b1d5..8449809ffe3 100755
--- a/.ci/scripts/test_model.sh
+++ b/.ci/scripts/test_model.sh
@@ -48,22 +48,25 @@ prepare_artifacts_upload() {
fi
}
+
build_cmake_executor_runner() {
local backend_string_select="${1:-}"
echo "Building executor_runner"
rm -rf ${CMAKE_OUTPUT_DIR}
mkdir ${CMAKE_OUTPUT_DIR}
+ # Common options:
+ COMMON="-DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE"
if [[ "$backend_string_select" == "XNNPACK" ]]; then
echo "Backend $backend_string_select selected"
- (cd ${CMAKE_OUTPUT_DIR} \
- && cmake -DCMAKE_BUILD_TYPE=Release \
+ cmake -DCMAKE_BUILD_TYPE=Release \
-DEXECUTORCH_BUILD_XNNPACK=ON \
- -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
+ ${COMMON} \
+ -B${CMAKE_OUTPUT_DIR} .
cmake --build ${CMAKE_OUTPUT_DIR} -j4
else
cmake -DCMAKE_BUILD_TYPE=Debug \
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
- -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
+ ${COMMON} \
-B${CMAKE_OUTPUT_DIR} .
cmake --build ${CMAKE_OUTPUT_DIR} -j4 --config Debug
fi
diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh
index f6f6ece786b..f896d3f1d40 100644
--- a/.ci/scripts/utils.sh
+++ b/.ci/scripts/utils.sh
@@ -125,14 +125,15 @@ build_executorch_runner_cmake() {
clean_executorch_install_folders
mkdir "${CMAKE_OUTPUT_DIR}"
- pushd "${CMAKE_OUTPUT_DIR}" || return
if [[ $1 == "Debug" ]]; then
CXXFLAGS="-fsanitize=address,undefined"
else
CXXFLAGS=""
fi
- CXXFLAGS="$CXXFLAGS" retry cmake -DPYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" -DCMAKE_BUILD_TYPE="${1:-Release}" ..
- popd || return
+ CXXFLAGS="$CXXFLAGS" retry cmake \
+ -DPYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" \
+ -DCMAKE_BUILD_TYPE="${1:-Release}" \
+ -B${CMAKE_OUTPUT_DIR} .
if [ "$(uname)" == "Darwin" ]; then
CMAKE_JOBS=$(( $(sysctl -n hw.ncpu) - 1 ))
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7012ec641bf..6a36d7e563a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1021,6 +1021,10 @@ if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
extension_runner_util gflags executorch_backends
)
+ if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
+ list(APPEND _executor_runner_libs extension_flat_tensor)
+ endif()
+
if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
list(APPEND _executor_runner_libs optimized_native_cpu_ops_lib)
elseif(EXECUTORCH_BUILD_CADENCE)
diff --git a/examples/portable/custom_ops/CMakeLists.txt b/examples/portable/custom_ops/CMakeLists.txt
index 4188554af79..8e679697b47 100644
--- a/examples/portable/custom_ops/CMakeLists.txt
+++ b/examples/portable/custom_ops/CMakeLists.txt
@@ -117,8 +117,14 @@ list(TRANSFORM _executor_runner__srcs PREPEND "${EXECUTORCH_ROOT}/")
add_executable(custom_ops_executor_runner ${_executor_runner__srcs})
target_link_libraries(
- custom_ops_executor_runner custom_ops_lib executorch extension_evalue_util
- extension_runner_util gflags
+ custom_ops_executor_runner
+ custom_ops_lib
+ executorch
+ extension_evalue_util
+ extension_runner_util
+ gflags
+ extension_data_loader
+ extension_flat_tensor
)
target_compile_options(
custom_ops_executor_runner PUBLIC ${_common_compile_options}
diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp
index 5ce872eec8e..0974e751203 100644
--- a/examples/portable/executor_runner/executor_runner.cpp
+++ b/examples/portable/executor_runner/executor_runner.cpp
@@ -26,6 +26,7 @@
#include
#include
+#include
#include
#include
#include
@@ -50,6 +51,7 @@ DEFINE_string(
model_path,
"model.pte",
"Model serialized in flatbuffer format.");
+DEFINE_string(data_path, "", "Path to data file.");
DEFINE_string(inputs, "", "Comma-separated list of input files");
DEFINE_string(
output_file,
@@ -72,6 +74,7 @@ DEFINE_int32(
using executorch::aten::ScalarType;
using executorch::aten::Tensor;
using executorch::extension::FileDataLoader;
+using executorch::extension::FlatTensorDataMap;
using executorch::runtime::Error;
using executorch::runtime::EValue;
using executorch::runtime::EventTracer;
@@ -171,6 +174,34 @@ int main(int argc, char** argv) {
"FileDataLoader::from() failed: 0x%" PRIx32,
(uint32_t)loader.error());
+ // Load .ptd file if provided
+ std::unique_ptr