From 30a1a258b22d1471c0aae328f30a5910af6af118 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Tue, 26 Aug 2025 12:31:49 +0400 Subject: [PATCH 01/27] openvino quantizer refactored --- backends/openvino/quantizer/__init__.py | 4 +- backends/openvino/quantizer/observers.py | 286 ++++++++++++ .../quantizer/observers/nncf_observers.py | 176 -------- backends/openvino/quantizer/quantizer.py | 412 ++++++++++-------- examples/models/llama/export_llama_lib.py | 9 + extension/llm/export/quantizer_lib.py | 38 +- 6 files changed, 573 insertions(+), 352 deletions(-) create mode 100644 backends/openvino/quantizer/observers.py delete mode 100644 backends/openvino/quantizer/observers/nncf_observers.py diff --git a/backends/openvino/quantizer/__init__.py b/backends/openvino/quantizer/__init__.py index df038483f2f..0fd8c10b249 100644 --- a/backends/openvino/quantizer/__init__.py +++ b/backends/openvino/quantizer/__init__.py @@ -1,3 +1,3 @@ -from .quantizer import OpenVINOQuantizer, quantize_model +from .quantizer import OpenVINOQuantizer, quantize_model, QuantizationMode -__all__ = ["OpenVINOQuantizer", "quantize_model"] +__all__ = ["OpenVINOQuantizer", "quantize_model", "QuantizationMode"] diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py new file mode 100644 index 00000000000..2ea66f11a55 --- /dev/null +++ b/backends/openvino/quantizer/observers.py @@ -0,0 +1,286 @@ +# Copyright (c) Intel Corporation +# +# Licensed under the BSD License (the "License"); you may not use this file +# except in compliance with the License. See the license file found in the +# LICENSE file in the root directory of this source tree. + +# mypy: disable-error-code=import-not-found + +from abc import ABC, abstractmethod +from typing import Optional, Tuple + +import nncf.torch.graph.operator_metatypes as om # type: ignore[import-untyped] + +import torch +from nncf.experimental.torch.fx.nncf_graph_builder import ( # type: ignore[import-untyped] + GraphConverter, +) + +from nncf.experimental.torch.fx.node_utils import ( # type: ignore[import-untyped] + get_tensor_constant_from_node, +) +from nncf.experimental.torch.fx.transformations import ( # type: ignore[import-untyped] + constant_update_fn, + module_insertion_transformation_builder, +) +from nncf.parameters import CompressWeightsMode # type: ignore[import-untyped] +from nncf.quantization.algorithms.weight_compression.config import ( # type: ignore[import-untyped] + WeightCompressionConfig, +) +from nncf.quantization.algorithms.weight_compression.torch_fx_backend import ( # type: ignore[import-untyped] + FXWeightCompressionAlgoBackend, +) +from nncf.quantization.algorithms.weight_compression.weight_lowering import ( # type: ignore[import-untyped] + do_integer_quantization, +) +from nncf.tensor.tensor import Tensor # type: ignore[import-untyped] +from nncf.torch.graph.transformations.commands import ( # type: ignore[import-untyped] + PTTargetPoint, + TargetType, +) +from nncf.torch.quantization.layers import ( # type: ignore[import-untyped] + BaseWeightsDecompressor, + INT4AsymmetricWeightsDecompressor, + INT4SymmetricWeightsDecompressor, + INT8AsymmetricWeightsDecompressor, + INT8SymmetricWeightsDecompressor, +) +from torchao.quantization.pt2e import MappingType, ObserverBase +from nncf.torch.model_graph_manager import get_weight_compression_reduction_axes + +class WeightObserverBase(ObserverBase, ABC): + """ + Base implementation of an NNCF observer that defines the rules for compressing layer weights into the OpenVINO representation. + """ + + def calculate_qparams( # type: ignore[override] + self, + weight: torch.Tensor, + observer_node: torch.fx.Node, + model: torch.fx.GraphModule, + ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: + """ + Calculate quantization parameters such as scale, quantized weight and zero point. + + :param weight: FP weight to be used for calculating qparams. + :return: quantization params quantized weight, scale and zero point + """ + ndims = len(weight.size()) + node_with_weight, weight_port_id = ( + WeightObserverBase.get_node_with_weight_and_port_ids(observer_node, model) + ) + _, node_metatype = GraphConverter.get_node_type_and_metatype( + node_with_weight, model + ) + # Special case where embedding metatype has to be mapped to AtenEmbedding metatype + node_metatype = ( + om.PTAtenEmbeddingMetatype + if node_metatype == om.PTEmbeddingMetatype + else node_metatype + ) + reduction_dims = get_weight_compression_reduction_axes( + node_metatype, weight_port_id, ndims + ) + reduction_dims = tuple(reduction_dims) + + q_weight, scale, zp = do_integer_quantization( + Tensor(weight), self.wc_config, reduction_axes=reduction_dims + ) + zp = zp.data if zp is not None else None + return q_weight.data, scale.data, zp + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return x + + @staticmethod + def get_node_with_weight_and_port_ids( + observer_node: torch.fx.Node, model: torch.fx.GraphModule + ) -> Tuple[torch.fx.Node, int]: + """ + Returns the node which contains the weight and the weight port id. + + :param observer_node: Observer node for the weight. + :param graph: The model. + :return: Node which contains the weight (for eg. Linear node) and the port ID for the weight. + """ + for node in model.graph.nodes: + if observer_node in node.all_input_nodes: + return node, node.all_input_nodes.index(observer_node) + msg = f"Observer node {observer_node.name} has no consumer node" + raise RuntimeError(msg) + + def convert( + self, model: torch.fx.GraphModule, observer_node: torch.fx.Node + ) -> None: + """ + Converts the weight observer node into a decompression subgraph after calibration. + This method is responsible for transforming the model after the quantization preparation + and calibration phases. It replaces the observer node with the quantized weight and a decompression + module. + + :param model: A `torch.fx.GraphModule` representing the statically traced model + with observer nodes attached and calibrated. + :param observer_node: The `torch.fx.Node` corresponding to the observer module for + the weight that is being transformed into a compressed representation. + """ + weight_node = observer_node.args[0] + original_weight = get_tensor_constant_from_node(weight_node, model) + q_weight, scale, zero_point = self.calculate_qparams( + original_weight, observer_node, model + ) + + decompressor = self._create_decompressor( + scale, zero_point, q_weight, original_weight + ) + packed_q_weight = decompressor.pack_weight(q_weight) + + constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0) + + compressed_weight_name = observer_node.all_input_nodes[0].name + decompressor_suffix = "_".join( + compressed_weight_name.replace(".", "_").split("_")[:-2] + ) + decompressor_name = f"{decompressor.quantization_mode}_weights_decompressor_{decompressor_suffix}" + + module_insertion_transformation_builder( + decompressor, + [ + PTTargetPoint( + TargetType.OPERATOR_POST_HOOK, + target_node_name=compressed_weight_name, + ) + ], + decompressor_name, + )(model) + + decomp_node = observer_node.args[0] + observer_node.replace_all_uses_with(decomp_node) # type: ignore[arg-type] + model.graph.erase_node(observer_node) + + @abstractmethod + def _create_decompressor( + self, + scale: torch.Tensor, + zero_point: Optional[torch.Tensor], + q_weight: torch.Tensor, + original_weight: torch.Tensor, + ) -> BaseWeightsDecompressor: + """ + Used to return the respective NNCF decompressor for different types of quantization. + + :param scale: Calculated scale quantization parameter. + :param zero_point: Calculated zero_point quantization parameter. + :param q_weight: Calculated quantized weight. + :param original_weight: FP weight. + :return: NNCF observer according to the qmode which creates the decompression subgraph supported by OpenVINO. + """ + pass + + @abstractmethod + def get_wc_config(self) -> WeightCompressionConfig: + """ + Used to return the respective NNCF Weight Compression Config. + + :return: Weight compression config with the compression information such as qmode, group_size etc. + """ + pass + + +class INT4WeightObserver(WeightObserverBase): + """ + This class defines the behavior for INT4 Weight Compression which has per-group granularity. + """ + + def __init__( + self, + group_size: int, + mapping_type: MappingType, + target_dtype: torch.dtype, + *args, + **kwargs, + ) -> None: + """ + :param group_size: Group size for group wise quantization. group_size=-1 means it is per-channel quantization. + :param mapping_type: MappingType.SYMMETRIC and MappingType.ASYMMETRIC are supported types for this argument for symmetric or asymmetric quantization. + :param target_dtype: target dtype for quantization such as int8, uint8, etc. + """ + super().__init__(dtype=target_dtype, is_dynamic=False) + self.wc_config = None + self.mapping_type = mapping_type + + qmode = ( + CompressWeightsMode.INT4_ASYM + if self.mapping_type == MappingType.ASYMMETRIC + else CompressWeightsMode.INT4_SYM + ) + self.wc_config = WeightCompressionConfig(mode=qmode, group_size=group_size) + + def _create_decompressor( + self, + scale: torch.Tensor, + zero_point: Optional[torch.Tensor], + q_weight: torch.Tensor, + original_weight: torch.Tensor, + ) -> BaseWeightsDecompressor: + if zero_point is not None: + return INT4AsymmetricWeightsDecompressor( + scale, + zero_point, + q_weight.shape, + original_weight.shape, + original_weight.dtype, + ) + else: + return INT4SymmetricWeightsDecompressor( + scale, q_weight.shape, original_weight.shape, original_weight.dtype + ) + + def get_wc_config(self): + return self.wc_config + + +class INT8WeightObserver(WeightObserverBase): + """ + This class defines the behavior for Int8 WC which has per channel granularity. + """ + + def __init__( + self, + qscheme: torch.qscheme, + dtype: torch.dtype, + ch_axis: int = 0, + *args, + **kwargs, + ) -> None: + """ + :param qscheme: Quantization scheme which is per-channel for Int8 WC. + :param dtype: dtype for quantization such as int8, uint8, etc.. + :param ch_axis: Channel axis. + """ + super().__init__(dtype=dtype, is_dynamic=False) + self.wc_config = None + self.qscheme = qscheme + + qmode = ( + CompressWeightsMode.INT8_SYM + if self.qscheme == torch.per_channel_symmetric + else CompressWeightsMode.INT8_ASYM + ) + self.wc_config = WeightCompressionConfig(mode=qmode) + + def _create_decompressor( + self, + scale: torch.Tensor, + zero_point: Optional[torch.Tensor], + q_weight: torch.Tensor, + original_weight: torch.Tensor, + ) -> BaseWeightsDecompressor: + if zero_point is not None: + return INT8AsymmetricWeightsDecompressor( + scale, zero_point, original_weight.dtype + ) + else: + return INT8SymmetricWeightsDecompressor(scale, original_weight.dtype) + + def get_wc_config(self): + return self.wc_config \ No newline at end of file diff --git a/backends/openvino/quantizer/observers/nncf_observers.py b/backends/openvino/quantizer/observers/nncf_observers.py deleted file mode 100644 index f6ac2a3cb91..00000000000 --- a/backends/openvino/quantizer/observers/nncf_observers.py +++ /dev/null @@ -1,176 +0,0 @@ -# Copyright (c) Qualcomm Innovation Center, Inc. -# All rights reserved -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import torch -from nncf.experimental.torch.fx.node_utils import ( # type: ignore[import-untyped] - get_tensor_constant_from_node, -) -from nncf.experimental.torch.fx.transformations import ( # type: ignore[import-untyped] - constant_update_fn, - module_insertion_transformation_builder, -) -from nncf.parameters import CompressWeightsMode # type: ignore[import-untyped] -from nncf.quantization.algorithms.weight_compression.config import ( # type: ignore[import-untyped] - WeightCompressionConfig, -) - -from nncf.quantization.algorithms.weight_compression.weight_lowering import ( # type: ignore[import-untyped] - do_integer_quantization, -) -from nncf.tensor.tensor import Tensor # type: ignore[import-untyped] -from nncf.torch.graph.transformations.commands import ( # type: ignore[import-untyped] - PTTargetPoint, - TargetType, -) -from nncf.torch.quantization.layers import ( # type: ignore[import-untyped] - INT4AsymmetricWeightsDecompressor, - INT4SymmetricWeightsDecompressor, - INT8AsymmetricWeightsDecompressor, - INT8SymmetricWeightsDecompressor, -) -from torchao.quantization.observer import AffineQuantizedMinMaxObserver -from torchao.quantization.pt2e import ( - get_block_size, - MappingType, - PerAxis, - PerChannelMinMaxObserver, - PerGroup, -) -from torchao.quantization.quant_primitives import _get_reduction_params - - -class PTPerBlockParamObserver(AffineQuantizedMinMaxObserver): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - qmode = ( - CompressWeightsMode.INT4_ASYM - if self.mapping_type == MappingType.ASYMMETRIC - else CompressWeightsMode.INT4_SYM - ) - assert isinstance( - self.granularity, PerGroup - ), "Only PerGroup granularity is supported" - self.wc_config = WeightCompressionConfig( - mode=qmode, group_size=self.granularity.group_size - ) - - def calculate_qparams(self, weight): - assert hasattr(self, "min_val") and hasattr( - self, "max_val" - ), "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams" - _, reduction_dims = _get_reduction_params(self.block_size, weight.size()) - assert len(reduction_dims) == 1, "Only 1-D group size is supported" - reduction_dims = reduction_dims[0] - 1 - q_weight, scale, zp = do_integer_quantization( - Tensor(weight), self.wc_config, reduction_axes=reduction_dims - ) - zp = zp.data if zp is not None else None - return q_weight.data, scale.data, zp - - def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node): - print("calling convert") - assert ( - self.original_dtype is not None - ), "Expecting original_dtype to be populated" - weight_node = observer_node.args[0] - original_weight = get_tensor_constant_from_node(weight_node, model) - q_weight, scale, zero_point = self.calculate_qparams(original_weight) - - with model.graph.inserting_before(observer_node): - if zero_point is not None: - decompressor = INT4AsymmetricWeightsDecompressor( - scale, - zero_point, - q_weight.shape, - original_weight.shape, - original_weight.dtype, - ) - else: - decompressor = INT4SymmetricWeightsDecompressor( - scale, q_weight.shape, original_weight.shape, original_weight.dtype - ) - packed_q_weight = decompressor.pack_weight(q_weight) - constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0) - compressed_weight_name = observer_node.all_input_nodes[0].name - decompressor_suffix = "_".join( - compressed_weight_name.replace(".", "_").split("_")[:-2] - ) - decompressor_name = f"{decompressor.quantization_mode}_weights_decompressor_{decompressor_suffix}" - - module_insertion_transformation_builder( - decompressor, - [ - PTTargetPoint( - TargetType.OPERATOR_POST_HOOK, - target_node_name=compressed_weight_name, - ) - ], - decompressor_name, - )(model) - decomp_node = observer_node.args[0] - observer_node.replace_all_uses_with(decomp_node) - model.graph.erase_node(observer_node) - - -class NNCFInt8observer(PerChannelMinMaxObserver): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - qmode = ( - CompressWeightsMode.INT8_SYM - if self.qscheme == torch.per_channel_symmetric - else CompressWeightsMode.INT8_ASYM - ) - self.wc_config = WeightCompressionConfig(mode=qmode) - - def calculate_qparams(self, weight): - assert hasattr(self, "min_val") and hasattr( - self, "max_val" - ), "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams" - self.granularity = PerAxis(axis=self.ch_axis) - self.block_size = get_block_size(weight.shape, self.granularity) - _, reduction_dims = _get_reduction_params(self.block_size, weight.size()) - q_weight, scale, zp = do_integer_quantization( - Tensor(weight), self.wc_config, reduction_axes=reduction_dims - ) - zp = zp.data if zp is not None else None - return q_weight.data, scale.data, zp - - def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node): - print("calling convert") - weight_node = observer_node.args[0] - original_weight = get_tensor_constant_from_node(weight_node, model) - q_weight, scale, zero_point = self.calculate_qparams(original_weight) - - with model.graph.inserting_before(observer_node): - if zero_point is not None: - decompressor = INT8AsymmetricWeightsDecompressor( - scale, zero_point, original_weight.dtype - ) - else: - decompressor = INT8SymmetricWeightsDecompressor( - scale, original_weight.dtype - ) - packed_q_weight = decompressor.pack_weight(q_weight) - constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0) - compressed_weight_name = observer_node.all_input_nodes[0].name - decompressor_suffix = "_".join( - compressed_weight_name.replace(".", "_").split("_")[:-2] - ) - decompressor_name = f"{decompressor.quantization_mode}_weights_decompressor_{decompressor_suffix}" - - module_insertion_transformation_builder( - decompressor, - [ - PTTargetPoint( - TargetType.OPERATOR_POST_HOOK, - target_node_name=compressed_weight_name, - ) - ], - decompressor_name, - )(model) - decomp_node = observer_node.args[0] - observer_node.replace_all_uses_with(decomp_node) - model.graph.erase_node(observer_node) diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py index cd78f6907c7..31d41bff7be 100644 --- a/backends/openvino/quantizer/quantizer.py +++ b/backends/openvino/quantizer/quantizer.py @@ -15,16 +15,11 @@ import nncf.experimental.torch.fx as nncf_fx # type: ignore[import-untyped] import torch.fx -from executorch.backends.openvino.quantizer.observers.nncf_observers import ( - NNCFInt8observer, - PTPerBlockParamObserver, +from executorch.backends.openvino.quantizer.observers import ( + INT4WeightObserver, + INT8WeightObserver, ) - from nncf.common.graph.graph import NNCFGraph # type: ignore[import-untyped] -from nncf.common.quantization.structs import ( # type: ignore[import-untyped] - QuantizationScheme, - QuantizerConfig, -) from nncf.quantization.quantize_model import ( # type: ignore[import-untyped] get_weight_compression_configuration, ) @@ -32,7 +27,6 @@ HistogramObserver, MappingType, PerChannelMinMaxObserver, - PerGroup, UniformQuantizationObserverBase, ) from torchao.quantization.pt2e.quantizer import ( @@ -45,7 +39,6 @@ ) QUANT_ANNOTATION_KEY = "quantization_annotation" -from torchao.quantization.pt2e.quantizer.quantizer import Q_ANNOTATION_KEY class QuantizationMode(Enum): @@ -55,15 +48,19 @@ class QuantizationMode(Enum): - INT8_SYM: INT8 symmetric quantization for both activations and weights. - INT8_MIXED: INT8 asymmetric quantization for activations, symmetric for weights. - INT8_TRANSFORMER: Optimized INT8 quantization for transformer-based models + - INT8WO_SYM: INT8 symmetric quantization for weights only. + - INT8WO_ASYM: INT8 asymmetric quantization for weights only. + - INT4WO_SYM: INT4 symmetric quantization for weights only. + - INT4WO_ASYM: INT4 asymmetric quantization for weights only """ INT8_SYM = "int8_sym" INT8_MIXED = "int8_mixed" INT8_TRANSFORMER = "int8_transformer" - INT8_SYM_WC = "int8_sym_wc" - INT8_ASYM_WC = "int8_asym_wc" - INT4_SYM_WC = "int4_sym" - INT4_ASYM_WC = "int4_asym" + INT8WO_SYM = "int8wo_sym" + INT8WO_ASYM = "int8wo_asym" + INT4WO_SYM = "int4wo_sym" + INT4WO_ASYM = "int4wo_asym" class OpenVINOQuantizer(Quantizer): @@ -72,10 +69,17 @@ class OpenVINOQuantizer(Quantizer): optimally for the inference via OpenVINO. """ + WEIGHTS_ONLY_COMPRESSION_MODES = ( + QuantizationMode.INT4WO_SYM, + QuantizationMode.INT4WO_ASYM, + QuantizationMode.INT8WO_SYM, + QuantizationMode.INT8WO_ASYM, + ) + def __init__( self, *, - mode: Optional[QuantizationMode] = QuantizationMode.INT8_SYM, + mode: QuantizationMode = QuantizationMode.INT8_SYM, **kwargs, ): """ @@ -89,28 +93,21 @@ def __init__( :param kwargs: Arguments to pass to the NNCF MinMaxQuantization algorithm. """ self.mode = mode - self.wc_modes = [ - QuantizationMode.INT4_ASYM_WC, - QuantizationMode.INT4_SYM_WC, - QuantizationMode.INT8_ASYM_WC, - QuantizationMode.INT8_SYM_WC, - ] - if mode == QuantizationMode.INT8_SYM: - preset = quantization.structs.QuantizationPreset.PERFORMANCE - model_type = None - elif mode == QuantizationMode.INT8_MIXED: - preset = quantization.structs.QuantizationPreset.MIXED - model_type = None - else: - preset = None - model_type = nncf.parameters.ModelType.TRANSFORMER - if self.mode not in self.wc_modes: - self._min_max_algo = ( + if self.mode not in OpenVINOQuantizer.WEIGHTS_ONLY_COMPRESSION_MODES: + if mode == QuantizationMode.INT8_SYM: + preset = quantization.structs.QuantizationPreset.PERFORMANCE + model_type = None + elif mode == QuantizationMode.INT8_MIXED: + preset = quantization.structs.QuantizationPreset.MIXED + model_type = None + else: + preset = None + model_type = nncf.parameters.ModelType.TRANSFORMER + self._algo = ( nncf.quantization.algorithms.min_max.algorithm.MinMaxQuantization( preset=preset, model_type=model_type, **kwargs ) ) - self._algo = self._min_max_algo else: weight_compression_configuration = get_weight_compression_configuration( mode.value.replace( @@ -118,10 +115,9 @@ def __init__( ), # Mode value has to match NNCF CompressWeightsMode **kwargs, ) - self._weight_compression_algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression( + self._algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression( subset_size=None, **weight_compression_configuration ) - self._algo = self._weight_compression_algo def set_ignored_scope( self, @@ -158,104 +154,131 @@ def get_nncf_quantization_setup( self._algo._set_backend_entity(model) return self._algo.find_quantization_setup(model, nncf_graph) - def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule: - nncf_graph = nncf_fx.nncf_graph_builder.GraphConverter.create_nncf_graph(model) + def _annotate_weight_compression( + self, + model: torch.fx.GraphModule, + graph: torch.fx.Graph, + nncf_graph: NNCFGraph, + node_vs_torch_annotation: DefaultDict[torch.fx.Node, QuantizationAnnotation], + ) -> DefaultDict[torch.fx.Node, QuantizationAnnotation]: + """ + Annotates the model graph with weight-only quantization specs. - graph = model.graph - node_vs_torch_annotation: DefaultDict[torch.fx.Node, QuantizationAnnotation] = ( - defaultdict(QuantizationAnnotation) - ) - # Serperate into annotation for quantize and compress - if self.mode in self.wc_modes: - self._algo.set_backend_entity(model) - nodes_to_compress = self._algo.get_nodes_to_compress(nncf_graph) - for node in nodes_to_compress: - quantization_insertion_point = ( - quantization.quantizer_setup.WeightQuantizationInsertionPoint( - target_node_name=node.node_name - ) - ) - group_size = self._algo._group_size - num_bits = ( - 4 - if self.mode - in [QuantizationMode.INT4_SYM_WC, QuantizationMode.INT4_ASYM_WC] - else 8 - ) - qmode = ( - QuantizationScheme.SYMMETRIC - if self.mode - in [QuantizationMode.INT4_SYM_WC, QuantizationMode.INT8_SYM_WC] - else QuantizationScheme.ASYMMETRIC - ) - nncf_qconfig = QuantizerConfig(num_bits=num_bits, mode=qmode) - qp = quantization.quantizer_setup.SingleConfigQuantizationPoint( - qip=quantization_insertion_point, - qconfig=nncf_qconfig, - directly_quantized_operator_node_names=[node], - ) - edge_or_node, annotation = self._get_edge_or_node_and_annotation( - graph, nncf_graph, qp, node_vs_torch_annotation - ) - qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config( - qp, group_size=group_size, weights_only=True + Identifies compressible nodes in the NNCF graph and attaches the corresponding + TorchAO quantization specifications to their weight edges for later transformation. + + :param model: The FX GraphModule to annotate. + :param graph: The underlying FX graph. + :param nncf_graph: The corresponding NNCF graph. + :param node_vs_torch_annotation: A mapping of FX nodes to quantization annotations. + + :return: Updated mapping of FX nodes with weight compression annotations. + """ + self._algo.set_backend_entity(model) + nodes_to_compress = self._algo.get_nodes_to_compress(nncf_graph) + + for node in nodes_to_compress: + target_node = nncf_fx.node_utils.get_graph_node_by_name( + graph, node.node_name + ) + annotation = node_vs_torch_annotation[target_node] + edge_or_node = OpenVINOQuantizer._get_weight_edge(target_node, nncf_graph) + group_size = getattr(self._algo, "_group_size", -1) + qspec = self._get_torch_ao_qspec_from_nncf_config( + qp=None, group_size=group_size, qmode=self.mode, weights_only=True + ) + self._fill_torch_ao_annotation(edge_or_node, qspec, annotation) + + return node_vs_torch_annotation + + def _annotate_post_training_quantization( + self, + model: torch.fx.GraphModule, + graph: torch.fx.Graph, + nncf_graph: NNCFGraph, + node_vs_torch_annotation: DefaultDict[torch.fx.Node, QuantizationAnnotation], + ) -> DefaultDict[torch.fx.Node, QuantizationAnnotation]: + """ + Annotates the model graph with post-training quantization configurations. + + Converts NNCF quantization points into TorchAO-compatible quantization specs, + assigning them to corresponding nodes or edges. Also handles unified scale groups, + ensuring shared quantization specs across grouped quantizers with consistent configs. + + :param model: The FX GraphModule to annotate. + :param graph: The underlying FX graph. + :param nncf_graph: The corresponding NNCF graph. + :param node_vs_torch_annotation: A mapping of FX nodes to quantization annotations. + + :return: Updated mapping of FX nodes with post-training quantization annotations. + """ + quantization_setup = self.get_nncf_quantization_setup(model, nncf_graph) + + for qp in quantization_setup.quantization_points.values(): + edge_or_node, annotation = self._get_edge_or_node_and_annotation( + graph, nncf_graph, qp, node_vs_torch_annotation + ) + qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(qp) + self._fill_torch_ao_annotation(edge_or_node, qspec, annotation) + + for quantizer_ids in quantization_setup.unified_scale_groups.values(): + root_quantizer_id = self._get_unified_scales_root_quantizer_id( + nncf_graph, quantizer_ids, quantization_setup + ) + root_qp = quantization_setup.quantization_points[root_quantizer_id] + + if any( + root_qp.qconfig != quantization_setup.quantization_points[q_id].qconfig + for q_id in quantizer_ids + ): + qps = [ + quantization_setup.quantization_points[qid] for qid in quantizer_ids + ] + raise nncf.InternalError( + "Different quantization configs are set to one unified scale group:" + f"{[(qp.insertion_point.__dict__, str(qp.qconfig)) for qp in qps]}" ) - self._fill_torch_ao_annotation(edge_or_node, qspec, annotation) - else: - quantization_setup = self.get_nncf_quantization_setup(model, nncf_graph) - for qp in quantization_setup.quantization_points.values(): + root_target_node = nncf_fx.node_utils.get_graph_node_by_name( + graph, root_qp.insertion_point.target_node_name + ) + root_edge_or_node = self._get_edge_or_node( + root_target_node, root_qp, nncf_graph + ) + + for quantizer_id in quantizer_ids: + if quantizer_id == root_quantizer_id: + continue + + qspec = SharedQuantizationSpec(root_edge_or_node) # type: ignore[assignment] + qp = quantization_setup.quantization_points[quantizer_id] edge_or_node, annotation = self._get_edge_or_node_and_annotation( graph, nncf_graph, qp, node_vs_torch_annotation ) - qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config( - qp - ) self._fill_torch_ao_annotation(edge_or_node, qspec, annotation) - for quantizer_ids in quantization_setup.unified_scale_groups.values(): + return node_vs_torch_annotation - root_quantizer_id = self._get_unified_scales_root_quantizer_id( - nncf_graph, quantizer_ids, quantization_setup - ) - root_qp = quantization_setup.quantization_points[root_quantizer_id] - - if any( - root_qp.qconfig - != quantization_setup.quantization_points[q_id].qconfig - for q_id in quantizer_ids - ): - qps = [ - quantization_setup.quantization_points[q_id] - for q_id in quantizer_ids - ] - msg = ( - "Different quantization configs are set to one unified scale group:" - f"{[(qp.insertion_point.__dict__, str(qp.qconfig)) for qp in qps]}" - ) - raise nncf.InternalError(msg) - - root_target_node = nncf_fx.node_utils.get_graph_node_by_name( - graph, root_qp.insertion_point.target_node_name - ) - root_edge_or_node = self._get_edge_or_node( - root_target_node, root_qp, nncf_graph - ) - - for quantizer_id in quantizer_ids: - if quantizer_id == root_quantizer_id: - continue + def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule: + nncf_graph = nncf_fx.nncf_graph_builder.GraphConverter.create_nncf_graph(model) + graph = model.graph + node_vs_torch_annotation: DefaultDict[torch.fx.Node, QuantizationAnnotation] = ( + defaultdict(QuantizationAnnotation) + ) - qspec = SharedQuantizationSpec(root_edge_or_node) - qp = quantization_setup.quantization_points[quantizer_id] - edge_or_node, annotation = self._get_edge_or_node_and_annotation( - graph, nncf_graph, qp, node_vs_torch_annotation - ) - self._fill_torch_ao_annotation(edge_or_node, qspec, annotation) + if self.mode in OpenVINOQuantizer.WEIGHTS_ONLY_COMPRESSION_MODES: + node_vs_torch_annotation = self._annotate_weight_compression( + model, graph, nncf_graph, node_vs_torch_annotation + ) + else: + node_vs_torch_annotation = self._annotate_post_training_quantization( + model, graph, nncf_graph, node_vs_torch_annotation + ) for node, annotation in node_vs_torch_annotation.items(): - assert Q_ANNOTATION_KEY not in node.meta - node.meta[Q_ANNOTATION_KEY] = annotation + assert QUANT_ANNOTATION_KEY not in node.meta + node.meta[QUANT_ANNOTATION_KEY] = annotation + return model @staticmethod @@ -317,6 +340,36 @@ def _get_edge_or_node_and_annotation( edge_or_node = OpenVINOQuantizer._get_edge_or_node(target_node, qp, nncf_graph) return edge_or_node, annotation + @staticmethod + def _get_weight_edge( + target_node: torch.fx.Node, + nncf_graph: NNCFGraph, + ): + """ + Returns the FX node corresponding to the weight tensor input of a given operator node. + Uses the NNCF graph to identify which input port of the target node holds the weight. + If multiple weight ports are present, a warning is issued and only the first one is used. + + :param target_node: FX node representing a weighted operation (e.g., Linear, Conv). + :param nncf_graph: NNCFGraph used to determine weight port indices. + + :return: Edge represented by a Tuple of (weight_node, target_node), where weight_node is the FX node supplying the weight. + """ + nncf_node = nncf_graph.get_node_by_name(target_node.name) + weights_ports_ids = nncf.torch.model_graph_manager.get_weight_tensor_port_ids( + nncf_node, nncf_graph + ) + if len(weights_ports_ids) > 1: + # TODO(dlyakhov): support quantization for nodes with several weights + nncf.common.logging.nncf_logger.warning( + f"Quantization of the weighted node {target_node.name}" + " is not yet supported by the OpenVINOQuantizer." + f" Only the weight on port ID {weights_ports_ids[0]} will be quantized." + f" Quantizable weights are located on ports: {weights_ports_ids}." + ) + weight_node = target_node.all_input_nodes[weights_ports_ids[0]] + return (weight_node, target_node) + @staticmethod def _get_edge_or_node( target_node: torch.fx.Node, @@ -333,22 +386,7 @@ def _get_edge_or_node( """ ip = qp.insertion_point if qp.is_weight_quantization_point(): - nncf_node = nncf_graph.get_node_by_name(target_node.name) - weights_ports_ids = ( - nncf.torch.model_graph_manager.get_weight_tensor_port_ids( - nncf_node, nncf_graph - ) - ) - if len(weights_ports_ids) > 1: - # TODO(dlyakhov): support quantization for nodes with several weights - nncf.common.logging.nncf_logger.warning( - f"Quantization of the weighted node {target_node.name}" - " is not yet supported by the OpenVINOQuantizer." - f" Only the weight on port ID {weights_ports_ids[0]} will be quantized." - f" Quantizable weights are located on ports: {weights_ports_ids}." - ) - weight_node = target_node.all_input_nodes[weights_ports_ids[0]] - return (weight_node, target_node) + OpenVINOQuantizer._get_weight_edge(target_node, nncf_graph) if ip.input_port_id is None: return target_node @@ -377,22 +415,67 @@ def _fill_torch_ao_annotation( @staticmethod def _get_torch_ao_qspec_from_nncf_config( qp: quantization.quantizer_setup.QuantizationPointBase, - group_size=-1, - weights_only=False, + group_size: int = -1, + qmode: Optional[QuantizationMode] = None, + weights_only: bool = False, ) -> QuantizationSpec: """ - Retrieves the quantization configuration from the given quantization point and - converts it into a QuantizationSpec. - - :param qp: An instance of QuantizationPointBase. - :return: A QuantizationSpec retrieved and converted from the quantization point. + Returns a TorchAO QuantizationSpec based on NNCF quantization config and other arguments. + For weight-only quantization (e.g., INT4/INT8 compression), uses `qmode`, `group_size`, + and `weights_only`. For post-training quantization, only `qp` is required. + + :param qp: Quantization point from NNCF. + :param group_size: Group size for INT4 group-wise quantization. + :param qmode: Quantization mode for weight compression. + :param weights_only: If True, applies weight-only quantization logic. + :return: A TorchAO QuantizationSpec. """ + observer: Type[UniformQuantizationObserverBase] + # Eps value is copied from nncf/torch/quantization/layers.py - extra_args = {"eps": 1e-16} + extra_args: Dict[str, Any] = {"eps": 1e-16} + + if weights_only: + mapping_type = ( + MappingType.SYMMETRIC + if qmode == QuantizationMode.INT4WO_SYM + else MappingType.ASYMMETRIC + ) + if qmode in [QuantizationMode.INT4WO_SYM, QuantizationMode.INT4WO_SYM]: + extra_args["mapping_type"] = mapping_type + extra_args["target_dtype"] = torch.int8 + extra_args["group_size"] = group_size + observer = INT4WeightObserver + quant_min = -8 if mapping_type == MappingType.SYMMETRIC else 0 + quant_max = 7 if mapping_type == MappingType.SYMMETRIC else 15 + dtype = torch.int8 + channel_axis = 0 + torch_qscheme = None + else: + observer = INT8WeightObserver + quant_min = -128 if mapping_type == MappingType.SYMMETRIC else 0 + quant_max = 1277 if mapping_type == MappingType.SYMMETRIC else 255 + dtype = torch.int8 + channel_axis = 0 + torch_qscheme = ( + torch.per_channel_symmetric + if qmode == QuantizationMode.INT8WO_SYM + else torch.per_channel_affine + ) + + return QuantizationSpec( + dtype=dtype, + observer_or_fake_quant_ctr=observer.with_args(**extra_args), + quant_min=quant_min, + quant_max=quant_max, + qscheme=torch_qscheme, + ch_axis=channel_axis, + is_dynamic=False, + ) + is_weight = qp.is_weight_quantization_point() qconfig = qp.qconfig - observer: Type[UniformQuantizationObserverBase] if qconfig.per_channel: torch_qscheme = ( torch.per_channel_symmetric @@ -406,33 +489,16 @@ def _get_torch_ao_qspec_from_nncf_config( else torch.per_tensor_affine ) if is_weight: - mapping_type = ( - MappingType.SYMMETRIC - if qconfig.mode == QuantizationScheme.SYMMETRIC - else MappingType.ASYMMETRIC + observer = PerChannelMinMaxObserver + quant_min = -128 + quant_max = 127 + dtype = torch.int8 + channel_axis = 0 + torch_qscheme = ( + torch.per_channel_symmetric + if qconfig.mode is quantization.structs.QuantizationScheme.SYMMETRIC + else torch.per_channel_affine ) - if qconfig.num_bits == 4: - extra_args["mapping_type"] = mapping_type - extra_args["target_dtype"] = torch.int8 - extra_args["granularity"] = PerGroup(group_size=group_size) - observer = PTPerBlockParamObserver - quant_min = -8 - quant_max = 7 - dtype = torch.int8 - channel_axis = 0 - elif qconfig.num_bits == 8: - observer = ( - NNCFInt8observer if weights_only else PerChannelMinMaxObserver - ) - quant_min = -128 - quant_max = 127 - dtype = torch.int8 - channel_axis = 0 - torch_qscheme = ( - torch.per_channel_symmetric - if qconfig.mode is quantization.structs.QuantizationScheme.SYMMETRIC - else torch.per_channel_affine - ) else: observer = ( HistogramObserver @@ -514,4 +580,4 @@ def quantize_model( smooth_quant=smooth_quant, **kwargs, ) - return quantized_model + return quantized_model \ No newline at end of file diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 47527a326f9..54acf67a21d 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -50,6 +50,7 @@ get_pt2e_quantization_params, get_pt2e_quantizers, get_qnn_quantizer, + get_ov_quantizer, get_vulkan_quantizer, ) from executorch.util.activation_memory_profiler import generate_memory_trace @@ -205,6 +206,8 @@ def build_args_parser() -> argparse.ArgumentParser: choices=[ "xnnpack_dynamic", "xnnpack_dynamic_qc4", + "openvino_8da4w", + "openvino_8da8w", "qnn_8a8w", "qnn_16a16w", "qnn_16a4w", @@ -786,6 +789,12 @@ def get_quantizer_and_quant_params(llm_config): llm_config.quantization.pt2e_quantize.value, llm_config.quantization.qmode ) quantizers.append(qnn_quantizer) + if llm_config.backend.openvino.enabled and llm_config.quantization.pt2e_quantize: + assert len(quantizers) == 0, "Should not enable both xnnpack and openvino" + ov_quantizer = get_ov_quantizer( + llm_config.quantization.pt2e_quantize.value, llm_config.quantization.group_size + ) + quantizers.append(ov_quantizer) if llm_config.backend.coreml.enabled and llm_config.quantization.pt2e_quantize: assert len(quantizers) == 0, "Should not enable both xnnpack / qnn and coreml" coreml_quantizer = get_coreml_quantizer( diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py index d87c722363f..4669d09e0e7 100644 --- a/extension/llm/export/quantizer_lib.py +++ b/extension/llm/export/quantizer_lib.py @@ -207,7 +207,7 @@ def get_qnn_quantizer( f"No support for quant type {quant_config}. Support 8a8w, 16a16w and 16a4w." ) - assert ( + assert (get_qnn_quantizer quantization_mode is None ), "Currently qnn backend only supports QnnQuantizer via pt2e flow" qnn_quantizer.add_custom_quant_annotations(custom_annotations) @@ -215,6 +215,42 @@ def get_qnn_quantizer( return qnn_quantizer, quant_dtype +def get_ov_quantizer( + pt2e_quantize: str, + group_size: int = 32, +): + try: + from executorch.backends.openvino.quantizer import OpenVINOQuantizer, QuantizationMode + + except ImportError: + raise ImportError( + "Please install nncf via backends/openvino/requirements.txt" + ) + + backend, quant_config = pt2e_quantize.split("_") + assert ( + backend == "openvino" + ), f"The quantization config is for backend {backend} instead of openvino." + ov_quantizer = OpenVINOQuantizer() + # Manually ignore MP layers. + # ov_quantizer.set_ignored_scope() + + extra_quantizer_options = {"group_size": group_size} + if quant_config == "8da4w": + mode = QuantizationMode.INT4WO_SYM + + elif quant_config == "8da8w": + mode = QuantizationMode.INT8WO_SYM + else: + raise AssertionError( + f"No support for quant type {quant_config}. Support 8a4w, 8a8w only." + ) + + ov_quantizer = OpenVINOQuantizer(mode=mode, **extra_quantizer_options) + + return ov_quantizer + + def get_coreml_quantizer(pt2e_quantize: str): try: from coremltools.optimize.torch.quantization.quantization_config import ( From 4cc7694433b12f7c8afe4c61b785e5158e0798e0 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Tue, 26 Aug 2025 18:32:27 +0400 Subject: [PATCH 02/27] fixes --- backends/openvino/quantizer/quantizer.py | 10 ++++-- examples/models/llama/export_llama_lib.py | 9 +++-- extension/llm/export/config/llm_config.py | 2 ++ extension/llm/export/quantizer_lib.py | 42 +++++++++++++++++++---- 4 files changed, 51 insertions(+), 12 deletions(-) diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py index 31d41bff7be..f594c6fffa8 100644 --- a/backends/openvino/quantizer/quantizer.py +++ b/backends/openvino/quantizer/quantizer.py @@ -12,6 +12,7 @@ import nncf # type: ignore[import-untyped] import nncf.common.quantization as quantization # type: ignore[import-untyped] +from nncf.common.scopes import should_consider_scope # type: ignore[import-untyped] import nncf.experimental.torch.fx as nncf_fx # type: ignore[import-untyped] import torch.fx @@ -176,8 +177,12 @@ def _annotate_weight_compression( """ self._algo.set_backend_entity(model) nodes_to_compress = self._algo.get_nodes_to_compress(nncf_graph) + ignored_names = self._algo.get_ignored_node_names(nncf_graph) for node in nodes_to_compress: + is_target_node = should_consider_scope(node.node_name, ignored_names) + if not is_target_node: + continue target_node = nncf_fx.node_utils.get_graph_node_by_name( graph, node.node_name ) @@ -442,9 +447,9 @@ def _get_torch_ao_qspec_from_nncf_config( else MappingType.ASYMMETRIC ) if qmode in [QuantizationMode.INT4WO_SYM, QuantizationMode.INT4WO_SYM]: + extra_args["group_size"] = group_size extra_args["mapping_type"] = mapping_type extra_args["target_dtype"] = torch.int8 - extra_args["group_size"] = group_size observer = INT4WeightObserver quant_min = -8 if mapping_type == MappingType.SYMMETRIC else 0 quant_max = 7 if mapping_type == MappingType.SYMMETRIC else 15 @@ -454,7 +459,7 @@ def _get_torch_ao_qspec_from_nncf_config( else: observer = INT8WeightObserver quant_min = -128 if mapping_type == MappingType.SYMMETRIC else 0 - quant_max = 1277 if mapping_type == MappingType.SYMMETRIC else 255 + quant_max = 127 if mapping_type == MappingType.SYMMETRIC else 255 dtype = torch.int8 channel_axis = 0 torch_qscheme = ( @@ -462,7 +467,6 @@ def _get_torch_ao_qspec_from_nncf_config( if qmode == QuantizationMode.INT8WO_SYM else torch.per_channel_affine ) - return QuantizationSpec( dtype=dtype, observer_or_fake_quant_ctr=observer.with_args(**extra_args), diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 54acf67a21d..269f927e9f6 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -791,8 +791,10 @@ def get_quantizer_and_quant_params(llm_config): quantizers.append(qnn_quantizer) if llm_config.backend.openvino.enabled and llm_config.quantization.pt2e_quantize: assert len(quantizers) == 0, "Should not enable both xnnpack and openvino" + group_size = llm_config.quantization.group_size + group_size = group_size if group_size else 32 ov_quantizer = get_ov_quantizer( - llm_config.quantization.pt2e_quantize.value, llm_config.quantization.group_size + llm_config.quantization.pt2e_quantize.value, ) quantizers.append(ov_quantizer) if llm_config.backend.coreml.enabled and llm_config.quantization.pt2e_quantize: @@ -904,6 +906,7 @@ def _to_edge_and_lower_llama_xnnpack( def _to_edge_and_lower_llama_openvino( builder_exported, modelname, + quantizers, additional_passes, openvino_device: str = "CPU", nncf_compression: bool = False, @@ -935,7 +938,6 @@ def _to_edge_and_lower_llama_openvino( def transform_fn(prompts: str, tokenizer): tokenized_text = tokenizer.encode(prompts, bos=False, eos=False) - logging.error(tokenized_text) inputs = () inputs = ( @@ -971,7 +973,7 @@ def transform_fn(prompts: str, tokenizer): sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION, ) - builder = builder_exported.to_edge_transform_and_lower(partitioners) + builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(partitioners) if verbose: print_delegation_info(builder.edge_manager.exported_program().graph_module) @@ -1214,6 +1216,7 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager: # noqa: C901 builder = _to_edge_and_lower_llama_openvino( builder_exported, modelname, + quantizers, additional_passes, openvino_device=llm_config.backend.openvino.device, nncf_compression=llm_config.backend.openvino.nncf_compression, diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py index ab18c19159b..b4175d54cd7 100644 --- a/extension/llm/export/config/llm_config.py +++ b/extension/llm/export/config/llm_config.py @@ -275,6 +275,8 @@ class Pt2eQuantize(str, Enum): xnnpack_dynamic = "xnnpack_dynamic" xnnpack_dynamic_qc4 = "xnnpack_dynamic_qc4" + openvino_8da4w = "openvino_8da4w" + openvino_8da8w = "openvino_8da8w" qnn_8a8w = "qnn_8a8w" qnn_16a16w = "qnn_16a16w" qnn_16a4w = "qnn_16a4w" diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py index 4669d09e0e7..2a20a90d55a 100644 --- a/extension/llm/export/quantizer_lib.py +++ b/extension/llm/export/quantizer_lib.py @@ -207,7 +207,7 @@ def get_qnn_quantizer( f"No support for quant type {quant_config}. Support 8a8w, 16a16w and 16a4w." ) - assert (get_qnn_quantizer + assert ( quantization_mode is None ), "Currently qnn backend only supports QnnQuantizer via pt2e flow" qnn_quantizer.add_custom_quant_annotations(custom_annotations) @@ -231,22 +231,52 @@ def get_ov_quantizer( assert ( backend == "openvino" ), f"The quantization config is for backend {backend} instead of openvino." - ov_quantizer = OpenVINOQuantizer() + assert group_size != None, "Group Size None is Not Supported. It should be set to -1 for per-channel." + # Manually ignore MP layers. - # ov_quantizer.set_ignored_scope() + fp_node_names = linear_list = [ + "embedding", # First embedding is kept in Full precision + "linear_14", + "linear_15", + "linear_35", + "linear_56", + "linear_57", + "linear_63", + "linear_70", + "linear_71", + "linear_77", + "linear_78", + "linear_81", + "linear_84", + "linear_85", + "linear_88", + "linear_89", + "linear_91", + "linear_92", + "linear_95", + "linear_96", + "linear_98", + "linear_99", + "linear_102", + "linear_103", + "linear_105", + "linear_106", + "linear_109", + "linear_110", + "linear_112",] - extra_quantizer_options = {"group_size": group_size} if quant_config == "8da4w": mode = QuantizationMode.INT4WO_SYM elif quant_config == "8da8w": + group_size = -1 mode = QuantizationMode.INT8WO_SYM else: raise AssertionError( f"No support for quant type {quant_config}. Support 8a4w, 8a8w only." ) - - ov_quantizer = OpenVINOQuantizer(mode=mode, **extra_quantizer_options) + ov_quantizer = OpenVINOQuantizer(mode=mode, group_size=group_size) + ov_quantizer.set_ignored_scope(names=fp_node_names) return ov_quantizer From 5da40a57d7d42363b795d483630b00d9ce4b5f31 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Wed, 27 Aug 2025 13:48:41 +0400 Subject: [PATCH 03/27] support all_layers, backup mode in OVQuantizer --- backends/openvino/quantizer/quantizer.py | 25 ++++--- examples/models/llama/export_llama_lib.py | 82 ++++++++++------------- extension/llm/export/quantizer_lib.py | 8 +-- 3 files changed, 55 insertions(+), 60 deletions(-) diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py index f594c6fffa8..2ede04e53db 100644 --- a/backends/openvino/quantizer/quantizer.py +++ b/backends/openvino/quantizer/quantizer.py @@ -116,8 +116,14 @@ def __init__( ), # Mode value has to match NNCF CompressWeightsMode **kwargs, ) + subset_size = 1 # Doesn't really matter in this case since it is data-free. Should just be +ve + dataset = None # Only Data Free Quantization is Supported in OVQuantizer + compression_format = nncf.CompressionFormat.DQ + nncf.quantization.algorithms.weight_compression.algorithm.check_user_compression_configuration( + subset_size=subset_size, dataset=dataset, compression_format=compression_format, **weight_compression_configuration + ) self._algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression( - subset_size=None, **weight_compression_configuration + subset_size=subset_size, **weight_compression_configuration ) def set_ignored_scope( @@ -176,21 +182,20 @@ def _annotate_weight_compression( :return: Updated mapping of FX nodes with weight compression annotations. """ self._algo.set_backend_entity(model) - nodes_to_compress = self._algo.get_nodes_to_compress(nncf_graph) - ignored_names = self._algo.get_ignored_node_names(nncf_graph) + all_wc_params, _ = self._algo.get_processed_weight_compression_parameters(model, nncf_graph) - for node in nodes_to_compress: - is_target_node = should_consider_scope(node.node_name, ignored_names) - if not is_target_node: - continue + for wc_param in all_wc_params: + wc_config = wc_param.compression_config + node_with_weight = wc_param.node_with_weight target_node = nncf_fx.node_utils.get_graph_node_by_name( - graph, node.node_name + graph, node_with_weight.node_name ) annotation = node_vs_torch_annotation[target_node] edge_or_node = OpenVINOQuantizer._get_weight_edge(target_node, nncf_graph) - group_size = getattr(self._algo, "_group_size", -1) + group_size = wc_config.group_size + qmode = wc_config.mode qspec = self._get_torch_ao_qspec_from_nncf_config( - qp=None, group_size=group_size, qmode=self.mode, weights_only=True + qp=None, group_size=group_size, qmode=qmode, weights_only=True ) self._fill_torch_ao_annotation(edge_or_node, qspec, annotation) diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 269f927e9f6..00785491100 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -792,9 +792,9 @@ def get_quantizer_and_quant_params(llm_config): if llm_config.backend.openvino.enabled and llm_config.quantization.pt2e_quantize: assert len(quantizers) == 0, "Should not enable both xnnpack and openvino" group_size = llm_config.quantization.group_size - group_size = group_size if group_size else 32 + group_size = group_size if group_size else 32 ov_quantizer = get_ov_quantizer( - llm_config.quantization.pt2e_quantize.value, + llm_config.quantization.pt2e_quantize.value, group_size ) quantizers.append(ov_quantizer) if llm_config.backend.coreml.enabled and llm_config.quantization.pt2e_quantize: @@ -921,59 +921,51 @@ def _to_edge_and_lower_llama_openvino( logging.info("Lowering model using following partitioner(s): ") for partitioner in partitioners: logging.info(f"--> {partitioner.__class__.__name__}") - + try: + import nncf + from functools import partial + from pytorch_tokenizers import get_tokenizer + except ImportError: + raise ImportError( + "Please install nncf via backends/openvino/requirements.txt" + ) + + tokenizer = get_tokenizer(builder_exported.tokenizer_path) + from datasets import load_dataset # Use NNCF compression if enabled # TODO: Enable passing OpenVINOQuantizer as a parameter to pt2e_quantize if nncf_compression: - try: - from functools import partial - - import nncf - from pytorch_tokenizers import get_tokenizer - except ImportError: - raise ImportError( - "Please install nncf via backends/openvino/requirements.txt" - ) - tokenizer = get_tokenizer(builder_exported.tokenizer_path) - - def transform_fn(prompts: str, tokenizer): - tokenized_text = tokenizer.encode(prompts, bos=False, eos=False) - + dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") + dataset = dataset.filter(lambda example: example['text'].strip() != "") + dataset = dataset.filter(lambda example: example['text'].strip() != "\n") + def transform_fn( + prompts: str, tokenizer + ): + tokenized_text = tokenizer.encode(prompts["text"], bos=False, eos=False) + device = torch.device("cpu") if openvino_device=="CPU" else torch.device("cuda") inputs = () inputs = ( - torch.tensor(tokenized_text).unsqueeze(0), - {"input_pos": torch.tensor([0])}, + torch.tensor(tokenized_text[:128], device=device).unsqueeze(0), + {"input_pos": torch.tensor([0], device=device)}, ) return inputs - - builder_exported.calibration_data = ( - [builder_exported.calibration_data] - if isinstance(builder_exported.calibration_data, str) - else builder_exported.calibration_data - ) - builder_exported.calibration_data = ( - [ - word - for prompt in builder_exported.calibration_data - for word in prompt.split() - ] - if not builder_exported.dynamic_shapes - else builder_exported.calibration_data - ) - + builder_exported.pre_autograd_graph_module = nncf.compress_weights( - builder_exported.pre_autograd_graph_module, - dataset=nncf.Dataset( - builder_exported.calibration_data, - transform_func=partial(transform_fn, tokenizer=tokenizer), - ), - mode=nncf.CompressWeightsMode.INT4_SYM, - ratio=0.8, - sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION, - ) + builder_exported.pre_autograd_graph_module, + dataset=nncf.Dataset(dataset, partial(transform_fn, tokenizer=tokenizer)), + mode=nncf.CompressWeightsMode.INT4_SYM, + group_size=32, + backup_mode=nncf.BackupMode.NONE, + ratio=0.8, + sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION, + ) + + builder = builder_exported.to_edge_transform_and_lower(partitioners) + + else: + builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(partitioners) - builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(partitioners) if verbose: print_delegation_info(builder.edge_manager.exported_program().graph_module) diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py index 2a20a90d55a..9220c1efbdc 100644 --- a/extension/llm/export/quantizer_lib.py +++ b/extension/llm/export/quantizer_lib.py @@ -221,7 +221,7 @@ def get_ov_quantizer( ): try: from executorch.backends.openvino.quantizer import OpenVINOQuantizer, QuantizationMode - + import nncf except ImportError: raise ImportError( "Please install nncf via backends/openvino/requirements.txt" @@ -234,8 +234,7 @@ def get_ov_quantizer( assert group_size != None, "Group Size None is Not Supported. It should be set to -1 for per-channel." # Manually ignore MP layers. - fp_node_names = linear_list = [ - "embedding", # First embedding is kept in Full precision + fp_node_names = [ "linear_14", "linear_15", "linear_35", @@ -262,8 +261,7 @@ def get_ov_quantizer( "linear_105", "linear_106", "linear_109", - "linear_110", - "linear_112",] + "linear_110",] if quant_config == "8da4w": mode = QuantizationMode.INT4WO_SYM From 9e65a7ef860e5725522859bbf8d863c76e26503d Mon Sep 17 00:00:00 2001 From: anzr299 Date: Wed, 27 Aug 2025 17:29:05 +0400 Subject: [PATCH 04/27] clean up and use new nncf method for obtaining compression parameters --- backends/openvino/quantizer/observers.py | 127 ++++++----------------- backends/openvino/quantizer/quantizer.py | 52 ++++------ 2 files changed, 48 insertions(+), 131 deletions(-) diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py index 2ea66f11a55..845a091d24b 100644 --- a/backends/openvino/quantizer/observers.py +++ b/backends/openvino/quantizer/observers.py @@ -25,10 +25,7 @@ ) from nncf.parameters import CompressWeightsMode # type: ignore[import-untyped] from nncf.quantization.algorithms.weight_compression.config import ( # type: ignore[import-untyped] - WeightCompressionConfig, -) -from nncf.quantization.algorithms.weight_compression.torch_fx_backend import ( # type: ignore[import-untyped] - FXWeightCompressionAlgoBackend, + WeightCompressionParameters, ) from nncf.quantization.algorithms.weight_compression.weight_lowering import ( # type: ignore[import-untyped] do_integer_quantization, @@ -45,19 +42,31 @@ INT8AsymmetricWeightsDecompressor, INT8SymmetricWeightsDecompressor, ) -from torchao.quantization.pt2e import MappingType, ObserverBase -from nncf.torch.model_graph_manager import get_weight_compression_reduction_axes +from torchao.quantization.pt2e import ObserverBase + class WeightObserverBase(ObserverBase, ABC): """ Base implementation of an NNCF observer that defines the rules for compressing layer weights into the OpenVINO representation. """ + def __init__( + self, + wc_param: WeightCompressionParameters, + dtype: torch.dtype, + **kwargs, + ) -> None: + """ + :param wc_param: Weight compression parameter which contains information such as group_size + reduction_axes, quantization mode etc. + :param dtype: target dtype for quantization such as int8, uint8, etc. + """ + super().__init__(dtype=dtype, is_dynamic=False) + self.wc_param = wc_param + def calculate_qparams( # type: ignore[override] self, weight: torch.Tensor, - observer_node: torch.fx.Node, - model: torch.fx.GraphModule, ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: """ Calculate quantization parameters such as scale, quantized weight and zero point. @@ -65,26 +74,11 @@ def calculate_qparams( # type: ignore[override] :param weight: FP weight to be used for calculating qparams. :return: quantization params quantized weight, scale and zero point """ - ndims = len(weight.size()) - node_with_weight, weight_port_id = ( - WeightObserverBase.get_node_with_weight_and_port_ids(observer_node, model) - ) - _, node_metatype = GraphConverter.get_node_type_and_metatype( - node_with_weight, model - ) - # Special case where embedding metatype has to be mapped to AtenEmbedding metatype - node_metatype = ( - om.PTAtenEmbeddingMetatype - if node_metatype == om.PTEmbeddingMetatype - else node_metatype - ) - reduction_dims = get_weight_compression_reduction_axes( - node_metatype, weight_port_id, ndims - ) - reduction_dims = tuple(reduction_dims) - + wc_param = self.get_wc_param() + wc_config = wc_param.compression_config + reduction_axes = wc_param.reduction_axes q_weight, scale, zp = do_integer_quantization( - Tensor(weight), self.wc_config, reduction_axes=reduction_dims + Tensor(weight), wc_config, reduction_axes=reduction_axes ) zp = zp.data if zp is not None else None return q_weight.data, scale.data, zp @@ -92,23 +86,6 @@ def calculate_qparams( # type: ignore[override] def forward(self, x: torch.Tensor) -> torch.Tensor: return x - @staticmethod - def get_node_with_weight_and_port_ids( - observer_node: torch.fx.Node, model: torch.fx.GraphModule - ) -> Tuple[torch.fx.Node, int]: - """ - Returns the node which contains the weight and the weight port id. - - :param observer_node: Observer node for the weight. - :param graph: The model. - :return: Node which contains the weight (for eg. Linear node) and the port ID for the weight. - """ - for node in model.graph.nodes: - if observer_node in node.all_input_nodes: - return node, node.all_input_nodes.index(observer_node) - msg = f"Observer node {observer_node.name} has no consumer node" - raise RuntimeError(msg) - def convert( self, model: torch.fx.GraphModule, observer_node: torch.fx.Node ) -> None: @@ -126,7 +103,7 @@ def convert( weight_node = observer_node.args[0] original_weight = get_tensor_constant_from_node(weight_node, model) q_weight, scale, zero_point = self.calculate_qparams( - original_weight, observer_node, model + original_weight ) decompressor = self._create_decompressor( @@ -134,6 +111,7 @@ def convert( ) packed_q_weight = decompressor.pack_weight(q_weight) + # Weight port id is 0 since observer is inserted for a single weight only. constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0) compressed_weight_name = observer_node.all_input_nodes[0].name @@ -177,7 +155,7 @@ def _create_decompressor( pass @abstractmethod - def get_wc_config(self) -> WeightCompressionConfig: + def get_wc_param(self) -> WeightCompressionParameters: """ Used to return the respective NNCF Weight Compression Config. @@ -191,30 +169,6 @@ class INT4WeightObserver(WeightObserverBase): This class defines the behavior for INT4 Weight Compression which has per-group granularity. """ - def __init__( - self, - group_size: int, - mapping_type: MappingType, - target_dtype: torch.dtype, - *args, - **kwargs, - ) -> None: - """ - :param group_size: Group size for group wise quantization. group_size=-1 means it is per-channel quantization. - :param mapping_type: MappingType.SYMMETRIC and MappingType.ASYMMETRIC are supported types for this argument for symmetric or asymmetric quantization. - :param target_dtype: target dtype for quantization such as int8, uint8, etc. - """ - super().__init__(dtype=target_dtype, is_dynamic=False) - self.wc_config = None - self.mapping_type = mapping_type - - qmode = ( - CompressWeightsMode.INT4_ASYM - if self.mapping_type == MappingType.ASYMMETRIC - else CompressWeightsMode.INT4_SYM - ) - self.wc_config = WeightCompressionConfig(mode=qmode, group_size=group_size) - def _create_decompressor( self, scale: torch.Tensor, @@ -235,8 +189,8 @@ def _create_decompressor( scale, q_weight.shape, original_weight.shape, original_weight.dtype ) - def get_wc_config(self): - return self.wc_config + def get_wc_param(self) -> WeightCompressionParameters: + return self.wc_param class INT8WeightObserver(WeightObserverBase): @@ -244,30 +198,6 @@ class INT8WeightObserver(WeightObserverBase): This class defines the behavior for Int8 WC which has per channel granularity. """ - def __init__( - self, - qscheme: torch.qscheme, - dtype: torch.dtype, - ch_axis: int = 0, - *args, - **kwargs, - ) -> None: - """ - :param qscheme: Quantization scheme which is per-channel for Int8 WC. - :param dtype: dtype for quantization such as int8, uint8, etc.. - :param ch_axis: Channel axis. - """ - super().__init__(dtype=dtype, is_dynamic=False) - self.wc_config = None - self.qscheme = qscheme - - qmode = ( - CompressWeightsMode.INT8_SYM - if self.qscheme == torch.per_channel_symmetric - else CompressWeightsMode.INT8_ASYM - ) - self.wc_config = WeightCompressionConfig(mode=qmode) - def _create_decompressor( self, scale: torch.Tensor, @@ -282,5 +212,6 @@ def _create_decompressor( else: return INT8SymmetricWeightsDecompressor(scale, original_weight.dtype) - def get_wc_config(self): - return self.wc_config \ No newline at end of file + def get_wc_param(self) -> WeightCompressionParameters: + return self.wc_param + \ No newline at end of file diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py index 2ede04e53db..ef9a83ca77c 100644 --- a/backends/openvino/quantizer/quantizer.py +++ b/backends/openvino/quantizer/quantizer.py @@ -24,9 +24,11 @@ from nncf.quantization.quantize_model import ( # type: ignore[import-untyped] get_weight_compression_configuration, ) +from nncf.quantization.algorithms.weight_compression.config import ( # type: ignore[import-untyped] + WeightCompressionParameters, +) from torchao.quantization.pt2e import ( HistogramObserver, - MappingType, PerChannelMinMaxObserver, UniformQuantizationObserverBase, ) @@ -112,16 +114,11 @@ def __init__( else: weight_compression_configuration = get_weight_compression_configuration( mode.value.replace( - "_wc", "" + "wo", "" ), # Mode value has to match NNCF CompressWeightsMode **kwargs, ) subset_size = 1 # Doesn't really matter in this case since it is data-free. Should just be +ve - dataset = None # Only Data Free Quantization is Supported in OVQuantizer - compression_format = nncf.CompressionFormat.DQ - nncf.quantization.algorithms.weight_compression.algorithm.check_user_compression_configuration( - subset_size=subset_size, dataset=dataset, compression_format=compression_format, **weight_compression_configuration - ) self._algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression( subset_size=subset_size, **weight_compression_configuration ) @@ -185,17 +182,14 @@ def _annotate_weight_compression( all_wc_params, _ = self._algo.get_processed_weight_compression_parameters(model, nncf_graph) for wc_param in all_wc_params: - wc_config = wc_param.compression_config node_with_weight = wc_param.node_with_weight target_node = nncf_fx.node_utils.get_graph_node_by_name( graph, node_with_weight.node_name ) annotation = node_vs_torch_annotation[target_node] edge_or_node = OpenVINOQuantizer._get_weight_edge(target_node, nncf_graph) - group_size = wc_config.group_size - qmode = wc_config.mode qspec = self._get_torch_ao_qspec_from_nncf_config( - qp=None, group_size=group_size, qmode=qmode, weights_only=True + qp=None, wc_param=wc_param ) self._fill_torch_ao_annotation(edge_or_node, qspec, annotation) @@ -425,19 +419,16 @@ def _fill_torch_ao_annotation( @staticmethod def _get_torch_ao_qspec_from_nncf_config( qp: quantization.quantizer_setup.QuantizationPointBase, - group_size: int = -1, - qmode: Optional[QuantizationMode] = None, - weights_only: bool = False, + wc_param: WeightCompressionParameters = None, ) -> QuantizationSpec: """ Returns a TorchAO QuantizationSpec based on NNCF quantization config and other arguments. - For weight-only quantization (e.g., INT4/INT8 compression), uses `qmode`, `group_size`, - and `weights_only`. For post-training quantization, only `qp` is required. + For weight-only quantization (e.g., INT4/INT8 compression), uses `wc_param` which carries + weight only quantization info such as group_size, reduction_axes etc. For post-training + quantization, only `qp` is required. :param qp: Quantization point from NNCF. - :param group_size: Group size for INT4 group-wise quantization. - :param qmode: Quantization mode for weight compression. - :param weights_only: If True, applies weight-only quantization logic. + :param wc_param: NNCF Weight compression parameters for the node. :return: A TorchAO QuantizationSpec. """ observer: Type[UniformQuantizationObserverBase] @@ -445,26 +436,21 @@ def _get_torch_ao_qspec_from_nncf_config( # Eps value is copied from nncf/torch/quantization/layers.py extra_args: Dict[str, Any] = {"eps": 1e-16} - if weights_only: - mapping_type = ( - MappingType.SYMMETRIC - if qmode == QuantizationMode.INT4WO_SYM - else MappingType.ASYMMETRIC - ) - if qmode in [QuantizationMode.INT4WO_SYM, QuantizationMode.INT4WO_SYM]: - extra_args["group_size"] = group_size - extra_args["mapping_type"] = mapping_type - extra_args["target_dtype"] = torch.int8 + if wc_param: + qmode = wc_param.compression_config.mode + if qmode in [nncf.CompressWeightsMode.INT4_ASYM, nncf.CompressWeightsMode.INT4_SYM]: + extra_args["wc_param"] = wc_param observer = INT4WeightObserver - quant_min = -8 if mapping_type == MappingType.SYMMETRIC else 0 - quant_max = 7 if mapping_type == MappingType.SYMMETRIC else 15 + quant_min = -8 if not wc_param.compression_config.is_asym_mode else 0 + quant_max = 7 if not wc_param.compression_config.is_asym_mode else 15 dtype = torch.int8 channel_axis = 0 torch_qscheme = None else: + extra_args["wc_param"] = wc_param observer = INT8WeightObserver - quant_min = -128 if mapping_type == MappingType.SYMMETRIC else 0 - quant_max = 127 if mapping_type == MappingType.SYMMETRIC else 255 + quant_min = -128 if not wc_param.compression_config.is_asym_mode else 0 + quant_max = 127 if not wc_param.compression_config.is_asym_mode else 255 dtype = torch.int8 channel_axis = 0 torch_qscheme = ( From 53e0f4cd0e01ed5a8adb85a7c08a2722d4a5a622 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 1 Sep 2025 10:39:20 +0400 Subject: [PATCH 05/27] review changes & update method names according to wc algo --- backends/openvino/quantizer/observers.py | 4 ++-- backends/openvino/quantizer/quantizer.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py index 845a091d24b..50fcc673ed6 100644 --- a/backends/openvino/quantizer/observers.py +++ b/backends/openvino/quantizer/observers.py @@ -30,7 +30,7 @@ from nncf.quantization.algorithms.weight_compression.weight_lowering import ( # type: ignore[import-untyped] do_integer_quantization, ) -from nncf.tensor.tensor import Tensor # type: ignore[import-untyped] +from nncf.tensor.tensor import Tensor as NNCFTensor # type: ignore[import-untyped] from nncf.torch.graph.transformations.commands import ( # type: ignore[import-untyped] PTTargetPoint, TargetType, @@ -78,7 +78,7 @@ def calculate_qparams( # type: ignore[override] wc_config = wc_param.compression_config reduction_axes = wc_param.reduction_axes q_weight, scale, zp = do_integer_quantization( - Tensor(weight), wc_config, reduction_axes=reduction_axes + NNCFTensor(weight), wc_config, reduction_axes=reduction_axes ) zp = zp.data if zp is not None else None return q_weight.data, scale.data, zp diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py index ef9a83ca77c..2e364424b16 100644 --- a/backends/openvino/quantizer/quantizer.py +++ b/backends/openvino/quantizer/quantizer.py @@ -179,7 +179,7 @@ def _annotate_weight_compression( :return: Updated mapping of FX nodes with weight compression annotations. """ self._algo.set_backend_entity(model) - all_wc_params, _ = self._algo.get_processed_weight_compression_parameters(model, nncf_graph) + all_wc_params, _ = self._algo.get_weight_compression_parameters(model, nncf_graph) for wc_param in all_wc_params: node_with_weight = wc_param.node_with_weight From bf959305dc210416f20c327509291db3655028e9 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 1 Sep 2025 11:14:13 +0400 Subject: [PATCH 06/27] review changes --- backends/openvino/quantizer/observers.py | 2 +- backends/openvino/quantizer/quantizer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py index 50fcc673ed6..b1054460a16 100644 --- a/backends/openvino/quantizer/observers.py +++ b/backends/openvino/quantizer/observers.py @@ -166,7 +166,7 @@ def get_wc_param(self) -> WeightCompressionParameters: class INT4WeightObserver(WeightObserverBase): """ - This class defines the behavior for INT4 Weight Compression which has per-group granularity. + OpenVINO INT4 Weight Compression observer. """ def _create_decompressor( diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py index 2e364424b16..485d67e3bb9 100644 --- a/backends/openvino/quantizer/quantizer.py +++ b/backends/openvino/quantizer/quantizer.py @@ -187,7 +187,7 @@ def _annotate_weight_compression( graph, node_with_weight.node_name ) annotation = node_vs_torch_annotation[target_node] - edge_or_node = OpenVINOQuantizer._get_weight_edge(target_node, nncf_graph) + edge_or_node = self._get_weight_edge(target_node, nncf_graph) qspec = self._get_torch_ao_qspec_from_nncf_config( qp=None, wc_param=wc_param ) From 2d4bec7a4b0041ead027a6c651e00eee32343dc4 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 1 Sep 2025 11:31:40 +0400 Subject: [PATCH 07/27] review changes --- backends/openvino/quantizer/observers.py | 38 ++++++----------------- backends/openvino/quantizer/quantizer.py | 7 +---- examples/models/llama/export_llama_lib.py | 2 +- 3 files changed, 12 insertions(+), 35 deletions(-) diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py index b1054460a16..d44a22556dd 100644 --- a/backends/openvino/quantizer/observers.py +++ b/backends/openvino/quantizer/observers.py @@ -9,12 +9,7 @@ from abc import ABC, abstractmethod from typing import Optional, Tuple -import nncf.torch.graph.operator_metatypes as om # type: ignore[import-untyped] - import torch -from nncf.experimental.torch.fx.nncf_graph_builder import ( # type: ignore[import-untyped] - GraphConverter, -) from nncf.experimental.torch.fx.node_utils import ( # type: ignore[import-untyped] get_tensor_constant_from_node, @@ -23,7 +18,6 @@ constant_update_fn, module_insertion_transformation_builder, ) -from nncf.parameters import CompressWeightsMode # type: ignore[import-untyped] from nncf.quantization.algorithms.weight_compression.config import ( # type: ignore[import-untyped] WeightCompressionParameters, ) @@ -57,9 +51,8 @@ def __init__( **kwargs, ) -> None: """ - :param wc_param: Weight compression parameter which contains information such as group_size - reduction_axes, quantization mode etc. - :param dtype: target dtype for quantization such as int8, uint8, etc. + :param wc_param: Weight compression parameters container. + :param dtype: target dtype for the quantization. """ super().__init__(dtype=dtype, is_dynamic=False) self.wc_param = wc_param @@ -69,10 +62,10 @@ def calculate_qparams( # type: ignore[override] weight: torch.Tensor, ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: """ - Calculate quantization parameters such as scale, quantized weight and zero point. + Calculates quantization parameters: quantized weight, quantization scale and quantization zero point. :param weight: FP weight to be used for calculating qparams. - :return: quantization params quantized weight, scale and zero point + :return: A tuple containing the quantized weight, quantization scale and quantization zero point. """ wc_param = self.get_wc_param() wc_config = wc_param.compression_config @@ -90,10 +83,8 @@ def convert( self, model: torch.fx.GraphModule, observer_node: torch.fx.Node ) -> None: """ - Converts the weight observer node into a decompression subgraph after calibration. - This method is responsible for transforming the model after the quantization preparation - and calibration phases. It replaces the observer node with the quantized weight and a decompression - module. + Replaces the given observer node from the given model with a quantized + weight and a OpenVINO specific decompression module. :param model: A `torch.fx.GraphModule` representing the statically traced model with observer nodes attached and calibrated. @@ -144,7 +135,7 @@ def _create_decompressor( original_weight: torch.Tensor, ) -> BaseWeightsDecompressor: """ - Used to return the respective NNCF decompressor for different types of quantization. + Returns a respective NNCF decompressor for different types of quantization. :param scale: Calculated scale quantization parameter. :param zero_point: Calculated zero_point quantization parameter. @@ -152,17 +143,14 @@ def _create_decompressor( :param original_weight: FP weight. :return: NNCF observer according to the qmode which creates the decompression subgraph supported by OpenVINO. """ - pass - @abstractmethod def get_wc_param(self) -> WeightCompressionParameters: """ - Used to return the respective NNCF Weight Compression Config. + Returns a respective NNCF Weight Compression Config. :return: Weight compression config with the compression information such as qmode, group_size etc. """ - pass - + return self.wc_param class INT4WeightObserver(WeightObserverBase): """ @@ -189,13 +177,10 @@ def _create_decompressor( scale, q_weight.shape, original_weight.shape, original_weight.dtype ) - def get_wc_param(self) -> WeightCompressionParameters: - return self.wc_param - class INT8WeightObserver(WeightObserverBase): """ - This class defines the behavior for Int8 WC which has per channel granularity. + OpenVINO INT8 Weight Compression per channel observer. """ def _create_decompressor( @@ -212,6 +197,3 @@ def _create_decompressor( else: return INT8SymmetricWeightsDecompressor(scale, original_weight.dtype) - def get_wc_param(self) -> WeightCompressionParameters: - return self.wc_param - \ No newline at end of file diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py index 485d67e3bb9..7f86686d03c 100644 --- a/backends/openvino/quantizer/quantizer.py +++ b/backends/openvino/quantizer/quantizer.py @@ -205,15 +205,10 @@ def _annotate_post_training_quantization( """ Annotates the model graph with post-training quantization configurations. - Converts NNCF quantization points into TorchAO-compatible quantization specs, - assigning them to corresponding nodes or edges. Also handles unified scale groups, - ensuring shared quantization specs across grouped quantizers with consistent configs. - :param model: The FX GraphModule to annotate. :param graph: The underlying FX graph. :param nncf_graph: The corresponding NNCF graph. :param node_vs_torch_annotation: A mapping of FX nodes to quantization annotations. - :return: Updated mapping of FX nodes with post-training quantization annotations. """ quantization_setup = self.get_nncf_quantization_setup(model, nncf_graph) @@ -575,4 +570,4 @@ def quantize_model( smooth_quant=smooth_quant, **kwargs, ) - return quantized_model \ No newline at end of file + return quantized_model diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 00785491100..269022f2cf7 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -790,7 +790,7 @@ def get_quantizer_and_quant_params(llm_config): ) quantizers.append(qnn_quantizer) if llm_config.backend.openvino.enabled and llm_config.quantization.pt2e_quantize: - assert len(quantizers) == 0, "Should not enable both xnnpack and openvino" + assert quantizers, "Should not enable both xnnpack and openvino" group_size = llm_config.quantization.group_size group_size = group_size if group_size else 32 ov_quantizer = get_ov_quantizer( From 0a2e361f04aa724c8af7d88c1dbd286b4c7556d6 Mon Sep 17 00:00:00 2001 From: Aamir Nazir Date: Wed, 3 Sep 2025 20:48:10 +0400 Subject: [PATCH 08/27] Update export_llama_lib.py --- examples/models/llama/export_llama_lib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 269022f2cf7..8eab3eefbc0 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -790,7 +790,7 @@ def get_quantizer_and_quant_params(llm_config): ) quantizers.append(qnn_quantizer) if llm_config.backend.openvino.enabled and llm_config.quantization.pt2e_quantize: - assert quantizers, "Should not enable both xnnpack and openvino" + assert not quantizers, "Should not enable both xnnpack and openvino" group_size = llm_config.quantization.group_size group_size = group_size if group_size else 32 ov_quantizer = get_ov_quantizer( From c8ea777098b8a812e6162b767dbfeabdd7c193c4 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Sat, 6 Sep 2025 13:39:52 +0400 Subject: [PATCH 09/27] use new transformations --- backends/openvino/quantizer/observers.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py index d44a22556dd..76ab33eb5c5 100644 --- a/backends/openvino/quantizer/observers.py +++ b/backends/openvino/quantizer/observers.py @@ -15,8 +15,9 @@ get_tensor_constant_from_node, ) from nncf.experimental.torch.fx.transformations import ( # type: ignore[import-untyped] - constant_update_fn, - module_insertion_transformation_builder, + constant_update, + module_insertion, + node_removal, ) from nncf.quantization.algorithms.weight_compression.config import ( # type: ignore[import-untyped] WeightCompressionParameters, @@ -103,7 +104,7 @@ def convert( packed_q_weight = decompressor.pack_weight(q_weight) # Weight port id is 0 since observer is inserted for a single weight only. - constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0) + constant_update(model, observer_node, packed_q_weight, input_port_id=0) compressed_weight_name = observer_node.all_input_nodes[0].name decompressor_suffix = "_".join( @@ -111,7 +112,8 @@ def convert( ) decompressor_name = f"{decompressor.quantization_mode}_weights_decompressor_{decompressor_suffix}" - module_insertion_transformation_builder( + module_insertion( + model, decompressor, [ PTTargetPoint( @@ -120,11 +122,8 @@ def convert( ) ], decompressor_name, - )(model) - - decomp_node = observer_node.args[0] - observer_node.replace_all_uses_with(decomp_node) # type: ignore[arg-type] - model.graph.erase_node(observer_node) + ) + node_removal(model, observer_node, 0) @abstractmethod def _create_decompressor( From a6b605f41b5390ff9de70b2397a2d00003f34ff2 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Sat, 6 Sep 2025 13:46:24 +0400 Subject: [PATCH 10/27] add comment for manual MP allocation --- extension/llm/export/quantizer_lib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py index 9220c1efbdc..e839827208c 100644 --- a/extension/llm/export/quantizer_lib.py +++ b/extension/llm/export/quantizer_lib.py @@ -233,7 +233,7 @@ def get_ov_quantizer( ), f"The quantization config is for backend {backend} instead of openvino." assert group_size != None, "Group Size None is Not Supported. It should be set to -1 for per-channel." - # Manually ignore MP layers. + # (TODO) Manually ignore MP layers. This is done manually for now till we use the dynamic allocation MP fp_node_names = [ "linear_14", "linear_15", From 9614fc4da170d76a39e047d0c364177bf96d0209 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Sat, 6 Sep 2025 13:48:58 +0400 Subject: [PATCH 11/27] remove nncf_compression from export llama lib --- examples/models/llama/export_llama_lib.py | 54 +---------------------- 1 file changed, 1 insertion(+), 53 deletions(-) diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 8eab3eefbc0..ac52893b99c 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -567,13 +567,6 @@ def build_args_parser() -> argparse.ArgumentParser: help="path to the input pruning token mapping file (token_map.json)", ) - parser.add_argument( - "--nncf_compression", - default=False, - action="store_true", - help="Enables nncf compression for openvino backend", - ) - parser.add_argument( "--export_only", default=False, @@ -909,7 +902,6 @@ def _to_edge_and_lower_llama_openvino( quantizers, additional_passes, openvino_device: str = "CPU", - nncf_compression: bool = False, verbose: bool = False, ) -> LLMEdgeManager: # noqa: C901 partitioners = [] @@ -921,51 +913,8 @@ def _to_edge_and_lower_llama_openvino( logging.info("Lowering model using following partitioner(s): ") for partitioner in partitioners: logging.info(f"--> {partitioner.__class__.__name__}") - try: - import nncf - from functools import partial - from pytorch_tokenizers import get_tokenizer - except ImportError: - raise ImportError( - "Please install nncf via backends/openvino/requirements.txt" - ) - - tokenizer = get_tokenizer(builder_exported.tokenizer_path) - from datasets import load_dataset - # Use NNCF compression if enabled - # TODO: Enable passing OpenVINOQuantizer as a parameter to pt2e_quantize - if nncf_compression: - dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") - dataset = dataset.filter(lambda example: example['text'].strip() != "") - dataset = dataset.filter(lambda example: example['text'].strip() != "\n") - def transform_fn( - prompts: str, tokenizer - ): - tokenized_text = tokenizer.encode(prompts["text"], bos=False, eos=False) - device = torch.device("cpu") if openvino_device=="CPU" else torch.device("cuda") - inputs = () - inputs = ( - torch.tensor(tokenized_text[:128], device=device).unsqueeze(0), - {"input_pos": torch.tensor([0], device=device)}, - ) - - return inputs - - builder_exported.pre_autograd_graph_module = nncf.compress_weights( - builder_exported.pre_autograd_graph_module, - dataset=nncf.Dataset(dataset, partial(transform_fn, tokenizer=tokenizer)), - mode=nncf.CompressWeightsMode.INT4_SYM, - group_size=32, - backup_mode=nncf.BackupMode.NONE, - ratio=0.8, - sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION, - ) - - builder = builder_exported.to_edge_transform_and_lower(partitioners) - - else: - builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(partitioners) + builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(partitioners) if verbose: print_delegation_info(builder.edge_manager.exported_program().graph_module) @@ -1211,7 +1160,6 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager: # noqa: C901 quantizers, additional_passes, openvino_device=llm_config.backend.openvino.device, - nncf_compression=llm_config.backend.openvino.nncf_compression, verbose=llm_config.debug.verbose, ) else: From 45007cf90c054ccfd527874ae35d383fc34a4ee8 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Sat, 6 Sep 2025 13:52:58 +0400 Subject: [PATCH 12/27] change pt2e quantize flag to use openvino_4wo instead of openvino_8da4w and so on --- extension/llm/export/config/llm_config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py index b4175d54cd7..49855d61e6e 100644 --- a/extension/llm/export/config/llm_config.py +++ b/extension/llm/export/config/llm_config.py @@ -275,8 +275,8 @@ class Pt2eQuantize(str, Enum): xnnpack_dynamic = "xnnpack_dynamic" xnnpack_dynamic_qc4 = "xnnpack_dynamic_qc4" - openvino_8da4w = "openvino_8da4w" - openvino_8da8w = "openvino_8da8w" + openvino_4wo = "openvino_4wo" + openvino_8wo = "openvino_8wo" qnn_8a8w = "qnn_8a8w" qnn_16a16w = "qnn_16a16w" qnn_16a4w = "qnn_16a4w" From 9d494147457e6696f7149e4b7cb69f95811cbd47 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Sat, 6 Sep 2025 13:53:14 +0400 Subject: [PATCH 13/27] follow up to last commit --- examples/models/llama/export_llama_lib.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index ac52893b99c..ec03f4b26c9 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -206,8 +206,8 @@ def build_args_parser() -> argparse.ArgumentParser: choices=[ "xnnpack_dynamic", "xnnpack_dynamic_qc4", - "openvino_8da4w", - "openvino_8da8w", + "openvino_4wo", + "openvino_8wo", "qnn_8a8w", "qnn_16a16w", "qnn_16a4w", From d6727cfed609d07281fdea42358d2e234ac82f19 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Sat, 6 Sep 2025 13:56:47 +0400 Subject: [PATCH 14/27] update quantizer lib with openvino_4wo --- extension/llm/export/quantizer_lib.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py index e839827208c..8a097f9b8f1 100644 --- a/extension/llm/export/quantizer_lib.py +++ b/extension/llm/export/quantizer_lib.py @@ -263,10 +263,10 @@ def get_ov_quantizer( "linear_109", "linear_110",] - if quant_config == "8da4w": + if quant_config == "4wo": mode = QuantizationMode.INT4WO_SYM - elif quant_config == "8da8w": + elif quant_config == "8wo": group_size = -1 mode = QuantizationMode.INT8WO_SYM else: From 4a0a7819ab69aa0d8fdfce70f3be219c14abc409 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Sat, 6 Sep 2025 14:06:48 +0400 Subject: [PATCH 15/27] split qspec function into 2 parts; 1 for WC and other for PTQ qspecs --- backends/openvino/quantizer/quantizer.py | 92 +++++++++++++----------- 1 file changed, 50 insertions(+), 42 deletions(-) diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py index 7f86686d03c..ef04ed0de46 100644 --- a/backends/openvino/quantizer/quantizer.py +++ b/backends/openvino/quantizer/quantizer.py @@ -188,8 +188,8 @@ def _annotate_weight_compression( ) annotation = node_vs_torch_annotation[target_node] edge_or_node = self._get_weight_edge(target_node, nncf_graph) - qspec = self._get_torch_ao_qspec_from_nncf_config( - qp=None, wc_param=wc_param + qspec = self._get_torch_ao_qspec_from_nncf_config_for_wc( + wc_param=wc_param ) self._fill_torch_ao_annotation(edge_or_node, qspec, annotation) @@ -217,7 +217,7 @@ def _annotate_post_training_quantization( edge_or_node, annotation = self._get_edge_or_node_and_annotation( graph, nncf_graph, qp, node_vs_torch_annotation ) - qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(qp) + qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config_for_ptq(qp) self._fill_torch_ao_annotation(edge_or_node, qspec, annotation) for quantizer_ids in quantization_setup.unified_scale_groups.values(): @@ -412,18 +412,58 @@ def _fill_torch_ao_annotation( annotation_to_update.input_qspec_map[edge_or_node[0]] = qspec @staticmethod - def _get_torch_ao_qspec_from_nncf_config( + def _get_torch_ao_qspec_from_nncf_config_for_wc( + wc_param: WeightCompressionParameters, + ) -> QuantizationSpec: + """ + Returns a TorchAO QuantizationSpec based on NNCF weight compression parameter. + + :param wc_param: NNCF Weight compression parameters for the node. + :return: A TorchAO QuantizationSpec. + """ + observer: Type[UniformQuantizationObserverBase] + + extra_args: Dict[str, Any] = {} + + qmode = wc_param.compression_config.mode + if qmode in [nncf.CompressWeightsMode.INT4_ASYM, nncf.CompressWeightsMode.INT4_SYM]: + extra_args["wc_param"] = wc_param + observer = INT4WeightObserver + quant_min = -8 if not wc_param.compression_config.is_asym_mode else 0 + quant_max = 7 if not wc_param.compression_config.is_asym_mode else 15 + dtype = torch.int8 + channel_axis = 0 + torch_qscheme = None + else: + extra_args["wc_param"] = wc_param + observer = INT8WeightObserver + quant_min = -128 if not wc_param.compression_config.is_asym_mode else 0 + quant_max = 127 if not wc_param.compression_config.is_asym_mode else 255 + dtype = torch.int8 + channel_axis = 0 + torch_qscheme = ( + torch.per_channel_symmetric + if qmode == QuantizationMode.INT8WO_SYM + else torch.per_channel_affine + ) + return QuantizationSpec( + dtype=dtype, + observer_or_fake_quant_ctr=observer.with_args(**extra_args), + quant_min=quant_min, + quant_max=quant_max, + qscheme=torch_qscheme, + ch_axis=channel_axis, + is_dynamic=False, + ) + + @staticmethod + def _get_torch_ao_qspec_from_nncf_config_for_ptq( qp: quantization.quantizer_setup.QuantizationPointBase, - wc_param: WeightCompressionParameters = None, ) -> QuantizationSpec: """ - Returns a TorchAO QuantizationSpec based on NNCF quantization config and other arguments. - For weight-only quantization (e.g., INT4/INT8 compression), uses `wc_param` which carries - weight only quantization info such as group_size, reduction_axes etc. For post-training - quantization, only `qp` is required. + Returns a TorchAO QuantizationSpec based on NNCF quantization point. :param qp: Quantization point from NNCF. - :param wc_param: NNCF Weight compression parameters for the node. :return: A TorchAO QuantizationSpec. """ observer: Type[UniformQuantizationObserverBase] @@ -431,38 +471,6 @@ def _get_torch_ao_qspec_from_nncf_config( # Eps value is copied from nncf/torch/quantization/layers.py extra_args: Dict[str, Any] = {"eps": 1e-16} - if wc_param: - qmode = wc_param.compression_config.mode - if qmode in [nncf.CompressWeightsMode.INT4_ASYM, nncf.CompressWeightsMode.INT4_SYM]: - extra_args["wc_param"] = wc_param - observer = INT4WeightObserver - quant_min = -8 if not wc_param.compression_config.is_asym_mode else 0 - quant_max = 7 if not wc_param.compression_config.is_asym_mode else 15 - dtype = torch.int8 - channel_axis = 0 - torch_qscheme = None - else: - extra_args["wc_param"] = wc_param - observer = INT8WeightObserver - quant_min = -128 if not wc_param.compression_config.is_asym_mode else 0 - quant_max = 127 if not wc_param.compression_config.is_asym_mode else 255 - dtype = torch.int8 - channel_axis = 0 - torch_qscheme = ( - torch.per_channel_symmetric - if qmode == QuantizationMode.INT8WO_SYM - else torch.per_channel_affine - ) - return QuantizationSpec( - dtype=dtype, - observer_or_fake_quant_ctr=observer.with_args(**extra_args), - quant_min=quant_min, - quant_max=quant_max, - qscheme=torch_qscheme, - ch_axis=channel_axis, - is_dynamic=False, - ) - is_weight = qp.is_weight_quantization_point() qconfig = qp.qconfig From f6a1ee3d708ca46fe495f081bc45872042b1bed6 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 8 Sep 2025 12:14:34 +0400 Subject: [PATCH 16/27] micro fix --- backends/openvino/quantizer/quantizer.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py index ef04ed0de46..762ed2a9171 100644 --- a/backends/openvino/quantizer/quantizer.py +++ b/backends/openvino/quantizer/quantizer.py @@ -426,24 +426,29 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc( extra_args: Dict[str, Any] = {} qmode = wc_param.compression_config.mode + is_asym_mode = wc_param.compression_config.is_asym_mode if qmode in [nncf.CompressWeightsMode.INT4_ASYM, nncf.CompressWeightsMode.INT4_SYM]: extra_args["wc_param"] = wc_param observer = INT4WeightObserver - quant_min = -8 if not wc_param.compression_config.is_asym_mode else 0 - quant_max = 7 if not wc_param.compression_config.is_asym_mode else 15 + quant_min = -8 if not is_asym_mode else 0 + quant_max = 7 if not is_asym_mode else 15 dtype = torch.int8 channel_axis = 0 - torch_qscheme = None + torch_qscheme = torch_qscheme = ( + torch.per_channel_symmetric + if not is_asym_mode + else torch.per_channel_affine + ) else: extra_args["wc_param"] = wc_param observer = INT8WeightObserver - quant_min = -128 if not wc_param.compression_config.is_asym_mode else 0 - quant_max = 127 if not wc_param.compression_config.is_asym_mode else 255 + quant_min = -128 if not is_asym_mode else 0 + quant_max = 127 if not is_asym_mode else 255 dtype = torch.int8 channel_axis = 0 torch_qscheme = ( torch.per_channel_symmetric - if qmode == QuantizationMode.INT8WO_SYM + if not is_asym_mode else torch.per_channel_affine ) return QuantizationSpec( From d285fcce354f8bde55e968892932cbe4a34421cd Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 8 Sep 2025 15:35:49 +0400 Subject: [PATCH 17/27] udpate mixed precision layers for higher accuracy. Change INT4 mode to Asymmetric --- extension/llm/export/quantizer_lib.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py index 8a097f9b8f1..46b10dcb960 100644 --- a/extension/llm/export/quantizer_lib.py +++ b/extension/llm/export/quantizer_lib.py @@ -235,21 +235,17 @@ def get_ov_quantizer( # (TODO) Manually ignore MP layers. This is done manually for now till we use the dynamic allocation MP fp_node_names = [ + "linear_13", "linear_14", - "linear_15", "linear_35", "linear_56", - "linear_57", - "linear_63", "linear_70", "linear_71", "linear_77", "linear_78", - "linear_81", "linear_84", "linear_85", "linear_88", - "linear_89", "linear_91", "linear_92", "linear_95", @@ -261,10 +257,11 @@ def get_ov_quantizer( "linear_105", "linear_106", "linear_109", - "linear_110",] + "linear_110", + "linear_111",] if quant_config == "4wo": - mode = QuantizationMode.INT4WO_SYM + mode = QuantizationMode.INT4WO_ASYM elif quant_config == "8wo": group_size = -1 From 4e66df1a52e40e90178f4c9fce815d364c5282f9 Mon Sep 17 00:00:00 2001 From: Aamir Nazir Date: Mon, 8 Sep 2025 18:12:37 +0400 Subject: [PATCH 18/27] Apply suggestions from code review Co-authored-by: Daniil Lyakhov --- backends/openvino/quantizer/observers.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py index 76ab33eb5c5..59a40f2be2d 100644 --- a/backends/openvino/quantizer/observers.py +++ b/backends/openvino/quantizer/observers.py @@ -56,9 +56,9 @@ def __init__( :param dtype: target dtype for the quantization. """ super().__init__(dtype=dtype, is_dynamic=False) - self.wc_param = wc_param + self._wc_param = wc_param - def calculate_qparams( # type: ignore[override] + def _calculate_qparams( # type: ignore[override] self, weight: torch.Tensor, ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: @@ -68,7 +68,7 @@ def calculate_qparams( # type: ignore[override] :param weight: FP weight to be used for calculating qparams. :return: A tuple containing the quantized weight, quantization scale and quantization zero point. """ - wc_param = self.get_wc_param() + wc_param = self._wc_param wc_config = wc_param.compression_config reduction_axes = wc_param.reduction_axes q_weight, scale, zp = do_integer_quantization( @@ -143,13 +143,6 @@ def _create_decompressor( :return: NNCF observer according to the qmode which creates the decompression subgraph supported by OpenVINO. """ - def get_wc_param(self) -> WeightCompressionParameters: - """ - Returns a respective NNCF Weight Compression Config. - - :return: Weight compression config with the compression information such as qmode, group_size etc. - """ - return self.wc_param class INT4WeightObserver(WeightObserverBase): """ From e850e419cb313e86fd0f5669e7eaa1d115fcc10c Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 8 Sep 2025 18:13:28 +0400 Subject: [PATCH 19/27] Review changes --- backends/openvino/quantizer/observers.py | 30 ++++++++++++------------ 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py index 59a40f2be2d..457399117e0 100644 --- a/backends/openvino/quantizer/observers.py +++ b/backends/openvino/quantizer/observers.py @@ -94,7 +94,7 @@ def convert( """ weight_node = observer_node.args[0] original_weight = get_tensor_constant_from_node(weight_node, model) - q_weight, scale, zero_point = self.calculate_qparams( + q_weight, scale, zero_point = self._calculate_qparams( original_weight ) @@ -156,18 +156,17 @@ def _create_decompressor( q_weight: torch.Tensor, original_weight: torch.Tensor, ) -> BaseWeightsDecompressor: - if zero_point is not None: - return INT4AsymmetricWeightsDecompressor( - scale, - zero_point, - q_weight.shape, - original_weight.shape, - original_weight.dtype, - ) - else: + if zero_point is None: return INT4SymmetricWeightsDecompressor( scale, q_weight.shape, original_weight.shape, original_weight.dtype ) + return INT4AsymmetricWeightsDecompressor( + scale, + zero_point, + q_weight.shape, + original_weight.shape, + original_weight.dtype, + ) class INT8WeightObserver(WeightObserverBase): @@ -182,10 +181,11 @@ def _create_decompressor( q_weight: torch.Tensor, original_weight: torch.Tensor, ) -> BaseWeightsDecompressor: - if zero_point is not None: - return INT8AsymmetricWeightsDecompressor( - scale, zero_point, original_weight.dtype + if zero_point is None: + return INT8SymmetricWeightsDecompressor( + scale, original_weight.dtype ) - else: - return INT8SymmetricWeightsDecompressor(scale, original_weight.dtype) + return INT8AsymmetricWeightsDecompressor( + scale, zero_point, original_weight.dtype + ) From 204043f973ba928c3f2b73dc11e1db6572b7c4a7 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 8 Sep 2025 18:33:16 +0400 Subject: [PATCH 20/27] review changes in quantizer --- backends/openvino/quantizer/quantizer.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py index 762ed2a9171..7e0e3c92af0 100644 --- a/backends/openvino/quantizer/quantizer.py +++ b/backends/openvino/quantizer/quantizer.py @@ -175,7 +175,6 @@ def _annotate_weight_compression( :param graph: The underlying FX graph. :param nncf_graph: The corresponding NNCF graph. :param node_vs_torch_annotation: A mapping of FX nodes to quantization annotations. - :return: Updated mapping of FX nodes with weight compression annotations. """ self._algo.set_backend_entity(model) @@ -343,7 +342,7 @@ def _get_edge_or_node_and_annotation( def _get_weight_edge( target_node: torch.fx.Node, nncf_graph: NNCFGraph, - ): + ) -> tuple[torch.fx.Node, torch.fx.Node]: """ Returns the FX node corresponding to the weight tensor input of a given operator node. Uses the NNCF graph to identify which input port of the target node holds the weight. @@ -351,7 +350,6 @@ def _get_weight_edge( :param target_node: FX node representing a weighted operation (e.g., Linear, Conv). :param nncf_graph: NNCFGraph used to determine weight port indices. - :return: Edge represented by a Tuple of (weight_node, target_node), where weight_node is the FX node supplying the weight. """ nncf_node = nncf_graph.get_node_by_name(target_node.name) @@ -428,7 +426,6 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc( qmode = wc_param.compression_config.mode is_asym_mode = wc_param.compression_config.is_asym_mode if qmode in [nncf.CompressWeightsMode.INT4_ASYM, nncf.CompressWeightsMode.INT4_SYM]: - extra_args["wc_param"] = wc_param observer = INT4WeightObserver quant_min = -8 if not is_asym_mode else 0 quant_max = 7 if not is_asym_mode else 15 @@ -440,7 +437,6 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc( else torch.per_channel_affine ) else: - extra_args["wc_param"] = wc_param observer = INT8WeightObserver quant_min = -128 if not is_asym_mode else 0 quant_max = 127 if not is_asym_mode else 255 @@ -453,7 +449,7 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc( ) return QuantizationSpec( dtype=dtype, - observer_or_fake_quant_ctr=observer.with_args(**extra_args), + observer_or_fake_quant_ctr=observer.with_args(wc_param=wc_param), quant_min=quant_min, quant_max=quant_max, qscheme=torch_qscheme, From ae6b089f293d20248df4c3d8a0d0c5ddfed62c4c Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 8 Sep 2025 18:45:54 +0400 Subject: [PATCH 21/27] revert extra args changes --- backends/openvino/quantizer/quantizer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py index 7e0e3c92af0..89d528f8d16 100644 --- a/backends/openvino/quantizer/quantizer.py +++ b/backends/openvino/quantizer/quantizer.py @@ -424,6 +424,7 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc( extra_args: Dict[str, Any] = {} qmode = wc_param.compression_config.mode + extra_args["wc_param"] = wc_param is_asym_mode = wc_param.compression_config.is_asym_mode if qmode in [nncf.CompressWeightsMode.INT4_ASYM, nncf.CompressWeightsMode.INT4_SYM]: observer = INT4WeightObserver @@ -449,7 +450,7 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc( ) return QuantizationSpec( dtype=dtype, - observer_or_fake_quant_ctr=observer.with_args(wc_param=wc_param), + observer_or_fake_quant_ctr=observer.with_args(**extra_args), quant_min=quant_min, quant_max=quant_max, qscheme=torch_qscheme, From 2de569398917362b9ffc02849037528c2a15efa7 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Tue, 9 Sep 2025 11:43:00 +0400 Subject: [PATCH 22/27] precommit fixes --- backends/openvino/quantizer/observers.py | 11 +++------ backends/openvino/quantizer/quantizer.py | 30 +++++++++++++---------- examples/models/llama/export_llama_lib.py | 6 +++-- extension/llm/export/quantizer_lib.py | 21 +++++++++------- 4 files changed, 36 insertions(+), 32 deletions(-) diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py index 457399117e0..faeb4fa7a60 100644 --- a/backends/openvino/quantizer/observers.py +++ b/backends/openvino/quantizer/observers.py @@ -84,7 +84,7 @@ def convert( self, model: torch.fx.GraphModule, observer_node: torch.fx.Node ) -> None: """ - Replaces the given observer node from the given model with a quantized + Replaces the given observer node from the given model with a quantized weight and a OpenVINO specific decompression module. :param model: A `torch.fx.GraphModule` representing the statically traced model @@ -94,9 +94,7 @@ def convert( """ weight_node = observer_node.args[0] original_weight = get_tensor_constant_from_node(weight_node, model) - q_weight, scale, zero_point = self._calculate_qparams( - original_weight - ) + q_weight, scale, zero_point = self._calculate_qparams(original_weight) decompressor = self._create_decompressor( scale, zero_point, q_weight, original_weight @@ -182,10 +180,7 @@ def _create_decompressor( original_weight: torch.Tensor, ) -> BaseWeightsDecompressor: if zero_point is None: - return INT8SymmetricWeightsDecompressor( - scale, original_weight.dtype - ) + return INT8SymmetricWeightsDecompressor(scale, original_weight.dtype) return INT8AsymmetricWeightsDecompressor( scale, zero_point, original_weight.dtype ) - diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py index 9db79fce9f9..bef1ef3274f 100644 --- a/backends/openvino/quantizer/quantizer.py +++ b/backends/openvino/quantizer/quantizer.py @@ -12,7 +12,6 @@ import nncf # type: ignore[import-untyped] import nncf.common.quantization as quantization # type: ignore[import-untyped] -from nncf.common.scopes import should_consider_scope # type: ignore[import-untyped] import nncf.experimental.torch.fx as nncf_fx # type: ignore[import-untyped] import torch.fx @@ -21,12 +20,12 @@ INT8WeightObserver, ) from nncf.common.graph.graph import NNCFGraph # type: ignore[import-untyped] -from nncf.quantization.quantize_model import ( # type: ignore[import-untyped] - get_weight_compression_configuration, -) from nncf.quantization.algorithms.weight_compression.config import ( # type: ignore[import-untyped] WeightCompressionParameters, ) +from nncf.quantization.quantize_model import ( # type: ignore[import-untyped] + get_weight_compression_configuration, +) from torchao.quantization.pt2e import ( HistogramObserver, PerChannelMinMaxObserver, @@ -118,7 +117,7 @@ def __init__( ), # Mode value has to match NNCF CompressWeightsMode **kwargs, ) - subset_size = 1 # Doesn't really matter in this case since it is data-free. Should just be +ve + subset_size = 1 # Doesn't really matter in this case since it is data-free. Should just be +ve self._algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression( subset_size=subset_size, **weight_compression_configuration ) @@ -178,7 +177,9 @@ def _annotate_weight_compression( :return: Updated mapping of FX nodes with weight compression annotations. """ self._algo.set_backend_entity(model) - all_wc_params, _ = self._algo.get_weight_compression_parameters(model, nncf_graph) + all_wc_params, _ = self._algo.get_weight_compression_parameters( + model, nncf_graph + ) for wc_param in all_wc_params: node_with_weight = wc_param.node_with_weight @@ -187,9 +188,7 @@ def _annotate_weight_compression( ) annotation = node_vs_torch_annotation[target_node] edge_or_node = self._get_weight_edge(target_node, nncf_graph) - qspec = self._get_torch_ao_qspec_from_nncf_config_for_wc( - wc_param=wc_param - ) + qspec = self._get_torch_ao_qspec_from_nncf_config_for_wc(wc_param=wc_param) self._fill_torch_ao_annotation(edge_or_node, qspec, annotation) return node_vs_torch_annotation @@ -216,7 +215,9 @@ def _annotate_post_training_quantization( edge_or_node, annotation = self._get_edge_or_node_and_annotation( graph, nncf_graph, qp, node_vs_torch_annotation ) - qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config_for_ptq(qp) + qspec: QuantizationSpecBase = ( + self._get_torch_ao_qspec_from_nncf_config_for_ptq(qp) + ) self._fill_torch_ao_annotation(edge_or_node, qspec, annotation) for quantizer_ids in quantization_setup.unified_scale_groups.values(): @@ -426,8 +427,11 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc( qmode = wc_param.compression_config.mode extra_args["wc_param"] = wc_param is_asym_mode = wc_param.compression_config.is_asym_mode - if qmode in [nncf.CompressWeightsMode.INT4_ASYM, nncf.CompressWeightsMode.INT4_SYM]: - observer = INT4WeightObserver + if qmode in [ + nncf.CompressWeightsMode.INT4_ASYM, + nncf.CompressWeightsMode.INT4_SYM, + ]: + observer = INT4WeightObserver # type: ignore[type-abstract] quant_min = -8 if not is_asym_mode else 0 quant_max = 7 if not is_asym_mode else 15 dtype = torch.int8 @@ -438,7 +442,7 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc( else torch.per_channel_affine ) else: - observer = INT8WeightObserver + observer = INT8WeightObserver # type: ignore[type-abstract] quant_min = -128 if not is_asym_mode else 0 quant_max = 127 if not is_asym_mode else 255 dtype = torch.int8 diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 578fd0fea7b..d9c282888cc 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -43,10 +43,10 @@ ) from executorch.extension.llm.export.quantizer_lib import ( get_coreml_quantizer, + get_ov_quantizer, get_pt2e_quantization_params, get_pt2e_quantizers, get_qnn_quantizer, - get_ov_quantizer, get_vulkan_quantizer, ) from executorch.util.activation_memory_profiler import generate_memory_trace @@ -897,7 +897,9 @@ def _to_edge_and_lower_llama_openvino( for partitioner in partitioners: logging.info(f"--> {partitioner.__class__.__name__}") - builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(partitioners) + builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower( + partitioners + ) if verbose: print_delegation_info(builder.edge_manager.exported_program().graph_module) diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py index 83d4a84420d..df8c2a5e36c 100644 --- a/extension/llm/export/quantizer_lib.py +++ b/extension/llm/export/quantizer_lib.py @@ -220,20 +220,22 @@ def get_ov_quantizer( group_size: int = 32, ): try: - from executorch.backends.openvino.quantizer import OpenVINOQuantizer, QuantizationMode - import nncf - except ImportError: - raise ImportError( - "Please install nncf via backends/openvino/requirements.txt" + from executorch.backends.openvino.quantizer import ( + OpenVINOQuantizer, + QuantizationMode, ) - + except ImportError: + raise ImportError("Please install nncf via backends/openvino/requirements.txt") + backend, quant_config = pt2e_quantize.split("_") assert ( backend == "openvino" ), f"The quantization config is for backend {backend} instead of openvino." - assert group_size != None, "Group Size None is Not Supported. It should be set to -1 for per-channel." + assert ( + group_size + ), "Group Size None is Not Supported. It should be set to -1 for per-channel." - # (TODO) Manually ignore MP layers. This is done manually for now till we use the dynamic allocation MP + # (TODO) Manually ignore MP layers. This is done manually for now till we use the dynamic allocation MP fp_node_names = [ "linear_13", "linear_14", @@ -258,7 +260,8 @@ def get_ov_quantizer( "linear_106", "linear_109", "linear_110", - "linear_111",] + "linear_111", + ] if quant_config == "4wo": mode = QuantizationMode.INT4WO_ASYM From 0e10f28242129a3c332ccdbd7a3b9a4340a8e1a1 Mon Sep 17 00:00:00 2001 From: Aamir Nazir Date: Tue, 9 Sep 2025 21:52:23 +0400 Subject: [PATCH 23/27] revert _calculate_qparams back to calculate_qparams --- backends/openvino/quantizer/observers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py index faeb4fa7a60..6cda4561604 100644 --- a/backends/openvino/quantizer/observers.py +++ b/backends/openvino/quantizer/observers.py @@ -58,7 +58,7 @@ def __init__( super().__init__(dtype=dtype, is_dynamic=False) self._wc_param = wc_param - def _calculate_qparams( # type: ignore[override] + def calculate_qparams( # type: ignore[override] self, weight: torch.Tensor, ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: @@ -94,7 +94,7 @@ def convert( """ weight_node = observer_node.args[0] original_weight = get_tensor_constant_from_node(weight_node, model) - q_weight, scale, zero_point = self._calculate_qparams(original_weight) + q_weight, scale, zero_point = self.calculate_qparams(original_weight) decompressor = self._create_decompressor( scale, zero_point, q_weight, original_weight From 05f5a929c7c5b9a79859d9c9848ce37dd0c16b41 Mon Sep 17 00:00:00 2001 From: Aamir Nazir Date: Wed, 10 Sep 2025 18:49:08 +0400 Subject: [PATCH 24/27] remove manual ignored nodes --- extension/llm/export/quantizer_lib.py | 29 --------------------------- 1 file changed, 29 deletions(-) diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py index df8c2a5e36c..870080a7549 100644 --- a/extension/llm/export/quantizer_lib.py +++ b/extension/llm/export/quantizer_lib.py @@ -235,34 +235,6 @@ def get_ov_quantizer( group_size ), "Group Size None is Not Supported. It should be set to -1 for per-channel." - # (TODO) Manually ignore MP layers. This is done manually for now till we use the dynamic allocation MP - fp_node_names = [ - "linear_13", - "linear_14", - "linear_35", - "linear_56", - "linear_70", - "linear_71", - "linear_77", - "linear_78", - "linear_84", - "linear_85", - "linear_88", - "linear_91", - "linear_92", - "linear_95", - "linear_96", - "linear_98", - "linear_99", - "linear_102", - "linear_103", - "linear_105", - "linear_106", - "linear_109", - "linear_110", - "linear_111", - ] - if quant_config == "4wo": mode = QuantizationMode.INT4WO_ASYM @@ -274,7 +246,6 @@ def get_ov_quantizer( f"No support for quant type {quant_config}. Support 8a4w, 8a8w only." ) ov_quantizer = OpenVINOQuantizer(mode=mode, group_size=group_size) - ov_quantizer.set_ignored_scope(names=fp_node_names) return ov_quantizer From fbe0e21137ee9ebc8ea246e61fd9cfa252f57b15 Mon Sep 17 00:00:00 2001 From: Aamir Nazir Date: Wed, 10 Sep 2025 18:52:42 +0400 Subject: [PATCH 25/27] add ratio to quantizer initialization --- extension/llm/export/quantizer_lib.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py index 870080a7549..350e8b3ce7c 100644 --- a/extension/llm/export/quantizer_lib.py +++ b/extension/llm/export/quantizer_lib.py @@ -235,17 +235,23 @@ def get_ov_quantizer( group_size ), "Group Size None is Not Supported. It should be set to -1 for per-channel." + quantization_params = {} + if quant_config == "4wo": - mode = QuantizationMode.INT4WO_ASYM + quantization_params["mode"] = QuantizationMode.INT4WO_ASYM + quantization_params["group_size"] = group_size + quantization_params["ratio"] = 0.8 elif quant_config == "8wo": - group_size = -1 - mode = QuantizationMode.INT8WO_SYM + quantization_params["mode"] = QuantizationMode.INT8WO_ASYM + quantization_params["group_size"] = -1 + quantization_params["ratio"] = None + else: raise AssertionError( f"No support for quant type {quant_config}. Support 8a4w, 8a8w only." ) - ov_quantizer = OpenVINOQuantizer(mode=mode, group_size=group_size) + ov_quantizer = OpenVINOQuantizer(**quantization_params) return ov_quantizer From 6bff1cdb00ebdae53b57ab706cab6e9e9ee7e335 Mon Sep 17 00:00:00 2001 From: Aamir Nazir Date: Thu, 11 Sep 2025 23:04:13 +0400 Subject: [PATCH 26/27] Update export_llama_lib.py --- examples/models/llama/export_llama_lib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index d9c282888cc..cbbf169a085 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -768,7 +768,7 @@ def get_quantizer_and_quant_params(llm_config): if llm_config.backend.openvino.enabled and llm_config.quantization.pt2e_quantize: assert not quantizers, "Should not enable both xnnpack and openvino" group_size = llm_config.quantization.group_size - group_size = group_size if group_size else 32 + group_size = group_size if group_size else 128 ov_quantizer = get_ov_quantizer( llm_config.quantization.pt2e_quantize.value, group_size ) From d744ae95f3cf806278b12db346105e233a2daec5 Mon Sep 17 00:00:00 2001 From: Aamir Nazir Date: Thu, 11 Sep 2025 23:04:50 +0400 Subject: [PATCH 27/27] Update quantizer_lib.py --- extension/llm/export/quantizer_lib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py index 350e8b3ce7c..f92c59cebd3 100644 --- a/extension/llm/export/quantizer_lib.py +++ b/extension/llm/export/quantizer_lib.py @@ -217,7 +217,7 @@ def get_qnn_quantizer( def get_ov_quantizer( pt2e_quantize: str, - group_size: int = 32, + group_size: int = 128, ): try: from executorch.backends.openvino.quantizer import (