From 30a1a258b22d1471c0aae328f30a5910af6af118 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Tue, 26 Aug 2025 12:31:49 +0400
Subject: [PATCH 01/27] openvino quantizer refactored

---
 backends/openvino/quantizer/__init__.py       |   4 +-
 backends/openvino/quantizer/observers.py      | 286 ++++++++++++
 .../quantizer/observers/nncf_observers.py     | 176 --------
 backends/openvino/quantizer/quantizer.py      | 412 ++++++++++--------
 examples/models/llama/export_llama_lib.py     |   9 +
 extension/llm/export/quantizer_lib.py         |  38 +-
 6 files changed, 573 insertions(+), 352 deletions(-)
 create mode 100644 backends/openvino/quantizer/observers.py
 delete mode 100644 backends/openvino/quantizer/observers/nncf_observers.py

diff --git a/backends/openvino/quantizer/__init__.py b/backends/openvino/quantizer/__init__.py
index df038483f2f..0fd8c10b249 100644
--- a/backends/openvino/quantizer/__init__.py
+++ b/backends/openvino/quantizer/__init__.py
@@ -1,3 +1,3 @@
-from .quantizer import OpenVINOQuantizer, quantize_model
+from .quantizer import OpenVINOQuantizer, quantize_model, QuantizationMode
 
-__all__ = ["OpenVINOQuantizer", "quantize_model"]
+__all__ = ["OpenVINOQuantizer", "quantize_model", "QuantizationMode"]
diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py
new file mode 100644
index 00000000000..2ea66f11a55
--- /dev/null
+++ b/backends/openvino/quantizer/observers.py
@@ -0,0 +1,286 @@
+# Copyright (c) Intel Corporation
+#
+# Licensed under the BSD License (the "License"); you may not use this file
+# except in compliance with the License. See the license file found in the
+# LICENSE file in the root directory of this source tree.
+
+# mypy: disable-error-code=import-not-found
+
+from abc import ABC, abstractmethod
+from typing import Optional, Tuple
+
+import nncf.torch.graph.operator_metatypes as om  # type: ignore[import-untyped]
+
+import torch
+from nncf.experimental.torch.fx.nncf_graph_builder import (  # type: ignore[import-untyped]
+    GraphConverter,
+)
+
+from nncf.experimental.torch.fx.node_utils import (  # type: ignore[import-untyped]
+    get_tensor_constant_from_node,
+)
+from nncf.experimental.torch.fx.transformations import (  # type: ignore[import-untyped]
+    constant_update_fn,
+    module_insertion_transformation_builder,
+)
+from nncf.parameters import CompressWeightsMode  # type: ignore[import-untyped]
+from nncf.quantization.algorithms.weight_compression.config import (  # type: ignore[import-untyped]
+    WeightCompressionConfig,
+)
+from nncf.quantization.algorithms.weight_compression.torch_fx_backend import (  # type: ignore[import-untyped]
+    FXWeightCompressionAlgoBackend,
+)
+from nncf.quantization.algorithms.weight_compression.weight_lowering import (  # type: ignore[import-untyped]
+    do_integer_quantization,
+)
+from nncf.tensor.tensor import Tensor  # type: ignore[import-untyped]
+from nncf.torch.graph.transformations.commands import (  # type: ignore[import-untyped]
+    PTTargetPoint,
+    TargetType,
+)
+from nncf.torch.quantization.layers import (  # type: ignore[import-untyped]
+    BaseWeightsDecompressor,
+    INT4AsymmetricWeightsDecompressor,
+    INT4SymmetricWeightsDecompressor,
+    INT8AsymmetricWeightsDecompressor,
+    INT8SymmetricWeightsDecompressor,
+)
+from torchao.quantization.pt2e import MappingType, ObserverBase
+from nncf.torch.model_graph_manager import get_weight_compression_reduction_axes
+
+class WeightObserverBase(ObserverBase, ABC):
+    """
+    Base implementation of an NNCF observer that defines the rules for compressing layer weights into the OpenVINO representation.
+    """
+
+    def calculate_qparams(  # type: ignore[override]
+        self,
+        weight: torch.Tensor,
+        observer_node: torch.fx.Node,
+        model: torch.fx.GraphModule,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Calculate quantization parameters such as scale, quantized weight and zero point.
+
+        :param weight: FP weight to be used for calculating qparams.
+        :return: quantization params quantized weight, scale and zero point
+        """
+        ndims = len(weight.size())
+        node_with_weight, weight_port_id = (
+            WeightObserverBase.get_node_with_weight_and_port_ids(observer_node, model)
+        )
+        _, node_metatype = GraphConverter.get_node_type_and_metatype(
+            node_with_weight, model
+        )
+        # Special case where embedding metatype has to be mapped to AtenEmbedding metatype
+        node_metatype = (
+            om.PTAtenEmbeddingMetatype
+            if node_metatype == om.PTEmbeddingMetatype
+            else node_metatype
+        )
+        reduction_dims = get_weight_compression_reduction_axes(
+            node_metatype, weight_port_id, ndims
+        )
+        reduction_dims = tuple(reduction_dims)
+
+        q_weight, scale, zp = do_integer_quantization(
+            Tensor(weight), self.wc_config, reduction_axes=reduction_dims
+        )
+        zp = zp.data if zp is not None else None
+        return q_weight.data, scale.data, zp
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x
+
+    @staticmethod
+    def get_node_with_weight_and_port_ids(
+        observer_node: torch.fx.Node, model: torch.fx.GraphModule
+    ) -> Tuple[torch.fx.Node, int]:
+        """
+        Returns the node which contains the weight and the weight port id.
+
+        :param observer_node: Observer node for the weight.
+        :param graph: The model.
+        :return: Node which contains the weight (for eg. Linear node) and the port ID for the weight.
+        """
+        for node in model.graph.nodes:
+            if observer_node in node.all_input_nodes:
+                return node, node.all_input_nodes.index(observer_node)
+        msg = f"Observer node {observer_node.name} has no consumer node"
+        raise RuntimeError(msg)
+
+    def convert(
+        self, model: torch.fx.GraphModule, observer_node: torch.fx.Node
+    ) -> None:
+        """
+        Converts the weight observer node into a decompression subgraph after calibration.
+        This method is responsible for transforming the model after the quantization preparation
+        and calibration phases. It replaces the observer node with the quantized weight and a decompression
+        module.
+
+        :param model: A `torch.fx.GraphModule` representing the statically traced model
+                    with observer nodes attached and calibrated.
+        :param observer_node: The `torch.fx.Node` corresponding to the observer module for
+                            the weight that is being transformed into a compressed representation.
+        """
+        weight_node = observer_node.args[0]
+        original_weight = get_tensor_constant_from_node(weight_node, model)
+        q_weight, scale, zero_point = self.calculate_qparams(
+            original_weight, observer_node, model
+        )
+
+        decompressor = self._create_decompressor(
+            scale, zero_point, q_weight, original_weight
+        )
+        packed_q_weight = decompressor.pack_weight(q_weight)
+
+        constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0)
+
+        compressed_weight_name = observer_node.all_input_nodes[0].name
+        decompressor_suffix = "_".join(
+            compressed_weight_name.replace(".", "_").split("_")[:-2]
+        )
+        decompressor_name = f"{decompressor.quantization_mode}_weights_decompressor_{decompressor_suffix}"
+
+        module_insertion_transformation_builder(
+            decompressor,
+            [
+                PTTargetPoint(
+                    TargetType.OPERATOR_POST_HOOK,
+                    target_node_name=compressed_weight_name,
+                )
+            ],
+            decompressor_name,
+        )(model)
+
+        decomp_node = observer_node.args[0]
+        observer_node.replace_all_uses_with(decomp_node)  # type: ignore[arg-type]
+        model.graph.erase_node(observer_node)
+
+    @abstractmethod
+    def _create_decompressor(
+        self,
+        scale: torch.Tensor,
+        zero_point: Optional[torch.Tensor],
+        q_weight: torch.Tensor,
+        original_weight: torch.Tensor,
+    ) -> BaseWeightsDecompressor:
+        """
+        Used to return the respective NNCF decompressor for different types of quantization.
+
+        :param scale: Calculated scale quantization parameter.
+        :param zero_point: Calculated zero_point quantization parameter.
+        :param q_weight: Calculated quantized weight.
+        :param original_weight: FP weight.
+        :return: NNCF observer according to the qmode which creates the decompression subgraph supported by OpenVINO.
+        """
+        pass
+
+    @abstractmethod
+    def get_wc_config(self) -> WeightCompressionConfig:
+        """
+        Used to return the respective NNCF Weight Compression Config.
+
+        :return: Weight compression config with the compression information such as qmode, group_size etc.
+        """
+        pass
+
+
+class INT4WeightObserver(WeightObserverBase):
+    """
+    This class defines the behavior for INT4 Weight Compression which has per-group granularity.
+    """
+
+    def __init__(
+        self,
+        group_size: int,
+        mapping_type: MappingType,
+        target_dtype: torch.dtype,
+        *args,
+        **kwargs,
+    ) -> None:
+        """
+        :param group_size: Group size for group wise quantization. group_size=-1 means it is per-channel quantization.
+        :param mapping_type: MappingType.SYMMETRIC and MappingType.ASYMMETRIC are supported types for this argument for symmetric or asymmetric quantization.
+        :param target_dtype: target dtype for quantization such as int8, uint8, etc.
+        """
+        super().__init__(dtype=target_dtype, is_dynamic=False)
+        self.wc_config = None
+        self.mapping_type = mapping_type
+
+        qmode = (
+            CompressWeightsMode.INT4_ASYM
+            if self.mapping_type == MappingType.ASYMMETRIC
+            else CompressWeightsMode.INT4_SYM
+        )
+        self.wc_config = WeightCompressionConfig(mode=qmode, group_size=group_size)
+
+    def _create_decompressor(
+        self,
+        scale: torch.Tensor,
+        zero_point: Optional[torch.Tensor],
+        q_weight: torch.Tensor,
+        original_weight: torch.Tensor,
+    ) -> BaseWeightsDecompressor:
+        if zero_point is not None:
+            return INT4AsymmetricWeightsDecompressor(
+                scale,
+                zero_point,
+                q_weight.shape,
+                original_weight.shape,
+                original_weight.dtype,
+            )
+        else:
+            return INT4SymmetricWeightsDecompressor(
+                scale, q_weight.shape, original_weight.shape, original_weight.dtype
+            )
+
+    def get_wc_config(self):
+        return self.wc_config
+
+
+class INT8WeightObserver(WeightObserverBase):
+    """
+    This class defines the behavior for Int8 WC which has per channel granularity.
+    """
+
+    def __init__(
+        self,
+        qscheme: torch.qscheme,
+        dtype: torch.dtype,
+        ch_axis: int = 0,
+        *args,
+        **kwargs,
+    ) -> None:
+        """
+        :param qscheme: Quantization scheme which is per-channel for Int8 WC.
+        :param dtype: dtype for quantization such as int8, uint8, etc..
+        :param ch_axis: Channel axis.
+        """
+        super().__init__(dtype=dtype, is_dynamic=False)
+        self.wc_config = None
+        self.qscheme = qscheme
+
+        qmode = (
+            CompressWeightsMode.INT8_SYM
+            if self.qscheme == torch.per_channel_symmetric
+            else CompressWeightsMode.INT8_ASYM
+        )
+        self.wc_config = WeightCompressionConfig(mode=qmode)
+
+    def _create_decompressor(
+        self,
+        scale: torch.Tensor,
+        zero_point: Optional[torch.Tensor],
+        q_weight: torch.Tensor,
+        original_weight: torch.Tensor,
+    ) -> BaseWeightsDecompressor:
+        if zero_point is not None:
+            return INT8AsymmetricWeightsDecompressor(
+                scale, zero_point, original_weight.dtype
+            )
+        else:
+            return INT8SymmetricWeightsDecompressor(scale, original_weight.dtype)
+
+    def get_wc_config(self):
+        return self.wc_config
\ No newline at end of file
diff --git a/backends/openvino/quantizer/observers/nncf_observers.py b/backends/openvino/quantizer/observers/nncf_observers.py
deleted file mode 100644
index f6ac2a3cb91..00000000000
--- a/backends/openvino/quantizer/observers/nncf_observers.py
+++ /dev/null
@@ -1,176 +0,0 @@
-# Copyright (c) Qualcomm Innovation Center, Inc.
-# All rights reserved
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-from nncf.experimental.torch.fx.node_utils import (  # type: ignore[import-untyped]
-    get_tensor_constant_from_node,
-)
-from nncf.experimental.torch.fx.transformations import (  # type: ignore[import-untyped]
-    constant_update_fn,
-    module_insertion_transformation_builder,
-)
-from nncf.parameters import CompressWeightsMode  # type: ignore[import-untyped]
-from nncf.quantization.algorithms.weight_compression.config import (  # type: ignore[import-untyped]
-    WeightCompressionConfig,
-)
-
-from nncf.quantization.algorithms.weight_compression.weight_lowering import (  # type: ignore[import-untyped]
-    do_integer_quantization,
-)
-from nncf.tensor.tensor import Tensor  # type: ignore[import-untyped]
-from nncf.torch.graph.transformations.commands import (  # type: ignore[import-untyped]
-    PTTargetPoint,
-    TargetType,
-)
-from nncf.torch.quantization.layers import (  # type: ignore[import-untyped]
-    INT4AsymmetricWeightsDecompressor,
-    INT4SymmetricWeightsDecompressor,
-    INT8AsymmetricWeightsDecompressor,
-    INT8SymmetricWeightsDecompressor,
-)
-from torchao.quantization.observer import AffineQuantizedMinMaxObserver
-from torchao.quantization.pt2e import (
-    get_block_size,
-    MappingType,
-    PerAxis,
-    PerChannelMinMaxObserver,
-    PerGroup,
-)
-from torchao.quantization.quant_primitives import _get_reduction_params
-
-
-class PTPerBlockParamObserver(AffineQuantizedMinMaxObserver):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        qmode = (
-            CompressWeightsMode.INT4_ASYM
-            if self.mapping_type == MappingType.ASYMMETRIC
-            else CompressWeightsMode.INT4_SYM
-        )
-        assert isinstance(
-            self.granularity, PerGroup
-        ), "Only PerGroup granularity is supported"
-        self.wc_config = WeightCompressionConfig(
-            mode=qmode, group_size=self.granularity.group_size
-        )
-
-    def calculate_qparams(self, weight):
-        assert hasattr(self, "min_val") and hasattr(
-            self, "max_val"
-        ), "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams"
-        _, reduction_dims = _get_reduction_params(self.block_size, weight.size())
-        assert len(reduction_dims) == 1, "Only 1-D group size is supported"
-        reduction_dims = reduction_dims[0] - 1
-        q_weight, scale, zp = do_integer_quantization(
-            Tensor(weight), self.wc_config, reduction_axes=reduction_dims
-        )
-        zp = zp.data if zp is not None else None
-        return q_weight.data, scale.data, zp
-
-    def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node):
-        print("calling convert")
-        assert (
-            self.original_dtype is not None
-        ), "Expecting original_dtype to be populated"
-        weight_node = observer_node.args[0]
-        original_weight = get_tensor_constant_from_node(weight_node, model)
-        q_weight, scale, zero_point = self.calculate_qparams(original_weight)
-
-        with model.graph.inserting_before(observer_node):
-            if zero_point is not None:
-                decompressor = INT4AsymmetricWeightsDecompressor(
-                    scale,
-                    zero_point,
-                    q_weight.shape,
-                    original_weight.shape,
-                    original_weight.dtype,
-                )
-            else:
-                decompressor = INT4SymmetricWeightsDecompressor(
-                    scale, q_weight.shape, original_weight.shape, original_weight.dtype
-                )
-            packed_q_weight = decompressor.pack_weight(q_weight)
-            constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0)
-            compressed_weight_name = observer_node.all_input_nodes[0].name
-            decompressor_suffix = "_".join(
-                compressed_weight_name.replace(".", "_").split("_")[:-2]
-            )
-            decompressor_name = f"{decompressor.quantization_mode}_weights_decompressor_{decompressor_suffix}"
-
-            module_insertion_transformation_builder(
-                decompressor,
-                [
-                    PTTargetPoint(
-                        TargetType.OPERATOR_POST_HOOK,
-                        target_node_name=compressed_weight_name,
-                    )
-                ],
-                decompressor_name,
-            )(model)
-        decomp_node = observer_node.args[0]
-        observer_node.replace_all_uses_with(decomp_node)
-        model.graph.erase_node(observer_node)
-
-
-class NNCFInt8observer(PerChannelMinMaxObserver):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        qmode = (
-            CompressWeightsMode.INT8_SYM
-            if self.qscheme == torch.per_channel_symmetric
-            else CompressWeightsMode.INT8_ASYM
-        )
-        self.wc_config = WeightCompressionConfig(mode=qmode)
-
-    def calculate_qparams(self, weight):
-        assert hasattr(self, "min_val") and hasattr(
-            self, "max_val"
-        ), "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams"
-        self.granularity = PerAxis(axis=self.ch_axis)
-        self.block_size = get_block_size(weight.shape, self.granularity)
-        _, reduction_dims = _get_reduction_params(self.block_size, weight.size())
-        q_weight, scale, zp = do_integer_quantization(
-            Tensor(weight), self.wc_config, reduction_axes=reduction_dims
-        )
-        zp = zp.data if zp is not None else None
-        return q_weight.data, scale.data, zp
-
-    def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node):
-        print("calling convert")
-        weight_node = observer_node.args[0]
-        original_weight = get_tensor_constant_from_node(weight_node, model)
-        q_weight, scale, zero_point = self.calculate_qparams(original_weight)
-
-        with model.graph.inserting_before(observer_node):
-            if zero_point is not None:
-                decompressor = INT8AsymmetricWeightsDecompressor(
-                    scale, zero_point, original_weight.dtype
-                )
-            else:
-                decompressor = INT8SymmetricWeightsDecompressor(
-                    scale, original_weight.dtype
-                )
-            packed_q_weight = decompressor.pack_weight(q_weight)
-            constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0)
-            compressed_weight_name = observer_node.all_input_nodes[0].name
-            decompressor_suffix = "_".join(
-                compressed_weight_name.replace(".", "_").split("_")[:-2]
-            )
-            decompressor_name = f"{decompressor.quantization_mode}_weights_decompressor_{decompressor_suffix}"
-
-            module_insertion_transformation_builder(
-                decompressor,
-                [
-                    PTTargetPoint(
-                        TargetType.OPERATOR_POST_HOOK,
-                        target_node_name=compressed_weight_name,
-                    )
-                ],
-                decompressor_name,
-            )(model)
-        decomp_node = observer_node.args[0]
-        observer_node.replace_all_uses_with(decomp_node)
-        model.graph.erase_node(observer_node)
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index cd78f6907c7..31d41bff7be 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -15,16 +15,11 @@
 import nncf.experimental.torch.fx as nncf_fx  # type: ignore[import-untyped]
 
 import torch.fx
-from executorch.backends.openvino.quantizer.observers.nncf_observers import (
-    NNCFInt8observer,
-    PTPerBlockParamObserver,
+from executorch.backends.openvino.quantizer.observers import (
+    INT4WeightObserver,
+    INT8WeightObserver,
 )
-
 from nncf.common.graph.graph import NNCFGraph  # type: ignore[import-untyped]
-from nncf.common.quantization.structs import (  # type: ignore[import-untyped]
-    QuantizationScheme,
-    QuantizerConfig,
-)
 from nncf.quantization.quantize_model import (  # type: ignore[import-untyped]
     get_weight_compression_configuration,
 )
@@ -32,7 +27,6 @@
     HistogramObserver,
     MappingType,
     PerChannelMinMaxObserver,
-    PerGroup,
     UniformQuantizationObserverBase,
 )
 from torchao.quantization.pt2e.quantizer import (
@@ -45,7 +39,6 @@
 )
 
 QUANT_ANNOTATION_KEY = "quantization_annotation"
-from torchao.quantization.pt2e.quantizer.quantizer import Q_ANNOTATION_KEY
 
 
 class QuantizationMode(Enum):
@@ -55,15 +48,19 @@ class QuantizationMode(Enum):
     - INT8_SYM: INT8 symmetric quantization for both activations and weights.
     - INT8_MIXED: INT8 asymmetric quantization for activations, symmetric for weights.
     - INT8_TRANSFORMER: Optimized INT8 quantization for transformer-based models
+    - INT8WO_SYM: INT8 symmetric quantization for weights only.
+    - INT8WO_ASYM: INT8 asymmetric quantization for weights only.
+    - INT4WO_SYM: INT4 symmetric quantization for weights only.
+    - INT4WO_ASYM: INT4 asymmetric quantization for weights only
     """
 
     INT8_SYM = "int8_sym"
     INT8_MIXED = "int8_mixed"
     INT8_TRANSFORMER = "int8_transformer"
-    INT8_SYM_WC = "int8_sym_wc"
-    INT8_ASYM_WC = "int8_asym_wc"
-    INT4_SYM_WC = "int4_sym"
-    INT4_ASYM_WC = "int4_asym"
+    INT8WO_SYM = "int8wo_sym"
+    INT8WO_ASYM = "int8wo_asym"
+    INT4WO_SYM = "int4wo_sym"
+    INT4WO_ASYM = "int4wo_asym"
 
 
 class OpenVINOQuantizer(Quantizer):
@@ -72,10 +69,17 @@ class OpenVINOQuantizer(Quantizer):
     optimally for the inference via OpenVINO.
     """
 
+    WEIGHTS_ONLY_COMPRESSION_MODES = (
+        QuantizationMode.INT4WO_SYM,
+        QuantizationMode.INT4WO_ASYM,
+        QuantizationMode.INT8WO_SYM,
+        QuantizationMode.INT8WO_ASYM,
+    )
+
     def __init__(
         self,
         *,
-        mode: Optional[QuantizationMode] = QuantizationMode.INT8_SYM,
+        mode: QuantizationMode = QuantizationMode.INT8_SYM,
         **kwargs,
     ):
         """
@@ -89,28 +93,21 @@ def __init__(
         :param kwargs: Arguments to pass to the NNCF MinMaxQuantization algorithm.
         """
         self.mode = mode
-        self.wc_modes = [
-            QuantizationMode.INT4_ASYM_WC,
-            QuantizationMode.INT4_SYM_WC,
-            QuantizationMode.INT8_ASYM_WC,
-            QuantizationMode.INT8_SYM_WC,
-        ]
-        if mode == QuantizationMode.INT8_SYM:
-            preset = quantization.structs.QuantizationPreset.PERFORMANCE
-            model_type = None
-        elif mode == QuantizationMode.INT8_MIXED:
-            preset = quantization.structs.QuantizationPreset.MIXED
-            model_type = None
-        else:
-            preset = None
-            model_type = nncf.parameters.ModelType.TRANSFORMER
-        if self.mode not in self.wc_modes:
-            self._min_max_algo = (
+        if self.mode not in OpenVINOQuantizer.WEIGHTS_ONLY_COMPRESSION_MODES:
+            if mode == QuantizationMode.INT8_SYM:
+                preset = quantization.structs.QuantizationPreset.PERFORMANCE
+                model_type = None
+            elif mode == QuantizationMode.INT8_MIXED:
+                preset = quantization.structs.QuantizationPreset.MIXED
+                model_type = None
+            else:
+                preset = None
+                model_type = nncf.parameters.ModelType.TRANSFORMER
+            self._algo = (
                 nncf.quantization.algorithms.min_max.algorithm.MinMaxQuantization(
                     preset=preset, model_type=model_type, **kwargs
                 )
             )
-            self._algo = self._min_max_algo
         else:
             weight_compression_configuration = get_weight_compression_configuration(
                 mode.value.replace(
@@ -118,10 +115,9 @@ def __init__(
                 ),  # Mode value has to match NNCF CompressWeightsMode
                 **kwargs,
             )
-            self._weight_compression_algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression(
+            self._algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression(
                 subset_size=None, **weight_compression_configuration
             )
-            self._algo = self._weight_compression_algo
 
     def set_ignored_scope(
         self,
@@ -158,104 +154,131 @@ def get_nncf_quantization_setup(
         self._algo._set_backend_entity(model)
         return self._algo.find_quantization_setup(model, nncf_graph)
 
-    def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
-        nncf_graph = nncf_fx.nncf_graph_builder.GraphConverter.create_nncf_graph(model)
+    def _annotate_weight_compression(
+        self,
+        model: torch.fx.GraphModule,
+        graph: torch.fx.Graph,
+        nncf_graph: NNCFGraph,
+        node_vs_torch_annotation: DefaultDict[torch.fx.Node, QuantizationAnnotation],
+    ) -> DefaultDict[torch.fx.Node, QuantizationAnnotation]:
+        """
+        Annotates the model graph with weight-only quantization specs.
 
-        graph = model.graph
-        node_vs_torch_annotation: DefaultDict[torch.fx.Node, QuantizationAnnotation] = (
-            defaultdict(QuantizationAnnotation)
-        )
-        # Serperate into annotation for quantize and compress
-        if self.mode in self.wc_modes:
-            self._algo.set_backend_entity(model)
-            nodes_to_compress = self._algo.get_nodes_to_compress(nncf_graph)
-            for node in nodes_to_compress:
-                quantization_insertion_point = (
-                    quantization.quantizer_setup.WeightQuantizationInsertionPoint(
-                        target_node_name=node.node_name
-                    )
-                )
-                group_size = self._algo._group_size
-                num_bits = (
-                    4
-                    if self.mode
-                    in [QuantizationMode.INT4_SYM_WC, QuantizationMode.INT4_ASYM_WC]
-                    else 8
-                )
-                qmode = (
-                    QuantizationScheme.SYMMETRIC
-                    if self.mode
-                    in [QuantizationMode.INT4_SYM_WC, QuantizationMode.INT8_SYM_WC]
-                    else QuantizationScheme.ASYMMETRIC
-                )
-                nncf_qconfig = QuantizerConfig(num_bits=num_bits, mode=qmode)
-                qp = quantization.quantizer_setup.SingleConfigQuantizationPoint(
-                    qip=quantization_insertion_point,
-                    qconfig=nncf_qconfig,
-                    directly_quantized_operator_node_names=[node],
-                )
-                edge_or_node, annotation = self._get_edge_or_node_and_annotation(
-                    graph, nncf_graph, qp, node_vs_torch_annotation
-                )
-                qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(
-                    qp, group_size=group_size, weights_only=True
+        Identifies compressible nodes in the NNCF graph and attaches the corresponding
+        TorchAO quantization specifications to their weight edges for later transformation.
+
+        :param model: The FX GraphModule to annotate.
+        :param graph: The underlying FX graph.
+        :param nncf_graph: The corresponding NNCF graph.
+        :param node_vs_torch_annotation: A mapping of FX nodes to quantization annotations.
+
+        :return: Updated mapping of FX nodes with weight compression annotations.
+        """
+        self._algo.set_backend_entity(model)
+        nodes_to_compress = self._algo.get_nodes_to_compress(nncf_graph)
+
+        for node in nodes_to_compress:
+            target_node = nncf_fx.node_utils.get_graph_node_by_name(
+                graph, node.node_name
+            )
+            annotation = node_vs_torch_annotation[target_node]
+            edge_or_node = OpenVINOQuantizer._get_weight_edge(target_node, nncf_graph)
+            group_size = getattr(self._algo, "_group_size", -1)
+            qspec = self._get_torch_ao_qspec_from_nncf_config(
+                qp=None, group_size=group_size, qmode=self.mode, weights_only=True
+            )
+            self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
+
+        return node_vs_torch_annotation
+
+    def _annotate_post_training_quantization(
+        self,
+        model: torch.fx.GraphModule,
+        graph: torch.fx.Graph,
+        nncf_graph: NNCFGraph,
+        node_vs_torch_annotation: DefaultDict[torch.fx.Node, QuantizationAnnotation],
+    ) -> DefaultDict[torch.fx.Node, QuantizationAnnotation]:
+        """
+        Annotates the model graph with post-training quantization configurations.
+
+        Converts NNCF quantization points into TorchAO-compatible quantization specs,
+        assigning them to corresponding nodes or edges. Also handles unified scale groups,
+        ensuring shared quantization specs across grouped quantizers with consistent configs.
+
+        :param model: The FX GraphModule to annotate.
+        :param graph: The underlying FX graph.
+        :param nncf_graph: The corresponding NNCF graph.
+        :param node_vs_torch_annotation: A mapping of FX nodes to quantization annotations.
+
+        :return: Updated mapping of FX nodes with post-training quantization annotations.
+        """
+        quantization_setup = self.get_nncf_quantization_setup(model, nncf_graph)
+
+        for qp in quantization_setup.quantization_points.values():
+            edge_or_node, annotation = self._get_edge_or_node_and_annotation(
+                graph, nncf_graph, qp, node_vs_torch_annotation
+            )
+            qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(qp)
+            self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
+
+        for quantizer_ids in quantization_setup.unified_scale_groups.values():
+            root_quantizer_id = self._get_unified_scales_root_quantizer_id(
+                nncf_graph, quantizer_ids, quantization_setup
+            )
+            root_qp = quantization_setup.quantization_points[root_quantizer_id]
+
+            if any(
+                root_qp.qconfig != quantization_setup.quantization_points[q_id].qconfig
+                for q_id in quantizer_ids
+            ):
+                qps = [
+                    quantization_setup.quantization_points[qid] for qid in quantizer_ids
+                ]
+                raise nncf.InternalError(
+                    "Different quantization configs are set to one unified scale group:"
+                    f"{[(qp.insertion_point.__dict__, str(qp.qconfig)) for qp in qps]}"
                 )
-                self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
-        else:
-            quantization_setup = self.get_nncf_quantization_setup(model, nncf_graph)
 
-            for qp in quantization_setup.quantization_points.values():
+            root_target_node = nncf_fx.node_utils.get_graph_node_by_name(
+                graph, root_qp.insertion_point.target_node_name
+            )
+            root_edge_or_node = self._get_edge_or_node(
+                root_target_node, root_qp, nncf_graph
+            )
+
+            for quantizer_id in quantizer_ids:
+                if quantizer_id == root_quantizer_id:
+                    continue
+
+                qspec = SharedQuantizationSpec(root_edge_or_node)  # type: ignore[assignment]
+                qp = quantization_setup.quantization_points[quantizer_id]
                 edge_or_node, annotation = self._get_edge_or_node_and_annotation(
                     graph, nncf_graph, qp, node_vs_torch_annotation
                 )
-                qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(
-                    qp
-                )
                 self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
 
-            for quantizer_ids in quantization_setup.unified_scale_groups.values():
+        return node_vs_torch_annotation
 
-                root_quantizer_id = self._get_unified_scales_root_quantizer_id(
-                    nncf_graph, quantizer_ids, quantization_setup
-                )
-                root_qp = quantization_setup.quantization_points[root_quantizer_id]
-
-                if any(
-                    root_qp.qconfig
-                    != quantization_setup.quantization_points[q_id].qconfig
-                    for q_id in quantizer_ids
-                ):
-                    qps = [
-                        quantization_setup.quantization_points[q_id]
-                        for q_id in quantizer_ids
-                    ]
-                    msg = (
-                        "Different quantization configs are set to one unified scale group:"
-                        f"{[(qp.insertion_point.__dict__, str(qp.qconfig)) for qp in qps]}"
-                    )
-                    raise nncf.InternalError(msg)
-
-                root_target_node = nncf_fx.node_utils.get_graph_node_by_name(
-                    graph, root_qp.insertion_point.target_node_name
-                )
-                root_edge_or_node = self._get_edge_or_node(
-                    root_target_node, root_qp, nncf_graph
-                )
-
-                for quantizer_id in quantizer_ids:
-                    if quantizer_id == root_quantizer_id:
-                        continue
+    def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+        nncf_graph = nncf_fx.nncf_graph_builder.GraphConverter.create_nncf_graph(model)
+        graph = model.graph
+        node_vs_torch_annotation: DefaultDict[torch.fx.Node, QuantizationAnnotation] = (
+            defaultdict(QuantizationAnnotation)
+        )
 
-                    qspec = SharedQuantizationSpec(root_edge_or_node)
-                    qp = quantization_setup.quantization_points[quantizer_id]
-                    edge_or_node, annotation = self._get_edge_or_node_and_annotation(
-                        graph, nncf_graph, qp, node_vs_torch_annotation
-                    )
-                    self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
+        if self.mode in OpenVINOQuantizer.WEIGHTS_ONLY_COMPRESSION_MODES:
+            node_vs_torch_annotation = self._annotate_weight_compression(
+                model, graph, nncf_graph, node_vs_torch_annotation
+            )
+        else:
+            node_vs_torch_annotation = self._annotate_post_training_quantization(
+                model, graph, nncf_graph, node_vs_torch_annotation
+            )
 
         for node, annotation in node_vs_torch_annotation.items():
-            assert Q_ANNOTATION_KEY not in node.meta
-            node.meta[Q_ANNOTATION_KEY] = annotation
+            assert QUANT_ANNOTATION_KEY not in node.meta
+            node.meta[QUANT_ANNOTATION_KEY] = annotation
+
         return model
 
     @staticmethod
@@ -317,6 +340,36 @@ def _get_edge_or_node_and_annotation(
         edge_or_node = OpenVINOQuantizer._get_edge_or_node(target_node, qp, nncf_graph)
         return edge_or_node, annotation
 
+    @staticmethod
+    def _get_weight_edge(
+        target_node: torch.fx.Node,
+        nncf_graph: NNCFGraph,
+    ):
+        """
+        Returns the FX node corresponding to the weight tensor input of a given operator node.
+        Uses the NNCF graph to identify which input port of the target node holds the weight.
+        If multiple weight ports are present, a warning is issued and only the first one is used.
+
+        :param target_node: FX node representing a weighted operation (e.g., Linear, Conv).
+        :param nncf_graph: NNCFGraph used to determine weight port indices.
+
+        :return: Edge represented by a Tuple of (weight_node, target_node), where weight_node is the FX node supplying the weight.
+        """
+        nncf_node = nncf_graph.get_node_by_name(target_node.name)
+        weights_ports_ids = nncf.torch.model_graph_manager.get_weight_tensor_port_ids(
+            nncf_node, nncf_graph
+        )
+        if len(weights_ports_ids) > 1:
+            # TODO(dlyakhov): support quantization for nodes with several weights
+            nncf.common.logging.nncf_logger.warning(
+                f"Quantization of the weighted node {target_node.name}"
+                " is not yet supported by the OpenVINOQuantizer."
+                f" Only the weight on port ID {weights_ports_ids[0]} will be quantized."
+                f" Quantizable weights are located on ports: {weights_ports_ids}."
+            )
+        weight_node = target_node.all_input_nodes[weights_ports_ids[0]]
+        return (weight_node, target_node)
+
     @staticmethod
     def _get_edge_or_node(
         target_node: torch.fx.Node,
@@ -333,22 +386,7 @@ def _get_edge_or_node(
         """
         ip = qp.insertion_point
         if qp.is_weight_quantization_point():
-            nncf_node = nncf_graph.get_node_by_name(target_node.name)
-            weights_ports_ids = (
-                nncf.torch.model_graph_manager.get_weight_tensor_port_ids(
-                    nncf_node, nncf_graph
-                )
-            )
-            if len(weights_ports_ids) > 1:
-                # TODO(dlyakhov): support quantization for nodes with several weights
-                nncf.common.logging.nncf_logger.warning(
-                    f"Quantization of the weighted node {target_node.name}"
-                    " is not yet supported by the OpenVINOQuantizer."
-                    f" Only the weight on port ID {weights_ports_ids[0]} will be quantized."
-                    f" Quantizable weights are located on ports: {weights_ports_ids}."
-                )
-            weight_node = target_node.all_input_nodes[weights_ports_ids[0]]
-            return (weight_node, target_node)
+            OpenVINOQuantizer._get_weight_edge(target_node, nncf_graph)
 
         if ip.input_port_id is None:
             return target_node
@@ -377,22 +415,67 @@ def _fill_torch_ao_annotation(
     @staticmethod
     def _get_torch_ao_qspec_from_nncf_config(
         qp: quantization.quantizer_setup.QuantizationPointBase,
-        group_size=-1,
-        weights_only=False,
+        group_size: int = -1,
+        qmode: Optional[QuantizationMode] = None,
+        weights_only: bool = False,
     ) -> QuantizationSpec:
         """
-        Retrieves the quantization configuration from the given quantization point and
-        converts it into a QuantizationSpec.
-
-        :param qp: An instance of QuantizationPointBase.
-        :return: A QuantizationSpec retrieved and converted from the quantization point.
+        Returns a TorchAO QuantizationSpec based on NNCF quantization config and other arguments.
+        For weight-only quantization (e.g., INT4/INT8 compression), uses `qmode`, `group_size`,
+        and `weights_only`. For post-training quantization, only `qp` is required.
+
+        :param qp: Quantization point from NNCF.
+        :param group_size: Group size for INT4 group-wise quantization.
+        :param qmode: Quantization mode for weight compression.
+        :param weights_only: If True, applies weight-only quantization logic.
+        :return: A TorchAO QuantizationSpec.
         """
+        observer: Type[UniformQuantizationObserverBase]
+
         # Eps value is copied from nncf/torch/quantization/layers.py
-        extra_args = {"eps": 1e-16}
+        extra_args: Dict[str, Any] = {"eps": 1e-16}
+
+        if weights_only:
+            mapping_type = (
+                MappingType.SYMMETRIC
+                if qmode == QuantizationMode.INT4WO_SYM
+                else MappingType.ASYMMETRIC
+            )
+            if qmode in [QuantizationMode.INT4WO_SYM, QuantizationMode.INT4WO_SYM]:
+                extra_args["mapping_type"] = mapping_type
+                extra_args["target_dtype"] = torch.int8
+                extra_args["group_size"] = group_size
+                observer = INT4WeightObserver
+                quant_min = -8 if mapping_type == MappingType.SYMMETRIC else 0
+                quant_max = 7 if mapping_type == MappingType.SYMMETRIC else 15
+                dtype = torch.int8
+                channel_axis = 0
+                torch_qscheme = None
+            else:
+                observer = INT8WeightObserver
+                quant_min = -128 if mapping_type == MappingType.SYMMETRIC else 0
+                quant_max = 1277 if mapping_type == MappingType.SYMMETRIC else 255
+                dtype = torch.int8
+                channel_axis = 0
+                torch_qscheme = (
+                    torch.per_channel_symmetric
+                    if qmode == QuantizationMode.INT8WO_SYM
+                    else torch.per_channel_affine
+                )
+
+            return QuantizationSpec(
+                dtype=dtype,
+                observer_or_fake_quant_ctr=observer.with_args(**extra_args),
+                quant_min=quant_min,
+                quant_max=quant_max,
+                qscheme=torch_qscheme,
+                ch_axis=channel_axis,
+                is_dynamic=False,
+            )
+
         is_weight = qp.is_weight_quantization_point()
         qconfig = qp.qconfig
 
-        observer: Type[UniformQuantizationObserverBase]
         if qconfig.per_channel:
             torch_qscheme = (
                 torch.per_channel_symmetric
@@ -406,33 +489,16 @@ def _get_torch_ao_qspec_from_nncf_config(
                 else torch.per_tensor_affine
             )
         if is_weight:
-            mapping_type = (
-                MappingType.SYMMETRIC
-                if qconfig.mode == QuantizationScheme.SYMMETRIC
-                else MappingType.ASYMMETRIC
+            observer = PerChannelMinMaxObserver
+            quant_min = -128
+            quant_max = 127
+            dtype = torch.int8
+            channel_axis = 0
+            torch_qscheme = (
+                torch.per_channel_symmetric
+                if qconfig.mode is quantization.structs.QuantizationScheme.SYMMETRIC
+                else torch.per_channel_affine
             )
-            if qconfig.num_bits == 4:
-                extra_args["mapping_type"] = mapping_type
-                extra_args["target_dtype"] = torch.int8
-                extra_args["granularity"] = PerGroup(group_size=group_size)
-                observer = PTPerBlockParamObserver
-                quant_min = -8
-                quant_max = 7
-                dtype = torch.int8
-                channel_axis = 0
-            elif qconfig.num_bits == 8:
-                observer = (
-                    NNCFInt8observer if weights_only else PerChannelMinMaxObserver
-                )
-                quant_min = -128
-                quant_max = 127
-                dtype = torch.int8
-                channel_axis = 0
-                torch_qscheme = (
-                    torch.per_channel_symmetric
-                    if qconfig.mode is quantization.structs.QuantizationScheme.SYMMETRIC
-                    else torch.per_channel_affine
-                )
         else:
             observer = (
                 HistogramObserver
@@ -514,4 +580,4 @@ def quantize_model(
         smooth_quant=smooth_quant,
         **kwargs,
     )
-    return quantized_model
+    return quantized_model
\ No newline at end of file
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 47527a326f9..54acf67a21d 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -50,6 +50,7 @@
     get_pt2e_quantization_params,
     get_pt2e_quantizers,
     get_qnn_quantizer,
+    get_ov_quantizer,
     get_vulkan_quantizer,
 )
 from executorch.util.activation_memory_profiler import generate_memory_trace
@@ -205,6 +206,8 @@ def build_args_parser() -> argparse.ArgumentParser:
         choices=[
             "xnnpack_dynamic",
             "xnnpack_dynamic_qc4",
+            "openvino_8da4w",
+            "openvino_8da8w",
             "qnn_8a8w",
             "qnn_16a16w",
             "qnn_16a4w",
@@ -786,6 +789,12 @@ def get_quantizer_and_quant_params(llm_config):
             llm_config.quantization.pt2e_quantize.value, llm_config.quantization.qmode
         )
         quantizers.append(qnn_quantizer)
+    if llm_config.backend.openvino.enabled and llm_config.quantization.pt2e_quantize:
+        assert len(quantizers) == 0, "Should not enable both xnnpack and openvino"
+        ov_quantizer = get_ov_quantizer(
+            llm_config.quantization.pt2e_quantize.value, llm_config.quantization.group_size
+        )
+        quantizers.append(ov_quantizer)
     if llm_config.backend.coreml.enabled and llm_config.quantization.pt2e_quantize:
         assert len(quantizers) == 0, "Should not enable both xnnpack / qnn and coreml"
         coreml_quantizer = get_coreml_quantizer(
diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index d87c722363f..4669d09e0e7 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -207,7 +207,7 @@ def get_qnn_quantizer(
             f"No support for quant type {quant_config}. Support 8a8w, 16a16w and 16a4w."
         )
 
-    assert (
+    assert (get_qnn_quantizer
         quantization_mode is None
     ), "Currently qnn backend only supports QnnQuantizer via pt2e flow"
     qnn_quantizer.add_custom_quant_annotations(custom_annotations)
@@ -215,6 +215,42 @@ def get_qnn_quantizer(
     return qnn_quantizer, quant_dtype
 
 
+def get_ov_quantizer(
+    pt2e_quantize: str,
+    group_size: int = 32,
+):
+    try:
+        from executorch.backends.openvino.quantizer import OpenVINOQuantizer, QuantizationMode
+
+    except ImportError:
+        raise ImportError(
+            "Please install nncf via backends/openvino/requirements.txt"
+        )
+    
+    backend, quant_config = pt2e_quantize.split("_")
+    assert (
+        backend == "openvino"
+    ), f"The quantization config is for backend {backend} instead of openvino."
+    ov_quantizer = OpenVINOQuantizer()
+    # Manually ignore MP layers.
+    # ov_quantizer.set_ignored_scope()
+
+    extra_quantizer_options = {"group_size": group_size}
+    if quant_config == "8da4w":
+        mode = QuantizationMode.INT4WO_SYM
+
+    elif quant_config == "8da8w":
+        mode = QuantizationMode.INT8WO_SYM
+    else:
+        raise AssertionError(
+            f"No support for quant type {quant_config}. Support 8a4w, 8a8w only."
+        )
+    
+    ov_quantizer = OpenVINOQuantizer(mode=mode, **extra_quantizer_options)
+
+    return ov_quantizer
+
+
 def get_coreml_quantizer(pt2e_quantize: str):
     try:
         from coremltools.optimize.torch.quantization.quantization_config import (

From 4cc7694433b12f7c8afe4c61b785e5158e0798e0 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Tue, 26 Aug 2025 18:32:27 +0400
Subject: [PATCH 02/27] fixes

---
 backends/openvino/quantizer/quantizer.py  | 10 ++++--
 examples/models/llama/export_llama_lib.py |  9 +++--
 extension/llm/export/config/llm_config.py |  2 ++
 extension/llm/export/quantizer_lib.py     | 42 +++++++++++++++++++----
 4 files changed, 51 insertions(+), 12 deletions(-)

diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index 31d41bff7be..f594c6fffa8 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -12,6 +12,7 @@
 
 import nncf  # type: ignore[import-untyped]
 import nncf.common.quantization as quantization  # type: ignore[import-untyped]
+from nncf.common.scopes import should_consider_scope  # type: ignore[import-untyped]
 import nncf.experimental.torch.fx as nncf_fx  # type: ignore[import-untyped]
 
 import torch.fx
@@ -176,8 +177,12 @@ def _annotate_weight_compression(
         """
         self._algo.set_backend_entity(model)
         nodes_to_compress = self._algo.get_nodes_to_compress(nncf_graph)
+        ignored_names = self._algo.get_ignored_node_names(nncf_graph)
 
         for node in nodes_to_compress:
+            is_target_node = should_consider_scope(node.node_name, ignored_names)
+            if not is_target_node:
+                continue
             target_node = nncf_fx.node_utils.get_graph_node_by_name(
                 graph, node.node_name
             )
@@ -442,9 +447,9 @@ def _get_torch_ao_qspec_from_nncf_config(
                 else MappingType.ASYMMETRIC
             )
             if qmode in [QuantizationMode.INT4WO_SYM, QuantizationMode.INT4WO_SYM]:
+                extra_args["group_size"] = group_size
                 extra_args["mapping_type"] = mapping_type
                 extra_args["target_dtype"] = torch.int8
-                extra_args["group_size"] = group_size
                 observer = INT4WeightObserver
                 quant_min = -8 if mapping_type == MappingType.SYMMETRIC else 0
                 quant_max = 7 if mapping_type == MappingType.SYMMETRIC else 15
@@ -454,7 +459,7 @@ def _get_torch_ao_qspec_from_nncf_config(
             else:
                 observer = INT8WeightObserver
                 quant_min = -128 if mapping_type == MappingType.SYMMETRIC else 0
-                quant_max = 1277 if mapping_type == MappingType.SYMMETRIC else 255
+                quant_max = 127 if mapping_type == MappingType.SYMMETRIC else 255
                 dtype = torch.int8
                 channel_axis = 0
                 torch_qscheme = (
@@ -462,7 +467,6 @@ def _get_torch_ao_qspec_from_nncf_config(
                     if qmode == QuantizationMode.INT8WO_SYM
                     else torch.per_channel_affine
                 )
-
             return QuantizationSpec(
                 dtype=dtype,
                 observer_or_fake_quant_ctr=observer.with_args(**extra_args),
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 54acf67a21d..269f927e9f6 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -791,8 +791,10 @@ def get_quantizer_and_quant_params(llm_config):
         quantizers.append(qnn_quantizer)
     if llm_config.backend.openvino.enabled and llm_config.quantization.pt2e_quantize:
         assert len(quantizers) == 0, "Should not enable both xnnpack and openvino"
+        group_size = llm_config.quantization.group_size
+        group_size = group_size if group_size else 32 
         ov_quantizer = get_ov_quantizer(
-            llm_config.quantization.pt2e_quantize.value, llm_config.quantization.group_size
+            llm_config.quantization.pt2e_quantize.value, 
         )
         quantizers.append(ov_quantizer)
     if llm_config.backend.coreml.enabled and llm_config.quantization.pt2e_quantize:
@@ -904,6 +906,7 @@ def _to_edge_and_lower_llama_xnnpack(
 def _to_edge_and_lower_llama_openvino(
     builder_exported,
     modelname,
+    quantizers,
     additional_passes,
     openvino_device: str = "CPU",
     nncf_compression: bool = False,
@@ -935,7 +938,6 @@ def _to_edge_and_lower_llama_openvino(
 
         def transform_fn(prompts: str, tokenizer):
             tokenized_text = tokenizer.encode(prompts, bos=False, eos=False)
-            logging.error(tokenized_text)
 
             inputs = ()
             inputs = (
@@ -971,7 +973,7 @@ def transform_fn(prompts: str, tokenizer):
             sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
         )
 
-    builder = builder_exported.to_edge_transform_and_lower(partitioners)
+    builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(partitioners)
 
     if verbose:
         print_delegation_info(builder.edge_manager.exported_program().graph_module)
@@ -1214,6 +1216,7 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager:  # noqa: C901
         builder = _to_edge_and_lower_llama_openvino(
             builder_exported,
             modelname,
+            quantizers,
             additional_passes,
             openvino_device=llm_config.backend.openvino.device,
             nncf_compression=llm_config.backend.openvino.nncf_compression,
diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py
index ab18c19159b..b4175d54cd7 100644
--- a/extension/llm/export/config/llm_config.py
+++ b/extension/llm/export/config/llm_config.py
@@ -275,6 +275,8 @@ class Pt2eQuantize(str, Enum):
 
     xnnpack_dynamic = "xnnpack_dynamic"
     xnnpack_dynamic_qc4 = "xnnpack_dynamic_qc4"
+    openvino_8da4w = "openvino_8da4w"
+    openvino_8da8w = "openvino_8da8w"
     qnn_8a8w = "qnn_8a8w"
     qnn_16a16w = "qnn_16a16w"
     qnn_16a4w = "qnn_16a4w"
diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index 4669d09e0e7..2a20a90d55a 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -207,7 +207,7 @@ def get_qnn_quantizer(
             f"No support for quant type {quant_config}. Support 8a8w, 16a16w and 16a4w."
         )
 
-    assert (get_qnn_quantizer
+    assert (
         quantization_mode is None
     ), "Currently qnn backend only supports QnnQuantizer via pt2e flow"
     qnn_quantizer.add_custom_quant_annotations(custom_annotations)
@@ -231,22 +231,52 @@ def get_ov_quantizer(
     assert (
         backend == "openvino"
     ), f"The quantization config is for backend {backend} instead of openvino."
-    ov_quantizer = OpenVINOQuantizer()
+    assert group_size != None, "Group Size None is Not Supported. It should be set to -1 for per-channel."
+
     # Manually ignore MP layers.
-    # ov_quantizer.set_ignored_scope()
+    fp_node_names = linear_list = [
+        "embedding", # First embedding is kept in Full precision
+        "linear_14",
+        "linear_15",
+        "linear_35",
+        "linear_56",
+        "linear_57",
+        "linear_63",
+        "linear_70",
+        "linear_71",
+        "linear_77",
+        "linear_78",
+        "linear_81",
+        "linear_84",
+        "linear_85",
+        "linear_88",
+        "linear_89",
+        "linear_91",
+        "linear_92",
+        "linear_95",
+        "linear_96",
+        "linear_98",
+        "linear_99",
+        "linear_102",
+        "linear_103",
+        "linear_105",
+        "linear_106",
+        "linear_109",
+        "linear_110",
+        "linear_112",]
 
-    extra_quantizer_options = {"group_size": group_size}
     if quant_config == "8da4w":
         mode = QuantizationMode.INT4WO_SYM
 
     elif quant_config == "8da8w":
+        group_size = -1
         mode = QuantizationMode.INT8WO_SYM
     else:
         raise AssertionError(
             f"No support for quant type {quant_config}. Support 8a4w, 8a8w only."
         )
-    
-    ov_quantizer = OpenVINOQuantizer(mode=mode, **extra_quantizer_options)
+    ov_quantizer = OpenVINOQuantizer(mode=mode, group_size=group_size)
+    ov_quantizer.set_ignored_scope(names=fp_node_names)
 
     return ov_quantizer
 

From 5da40a57d7d42363b795d483630b00d9ce4b5f31 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Wed, 27 Aug 2025 13:48:41 +0400
Subject: [PATCH 03/27] support all_layers, backup mode in OVQuantizer

---
 backends/openvino/quantizer/quantizer.py  | 25 ++++---
 examples/models/llama/export_llama_lib.py | 82 ++++++++++-------------
 extension/llm/export/quantizer_lib.py     |  8 +--
 3 files changed, 55 insertions(+), 60 deletions(-)

diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index f594c6fffa8..2ede04e53db 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -116,8 +116,14 @@ def __init__(
                 ),  # Mode value has to match NNCF CompressWeightsMode
                 **kwargs,
             )
+            subset_size = 1 # Doesn't really matter in this case since it is data-free. Should just be +ve
+            dataset = None # Only Data Free Quantization is Supported in OVQuantizer
+            compression_format = nncf.CompressionFormat.DQ
+            nncf.quantization.algorithms.weight_compression.algorithm.check_user_compression_configuration(
+                subset_size=subset_size, dataset=dataset, compression_format=compression_format, **weight_compression_configuration
+                )
             self._algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression(
-                subset_size=None, **weight_compression_configuration
+                subset_size=subset_size, **weight_compression_configuration
             )
 
     def set_ignored_scope(
@@ -176,21 +182,20 @@ def _annotate_weight_compression(
         :return: Updated mapping of FX nodes with weight compression annotations.
         """
         self._algo.set_backend_entity(model)
-        nodes_to_compress = self._algo.get_nodes_to_compress(nncf_graph)
-        ignored_names = self._algo.get_ignored_node_names(nncf_graph)
+        all_wc_params, _ = self._algo.get_processed_weight_compression_parameters(model, nncf_graph)
 
-        for node in nodes_to_compress:
-            is_target_node = should_consider_scope(node.node_name, ignored_names)
-            if not is_target_node:
-                continue
+        for wc_param in all_wc_params:
+            wc_config = wc_param.compression_config
+            node_with_weight = wc_param.node_with_weight
             target_node = nncf_fx.node_utils.get_graph_node_by_name(
-                graph, node.node_name
+                graph, node_with_weight.node_name
             )
             annotation = node_vs_torch_annotation[target_node]
             edge_or_node = OpenVINOQuantizer._get_weight_edge(target_node, nncf_graph)
-            group_size = getattr(self._algo, "_group_size", -1)
+            group_size = wc_config.group_size
+            qmode = wc_config.mode
             qspec = self._get_torch_ao_qspec_from_nncf_config(
-                qp=None, group_size=group_size, qmode=self.mode, weights_only=True
+                qp=None, group_size=group_size, qmode=qmode, weights_only=True
             )
             self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
 
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 269f927e9f6..00785491100 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -792,9 +792,9 @@ def get_quantizer_and_quant_params(llm_config):
     if llm_config.backend.openvino.enabled and llm_config.quantization.pt2e_quantize:
         assert len(quantizers) == 0, "Should not enable both xnnpack and openvino"
         group_size = llm_config.quantization.group_size
-        group_size = group_size if group_size else 32 
+        group_size = group_size if group_size else 32
         ov_quantizer = get_ov_quantizer(
-            llm_config.quantization.pt2e_quantize.value, 
+            llm_config.quantization.pt2e_quantize.value, group_size
         )
         quantizers.append(ov_quantizer)
     if llm_config.backend.coreml.enabled and llm_config.quantization.pt2e_quantize:
@@ -921,59 +921,51 @@ def _to_edge_and_lower_llama_openvino(
     logging.info("Lowering model using following partitioner(s): ")
     for partitioner in partitioners:
         logging.info(f"--> {partitioner.__class__.__name__}")
-
+    try:
+        import nncf
+        from functools import partial
+        from pytorch_tokenizers import get_tokenizer
+    except ImportError:
+        raise ImportError(
+            "Please install nncf via backends/openvino/requirements.txt"
+        )
+   
+    tokenizer = get_tokenizer(builder_exported.tokenizer_path)
+    from datasets import load_dataset
     # Use NNCF compression if enabled
     # TODO: Enable passing OpenVINOQuantizer as a parameter to pt2e_quantize
     if nncf_compression:
-        try:
-            from functools import partial
-
-            import nncf
-            from pytorch_tokenizers import get_tokenizer
-        except ImportError:
-            raise ImportError(
-                "Please install nncf via backends/openvino/requirements.txt"
-            )
-        tokenizer = get_tokenizer(builder_exported.tokenizer_path)
-
-        def transform_fn(prompts: str, tokenizer):
-            tokenized_text = tokenizer.encode(prompts, bos=False, eos=False)
-
+        dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+        dataset = dataset.filter(lambda example: example['text'].strip() != "")
+        dataset = dataset.filter(lambda example: example['text'].strip() != "\n")
+        def transform_fn(
+            prompts: str, tokenizer
+        ):
+            tokenized_text = tokenizer.encode(prompts["text"], bos=False, eos=False)
+            device = torch.device("cpu") if openvino_device=="CPU" else torch.device("cuda")
             inputs = ()
             inputs = (
-                torch.tensor(tokenized_text).unsqueeze(0),
-                {"input_pos": torch.tensor([0])},
+                torch.tensor(tokenized_text[:128], device=device).unsqueeze(0),
+                {"input_pos": torch.tensor([0], device=device)},
             )
 
             return inputs
-
-        builder_exported.calibration_data = (
-            [builder_exported.calibration_data]
-            if isinstance(builder_exported.calibration_data, str)
-            else builder_exported.calibration_data
-        )
-        builder_exported.calibration_data = (
-            [
-                word
-                for prompt in builder_exported.calibration_data
-                for word in prompt.split()
-            ]
-            if not builder_exported.dynamic_shapes
-            else builder_exported.calibration_data
-        )
-
+        
         builder_exported.pre_autograd_graph_module = nncf.compress_weights(
-            builder_exported.pre_autograd_graph_module,
-            dataset=nncf.Dataset(
-                builder_exported.calibration_data,
-                transform_func=partial(transform_fn, tokenizer=tokenizer),
-            ),
-            mode=nncf.CompressWeightsMode.INT4_SYM,
-            ratio=0.8,
-            sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
-        )
+                                                            builder_exported.pre_autograd_graph_module,
+                                                            dataset=nncf.Dataset(dataset,  partial(transform_fn, tokenizer=tokenizer)),
+                                                            mode=nncf.CompressWeightsMode.INT4_SYM,
+                                                            group_size=32,
+                                                            backup_mode=nncf.BackupMode.NONE,
+                                                            ratio=0.8,
+                                                            sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
+                                                        )
+ 
+        builder = builder_exported.to_edge_transform_and_lower(partitioners)
+    
+    else:
+        builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(partitioners)
 
-    builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(partitioners)
 
     if verbose:
         print_delegation_info(builder.edge_manager.exported_program().graph_module)
diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index 2a20a90d55a..9220c1efbdc 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -221,7 +221,7 @@ def get_ov_quantizer(
 ):
     try:
         from executorch.backends.openvino.quantizer import OpenVINOQuantizer, QuantizationMode
-
+        import nncf
     except ImportError:
         raise ImportError(
             "Please install nncf via backends/openvino/requirements.txt"
@@ -234,8 +234,7 @@ def get_ov_quantizer(
     assert group_size != None, "Group Size None is Not Supported. It should be set to -1 for per-channel."
 
     # Manually ignore MP layers.
-    fp_node_names = linear_list = [
-        "embedding", # First embedding is kept in Full precision
+    fp_node_names = [
         "linear_14",
         "linear_15",
         "linear_35",
@@ -262,8 +261,7 @@ def get_ov_quantizer(
         "linear_105",
         "linear_106",
         "linear_109",
-        "linear_110",
-        "linear_112",]
+        "linear_110",]
 
     if quant_config == "8da4w":
         mode = QuantizationMode.INT4WO_SYM

From 9e65a7ef860e5725522859bbf8d863c76e26503d Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Wed, 27 Aug 2025 17:29:05 +0400
Subject: [PATCH 04/27] clean up and use new nncf method for obtaining
 compression parameters

---
 backends/openvino/quantizer/observers.py | 127 ++++++-----------------
 backends/openvino/quantizer/quantizer.py |  52 ++++------
 2 files changed, 48 insertions(+), 131 deletions(-)

diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py
index 2ea66f11a55..845a091d24b 100644
--- a/backends/openvino/quantizer/observers.py
+++ b/backends/openvino/quantizer/observers.py
@@ -25,10 +25,7 @@
 )
 from nncf.parameters import CompressWeightsMode  # type: ignore[import-untyped]
 from nncf.quantization.algorithms.weight_compression.config import (  # type: ignore[import-untyped]
-    WeightCompressionConfig,
-)
-from nncf.quantization.algorithms.weight_compression.torch_fx_backend import (  # type: ignore[import-untyped]
-    FXWeightCompressionAlgoBackend,
+    WeightCompressionParameters,
 )
 from nncf.quantization.algorithms.weight_compression.weight_lowering import (  # type: ignore[import-untyped]
     do_integer_quantization,
@@ -45,19 +42,31 @@
     INT8AsymmetricWeightsDecompressor,
     INT8SymmetricWeightsDecompressor,
 )
-from torchao.quantization.pt2e import MappingType, ObserverBase
-from nncf.torch.model_graph_manager import get_weight_compression_reduction_axes
+from torchao.quantization.pt2e import ObserverBase
+
 
 class WeightObserverBase(ObserverBase, ABC):
     """
     Base implementation of an NNCF observer that defines the rules for compressing layer weights into the OpenVINO representation.
     """
 
+    def __init__(
+        self,
+        wc_param: WeightCompressionParameters,
+        dtype: torch.dtype,
+        **kwargs,
+    ) -> None:
+        """
+        :param wc_param: Weight compression parameter which contains information such as group_size
+                        reduction_axes, quantization mode etc.
+        :param dtype: target dtype for quantization such as int8, uint8, etc.
+        """
+        super().__init__(dtype=dtype, is_dynamic=False)
+        self.wc_param = wc_param
+
     def calculate_qparams(  # type: ignore[override]
         self,
         weight: torch.Tensor,
-        observer_node: torch.fx.Node,
-        model: torch.fx.GraphModule,
     ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
         """
         Calculate quantization parameters such as scale, quantized weight and zero point.
@@ -65,26 +74,11 @@ def calculate_qparams(  # type: ignore[override]
         :param weight: FP weight to be used for calculating qparams.
         :return: quantization params quantized weight, scale and zero point
         """
-        ndims = len(weight.size())
-        node_with_weight, weight_port_id = (
-            WeightObserverBase.get_node_with_weight_and_port_ids(observer_node, model)
-        )
-        _, node_metatype = GraphConverter.get_node_type_and_metatype(
-            node_with_weight, model
-        )
-        # Special case where embedding metatype has to be mapped to AtenEmbedding metatype
-        node_metatype = (
-            om.PTAtenEmbeddingMetatype
-            if node_metatype == om.PTEmbeddingMetatype
-            else node_metatype
-        )
-        reduction_dims = get_weight_compression_reduction_axes(
-            node_metatype, weight_port_id, ndims
-        )
-        reduction_dims = tuple(reduction_dims)
-
+        wc_param = self.get_wc_param()
+        wc_config = wc_param.compression_config
+        reduction_axes = wc_param.reduction_axes
         q_weight, scale, zp = do_integer_quantization(
-            Tensor(weight), self.wc_config, reduction_axes=reduction_dims
+            Tensor(weight), wc_config, reduction_axes=reduction_axes
         )
         zp = zp.data if zp is not None else None
         return q_weight.data, scale.data, zp
@@ -92,23 +86,6 @@ def calculate_qparams(  # type: ignore[override]
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x
 
-    @staticmethod
-    def get_node_with_weight_and_port_ids(
-        observer_node: torch.fx.Node, model: torch.fx.GraphModule
-    ) -> Tuple[torch.fx.Node, int]:
-        """
-        Returns the node which contains the weight and the weight port id.
-
-        :param observer_node: Observer node for the weight.
-        :param graph: The model.
-        :return: Node which contains the weight (for eg. Linear node) and the port ID for the weight.
-        """
-        for node in model.graph.nodes:
-            if observer_node in node.all_input_nodes:
-                return node, node.all_input_nodes.index(observer_node)
-        msg = f"Observer node {observer_node.name} has no consumer node"
-        raise RuntimeError(msg)
-
     def convert(
         self, model: torch.fx.GraphModule, observer_node: torch.fx.Node
     ) -> None:
@@ -126,7 +103,7 @@ def convert(
         weight_node = observer_node.args[0]
         original_weight = get_tensor_constant_from_node(weight_node, model)
         q_weight, scale, zero_point = self.calculate_qparams(
-            original_weight, observer_node, model
+            original_weight
         )
 
         decompressor = self._create_decompressor(
@@ -134,6 +111,7 @@ def convert(
         )
         packed_q_weight = decompressor.pack_weight(q_weight)
 
+        # Weight port id is 0 since observer is inserted for a single weight only.
         constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0)
 
         compressed_weight_name = observer_node.all_input_nodes[0].name
@@ -177,7 +155,7 @@ def _create_decompressor(
         pass
 
     @abstractmethod
-    def get_wc_config(self) -> WeightCompressionConfig:
+    def get_wc_param(self) -> WeightCompressionParameters:
         """
         Used to return the respective NNCF Weight Compression Config.
 
@@ -191,30 +169,6 @@ class INT4WeightObserver(WeightObserverBase):
     This class defines the behavior for INT4 Weight Compression which has per-group granularity.
     """
 
-    def __init__(
-        self,
-        group_size: int,
-        mapping_type: MappingType,
-        target_dtype: torch.dtype,
-        *args,
-        **kwargs,
-    ) -> None:
-        """
-        :param group_size: Group size for group wise quantization. group_size=-1 means it is per-channel quantization.
-        :param mapping_type: MappingType.SYMMETRIC and MappingType.ASYMMETRIC are supported types for this argument for symmetric or asymmetric quantization.
-        :param target_dtype: target dtype for quantization such as int8, uint8, etc.
-        """
-        super().__init__(dtype=target_dtype, is_dynamic=False)
-        self.wc_config = None
-        self.mapping_type = mapping_type
-
-        qmode = (
-            CompressWeightsMode.INT4_ASYM
-            if self.mapping_type == MappingType.ASYMMETRIC
-            else CompressWeightsMode.INT4_SYM
-        )
-        self.wc_config = WeightCompressionConfig(mode=qmode, group_size=group_size)
-
     def _create_decompressor(
         self,
         scale: torch.Tensor,
@@ -235,8 +189,8 @@ def _create_decompressor(
                 scale, q_weight.shape, original_weight.shape, original_weight.dtype
             )
 
-    def get_wc_config(self):
-        return self.wc_config
+    def get_wc_param(self) -> WeightCompressionParameters:
+        return self.wc_param
 
 
 class INT8WeightObserver(WeightObserverBase):
@@ -244,30 +198,6 @@ class INT8WeightObserver(WeightObserverBase):
     This class defines the behavior for Int8 WC which has per channel granularity.
     """
 
-    def __init__(
-        self,
-        qscheme: torch.qscheme,
-        dtype: torch.dtype,
-        ch_axis: int = 0,
-        *args,
-        **kwargs,
-    ) -> None:
-        """
-        :param qscheme: Quantization scheme which is per-channel for Int8 WC.
-        :param dtype: dtype for quantization such as int8, uint8, etc..
-        :param ch_axis: Channel axis.
-        """
-        super().__init__(dtype=dtype, is_dynamic=False)
-        self.wc_config = None
-        self.qscheme = qscheme
-
-        qmode = (
-            CompressWeightsMode.INT8_SYM
-            if self.qscheme == torch.per_channel_symmetric
-            else CompressWeightsMode.INT8_ASYM
-        )
-        self.wc_config = WeightCompressionConfig(mode=qmode)
-
     def _create_decompressor(
         self,
         scale: torch.Tensor,
@@ -282,5 +212,6 @@ def _create_decompressor(
         else:
             return INT8SymmetricWeightsDecompressor(scale, original_weight.dtype)
 
-    def get_wc_config(self):
-        return self.wc_config
\ No newline at end of file
+    def get_wc_param(self) -> WeightCompressionParameters:
+        return self.wc_param
+    
\ No newline at end of file
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index 2ede04e53db..ef9a83ca77c 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -24,9 +24,11 @@
 from nncf.quantization.quantize_model import (  # type: ignore[import-untyped]
     get_weight_compression_configuration,
 )
+from nncf.quantization.algorithms.weight_compression.config import (  # type: ignore[import-untyped]
+    WeightCompressionParameters,
+)
 from torchao.quantization.pt2e import (
     HistogramObserver,
-    MappingType,
     PerChannelMinMaxObserver,
     UniformQuantizationObserverBase,
 )
@@ -112,16 +114,11 @@ def __init__(
         else:
             weight_compression_configuration = get_weight_compression_configuration(
                 mode.value.replace(
-                    "_wc", ""
+                    "wo", ""
                 ),  # Mode value has to match NNCF CompressWeightsMode
                 **kwargs,
             )
             subset_size = 1 # Doesn't really matter in this case since it is data-free. Should just be +ve
-            dataset = None # Only Data Free Quantization is Supported in OVQuantizer
-            compression_format = nncf.CompressionFormat.DQ
-            nncf.quantization.algorithms.weight_compression.algorithm.check_user_compression_configuration(
-                subset_size=subset_size, dataset=dataset, compression_format=compression_format, **weight_compression_configuration
-                )
             self._algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression(
                 subset_size=subset_size, **weight_compression_configuration
             )
@@ -185,17 +182,14 @@ def _annotate_weight_compression(
         all_wc_params, _ = self._algo.get_processed_weight_compression_parameters(model, nncf_graph)
 
         for wc_param in all_wc_params:
-            wc_config = wc_param.compression_config
             node_with_weight = wc_param.node_with_weight
             target_node = nncf_fx.node_utils.get_graph_node_by_name(
                 graph, node_with_weight.node_name
             )
             annotation = node_vs_torch_annotation[target_node]
             edge_or_node = OpenVINOQuantizer._get_weight_edge(target_node, nncf_graph)
-            group_size = wc_config.group_size
-            qmode = wc_config.mode
             qspec = self._get_torch_ao_qspec_from_nncf_config(
-                qp=None, group_size=group_size, qmode=qmode, weights_only=True
+                qp=None, wc_param=wc_param
             )
             self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
 
@@ -425,19 +419,16 @@ def _fill_torch_ao_annotation(
     @staticmethod
     def _get_torch_ao_qspec_from_nncf_config(
         qp: quantization.quantizer_setup.QuantizationPointBase,
-        group_size: int = -1,
-        qmode: Optional[QuantizationMode] = None,
-        weights_only: bool = False,
+        wc_param: WeightCompressionParameters = None,
     ) -> QuantizationSpec:
         """
         Returns a TorchAO QuantizationSpec based on NNCF quantization config and other arguments.
-        For weight-only quantization (e.g., INT4/INT8 compression), uses `qmode`, `group_size`,
-        and `weights_only`. For post-training quantization, only `qp` is required.
+        For weight-only quantization (e.g., INT4/INT8 compression), uses `wc_param` which carries 
+        weight only quantization info such as group_size, reduction_axes etc. For post-training 
+        quantization, only `qp` is required.
 
         :param qp: Quantization point from NNCF.
-        :param group_size: Group size for INT4 group-wise quantization.
-        :param qmode: Quantization mode for weight compression.
-        :param weights_only: If True, applies weight-only quantization logic.
+        :param wc_param: NNCF Weight compression parameters for the node.
         :return: A TorchAO QuantizationSpec.
         """
         observer: Type[UniformQuantizationObserverBase]
@@ -445,26 +436,21 @@ def _get_torch_ao_qspec_from_nncf_config(
         # Eps value is copied from nncf/torch/quantization/layers.py
         extra_args: Dict[str, Any] = {"eps": 1e-16}
 
-        if weights_only:
-            mapping_type = (
-                MappingType.SYMMETRIC
-                if qmode == QuantizationMode.INT4WO_SYM
-                else MappingType.ASYMMETRIC
-            )
-            if qmode in [QuantizationMode.INT4WO_SYM, QuantizationMode.INT4WO_SYM]:
-                extra_args["group_size"] = group_size
-                extra_args["mapping_type"] = mapping_type
-                extra_args["target_dtype"] = torch.int8
+        if wc_param:
+            qmode = wc_param.compression_config.mode
+            if qmode in [nncf.CompressWeightsMode.INT4_ASYM, nncf.CompressWeightsMode.INT4_SYM]:
+                extra_args["wc_param"] = wc_param
                 observer = INT4WeightObserver
-                quant_min = -8 if mapping_type == MappingType.SYMMETRIC else 0
-                quant_max = 7 if mapping_type == MappingType.SYMMETRIC else 15
+                quant_min = -8 if not wc_param.compression_config.is_asym_mode else 0
+                quant_max = 7 if not wc_param.compression_config.is_asym_mode else 15
                 dtype = torch.int8
                 channel_axis = 0
                 torch_qscheme = None
             else:
+                extra_args["wc_param"] = wc_param
                 observer = INT8WeightObserver
-                quant_min = -128 if mapping_type == MappingType.SYMMETRIC else 0
-                quant_max = 127 if mapping_type == MappingType.SYMMETRIC else 255
+                quant_min = -128 if not wc_param.compression_config.is_asym_mode else 0
+                quant_max = 127 if not wc_param.compression_config.is_asym_mode else 255
                 dtype = torch.int8
                 channel_axis = 0
                 torch_qscheme = (

From 53e0f4cd0e01ed5a8adb85a7c08a2722d4a5a622 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 1 Sep 2025 10:39:20 +0400
Subject: [PATCH 05/27] review changes & update method names according to wc
 algo

---
 backends/openvino/quantizer/observers.py | 4 ++--
 backends/openvino/quantizer/quantizer.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py
index 845a091d24b..50fcc673ed6 100644
--- a/backends/openvino/quantizer/observers.py
+++ b/backends/openvino/quantizer/observers.py
@@ -30,7 +30,7 @@
 from nncf.quantization.algorithms.weight_compression.weight_lowering import (  # type: ignore[import-untyped]
     do_integer_quantization,
 )
-from nncf.tensor.tensor import Tensor  # type: ignore[import-untyped]
+from nncf.tensor.tensor import Tensor as NNCFTensor  # type: ignore[import-untyped]
 from nncf.torch.graph.transformations.commands import (  # type: ignore[import-untyped]
     PTTargetPoint,
     TargetType,
@@ -78,7 +78,7 @@ def calculate_qparams(  # type: ignore[override]
         wc_config = wc_param.compression_config
         reduction_axes = wc_param.reduction_axes
         q_weight, scale, zp = do_integer_quantization(
-            Tensor(weight), wc_config, reduction_axes=reduction_axes
+            NNCFTensor(weight), wc_config, reduction_axes=reduction_axes
         )
         zp = zp.data if zp is not None else None
         return q_weight.data, scale.data, zp
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index ef9a83ca77c..2e364424b16 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -179,7 +179,7 @@ def _annotate_weight_compression(
         :return: Updated mapping of FX nodes with weight compression annotations.
         """
         self._algo.set_backend_entity(model)
-        all_wc_params, _ = self._algo.get_processed_weight_compression_parameters(model, nncf_graph)
+        all_wc_params, _ = self._algo.get_weight_compression_parameters(model, nncf_graph)
 
         for wc_param in all_wc_params:
             node_with_weight = wc_param.node_with_weight

From bf959305dc210416f20c327509291db3655028e9 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 1 Sep 2025 11:14:13 +0400
Subject: [PATCH 06/27] review changes

---
 backends/openvino/quantizer/observers.py | 2 +-
 backends/openvino/quantizer/quantizer.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py
index 50fcc673ed6..b1054460a16 100644
--- a/backends/openvino/quantizer/observers.py
+++ b/backends/openvino/quantizer/observers.py
@@ -166,7 +166,7 @@ def get_wc_param(self) -> WeightCompressionParameters:
 
 class INT4WeightObserver(WeightObserverBase):
     """
-    This class defines the behavior for INT4 Weight Compression which has per-group granularity.
+    OpenVINO INT4 Weight Compression observer.
     """
 
     def _create_decompressor(
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index 2e364424b16..485d67e3bb9 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -187,7 +187,7 @@ def _annotate_weight_compression(
                 graph, node_with_weight.node_name
             )
             annotation = node_vs_torch_annotation[target_node]
-            edge_or_node = OpenVINOQuantizer._get_weight_edge(target_node, nncf_graph)
+            edge_or_node = self._get_weight_edge(target_node, nncf_graph)
             qspec = self._get_torch_ao_qspec_from_nncf_config(
                 qp=None, wc_param=wc_param
             )

From 2d4bec7a4b0041ead027a6c651e00eee32343dc4 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 1 Sep 2025 11:31:40 +0400
Subject: [PATCH 07/27] review changes

---
 backends/openvino/quantizer/observers.py  | 38 ++++++-----------------
 backends/openvino/quantizer/quantizer.py  |  7 +----
 examples/models/llama/export_llama_lib.py |  2 +-
 3 files changed, 12 insertions(+), 35 deletions(-)

diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py
index b1054460a16..d44a22556dd 100644
--- a/backends/openvino/quantizer/observers.py
+++ b/backends/openvino/quantizer/observers.py
@@ -9,12 +9,7 @@
 from abc import ABC, abstractmethod
 from typing import Optional, Tuple
 
-import nncf.torch.graph.operator_metatypes as om  # type: ignore[import-untyped]
-
 import torch
-from nncf.experimental.torch.fx.nncf_graph_builder import (  # type: ignore[import-untyped]
-    GraphConverter,
-)
 
 from nncf.experimental.torch.fx.node_utils import (  # type: ignore[import-untyped]
     get_tensor_constant_from_node,
@@ -23,7 +18,6 @@
     constant_update_fn,
     module_insertion_transformation_builder,
 )
-from nncf.parameters import CompressWeightsMode  # type: ignore[import-untyped]
 from nncf.quantization.algorithms.weight_compression.config import (  # type: ignore[import-untyped]
     WeightCompressionParameters,
 )
@@ -57,9 +51,8 @@ def __init__(
         **kwargs,
     ) -> None:
         """
-        :param wc_param: Weight compression parameter which contains information such as group_size
-                        reduction_axes, quantization mode etc.
-        :param dtype: target dtype for quantization such as int8, uint8, etc.
+        :param wc_param: Weight compression parameters container.
+        :param dtype: target dtype for the quantization.
         """
         super().__init__(dtype=dtype, is_dynamic=False)
         self.wc_param = wc_param
@@ -69,10 +62,10 @@ def calculate_qparams(  # type: ignore[override]
         weight: torch.Tensor,
     ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
         """
-        Calculate quantization parameters such as scale, quantized weight and zero point.
+        Calculates quantization parameters: quantized weight, quantization scale and quantization zero point.
 
         :param weight: FP weight to be used for calculating qparams.
-        :return: quantization params quantized weight, scale and zero point
+        :return: A tuple containing the quantized weight, quantization scale and quantization zero point.
         """
         wc_param = self.get_wc_param()
         wc_config = wc_param.compression_config
@@ -90,10 +83,8 @@ def convert(
         self, model: torch.fx.GraphModule, observer_node: torch.fx.Node
     ) -> None:
         """
-        Converts the weight observer node into a decompression subgraph after calibration.
-        This method is responsible for transforming the model after the quantization preparation
-        and calibration phases. It replaces the observer node with the quantized weight and a decompression
-        module.
+        Replaces the given observer node from the given model with a quantized 
+        weight and a OpenVINO specific decompression module.
 
         :param model: A `torch.fx.GraphModule` representing the statically traced model
                     with observer nodes attached and calibrated.
@@ -144,7 +135,7 @@ def _create_decompressor(
         original_weight: torch.Tensor,
     ) -> BaseWeightsDecompressor:
         """
-        Used to return the respective NNCF decompressor for different types of quantization.
+        Returns a respective NNCF decompressor for different types of quantization.
 
         :param scale: Calculated scale quantization parameter.
         :param zero_point: Calculated zero_point quantization parameter.
@@ -152,17 +143,14 @@ def _create_decompressor(
         :param original_weight: FP weight.
         :return: NNCF observer according to the qmode which creates the decompression subgraph supported by OpenVINO.
         """
-        pass
 
-    @abstractmethod
     def get_wc_param(self) -> WeightCompressionParameters:
         """
-        Used to return the respective NNCF Weight Compression Config.
+        Returns a respective NNCF Weight Compression Config.
 
         :return: Weight compression config with the compression information such as qmode, group_size etc.
         """
-        pass
-
+        return self.wc_param
 
 class INT4WeightObserver(WeightObserverBase):
     """
@@ -189,13 +177,10 @@ def _create_decompressor(
                 scale, q_weight.shape, original_weight.shape, original_weight.dtype
             )
 
-    def get_wc_param(self) -> WeightCompressionParameters:
-        return self.wc_param
-
 
 class INT8WeightObserver(WeightObserverBase):
     """
-    This class defines the behavior for Int8 WC which has per channel granularity.
+    OpenVINO INT8 Weight Compression per channel observer.
     """
 
     def _create_decompressor(
@@ -212,6 +197,3 @@ def _create_decompressor(
         else:
             return INT8SymmetricWeightsDecompressor(scale, original_weight.dtype)
 
-    def get_wc_param(self) -> WeightCompressionParameters:
-        return self.wc_param
-    
\ No newline at end of file
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index 485d67e3bb9..7f86686d03c 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -205,15 +205,10 @@ def _annotate_post_training_quantization(
         """
         Annotates the model graph with post-training quantization configurations.
 
-        Converts NNCF quantization points into TorchAO-compatible quantization specs,
-        assigning them to corresponding nodes or edges. Also handles unified scale groups,
-        ensuring shared quantization specs across grouped quantizers with consistent configs.
-
         :param model: The FX GraphModule to annotate.
         :param graph: The underlying FX graph.
         :param nncf_graph: The corresponding NNCF graph.
         :param node_vs_torch_annotation: A mapping of FX nodes to quantization annotations.
-
         :return: Updated mapping of FX nodes with post-training quantization annotations.
         """
         quantization_setup = self.get_nncf_quantization_setup(model, nncf_graph)
@@ -575,4 +570,4 @@ def quantize_model(
         smooth_quant=smooth_quant,
         **kwargs,
     )
-    return quantized_model
\ No newline at end of file
+    return quantized_model
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 00785491100..269022f2cf7 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -790,7 +790,7 @@ def get_quantizer_and_quant_params(llm_config):
         )
         quantizers.append(qnn_quantizer)
     if llm_config.backend.openvino.enabled and llm_config.quantization.pt2e_quantize:
-        assert len(quantizers) == 0, "Should not enable both xnnpack and openvino"
+        assert quantizers, "Should not enable both xnnpack and openvino"
         group_size = llm_config.quantization.group_size
         group_size = group_size if group_size else 32
         ov_quantizer = get_ov_quantizer(

From 0a2e361f04aa724c8af7d88c1dbd286b4c7556d6 Mon Sep 17 00:00:00 2001
From: Aamir Nazir <aamir.nazir@intel.com>
Date: Wed, 3 Sep 2025 20:48:10 +0400
Subject: [PATCH 08/27] Update export_llama_lib.py

---
 examples/models/llama/export_llama_lib.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 269022f2cf7..8eab3eefbc0 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -790,7 +790,7 @@ def get_quantizer_and_quant_params(llm_config):
         )
         quantizers.append(qnn_quantizer)
     if llm_config.backend.openvino.enabled and llm_config.quantization.pt2e_quantize:
-        assert quantizers, "Should not enable both xnnpack and openvino"
+        assert not quantizers, "Should not enable both xnnpack and openvino"
         group_size = llm_config.quantization.group_size
         group_size = group_size if group_size else 32
         ov_quantizer = get_ov_quantizer(

From c8ea777098b8a812e6162b767dbfeabdd7c193c4 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Sat, 6 Sep 2025 13:39:52 +0400
Subject: [PATCH 09/27] use new transformations

---
 backends/openvino/quantizer/observers.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py
index d44a22556dd..76ab33eb5c5 100644
--- a/backends/openvino/quantizer/observers.py
+++ b/backends/openvino/quantizer/observers.py
@@ -15,8 +15,9 @@
     get_tensor_constant_from_node,
 )
 from nncf.experimental.torch.fx.transformations import (  # type: ignore[import-untyped]
-    constant_update_fn,
-    module_insertion_transformation_builder,
+    constant_update,
+    module_insertion,
+    node_removal,
 )
 from nncf.quantization.algorithms.weight_compression.config import (  # type: ignore[import-untyped]
     WeightCompressionParameters,
@@ -103,7 +104,7 @@ def convert(
         packed_q_weight = decompressor.pack_weight(q_weight)
 
         # Weight port id is 0 since observer is inserted for a single weight only.
-        constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0)
+        constant_update(model, observer_node, packed_q_weight, input_port_id=0)
 
         compressed_weight_name = observer_node.all_input_nodes[0].name
         decompressor_suffix = "_".join(
@@ -111,7 +112,8 @@ def convert(
         )
         decompressor_name = f"{decompressor.quantization_mode}_weights_decompressor_{decompressor_suffix}"
 
-        module_insertion_transformation_builder(
+        module_insertion(
+            model,
             decompressor,
             [
                 PTTargetPoint(
@@ -120,11 +122,8 @@ def convert(
                 )
             ],
             decompressor_name,
-        )(model)
-
-        decomp_node = observer_node.args[0]
-        observer_node.replace_all_uses_with(decomp_node)  # type: ignore[arg-type]
-        model.graph.erase_node(observer_node)
+        )
+        node_removal(model, observer_node, 0)
 
     @abstractmethod
     def _create_decompressor(

From a6b605f41b5390ff9de70b2397a2d00003f34ff2 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Sat, 6 Sep 2025 13:46:24 +0400
Subject: [PATCH 10/27] add comment for manual MP allocation

---
 extension/llm/export/quantizer_lib.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index 9220c1efbdc..e839827208c 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -233,7 +233,7 @@ def get_ov_quantizer(
     ), f"The quantization config is for backend {backend} instead of openvino."
     assert group_size != None, "Group Size None is Not Supported. It should be set to -1 for per-channel."
 
-    # Manually ignore MP layers.
+    # (TODO) Manually ignore MP layers. This is done manually for now till we use the dynamic allocation MP 
     fp_node_names = [
         "linear_14",
         "linear_15",

From 9614fc4da170d76a39e047d0c364177bf96d0209 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Sat, 6 Sep 2025 13:48:58 +0400
Subject: [PATCH 11/27] remove nncf_compression from export llama lib

---
 examples/models/llama/export_llama_lib.py | 54 +----------------------
 1 file changed, 1 insertion(+), 53 deletions(-)

diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 8eab3eefbc0..ac52893b99c 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -567,13 +567,6 @@ def build_args_parser() -> argparse.ArgumentParser:
         help="path to the input pruning token mapping file (token_map.json)",
     )
 
-    parser.add_argument(
-        "--nncf_compression",
-        default=False,
-        action="store_true",
-        help="Enables nncf compression for openvino backend",
-    )
-
     parser.add_argument(
         "--export_only",
         default=False,
@@ -909,7 +902,6 @@ def _to_edge_and_lower_llama_openvino(
     quantizers,
     additional_passes,
     openvino_device: str = "CPU",
-    nncf_compression: bool = False,
     verbose: bool = False,
 ) -> LLMEdgeManager:  # noqa: C901
     partitioners = []
@@ -921,51 +913,8 @@ def _to_edge_and_lower_llama_openvino(
     logging.info("Lowering model using following partitioner(s): ")
     for partitioner in partitioners:
         logging.info(f"--> {partitioner.__class__.__name__}")
-    try:
-        import nncf
-        from functools import partial
-        from pytorch_tokenizers import get_tokenizer
-    except ImportError:
-        raise ImportError(
-            "Please install nncf via backends/openvino/requirements.txt"
-        )
-   
-    tokenizer = get_tokenizer(builder_exported.tokenizer_path)
-    from datasets import load_dataset
-    # Use NNCF compression if enabled
-    # TODO: Enable passing OpenVINOQuantizer as a parameter to pt2e_quantize
-    if nncf_compression:
-        dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
-        dataset = dataset.filter(lambda example: example['text'].strip() != "")
-        dataset = dataset.filter(lambda example: example['text'].strip() != "\n")
-        def transform_fn(
-            prompts: str, tokenizer
-        ):
-            tokenized_text = tokenizer.encode(prompts["text"], bos=False, eos=False)
-            device = torch.device("cpu") if openvino_device=="CPU" else torch.device("cuda")
-            inputs = ()
-            inputs = (
-                torch.tensor(tokenized_text[:128], device=device).unsqueeze(0),
-                {"input_pos": torch.tensor([0], device=device)},
-            )
-
-            return inputs
-        
-        builder_exported.pre_autograd_graph_module = nncf.compress_weights(
-                                                            builder_exported.pre_autograd_graph_module,
-                                                            dataset=nncf.Dataset(dataset,  partial(transform_fn, tokenizer=tokenizer)),
-                                                            mode=nncf.CompressWeightsMode.INT4_SYM,
-                                                            group_size=32,
-                                                            backup_mode=nncf.BackupMode.NONE,
-                                                            ratio=0.8,
-                                                            sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
-                                                        )
- 
-        builder = builder_exported.to_edge_transform_and_lower(partitioners)
-    
-    else:
-        builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(partitioners)
 
+    builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(partitioners)
 
     if verbose:
         print_delegation_info(builder.edge_manager.exported_program().graph_module)
@@ -1211,7 +1160,6 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager:  # noqa: C901
             quantizers,
             additional_passes,
             openvino_device=llm_config.backend.openvino.device,
-            nncf_compression=llm_config.backend.openvino.nncf_compression,
             verbose=llm_config.debug.verbose,
         )
     else:

From 45007cf90c054ccfd527874ae35d383fc34a4ee8 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Sat, 6 Sep 2025 13:52:58 +0400
Subject: [PATCH 12/27] change pt2e quantize flag to use openvino_4wo instead
 of openvino_8da4w and so on

---
 extension/llm/export/config/llm_config.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py
index b4175d54cd7..49855d61e6e 100644
--- a/extension/llm/export/config/llm_config.py
+++ b/extension/llm/export/config/llm_config.py
@@ -275,8 +275,8 @@ class Pt2eQuantize(str, Enum):
 
     xnnpack_dynamic = "xnnpack_dynamic"
     xnnpack_dynamic_qc4 = "xnnpack_dynamic_qc4"
-    openvino_8da4w = "openvino_8da4w"
-    openvino_8da8w = "openvino_8da8w"
+    openvino_4wo = "openvino_4wo"
+    openvino_8wo = "openvino_8wo"
     qnn_8a8w = "qnn_8a8w"
     qnn_16a16w = "qnn_16a16w"
     qnn_16a4w = "qnn_16a4w"

From 9d494147457e6696f7149e4b7cb69f95811cbd47 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Sat, 6 Sep 2025 13:53:14 +0400
Subject: [PATCH 13/27] follow up to last commit

---
 examples/models/llama/export_llama_lib.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index ac52893b99c..ec03f4b26c9 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -206,8 +206,8 @@ def build_args_parser() -> argparse.ArgumentParser:
         choices=[
             "xnnpack_dynamic",
             "xnnpack_dynamic_qc4",
-            "openvino_8da4w",
-            "openvino_8da8w",
+            "openvino_4wo",
+            "openvino_8wo",
             "qnn_8a8w",
             "qnn_16a16w",
             "qnn_16a4w",

From d6727cfed609d07281fdea42358d2e234ac82f19 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Sat, 6 Sep 2025 13:56:47 +0400
Subject: [PATCH 14/27] update quantizer lib with openvino_4wo

---
 extension/llm/export/quantizer_lib.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index e839827208c..8a097f9b8f1 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -263,10 +263,10 @@ def get_ov_quantizer(
         "linear_109",
         "linear_110",]
 
-    if quant_config == "8da4w":
+    if quant_config == "4wo":
         mode = QuantizationMode.INT4WO_SYM
 
-    elif quant_config == "8da8w":
+    elif quant_config == "8wo":
         group_size = -1
         mode = QuantizationMode.INT8WO_SYM
     else:

From 4a0a7819ab69aa0d8fdfce70f3be219c14abc409 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Sat, 6 Sep 2025 14:06:48 +0400
Subject: [PATCH 15/27] split qspec function into 2 parts; 1 for WC and other
 for PTQ qspecs

---
 backends/openvino/quantizer/quantizer.py | 92 +++++++++++++-----------
 1 file changed, 50 insertions(+), 42 deletions(-)

diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index 7f86686d03c..ef04ed0de46 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -188,8 +188,8 @@ def _annotate_weight_compression(
             )
             annotation = node_vs_torch_annotation[target_node]
             edge_or_node = self._get_weight_edge(target_node, nncf_graph)
-            qspec = self._get_torch_ao_qspec_from_nncf_config(
-                qp=None, wc_param=wc_param
+            qspec = self._get_torch_ao_qspec_from_nncf_config_for_wc(
+                wc_param=wc_param
             )
             self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
 
@@ -217,7 +217,7 @@ def _annotate_post_training_quantization(
             edge_or_node, annotation = self._get_edge_or_node_and_annotation(
                 graph, nncf_graph, qp, node_vs_torch_annotation
             )
-            qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(qp)
+            qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config_for_ptq(qp)
             self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
 
         for quantizer_ids in quantization_setup.unified_scale_groups.values():
@@ -412,18 +412,58 @@ def _fill_torch_ao_annotation(
             annotation_to_update.input_qspec_map[edge_or_node[0]] = qspec
 
     @staticmethod
-    def _get_torch_ao_qspec_from_nncf_config(
+    def _get_torch_ao_qspec_from_nncf_config_for_wc(
+        wc_param: WeightCompressionParameters,
+    ) -> QuantizationSpec:
+        """
+        Returns a TorchAO QuantizationSpec based on NNCF weight compression parameter.
+
+        :param wc_param: NNCF Weight compression parameters for the node.
+        :return: A TorchAO QuantizationSpec.
+        """
+        observer: Type[UniformQuantizationObserverBase]
+
+        extra_args: Dict[str, Any] = {}
+
+        qmode = wc_param.compression_config.mode
+        if qmode in [nncf.CompressWeightsMode.INT4_ASYM, nncf.CompressWeightsMode.INT4_SYM]:
+            extra_args["wc_param"] = wc_param
+            observer = INT4WeightObserver
+            quant_min = -8 if not wc_param.compression_config.is_asym_mode else 0
+            quant_max = 7 if not wc_param.compression_config.is_asym_mode else 15
+            dtype = torch.int8
+            channel_axis = 0
+            torch_qscheme = None
+        else:
+            extra_args["wc_param"] = wc_param
+            observer = INT8WeightObserver
+            quant_min = -128 if not wc_param.compression_config.is_asym_mode else 0
+            quant_max = 127 if not wc_param.compression_config.is_asym_mode else 255
+            dtype = torch.int8
+            channel_axis = 0
+            torch_qscheme = (
+                torch.per_channel_symmetric
+                if qmode == QuantizationMode.INT8WO_SYM
+                else torch.per_channel_affine
+            )
+        return QuantizationSpec(
+            dtype=dtype,
+            observer_or_fake_quant_ctr=observer.with_args(**extra_args),
+            quant_min=quant_min,
+            quant_max=quant_max,
+            qscheme=torch_qscheme,
+            ch_axis=channel_axis,
+            is_dynamic=False,
+        )
+
+    @staticmethod
+    def _get_torch_ao_qspec_from_nncf_config_for_ptq(
         qp: quantization.quantizer_setup.QuantizationPointBase,
-        wc_param: WeightCompressionParameters = None,
     ) -> QuantizationSpec:
         """
-        Returns a TorchAO QuantizationSpec based on NNCF quantization config and other arguments.
-        For weight-only quantization (e.g., INT4/INT8 compression), uses `wc_param` which carries 
-        weight only quantization info such as group_size, reduction_axes etc. For post-training 
-        quantization, only `qp` is required.
+        Returns a TorchAO QuantizationSpec based on NNCF quantization point.
 
         :param qp: Quantization point from NNCF.
-        :param wc_param: NNCF Weight compression parameters for the node.
         :return: A TorchAO QuantizationSpec.
         """
         observer: Type[UniformQuantizationObserverBase]
@@ -431,38 +471,6 @@ def _get_torch_ao_qspec_from_nncf_config(
         # Eps value is copied from nncf/torch/quantization/layers.py
         extra_args: Dict[str, Any] = {"eps": 1e-16}
 
-        if wc_param:
-            qmode = wc_param.compression_config.mode
-            if qmode in [nncf.CompressWeightsMode.INT4_ASYM, nncf.CompressWeightsMode.INT4_SYM]:
-                extra_args["wc_param"] = wc_param
-                observer = INT4WeightObserver
-                quant_min = -8 if not wc_param.compression_config.is_asym_mode else 0
-                quant_max = 7 if not wc_param.compression_config.is_asym_mode else 15
-                dtype = torch.int8
-                channel_axis = 0
-                torch_qscheme = None
-            else:
-                extra_args["wc_param"] = wc_param
-                observer = INT8WeightObserver
-                quant_min = -128 if not wc_param.compression_config.is_asym_mode else 0
-                quant_max = 127 if not wc_param.compression_config.is_asym_mode else 255
-                dtype = torch.int8
-                channel_axis = 0
-                torch_qscheme = (
-                    torch.per_channel_symmetric
-                    if qmode == QuantizationMode.INT8WO_SYM
-                    else torch.per_channel_affine
-                )
-            return QuantizationSpec(
-                dtype=dtype,
-                observer_or_fake_quant_ctr=observer.with_args(**extra_args),
-                quant_min=quant_min,
-                quant_max=quant_max,
-                qscheme=torch_qscheme,
-                ch_axis=channel_axis,
-                is_dynamic=False,
-            )
-
         is_weight = qp.is_weight_quantization_point()
         qconfig = qp.qconfig
 

From f6a1ee3d708ca46fe495f081bc45872042b1bed6 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 8 Sep 2025 12:14:34 +0400
Subject: [PATCH 16/27] micro fix

---
 backends/openvino/quantizer/quantizer.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index ef04ed0de46..762ed2a9171 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -426,24 +426,29 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc(
         extra_args: Dict[str, Any] = {}
 
         qmode = wc_param.compression_config.mode
+        is_asym_mode = wc_param.compression_config.is_asym_mode
         if qmode in [nncf.CompressWeightsMode.INT4_ASYM, nncf.CompressWeightsMode.INT4_SYM]:
             extra_args["wc_param"] = wc_param
             observer = INT4WeightObserver
-            quant_min = -8 if not wc_param.compression_config.is_asym_mode else 0
-            quant_max = 7 if not wc_param.compression_config.is_asym_mode else 15
+            quant_min = -8 if not is_asym_mode else 0
+            quant_max = 7 if not is_asym_mode else 15
             dtype = torch.int8
             channel_axis = 0
-            torch_qscheme = None
+            torch_qscheme = torch_qscheme = (
+                torch.per_channel_symmetric
+                if not is_asym_mode
+                else torch.per_channel_affine
+            )
         else:
             extra_args["wc_param"] = wc_param
             observer = INT8WeightObserver
-            quant_min = -128 if not wc_param.compression_config.is_asym_mode else 0
-            quant_max = 127 if not wc_param.compression_config.is_asym_mode else 255
+            quant_min = -128 if not is_asym_mode else 0
+            quant_max = 127 if not is_asym_mode else 255
             dtype = torch.int8
             channel_axis = 0
             torch_qscheme = (
                 torch.per_channel_symmetric
-                if qmode == QuantizationMode.INT8WO_SYM
+                if not is_asym_mode
                 else torch.per_channel_affine
             )
         return QuantizationSpec(

From d285fcce354f8bde55e968892932cbe4a34421cd Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 8 Sep 2025 15:35:49 +0400
Subject: [PATCH 17/27] udpate mixed precision layers for higher accuracy.
 Change INT4 mode to Asymmetric

---
 extension/llm/export/quantizer_lib.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index 8a097f9b8f1..46b10dcb960 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -235,21 +235,17 @@ def get_ov_quantizer(
 
     # (TODO) Manually ignore MP layers. This is done manually for now till we use the dynamic allocation MP 
     fp_node_names = [
+        "linear_13",
         "linear_14",
-        "linear_15",
         "linear_35",
         "linear_56",
-        "linear_57",
-        "linear_63",
         "linear_70",
         "linear_71",
         "linear_77",
         "linear_78",
-        "linear_81",
         "linear_84",
         "linear_85",
         "linear_88",
-        "linear_89",
         "linear_91",
         "linear_92",
         "linear_95",
@@ -261,10 +257,11 @@ def get_ov_quantizer(
         "linear_105",
         "linear_106",
         "linear_109",
-        "linear_110",]
+        "linear_110",
+        "linear_111",]
 
     if quant_config == "4wo":
-        mode = QuantizationMode.INT4WO_SYM
+        mode = QuantizationMode.INT4WO_ASYM
 
     elif quant_config == "8wo":
         group_size = -1

From 4e66df1a52e40e90178f4c9fce815d364c5282f9 Mon Sep 17 00:00:00 2001
From: Aamir Nazir <aamir.nazir@intel.com>
Date: Mon, 8 Sep 2025 18:12:37 +0400
Subject: [PATCH 18/27] Apply suggestions from code review

Co-authored-by: Daniil Lyakhov <daniil.lyakhov@intel.com>
---
 backends/openvino/quantizer/observers.py | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py
index 76ab33eb5c5..59a40f2be2d 100644
--- a/backends/openvino/quantizer/observers.py
+++ b/backends/openvino/quantizer/observers.py
@@ -56,9 +56,9 @@ def __init__(
         :param dtype: target dtype for the quantization.
         """
         super().__init__(dtype=dtype, is_dynamic=False)
-        self.wc_param = wc_param
+        self._wc_param = wc_param
 
-    def calculate_qparams(  # type: ignore[override]
+    def _calculate_qparams(  # type: ignore[override]
         self,
         weight: torch.Tensor,
     ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
@@ -68,7 +68,7 @@ def calculate_qparams(  # type: ignore[override]
         :param weight: FP weight to be used for calculating qparams.
         :return: A tuple containing the quantized weight, quantization scale and quantization zero point.
         """
-        wc_param = self.get_wc_param()
+        wc_param = self._wc_param
         wc_config = wc_param.compression_config
         reduction_axes = wc_param.reduction_axes
         q_weight, scale, zp = do_integer_quantization(
@@ -143,13 +143,6 @@ def _create_decompressor(
         :return: NNCF observer according to the qmode which creates the decompression subgraph supported by OpenVINO.
         """
 
-    def get_wc_param(self) -> WeightCompressionParameters:
-        """
-        Returns a respective NNCF Weight Compression Config.
-
-        :return: Weight compression config with the compression information such as qmode, group_size etc.
-        """
-        return self.wc_param
 
 class INT4WeightObserver(WeightObserverBase):
     """

From e850e419cb313e86fd0f5669e7eaa1d115fcc10c Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 8 Sep 2025 18:13:28 +0400
Subject: [PATCH 19/27] Review changes

---
 backends/openvino/quantizer/observers.py | 30 ++++++++++++------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py
index 59a40f2be2d..457399117e0 100644
--- a/backends/openvino/quantizer/observers.py
+++ b/backends/openvino/quantizer/observers.py
@@ -94,7 +94,7 @@ def convert(
         """
         weight_node = observer_node.args[0]
         original_weight = get_tensor_constant_from_node(weight_node, model)
-        q_weight, scale, zero_point = self.calculate_qparams(
+        q_weight, scale, zero_point = self._calculate_qparams(
             original_weight
         )
 
@@ -156,18 +156,17 @@ def _create_decompressor(
         q_weight: torch.Tensor,
         original_weight: torch.Tensor,
     ) -> BaseWeightsDecompressor:
-        if zero_point is not None:
-            return INT4AsymmetricWeightsDecompressor(
-                scale,
-                zero_point,
-                q_weight.shape,
-                original_weight.shape,
-                original_weight.dtype,
-            )
-        else:
+        if zero_point is None:
             return INT4SymmetricWeightsDecompressor(
                 scale, q_weight.shape, original_weight.shape, original_weight.dtype
             )
+        return INT4AsymmetricWeightsDecompressor(
+            scale,
+            zero_point,
+            q_weight.shape,
+            original_weight.shape,
+            original_weight.dtype,
+        )
 
 
 class INT8WeightObserver(WeightObserverBase):
@@ -182,10 +181,11 @@ def _create_decompressor(
         q_weight: torch.Tensor,
         original_weight: torch.Tensor,
     ) -> BaseWeightsDecompressor:
-        if zero_point is not None:
-            return INT8AsymmetricWeightsDecompressor(
-                scale, zero_point, original_weight.dtype
+        if zero_point is None:
+            return INT8SymmetricWeightsDecompressor(
+                scale, original_weight.dtype
             )
-        else:
-            return INT8SymmetricWeightsDecompressor(scale, original_weight.dtype)
+        return INT8AsymmetricWeightsDecompressor(
+            scale, zero_point, original_weight.dtype
+        )
 

From 204043f973ba928c3f2b73dc11e1db6572b7c4a7 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 8 Sep 2025 18:33:16 +0400
Subject: [PATCH 20/27] review changes in quantizer

---
 backends/openvino/quantizer/quantizer.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index 762ed2a9171..7e0e3c92af0 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -175,7 +175,6 @@ def _annotate_weight_compression(
         :param graph: The underlying FX graph.
         :param nncf_graph: The corresponding NNCF graph.
         :param node_vs_torch_annotation: A mapping of FX nodes to quantization annotations.
-
         :return: Updated mapping of FX nodes with weight compression annotations.
         """
         self._algo.set_backend_entity(model)
@@ -343,7 +342,7 @@ def _get_edge_or_node_and_annotation(
     def _get_weight_edge(
         target_node: torch.fx.Node,
         nncf_graph: NNCFGraph,
-    ):
+    ) -> tuple[torch.fx.Node, torch.fx.Node]:
         """
         Returns the FX node corresponding to the weight tensor input of a given operator node.
         Uses the NNCF graph to identify which input port of the target node holds the weight.
@@ -351,7 +350,6 @@ def _get_weight_edge(
 
         :param target_node: FX node representing a weighted operation (e.g., Linear, Conv).
         :param nncf_graph: NNCFGraph used to determine weight port indices.
-
         :return: Edge represented by a Tuple of (weight_node, target_node), where weight_node is the FX node supplying the weight.
         """
         nncf_node = nncf_graph.get_node_by_name(target_node.name)
@@ -428,7 +426,6 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc(
         qmode = wc_param.compression_config.mode
         is_asym_mode = wc_param.compression_config.is_asym_mode
         if qmode in [nncf.CompressWeightsMode.INT4_ASYM, nncf.CompressWeightsMode.INT4_SYM]:
-            extra_args["wc_param"] = wc_param
             observer = INT4WeightObserver
             quant_min = -8 if not is_asym_mode else 0
             quant_max = 7 if not is_asym_mode else 15
@@ -440,7 +437,6 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc(
                 else torch.per_channel_affine
             )
         else:
-            extra_args["wc_param"] = wc_param
             observer = INT8WeightObserver
             quant_min = -128 if not is_asym_mode else 0
             quant_max = 127 if not is_asym_mode else 255
@@ -453,7 +449,7 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc(
             )
         return QuantizationSpec(
             dtype=dtype,
-            observer_or_fake_quant_ctr=observer.with_args(**extra_args),
+            observer_or_fake_quant_ctr=observer.with_args(wc_param=wc_param),
             quant_min=quant_min,
             quant_max=quant_max,
             qscheme=torch_qscheme,

From ae6b089f293d20248df4c3d8a0d0c5ddfed62c4c Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 8 Sep 2025 18:45:54 +0400
Subject: [PATCH 21/27] revert extra args changes

---
 backends/openvino/quantizer/quantizer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index 7e0e3c92af0..89d528f8d16 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -424,6 +424,7 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc(
         extra_args: Dict[str, Any] = {}
 
         qmode = wc_param.compression_config.mode
+        extra_args["wc_param"] = wc_param
         is_asym_mode = wc_param.compression_config.is_asym_mode
         if qmode in [nncf.CompressWeightsMode.INT4_ASYM, nncf.CompressWeightsMode.INT4_SYM]:
             observer = INT4WeightObserver
@@ -449,7 +450,7 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc(
             )
         return QuantizationSpec(
             dtype=dtype,
-            observer_or_fake_quant_ctr=observer.with_args(wc_param=wc_param),
+            observer_or_fake_quant_ctr=observer.with_args(**extra_args),
             quant_min=quant_min,
             quant_max=quant_max,
             qscheme=torch_qscheme,

From 2de569398917362b9ffc02849037528c2a15efa7 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Tue, 9 Sep 2025 11:43:00 +0400
Subject: [PATCH 22/27] precommit fixes

---
 backends/openvino/quantizer/observers.py  | 11 +++------
 backends/openvino/quantizer/quantizer.py  | 30 +++++++++++++----------
 examples/models/llama/export_llama_lib.py |  6 +++--
 extension/llm/export/quantizer_lib.py     | 21 +++++++++-------
 4 files changed, 36 insertions(+), 32 deletions(-)

diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py
index 457399117e0..faeb4fa7a60 100644
--- a/backends/openvino/quantizer/observers.py
+++ b/backends/openvino/quantizer/observers.py
@@ -84,7 +84,7 @@ def convert(
         self, model: torch.fx.GraphModule, observer_node: torch.fx.Node
     ) -> None:
         """
-        Replaces the given observer node from the given model with a quantized 
+        Replaces the given observer node from the given model with a quantized
         weight and a OpenVINO specific decompression module.
 
         :param model: A `torch.fx.GraphModule` representing the statically traced model
@@ -94,9 +94,7 @@ def convert(
         """
         weight_node = observer_node.args[0]
         original_weight = get_tensor_constant_from_node(weight_node, model)
-        q_weight, scale, zero_point = self._calculate_qparams(
-            original_weight
-        )
+        q_weight, scale, zero_point = self._calculate_qparams(original_weight)
 
         decompressor = self._create_decompressor(
             scale, zero_point, q_weight, original_weight
@@ -182,10 +180,7 @@ def _create_decompressor(
         original_weight: torch.Tensor,
     ) -> BaseWeightsDecompressor:
         if zero_point is None:
-            return INT8SymmetricWeightsDecompressor(
-                scale, original_weight.dtype
-            )
+            return INT8SymmetricWeightsDecompressor(scale, original_weight.dtype)
         return INT8AsymmetricWeightsDecompressor(
             scale, zero_point, original_weight.dtype
         )
-
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index 9db79fce9f9..bef1ef3274f 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -12,7 +12,6 @@
 
 import nncf  # type: ignore[import-untyped]
 import nncf.common.quantization as quantization  # type: ignore[import-untyped]
-from nncf.common.scopes import should_consider_scope  # type: ignore[import-untyped]
 import nncf.experimental.torch.fx as nncf_fx  # type: ignore[import-untyped]
 
 import torch.fx
@@ -21,12 +20,12 @@
     INT8WeightObserver,
 )
 from nncf.common.graph.graph import NNCFGraph  # type: ignore[import-untyped]
-from nncf.quantization.quantize_model import (  # type: ignore[import-untyped]
-    get_weight_compression_configuration,
-)
 from nncf.quantization.algorithms.weight_compression.config import (  # type: ignore[import-untyped]
     WeightCompressionParameters,
 )
+from nncf.quantization.quantize_model import (  # type: ignore[import-untyped]
+    get_weight_compression_configuration,
+)
 from torchao.quantization.pt2e import (
     HistogramObserver,
     PerChannelMinMaxObserver,
@@ -118,7 +117,7 @@ def __init__(
                 ),  # Mode value has to match NNCF CompressWeightsMode
                 **kwargs,
             )
-            subset_size = 1 # Doesn't really matter in this case since it is data-free. Should just be +ve
+            subset_size = 1  # Doesn't really matter in this case since it is data-free. Should just be +ve
             self._algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression(
                 subset_size=subset_size, **weight_compression_configuration
             )
@@ -178,7 +177,9 @@ def _annotate_weight_compression(
         :return: Updated mapping of FX nodes with weight compression annotations.
         """
         self._algo.set_backend_entity(model)
-        all_wc_params, _ = self._algo.get_weight_compression_parameters(model, nncf_graph)
+        all_wc_params, _ = self._algo.get_weight_compression_parameters(
+            model, nncf_graph
+        )
 
         for wc_param in all_wc_params:
             node_with_weight = wc_param.node_with_weight
@@ -187,9 +188,7 @@ def _annotate_weight_compression(
             )
             annotation = node_vs_torch_annotation[target_node]
             edge_or_node = self._get_weight_edge(target_node, nncf_graph)
-            qspec = self._get_torch_ao_qspec_from_nncf_config_for_wc(
-                wc_param=wc_param
-            )
+            qspec = self._get_torch_ao_qspec_from_nncf_config_for_wc(wc_param=wc_param)
             self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
 
         return node_vs_torch_annotation
@@ -216,7 +215,9 @@ def _annotate_post_training_quantization(
             edge_or_node, annotation = self._get_edge_or_node_and_annotation(
                 graph, nncf_graph, qp, node_vs_torch_annotation
             )
-            qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config_for_ptq(qp)
+            qspec: QuantizationSpecBase = (
+                self._get_torch_ao_qspec_from_nncf_config_for_ptq(qp)
+            )
             self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
 
         for quantizer_ids in quantization_setup.unified_scale_groups.values():
@@ -426,8 +427,11 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc(
         qmode = wc_param.compression_config.mode
         extra_args["wc_param"] = wc_param
         is_asym_mode = wc_param.compression_config.is_asym_mode
-        if qmode in [nncf.CompressWeightsMode.INT4_ASYM, nncf.CompressWeightsMode.INT4_SYM]:
-            observer = INT4WeightObserver
+        if qmode in [
+            nncf.CompressWeightsMode.INT4_ASYM,
+            nncf.CompressWeightsMode.INT4_SYM,
+        ]:
+            observer = INT4WeightObserver  # type: ignore[type-abstract]
             quant_min = -8 if not is_asym_mode else 0
             quant_max = 7 if not is_asym_mode else 15
             dtype = torch.int8
@@ -438,7 +442,7 @@ def _get_torch_ao_qspec_from_nncf_config_for_wc(
                 else torch.per_channel_affine
             )
         else:
-            observer = INT8WeightObserver
+            observer = INT8WeightObserver  # type: ignore[type-abstract]
             quant_min = -128 if not is_asym_mode else 0
             quant_max = 127 if not is_asym_mode else 255
             dtype = torch.int8
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 578fd0fea7b..d9c282888cc 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -43,10 +43,10 @@
 )
 from executorch.extension.llm.export.quantizer_lib import (
     get_coreml_quantizer,
+    get_ov_quantizer,
     get_pt2e_quantization_params,
     get_pt2e_quantizers,
     get_qnn_quantizer,
-    get_ov_quantizer,
     get_vulkan_quantizer,
 )
 from executorch.util.activation_memory_profiler import generate_memory_trace
@@ -897,7 +897,9 @@ def _to_edge_and_lower_llama_openvino(
     for partitioner in partitioners:
         logging.info(f"--> {partitioner.__class__.__name__}")
 
-    builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(partitioners)
+    builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(
+        partitioners
+    )
 
     if verbose:
         print_delegation_info(builder.edge_manager.exported_program().graph_module)
diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index 83d4a84420d..df8c2a5e36c 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -220,20 +220,22 @@ def get_ov_quantizer(
     group_size: int = 32,
 ):
     try:
-        from executorch.backends.openvino.quantizer import OpenVINOQuantizer, QuantizationMode
-        import nncf
-    except ImportError:
-        raise ImportError(
-            "Please install nncf via backends/openvino/requirements.txt"
+        from executorch.backends.openvino.quantizer import (
+            OpenVINOQuantizer,
+            QuantizationMode,
         )
-    
+    except ImportError:
+        raise ImportError("Please install nncf via backends/openvino/requirements.txt")
+
     backend, quant_config = pt2e_quantize.split("_")
     assert (
         backend == "openvino"
     ), f"The quantization config is for backend {backend} instead of openvino."
-    assert group_size != None, "Group Size None is Not Supported. It should be set to -1 for per-channel."
+    assert (
+        group_size
+    ), "Group Size None is Not Supported. It should be set to -1 for per-channel."
 
-    # (TODO) Manually ignore MP layers. This is done manually for now till we use the dynamic allocation MP 
+    # (TODO) Manually ignore MP layers. This is done manually for now till we use the dynamic allocation MP
     fp_node_names = [
         "linear_13",
         "linear_14",
@@ -258,7 +260,8 @@ def get_ov_quantizer(
         "linear_106",
         "linear_109",
         "linear_110",
-        "linear_111",]
+        "linear_111",
+    ]
 
     if quant_config == "4wo":
         mode = QuantizationMode.INT4WO_ASYM

From 0e10f28242129a3c332ccdbd7a3b9a4340a8e1a1 Mon Sep 17 00:00:00 2001
From: Aamir Nazir <aamir.nazir@intel.com>
Date: Tue, 9 Sep 2025 21:52:23 +0400
Subject: [PATCH 23/27] revert _calculate_qparams back to calculate_qparams

---
 backends/openvino/quantizer/observers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py
index faeb4fa7a60..6cda4561604 100644
--- a/backends/openvino/quantizer/observers.py
+++ b/backends/openvino/quantizer/observers.py
@@ -58,7 +58,7 @@ def __init__(
         super().__init__(dtype=dtype, is_dynamic=False)
         self._wc_param = wc_param
 
-    def _calculate_qparams(  # type: ignore[override]
+    def calculate_qparams(  # type: ignore[override]
         self,
         weight: torch.Tensor,
     ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
@@ -94,7 +94,7 @@ def convert(
         """
         weight_node = observer_node.args[0]
         original_weight = get_tensor_constant_from_node(weight_node, model)
-        q_weight, scale, zero_point = self._calculate_qparams(original_weight)
+        q_weight, scale, zero_point = self.calculate_qparams(original_weight)
 
         decompressor = self._create_decompressor(
             scale, zero_point, q_weight, original_weight

From 05f5a929c7c5b9a79859d9c9848ce37dd0c16b41 Mon Sep 17 00:00:00 2001
From: Aamir Nazir <aamir.nazir@intel.com>
Date: Wed, 10 Sep 2025 18:49:08 +0400
Subject: [PATCH 24/27] remove manual ignored nodes

---
 extension/llm/export/quantizer_lib.py | 29 ---------------------------
 1 file changed, 29 deletions(-)

diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index df8c2a5e36c..870080a7549 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -235,34 +235,6 @@ def get_ov_quantizer(
         group_size
     ), "Group Size None is Not Supported. It should be set to -1 for per-channel."
 
-    # (TODO) Manually ignore MP layers. This is done manually for now till we use the dynamic allocation MP
-    fp_node_names = [
-        "linear_13",
-        "linear_14",
-        "linear_35",
-        "linear_56",
-        "linear_70",
-        "linear_71",
-        "linear_77",
-        "linear_78",
-        "linear_84",
-        "linear_85",
-        "linear_88",
-        "linear_91",
-        "linear_92",
-        "linear_95",
-        "linear_96",
-        "linear_98",
-        "linear_99",
-        "linear_102",
-        "linear_103",
-        "linear_105",
-        "linear_106",
-        "linear_109",
-        "linear_110",
-        "linear_111",
-    ]
-
     if quant_config == "4wo":
         mode = QuantizationMode.INT4WO_ASYM
 
@@ -274,7 +246,6 @@ def get_ov_quantizer(
             f"No support for quant type {quant_config}. Support 8a4w, 8a8w only."
         )
     ov_quantizer = OpenVINOQuantizer(mode=mode, group_size=group_size)
-    ov_quantizer.set_ignored_scope(names=fp_node_names)
 
     return ov_quantizer
 

From fbe0e21137ee9ebc8ea246e61fd9cfa252f57b15 Mon Sep 17 00:00:00 2001
From: Aamir Nazir <aamir.nazir@intel.com>
Date: Wed, 10 Sep 2025 18:52:42 +0400
Subject: [PATCH 25/27] add ratio to quantizer initialization

---
 extension/llm/export/quantizer_lib.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index 870080a7549..350e8b3ce7c 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -235,17 +235,23 @@ def get_ov_quantizer(
         group_size
     ), "Group Size None is Not Supported. It should be set to -1 for per-channel."
 
+    quantization_params = {}
+
     if quant_config == "4wo":
-        mode = QuantizationMode.INT4WO_ASYM
+        quantization_params["mode"] = QuantizationMode.INT4WO_ASYM
+        quantization_params["group_size"] = group_size
+        quantization_params["ratio"] = 0.8
 
     elif quant_config == "8wo":
-        group_size = -1
-        mode = QuantizationMode.INT8WO_SYM
+        quantization_params["mode"] = QuantizationMode.INT8WO_ASYM
+        quantization_params["group_size"] = -1
+        quantization_params["ratio"] = None
+
     else:
         raise AssertionError(
             f"No support for quant type {quant_config}. Support 8a4w, 8a8w only."
         )
-    ov_quantizer = OpenVINOQuantizer(mode=mode, group_size=group_size)
+    ov_quantizer = OpenVINOQuantizer(**quantization_params)
 
     return ov_quantizer
 

From 6bff1cdb00ebdae53b57ab706cab6e9e9ee7e335 Mon Sep 17 00:00:00 2001
From: Aamir Nazir <aamir.nazir@intel.com>
Date: Thu, 11 Sep 2025 23:04:13 +0400
Subject: [PATCH 26/27] Update export_llama_lib.py

---
 examples/models/llama/export_llama_lib.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index d9c282888cc..cbbf169a085 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -768,7 +768,7 @@ def get_quantizer_and_quant_params(llm_config):
     if llm_config.backend.openvino.enabled and llm_config.quantization.pt2e_quantize:
         assert not quantizers, "Should not enable both xnnpack and openvino"
         group_size = llm_config.quantization.group_size
-        group_size = group_size if group_size else 32
+        group_size = group_size if group_size else 128
         ov_quantizer = get_ov_quantizer(
             llm_config.quantization.pt2e_quantize.value, group_size
         )

From d744ae95f3cf806278b12db346105e233a2daec5 Mon Sep 17 00:00:00 2001
From: Aamir Nazir <aamir.nazir@intel.com>
Date: Thu, 11 Sep 2025 23:04:50 +0400
Subject: [PATCH 27/27] Update quantizer_lib.py

---
 extension/llm/export/quantizer_lib.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index 350e8b3ce7c..f92c59cebd3 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -217,7 +217,7 @@ def get_qnn_quantizer(
 
 def get_ov_quantizer(
     pt2e_quantize: str,
-    group_size: int = 32,
+    group_size: int = 128,
 ):
     try:
         from executorch.backends.openvino.quantizer import (