pytorch · mergennachin · Feb 25, 2026 · Jan 16, 2026 · Jan 16, 2026 · Jan 19, 2026
diff --git a/backends/openvino/quantizer/__init__.py b/backends/openvino/quantizer/__init__.py
@@ -1,3 +1,9 @@
+from .llm_compression import apply_nncf_data_aware_compression
 from .quantizer import OpenVINOQuantizer, QuantizationMode, quantize_model
 
-__all__ = ["OpenVINOQuantizer", "quantize_model", "QuantizationMode"]
+__all__ = [
+    "OpenVINOQuantizer",
+    "quantize_model",
+    "QuantizationMode",
+    "apply_nncf_data_aware_compression",
+]
diff --git a/backends/openvino/quantizer/llm_compression.py b/backends/openvino/quantizer/llm_compression.py
@@ -0,0 +1,133 @@
+# Copyright (c) Intel Corporation
+#
+# Licensed under the BSD License (the "License"); you may not use this file
+# except in compliance with the License. See the license file found in the
+# LICENSE file in the root directory of this source tree.
+
+# mypy: disable-error-code=import-not-found
+
+from typing import Tuple
+
+import torch
+from executorch.extension.llm.export.builder import LLMEdgeManager
+from torchao.quantization.pt2e.quantizer import Quantizer
+
+try:
+    import nncf  # type: ignore[import-untyped]
+    from pytorch_tokenizers import get_tokenizer  # type: ignore[import-untyped]
+except ImportError:
+    raise ImportError("Please install nncf via backends/openvino/requirements.txt")
+
+
+# This code is adapted from https://github.com/pytorch/executorch/blob/0c54fd0483314da173f8e14d63d2ed9591c7133a/extension/llm/export/builder.py#L278
+def get_calibration_data(
+    module: torch.fx.GraphModule, tokenizer, prompts: str, max_len: int
+):
+    """
+    This method is used to obtain calibration data from a prompt so that the algorithm
+    is calibrated not only with the dataset but also the inputs which are output by
+    the model.
+    Currently, this method is only tested with Llama models.
+    """
+    # TODO: change criteria & support batch inputs if necessary
+    pos = 0
+    token_list = tokenizer.encode(prompts, bos=True, eos=False)
+
+    with torch.no_grad():
+        while token_list[-1] != tokenizer.eos_id and pos < max_len:
+            logits = module(
+                torch.full((1, 1), token_list[pos]),
+                {"input_pos": torch.tensor((pos,))},
+            )
+            pos += 1
+            if pos >= len(token_list):
+                token_list.append(torch.argmax(logits[:], dim=-1).item())
+    token_list = [
+        (
+            torch.tensor(pos, dtype=torch.int64),
+            token,
+        )
+        for pos, token in enumerate(token_list)
+    ]
+    return token_list
+
+
+def transform_fn(token_pos_map: Tuple[int, int]):
+    """
+    Transforms and returns input from dataset so that it is acceptable by the model
+    Currently, this method is only tested with Llama models.
+
+    :param token_pos_map: This input contains the position and its token ID
+    """
+    inputs = (
+        torch.tensor([[token_pos_map[1]]]),
+        {"input_pos": torch.tensor([token_pos_map[0]])},
+    )
+
+    return inputs
+
+
+def apply_nncf_data_aware_compression(
+    builder_exported: LLMEdgeManager,
+    quantizer: Quantizer,
+    awq: bool,
+    scale_estimation: bool,
+) -> LLMEdgeManager:
+    """
+    Applies NNCF data-aware weight compression to the exported LLM graph.
+    Uses the builder's tokenizer and calibration prompt to generate token-level
+    calibration data, then runs `nncf.experimental.torch.fx.compress_pt2e` with
+    the given quantizer and optional AWQ / scale estimation enabled.
+
+    :param builder_exported: LLMEdgeManager containing the FX graph, tokenizer path,
+        calibration prompt, and max sequence length.
+    :param quantizer: TorchAO quantizer to use for compression.
+    :param awq: If True, enables Activation-aware Weights Quantization (AWQ).
+    :param scale_estimation: If True, enables NNCF's scale estimation algorithm.
+    :return: The updated LLMEdgeManager with compressed torch FX model
+    """
+    nncf_calibration_data = None
+    if (
+        builder_exported.calibration_seq_length is not None
+        and builder_exported.calibration_data is not None
+        and builder_exported.tokenizer_path is not None
+        and (awq or scale_estimation)
+    ):
+        tokenizer = get_tokenizer(builder_exported.tokenizer_path)
+        nncf_calibration_data = nncf.Dataset(
+            get_calibration_data(
+                builder_exported.pre_autograd_graph_module,  # type: ignore[arg-type]
+                tokenizer,
+                builder_exported.calibration_data,
+                builder_exported.calibration_seq_length,
+            ),
+            transform_func=transform_fn,
+        )
+
+    # AWQ can work without a dataset as well.
+    if scale_estimation and not nncf_calibration_data:
+        missing_params = []
+        if builder_exported.calibration_data is None:
+            missing_params.append("calibration_data")
+        if builder_exported.calibration_seq_length is None:
+            missing_params.append("calibration_seq_length")
+        if builder_exported.tokenizer_path is None:
+            missing_params.append("tokenizer_path")
+        if missing_params:
+            msg = (
+                "Missing required calibration parameter(s): "
+                + ", ".join(missing_params)
+                + ". Please provide calibration_data, calibration_seq_length, and tokenizer_path."
+            )
+            raise ValueError(msg)
+
+    builder_exported.pre_autograd_graph_module = (
+        nncf.experimental.torch.fx.compress_pt2e(
+            builder_exported.pre_autograd_graph_module,
+            quantizer=quantizer,
+            dataset=nncf_calibration_data,
+            awq=awq,
+            scale_estimation=scale_estimation,
+        )
+    )
+    return builder_exported
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
@@ -13,7 +13,6 @@
 import nncf  # type: ignore[import-untyped]
 import nncf.common.quantization as quantization  # type: ignore[import-untyped]
 import nncf.experimental.torch.fx as nncf_fx  # type: ignore[import-untyped]
-
 import torch.fx
 from executorch.backends.openvino.quantizer.observers import (
     INT4WeightObserver,
@@ -78,12 +77,12 @@ class OpenVINOQuantizer(Quantizer):
     optimally for the inference via OpenVINO.
     """
 
-    WEIGHTS_ONLY_COMPRESSION_MODES = (
-        QuantizationMode.INT4WO_SYM,
-        QuantizationMode.INT4WO_ASYM,
-        QuantizationMode.INT8WO_SYM,
-        QuantizationMode.INT8WO_ASYM,
-    )
+    WEIGHTS_ONLY_COMPRESSION_MODES = {
+        QuantizationMode.INT4WO_SYM: "int4_sym",
+        QuantizationMode.INT4WO_ASYM: "int4_asym",
+        QuantizationMode.INT8WO_SYM: "int8_sym",
+        QuantizationMode.INT8WO_ASYM: "int8_asym",
+    }
 
     def __init__(
         self,
@@ -116,17 +115,63 @@ def __init__(
                 preset=preset, model_type=model_type, **kwargs
             )
         else:
-            compression_mode = mode.value.replace(
-                "wo", ""
-            )  # Mode value has to match NNCF CompressWeightsMode
+            compression_mode = OpenVINOQuantizer.WEIGHTS_ONLY_COMPRESSION_MODES[
+                mode
+            ]  # Mode value has to match NNCF CompressWeightsMode
             weight_compression_configuration = get_weight_compression_configuration(
                 nncf.CompressWeightsMode(compression_mode),
                 **kwargs,
             )
-            subset_size = 1  # Doesn't really matter in this case since it is data-free. Should just be +ve
+            weight_compression_configuration["subset_size"] = (
+                1  # Doesn't really matter in this case since it is data-free. Should just be +ve
+            )
+
             self._algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression(
-                subset_size=subset_size, **weight_compression_configuration
+                **weight_compression_configuration
+            )
+
+    def _require_wc_algo(
+        self,
+    ) -> nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression:
+        if not isinstance(
+            self._algo,
+            nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression,
+        ):
+            raise TypeError(
+                "This method requires WeightCompression algo, but "
+                f"got {type(self._algo).__name__} (mode={self.mode})."
             )
+        return self._algo
+
+    def _require_ptq_algo(self) -> MinMaxQuantization:
+        if not isinstance(self._algo, MinMaxQuantization):
+            raise TypeError(
+                "This method requires MinMaxQuantization algo, but "
+                f"got {type(self._algo).__name__} (mode={self.mode})."
+            )
+        return self._algo
+
+    def get_weights_compression_config(self) -> Dict[str, Any]:
+        """
+        Returns a dictionary with all_layers, group_size, backup_mode and Quantization mode parameters
+        used by the compress_pt2e weight compression algorithm.
+
+        :return: A dictionary containing:
+            1. mode: Quantization mode. One of INT4 Sym, INT4 Asym, INT8 Sym, INT8 Asym.
+            2. group_size: group size to be used for group-wise compression.
+            3. all_layers: Indicates whether embeddings and last MatMul layers should be compressed to a primary
+                precision. By default, the backup precision is assigned for the embeddings and last MatMul layers.
+            4. backup_mode: Defines a backup mode for mixed-precision weight compression.
+        """
+        algo = self._require_wc_algo()
+        quantizer_initialized_algo_attributes = {
+            "mode": algo.mode,
+            "group_size": algo.group_size,
+            "all_layers": algo.all_layers,
+            "backup_mode": algo.backup_mode,
+        }
+
+        return quantizer_initialized_algo_attributes
 
     def set_ignored_scope(
         self,
@@ -160,8 +205,32 @@ def set_ignored_scope(
     def get_nncf_quantization_setup(
         self, model: torch.fx.GraphModule, nncf_graph: NNCFGraph
     ) -> quantization.quantizer_setup.SingleConfigQuantizerSetup:
-        self._algo._set_backend_entity(model)
-        return self._algo.find_quantization_setup(model, nncf_graph)
+        algo = self._require_ptq_algo()
+        algo._set_backend_entity(model)
+        return algo.find_quantization_setup(model, nncf_graph)
+
+    def get_nncf_weight_compression_parameters(
+        self,
+        model: torch.fx.GraphModule,
+        nncf_graph: NNCFGraph,
+    ) -> Tuple[
+        List[WeightCompressionParameters],
+        List[WeightCompressionParameters],
+        List[WeightCompressionParameters],
+    ]:
+        """
+        Collect weight compression parameters for the given FX model and NNCF graph.
+
+        :param model: FX GraphModule to analyze for weight compression.
+        :param nncf_graph: NNCFGraph representation of the model.
+        :return: A tuple of:
+            - all parameters eligible for weight compression,
+            - ratio-defining parameters used to set primary/backup precisions,
+            - parameters that are not compressible and remain in original precision.
+        """
+        algo = self._require_wc_algo()
+        algo.set_backend_entity(model)
+        return algo.get_weight_compression_parameters(model, nncf_graph)
 
     def _annotate_weight_compression(
         self,
@@ -182,12 +251,17 @@ def _annotate_weight_compression(
         :param node_vs_torch_annotation: A mapping of FX nodes to quantization annotations.
         :return: Updated mapping of FX nodes with weight compression annotations.
         """
-        self._algo.set_backend_entity(model)
-        all_wc_params, _ = self._algo.get_weight_compression_parameters(
+        all_wc_params, *_ = self.get_nncf_weight_compression_parameters(
             model, nncf_graph
         )
 
         for wc_param in all_wc_params:
+            if not wc_param.compression_config:
+                nncf_logger.debug(
+                    "Skipping weight compression for node '%s' because compression_config is missing.",
+                    getattr(wc_param.node_with_weight, "node_name", "<unknown>"),
+                )
+                continue
             node_with_weight = wc_param.node_with_weight
             target_node = nncf_fx.node_utils.get_graph_node_by_name(
                 graph, node_with_weight.node_name

diff --git a/backends/openvino/requirements.txt b/backends/openvino/requirements.txt
@@ -1 +1 @@
-git+https://github.com/openvinotoolkit/nncf@3d753ac#egg=nncf
+nncf==3.0.0
diff --git a/backends/openvino/tests/README.md b/backends/openvino/tests/README.md
@@ -11,6 +11,8 @@ backends/openvino/tests
     └── test_<op_name>.py               # Individual op tests scripts.
 ├── models                              # Directory with model test scripts.
     └── test_classification.py          # Test script for classification models.
+├── quantizer                           # Directory with quantizer test scripts.
+    └── test_llm_compression.py         # Test script for llm compression using NNCF algorithms.
 ├── README.md                           # Documentation for unit tests (this file)
 └── test_runner.py                      # Script to execute unit tests.
 ```
@@ -31,6 +33,7 @@ Before you begin, refer to instructions provided in [OpenVINO Backend for ExecuT
   Supported values:
   - `ops` (default)
   - `models`
+  - `quantizer`
 
 - **`--pattern`** (optional):  
   Pattern to match test files. Provide complete file name to run individual tests. The default value is `test_*.py`

diff --git a/backends/openvino/tests/quantizer/synthetic_test_models.py b/backends/openvino/tests/quantizer/synthetic_test_models.py
@@ -0,0 +1,22 @@
+import torch
+
+
+class ExportLlamaTestModel(torch.nn.Module):
+    def __init__(self, vocab_size=5, hidden_size=2, num_layers=1):
+        super().__init__()
+        self.embed = torch.nn.Embedding(vocab_size, hidden_size)
+        self.layers = torch.nn.ModuleList(
+            [torch.nn.Linear(hidden_size, hidden_size) for _ in range(num_layers)]
+        )
+        self.lm_head = torch.nn.Linear(hidden_size, vocab_size)
+        self.vocab_size = vocab_size
+
+    def forward(self, tokens, input_pos):
+        x = self.embed(tokens)
+
+        for layer in self.layers:
+            x = torch.relu(layer(x))
+
+        logits = self.lm_head(x)
+
+        return logits
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		git+https://github.com/openvinotoolkit/nncf@3d753ac#egg=nncf
		nncf==3.0.0