Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
ece4a0b
Extend quantizer to support compress_pt2e
anzr299 Jan 16, 2026
9cc0991
integrate compress_pt2e into the example
anzr299 Jan 16, 2026
b7bac57
Merge branch 'main' into an/openvino/nncf_compress_pt2e
anzr299 Jan 19, 2026
6c0d766
remove extra directories
anzr299 Jan 19, 2026
fcd40bb
Merge branch 'an/openvino/nncf_compress_pt2e' of https://github.com/a…
anzr299 Jan 19, 2026
f9c782b
review changes
anzr299 Jan 21, 2026
24f684f
lint
anzr299 Jan 21, 2026
0963b73
add unit test
anzr299 Jan 21, 2026
792caf2
add some corner case checks in llm compression
anzr299 Jan 21, 2026
dc3b219
clean unused imports
anzr299 Jan 21, 2026
0d3d681
lint
anzr299 Jan 21, 2026
12efc70
review changes
anzr299 Jan 22, 2026
1236dfc
comprae reference scale values in tests
anzr299 Jan 22, 2026
019b2cc
remove dead code
anzr299 Jan 22, 2026
659a834
Merge branch 'main' into an/openvino/nncf_compress_pt2e
anzr299 Feb 3, 2026
562261f
lint fixes
anzr299 Feb 3, 2026
ecd5b8a
extend test for error
anzr299 Feb 3, 2026
d72466d
lint
anzr299 Feb 3, 2026
6e349c3
Merge branch 'main' into an/openvino/nncf_compress_pt2e
anzr299 Feb 4, 2026
83f0fb8
remove leading space in error message
anzr299 Feb 4, 2026
42fc491
Merge branch 'pytorch:main' into an/openvino/nncf_compress_pt2e
anzr299 Feb 6, 2026
ba68d56
Merge branch 'pytorch:main' into an/openvino/nncf_compress_pt2e
anzr299 Feb 23, 2026
b1b2fb2
Merge branch 'pytorch:main' into an/openvino/nncf_compress_pt2e
anzr299 Feb 24, 2026
0093592
update nncf version to 3.0.0
anzr299 Feb 24, 2026
0c82495
Merge branch 'main' into an/openvino/nncf_compress_pt2e
suryasidd Feb 25, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion backends/openvino/quantizer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
from .llm_compression import apply_nncf_data_aware_compression
from .quantizer import OpenVINOQuantizer, QuantizationMode, quantize_model

__all__ = ["OpenVINOQuantizer", "quantize_model", "QuantizationMode"]
__all__ = [
"OpenVINOQuantizer",
"quantize_model",
"QuantizationMode",
"apply_nncf_data_aware_compression",
]
133 changes: 133 additions & 0 deletions backends/openvino/quantizer/llm_compression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
# Copyright (c) Intel Corporation
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a basic unit test that calls this flow?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

#
# Licensed under the BSD License (the "License"); you may not use this file
# except in compliance with the License. See the license file found in the
# LICENSE file in the root directory of this source tree.

# mypy: disable-error-code=import-not-found

from typing import Tuple

import torch
from executorch.extension.llm.export.builder import LLMEdgeManager
from torchao.quantization.pt2e.quantizer import Quantizer

try:
import nncf # type: ignore[import-untyped]
from pytorch_tokenizers import get_tokenizer # type: ignore[import-untyped]
except ImportError:
raise ImportError("Please install nncf via backends/openvino/requirements.txt")


# This code is adapted from https://github.com/pytorch/executorch/blob/0c54fd0483314da173f8e14d63d2ed9591c7133a/extension/llm/export/builder.py#L278
def get_calibration_data(
module: torch.fx.GraphModule, tokenizer, prompts: str, max_len: int
):
"""
This method is used to obtain calibration data from a prompt so that the algorithm
is calibrated not only with the dataset but also the inputs which are output by
the model.
Currently, this method is only tested with Llama models.
"""
# TODO: change criteria & support batch inputs if necessary
pos = 0
token_list = tokenizer.encode(prompts, bos=True, eos=False)

with torch.no_grad():
while token_list[-1] != tokenizer.eos_id and pos < max_len:
logits = module(
torch.full((1, 1), token_list[pos]),
{"input_pos": torch.tensor((pos,))},
)
pos += 1
if pos >= len(token_list):
token_list.append(torch.argmax(logits[:], dim=-1).item())
token_list = [
(
torch.tensor(pos, dtype=torch.int64),
token,
)
for pos, token in enumerate(token_list)
]
return token_list


def transform_fn(token_pos_map: Tuple[int, int]):
"""
Transforms and returns input from dataset so that it is acceptable by the model
Currently, this method is only tested with Llama models.

:param token_pos_map: This input contains the position and its token ID
"""
inputs = (
torch.tensor([[token_pos_map[1]]]),
{"input_pos": torch.tensor([token_pos_map[0]])},
)

return inputs


def apply_nncf_data_aware_compression(
builder_exported: LLMEdgeManager,
quantizer: Quantizer,
awq: bool,
scale_estimation: bool,
) -> LLMEdgeManager:
"""
Applies NNCF data-aware weight compression to the exported LLM graph.
Uses the builder's tokenizer and calibration prompt to generate token-level
calibration data, then runs `nncf.experimental.torch.fx.compress_pt2e` with
the given quantizer and optional AWQ / scale estimation enabled.

:param builder_exported: LLMEdgeManager containing the FX graph, tokenizer path,
calibration prompt, and max sequence length.
:param quantizer: TorchAO quantizer to use for compression.
:param awq: If True, enables Activation-aware Weights Quantization (AWQ).
:param scale_estimation: If True, enables NNCF's scale estimation algorithm.
:return: The updated LLMEdgeManager with compressed torch FX model
"""
nncf_calibration_data = None
if (
builder_exported.calibration_seq_length is not None
and builder_exported.calibration_data is not None
and builder_exported.tokenizer_path is not None
and (awq or scale_estimation)
):
tokenizer = get_tokenizer(builder_exported.tokenizer_path)
nncf_calibration_data = nncf.Dataset(
get_calibration_data(
builder_exported.pre_autograd_graph_module, # type: ignore[arg-type]
tokenizer,
builder_exported.calibration_data,
builder_exported.calibration_seq_length,
),
transform_func=transform_fn,
)

# AWQ can work without a dataset as well.
if scale_estimation and not nncf_calibration_data:
missing_params = []
if builder_exported.calibration_data is None:
missing_params.append("calibration_data")
if builder_exported.calibration_seq_length is None:
missing_params.append("calibration_seq_length")
if builder_exported.tokenizer_path is None:
missing_params.append("tokenizer_path")
if missing_params:
msg = (
"Missing required calibration parameter(s): "
+ ", ".join(missing_params)
+ ". Please provide calibration_data, calibration_seq_length, and tokenizer_path."
)
raise ValueError(msg)

builder_exported.pre_autograd_graph_module = (
nncf.experimental.torch.fx.compress_pt2e(
builder_exported.pre_autograd_graph_module,
quantizer=quantizer,
dataset=nncf_calibration_data,
awq=awq,
scale_estimation=scale_estimation,
)
)
return builder_exported
106 changes: 90 additions & 16 deletions backends/openvino/quantizer/quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import nncf # type: ignore[import-untyped]
import nncf.common.quantization as quantization # type: ignore[import-untyped]
import nncf.experimental.torch.fx as nncf_fx # type: ignore[import-untyped]

import torch.fx
from executorch.backends.openvino.quantizer.observers import (
INT4WeightObserver,
Expand Down Expand Up @@ -78,12 +77,12 @@ class OpenVINOQuantizer(Quantizer):
optimally for the inference via OpenVINO.
"""

WEIGHTS_ONLY_COMPRESSION_MODES = (
QuantizationMode.INT4WO_SYM,
QuantizationMode.INT4WO_ASYM,
QuantizationMode.INT8WO_SYM,
QuantizationMode.INT8WO_ASYM,
)
WEIGHTS_ONLY_COMPRESSION_MODES = {
QuantizationMode.INT4WO_SYM: "int4_sym",
QuantizationMode.INT4WO_ASYM: "int4_asym",
QuantizationMode.INT8WO_SYM: "int8_sym",
QuantizationMode.INT8WO_ASYM: "int8_asym",
}

def __init__(
self,
Expand Down Expand Up @@ -116,17 +115,63 @@ def __init__(
preset=preset, model_type=model_type, **kwargs
)
else:
compression_mode = mode.value.replace(
"wo", ""
) # Mode value has to match NNCF CompressWeightsMode
compression_mode = OpenVINOQuantizer.WEIGHTS_ONLY_COMPRESSION_MODES[
mode
] # Mode value has to match NNCF CompressWeightsMode
weight_compression_configuration = get_weight_compression_configuration(
nncf.CompressWeightsMode(compression_mode),
**kwargs,
)
subset_size = 1 # Doesn't really matter in this case since it is data-free. Should just be +ve
weight_compression_configuration["subset_size"] = (
1 # Doesn't really matter in this case since it is data-free. Should just be +ve
)

self._algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression(
subset_size=subset_size, **weight_compression_configuration
**weight_compression_configuration
)

def _require_wc_algo(
self,
) -> nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression:
if not isinstance(
self._algo,
nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression,
):
raise TypeError(
"This method requires WeightCompression algo, but "
f"got {type(self._algo).__name__} (mode={self.mode})."
)
return self._algo

def _require_ptq_algo(self) -> MinMaxQuantization:
if not isinstance(self._algo, MinMaxQuantization):
raise TypeError(
"This method requires MinMaxQuantization algo, but "
f"got {type(self._algo).__name__} (mode={self.mode})."
)
return self._algo

def get_weights_compression_config(self) -> Dict[str, Any]:
"""
Returns a dictionary with all_layers, group_size, backup_mode and Quantization mode parameters
used by the compress_pt2e weight compression algorithm.

:return: A dictionary containing:
1. mode: Quantization mode. One of INT4 Sym, INT4 Asym, INT8 Sym, INT8 Asym.
2. group_size: group size to be used for group-wise compression.
3. all_layers: Indicates whether embeddings and last MatMul layers should be compressed to a primary
precision. By default, the backup precision is assigned for the embeddings and last MatMul layers.
4. backup_mode: Defines a backup mode for mixed-precision weight compression.
"""
algo = self._require_wc_algo()
quantizer_initialized_algo_attributes = {
"mode": algo.mode,
"group_size": algo.group_size,
"all_layers": algo.all_layers,
"backup_mode": algo.backup_mode,
}

return quantizer_initialized_algo_attributes

def set_ignored_scope(
self,
Expand Down Expand Up @@ -160,8 +205,32 @@ def set_ignored_scope(
def get_nncf_quantization_setup(
self, model: torch.fx.GraphModule, nncf_graph: NNCFGraph
) -> quantization.quantizer_setup.SingleConfigQuantizerSetup:
self._algo._set_backend_entity(model)
return self._algo.find_quantization_setup(model, nncf_graph)
algo = self._require_ptq_algo()
algo._set_backend_entity(model)
return algo.find_quantization_setup(model, nncf_graph)

def get_nncf_weight_compression_parameters(
self,
model: torch.fx.GraphModule,
nncf_graph: NNCFGraph,
) -> Tuple[
List[WeightCompressionParameters],
List[WeightCompressionParameters],
List[WeightCompressionParameters],
]:
"""
Collect weight compression parameters for the given FX model and NNCF graph.

:param model: FX GraphModule to analyze for weight compression.
:param nncf_graph: NNCFGraph representation of the model.
:return: A tuple of:
- all parameters eligible for weight compression,
- ratio-defining parameters used to set primary/backup precisions,
- parameters that are not compressible and remain in original precision.
"""
algo = self._require_wc_algo()
algo.set_backend_entity(model)
return algo.get_weight_compression_parameters(model, nncf_graph)

def _annotate_weight_compression(
self,
Expand All @@ -182,12 +251,17 @@ def _annotate_weight_compression(
:param node_vs_torch_annotation: A mapping of FX nodes to quantization annotations.
:return: Updated mapping of FX nodes with weight compression annotations.
"""
self._algo.set_backend_entity(model)
all_wc_params, _ = self._algo.get_weight_compression_parameters(
all_wc_params, *_ = self.get_nncf_weight_compression_parameters(
model, nncf_graph
)

for wc_param in all_wc_params:
if not wc_param.compression_config:
nncf_logger.debug(
"Skipping weight compression for node '%s' because compression_config is missing.",
getattr(wc_param.node_with_weight, "node_name", "<unknown>"),
)
continue
node_with_weight = wc_param.node_with_weight
target_node = nncf_fx.node_utils.get_graph_node_by_name(
graph, node_with_weight.node_name
Expand Down
2 changes: 1 addition & 1 deletion backends/openvino/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
git+https://github.com/openvinotoolkit/nncf@3d753ac#egg=nncf
nncf==3.0.0
3 changes: 3 additions & 0 deletions backends/openvino/tests/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ backends/openvino/tests
└── test_<op_name>.py # Individual op tests scripts.
├── models # Directory with model test scripts.
└── test_classification.py # Test script for classification models.
├── quantizer # Directory with quantizer test scripts.
└── test_llm_compression.py # Test script for llm compression using NNCF algorithms.
├── README.md # Documentation for unit tests (this file)
└── test_runner.py # Script to execute unit tests.
```
Expand All @@ -31,6 +33,7 @@ Before you begin, refer to instructions provided in [OpenVINO Backend for ExecuT
Supported values:
- `ops` (default)
- `models`
- `quantizer`

- **`--pattern`** (optional):
Pattern to match test files. Provide complete file name to run individual tests. The default value is `test_*.py`
Expand Down
22 changes: 22 additions & 0 deletions backends/openvino/tests/quantizer/synthetic_test_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import torch


class ExportLlamaTestModel(torch.nn.Module):
def __init__(self, vocab_size=5, hidden_size=2, num_layers=1):
super().__init__()
self.embed = torch.nn.Embedding(vocab_size, hidden_size)
self.layers = torch.nn.ModuleList(
[torch.nn.Linear(hidden_size, hidden_size) for _ in range(num_layers)]
)
self.lm_head = torch.nn.Linear(hidden_size, vocab_size)
self.vocab_size = vocab_size

def forward(self, tokens, input_pos):
x = self.embed(tokens)

for layer in self.layers:
x = torch.relu(layer(x))

logits = self.lm_head(x)

return logits
Loading
Loading