Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 114 additions & 0 deletions backends/openvino/quantizer/observers/nncf_observers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# Copyright (c) Qualcomm Innovation Center, Inc.
# All rights reserved
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

from typing import Tuple

import torch
from torch.ao.quantization.observer import MappingType, PerGroup, PerAxis, PerChannelMinMaxObserver, get_block_size
from torch.ao.quantization.pt2e._affine_quantization import (
_get_reduction_params,
AffineQuantizedMinMaxObserver,
)
from nncf.torch.quantization.layers import INT4AsymmetricWeightsDecompressor, INT4SymmetricWeightsDecompressor, INT8AsymmetricWeightsDecompressor, INT8SymmetricWeightsDecompressor
from nncf.experimental.torch.fx.transformations import constant_update_fn, module_insertion_transformation_builder
from nncf.experimental.torch.fx.node_utils import get_tensor_constant_from_node
from nncf.torch.graph.transformations.commands import PTTargetPoint, TargetType

from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization
from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
from nncf.parameters import CompressWeightsMode
from nncf.tensor.tensor import Tensor

class PTPerBlockParamObserver(AffineQuantizedMinMaxObserver):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
qmode = CompressWeightsMode.INT4_ASYM if self.mapping_type==MappingType.ASYMMETRIC else CompressWeightsMode.INT4_SYM
assert isinstance(self.granularity, PerGroup), "Only PerGroup granularity is supported"
self.wc_config = WeightCompressionConfig(mode=qmode, group_size=self.granularity.group_size)

def calculate_qparams(self, weight):
assert hasattr(self, "min_val") and hasattr(
self, "max_val"
), "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams"
_, reduction_dims = _get_reduction_params(
self.block_size, weight.size()
)
assert len(reduction_dims) == 1, "Only 1-D group size is supported"
reduction_dims = reduction_dims[0] - 1
q_weight, scale, zp = do_integer_quantization(Tensor(weight), self.wc_config, reduction_axes=reduction_dims)
zp = zp.data if zp is not None else None
return q_weight.data, scale.data, zp

def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node):
print("calling convert")
assert (
self.original_dtype is not None
), "Expecting original_dtype to be populated"
weight_node = observer_node.args[0]
original_weight = get_tensor_constant_from_node(weight_node, model)
q_weight, scale, zero_point = self.calculate_qparams(original_weight)

with model.graph.inserting_before(observer_node):
if(zero_point is not None):
decompressor = INT4AsymmetricWeightsDecompressor(scale, zero_point, q_weight.shape, original_weight.shape, original_weight.dtype)
else:
decompressor = INT4SymmetricWeightsDecompressor(scale, q_weight.shape, original_weight.shape, original_weight.dtype)
packed_q_weight = decompressor.pack_weight(q_weight)
new_weight_node = constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0)
decompressor_name = f'NNCFDecompressor_{new_weight_node.name}'

module_insertion_transformation_builder(
decompressor,
[PTTargetPoint(TargetType.OPERATOR_POST_HOOK, target_node_name=new_weight_node.name)],
decompressor_name,
)(model)
decomp_node = observer_node.args[0]
observer_node.replace_all_uses_with(decomp_node)
model.graph.erase_node(observer_node)


class NNCFInt8observer(PerChannelMinMaxObserver):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
qmode = CompressWeightsMode.INT8_SYM if self.qscheme==torch.per_channel_symmetric else CompressWeightsMode.INT8_ASYM
self.wc_config = WeightCompressionConfig(mode=qmode)

def calculate_qparams(self, weight):
assert hasattr(self, "min_val") and hasattr(
self, "max_val"
), "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams"
self.granularity = PerAxis(axis=self.ch_axis)
self.block_size = get_block_size(weight.shape, self.granularity)
_, reduction_dims = _get_reduction_params(
self.block_size, weight.size()
)
q_weight, scale, zp = do_integer_quantization(Tensor(weight), self.wc_config, reduction_axes=reduction_dims)
zp = zp.data if zp is not None else None
return q_weight.data, scale.data, zp

def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node):
print("calling convert")
weight_node = observer_node.args[0]
original_weight = get_tensor_constant_from_node(weight_node, model)
q_weight, scale, zero_point = self.calculate_qparams(original_weight)

with model.graph.inserting_before(observer_node):
if(zero_point is not None):
decompressor = INT8AsymmetricWeightsDecompressor(scale, zero_point, original_weight.dtype)
else:
decompressor = INT8SymmetricWeightsDecompressor(scale, original_weight.dtype)
packed_q_weight = decompressor.pack_weight(q_weight)
new_weight_node = constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0)
decompressor_name = f'NNCFDecompressor_{new_weight_node.name}'

module_insertion_transformation_builder(
decompressor,
[PTTargetPoint(TargetType.OPERATOR_POST_HOOK, target_node_name=new_weight_node.name)],
decompressor_name,
)(model)
decomp_node = observer_node.args[0]
observer_node.replace_all_uses_with(decomp_node)
model.graph.erase_node(observer_node)
170 changes: 114 additions & 56 deletions backends/openvino/quantizer/quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
HistogramObserver,
PerChannelMinMaxObserver,
UniformQuantizationObserverBase,
PerGroup,
MappingType,
)
from torchao.quantization.pt2e.quantizer import (
EdgeOrNode,
Expand All @@ -30,6 +32,9 @@
Quantizer,
SharedQuantizationSpec,
)
from nncf.quantization.quantize_model import get_weight_compression_configuration
from nncf.common.quantization.structs import QuantizerConfig, QuantizationScheme
from executorch.backends.openvino.quantizer.observers.nncf_observers import PTPerBlockParamObserver,NNCFInt8observer

QUANT_ANNOTATION_KEY = "quantization_annotation"

Expand All @@ -46,6 +51,10 @@ class QuantizationMode(Enum):
INT8_SYM = "int8_sym"
INT8_MIXED = "int8_mixed"
INT8_TRANSFORMER = "int8_transformer"
INT8_SYM_WC = "int8_sym_wc"
INT8_ASYM_WC = "int8_asym_wc"
INT4_SYM_WC = "int4_sym"
INT4_ASYM_WC = "int4_asym"


class OpenVINOQuantizer(Quantizer):
Expand All @@ -66,8 +75,12 @@ def __init__(
- INT8_MIXED: INT8 asymmetric quantization for activations, symmetric for weights.
- INT8_TRANSFORMER: Optimized INT8 quantization for transformer-based models
Default value is INT8_SYM.
- INT4_SYM: Symmetric INT4 Weights-Only Compression
- INT4_ASYM: Asymmetric INT4 Weights-Only Compression
:param kwargs: Arguments to pass to the NNCF MinMaxQuantization algorithm.
"""
self.mode = mode
self.wc_modes = [QuantizationMode.INT4_ASYM_WC,QuantizationMode.INT4_SYM_WC, QuantizationMode.INT8_ASYM_WC, QuantizationMode.INT8_SYM_WC]
if mode == QuantizationMode.INT8_SYM:
preset = quantization.structs.QuantizationPreset.PERFORMANCE
model_type = None
Expand All @@ -77,11 +90,24 @@ def __init__(
else:
preset = None
model_type = nncf.parameters.ModelType.TRANSFORMER
self._min_max_algo = (
nncf.quantization.algorithms.min_max.algorithm.MinMaxQuantization(
preset=preset, model_type=model_type, **kwargs
if(self.mode not in self.wc_modes):
self._min_max_algo = (
nncf.quantization.algorithms.min_max.algorithm.MinMaxQuantization(
preset=preset, model_type=model_type, **kwargs
)
)
)
self._algo = self._min_max_algo
else:
weight_compression_configuration = get_weight_compression_configuration(
mode.value.replace("_wc", ""), # Mode value has to match NNCF CompressWeightsMode
**kwargs
)
self._weight_compression_algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression(
subset_size=None,
**weight_compression_configuration
)
self._algo = self._weight_compression_algo


def set_ignored_scope(
self,
Expand All @@ -102,7 +128,7 @@ def set_ignored_scope(
:param validate: If set to True, then a RuntimeError will be raised if any ignored scope does not match
in the model graph.
"""
self._min_max_algo.set_ignored_scope(
self._algo.set_ignored_scope(
nncf.IgnoredScope(
names=names or [],
patterns=patterns or [],
Expand All @@ -115,63 +141,80 @@ def set_ignored_scope(
def get_nncf_quantization_setup(
self, model: torch.fx.GraphModule, nncf_graph: NNCFGraph
) -> quantization.quantizer_setup.SingleConfigQuantizerSetup:
self._min_max_algo._set_backend_entity(model)
return self._min_max_algo.find_quantization_setup(model, nncf_graph)
self._algo._set_backend_entity(model)
return self._algo.find_quantization_setup(model, nncf_graph)

def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
nncf_graph = nncf_fx.nncf_graph_builder.GraphConverter.create_nncf_graph(model)
quantization_setup = self.get_nncf_quantization_setup(model, nncf_graph)


graph = model.graph
node_vs_torch_annotation: DefaultDict[torch.fx.Node, QuantizationAnnotation] = (
defaultdict(QuantizationAnnotation)
)
# Serperate into annotation for quantize and compress
if(self.mode in self.wc_modes):
self._algo.set_backend_entity(model)
nodes_to_compress = self._algo.get_nodes_to_compress(nncf_graph)
for node in nodes_to_compress:
quantization_insertion_point = quantization.quantizer_setup.WeightQuantizationInsertionPoint(target_node_name=node.node_name)
group_size = self._algo._group_size
num_bits = 4 if self.mode in [QuantizationMode.INT4_SYM_WC,QuantizationMode.INT4_ASYM_WC] else 8
qmode = QuantizationScheme.SYMMETRIC if self.mode in [QuantizationMode.INT4_SYM_WC,QuantizationMode.INT8_SYM_WC] else QuantizationScheme.ASYMMETRIC
nncf_qconfig = QuantizerConfig(num_bits=num_bits, mode=qmode)
qp = quantization.quantizer_setup.SingleConfigQuantizationPoint(qip=quantization_insertion_point, qconfig=nncf_qconfig, directly_quantized_operator_node_names=[node])
edge_or_node, annotation = self._get_edge_or_node_and_annotation(
graph, nncf_graph, qp, node_vs_torch_annotation
)
qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(qp, group_size=group_size, weights_only=True)
self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
else:
quantization_setup = self.get_nncf_quantization_setup(model, nncf_graph)

for qp in quantization_setup.quantization_points.values():
edge_or_node, annotation = self._get_edge_or_node_and_annotation(
graph, nncf_graph, qp, node_vs_torch_annotation
)
qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_qp(qp)
self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
for qp in quantization_setup.quantization_points.values():
edge_or_node, annotation = self._get_edge_or_node_and_annotation(
graph, nncf_graph, qp, node_vs_torch_annotation
)
qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(qp)
self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)

for quantizer_ids in quantization_setup.unified_scale_groups.values():
for quantizer_ids in quantization_setup.unified_scale_groups.values():

root_quantizer_id = self._get_unified_scales_root_quantizer_id(
nncf_graph, quantizer_ids, quantization_setup
)
root_qp = quantization_setup.quantization_points[root_quantizer_id]
root_quantizer_id = self._get_unified_scales_root_quantizer_id(
nncf_graph, quantizer_ids, quantization_setup
)
root_qp = quantization_setup.quantization_points[root_quantizer_id]

if any(
root_qp.qconfig != quantization_setup.quantization_points[q_id].qconfig
for q_id in quantizer_ids
):
qps = [
quantization_setup.quantization_points[q_id]
if any(
root_qp.qconfig != quantization_setup.quantization_points[q_id].qconfig
for q_id in quantizer_ids
]
msg = (
"Different quantization configs are set to one unified scale group:"
f"{[(qp.insertion_point.__dict__, str(qp.qconfig)) for qp in qps]}"
):
qps = [
quantization_setup.quantization_points[q_id]
for q_id in quantizer_ids
]
msg = (
"Different quantization configs are set to one unified scale group:"
f"{[(qp.insertion_point.__dict__, str(qp.qconfig)) for qp in qps]}"
)
raise nncf.InternalError(msg)

root_target_node = nncf_fx.node_utils.get_graph_node_by_name(
graph, root_qp.insertion_point.target_node_name
)
root_edge_or_node = self._get_edge_or_node(
root_target_node, root_qp, nncf_graph
)
raise nncf.InternalError(msg)

root_target_node = nncf_fx.node_utils.get_graph_node_by_name(
graph, root_qp.insertion_point.target_node_name
)
root_edge_or_node = self._get_edge_or_node(
root_target_node, root_qp, nncf_graph
)

for quantizer_id in quantizer_ids:
if quantizer_id == root_quantizer_id:
continue
for quantizer_id in quantizer_ids:
if quantizer_id == root_quantizer_id:
continue

qspec = SharedQuantizationSpec(root_edge_or_node)
qp = quantization_setup.quantization_points[quantizer_id]
edge_or_node, annotation = self._get_edge_or_node_and_annotation(
graph, nncf_graph, qp, node_vs_torch_annotation
)
self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
qspec = SharedQuantizationSpec(root_edge_or_node)
qp = quantization_setup.quantization_points[quantizer_id]
edge_or_node, annotation = self._get_edge_or_node_and_annotation(
graph, nncf_graph, qp, node_vs_torch_annotation
)
self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)

for node, annotation in node_vs_torch_annotation.items():
assert QUANT_ANNOTATION_KEY not in node.meta
Expand Down Expand Up @@ -295,8 +338,8 @@ def _fill_torch_ao_annotation(
annotation_to_update.input_qspec_map[edge_or_node[0]] = qspec

@staticmethod
def _get_torch_ao_qspec_from_qp(
qp: quantization.quantizer_setup.QuantizationPointBase,
def _get_torch_ao_qspec_from_nncf_config(
qp: quantization.quantizer_setup.QuantizationPointBase, group_size=-1, weights_only=False
) -> QuantizationSpec:
"""
Retrieves the quantization configuration from the given quantization point and
Expand All @@ -307,11 +350,10 @@ def _get_torch_ao_qspec_from_qp(
"""
# Eps value is copied from nncf/torch/quantization/layers.py
extra_args = {"eps": 1e-16}
qconfig = qp.qconfig
is_weight = qp.is_weight_quantization_point()
qconfig = qp.qconfig

observer: Type[UniformQuantizationObserverBase]

if qconfig.per_channel:
torch_qscheme = (
torch.per_channel_symmetric
Expand All @@ -325,11 +367,27 @@ def _get_torch_ao_qspec_from_qp(
else torch.per_tensor_affine
)
if is_weight:
observer = PerChannelMinMaxObserver
quant_min = -128
quant_max = 127
dtype = torch.int8
channel_axis = 0
mapping_type = MappingType.SYMMETRIC if qconfig.mode == QuantizationScheme.SYMMETRIC else MappingType.ASYMMETRIC
if qconfig.num_bits==4:
extra_args["mapping_type"] = mapping_type
extra_args["target_dtype"] = torch.int8
extra_args["granularity"] = PerGroup(group_size=group_size)
observer = PTPerBlockParamObserver
quant_min = -8
quant_max = 7
dtype = torch.int8
channel_axis = 0
elif qconfig.num_bits==8:
observer = NNCFInt8observer if weights_only else PerChannelMinMaxObserver
quant_min = -128
quant_max = 127
dtype = torch.int8
channel_axis = 0
torch_qscheme = (
torch.per_channel_symmetric
if qconfig.mode is quantization.structs.QuantizationScheme.SYMMETRIC
else torch.per_channel_affine
)
else:
observer = (
HistogramObserver
Expand Down
Loading