diff --git a/backends/openvino/quantizer/observers/nncf_observers.py b/backends/openvino/quantizer/observers/nncf_observers.py new file mode 100644 index 00000000000..54f4348e0ed --- /dev/null +++ b/backends/openvino/quantizer/observers/nncf_observers.py @@ -0,0 +1,114 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Tuple + +import torch +from torch.ao.quantization.observer import MappingType, PerGroup, PerAxis, PerChannelMinMaxObserver, get_block_size +from torch.ao.quantization.pt2e._affine_quantization import ( + _get_reduction_params, + AffineQuantizedMinMaxObserver, +) +from nncf.torch.quantization.layers import INT4AsymmetricWeightsDecompressor, INT4SymmetricWeightsDecompressor, INT8AsymmetricWeightsDecompressor, INT8SymmetricWeightsDecompressor +from nncf.experimental.torch.fx.transformations import constant_update_fn, module_insertion_transformation_builder +from nncf.experimental.torch.fx.node_utils import get_tensor_constant_from_node +from nncf.torch.graph.transformations.commands import PTTargetPoint, TargetType + +from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization +from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig +from nncf.parameters import CompressWeightsMode +from nncf.tensor.tensor import Tensor + +class PTPerBlockParamObserver(AffineQuantizedMinMaxObserver): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + qmode = CompressWeightsMode.INT4_ASYM if self.mapping_type==MappingType.ASYMMETRIC else CompressWeightsMode.INT4_SYM + assert isinstance(self.granularity, PerGroup), "Only PerGroup granularity is supported" + self.wc_config = WeightCompressionConfig(mode=qmode, group_size=self.granularity.group_size) + + def calculate_qparams(self, weight): + assert hasattr(self, "min_val") and hasattr( + self, "max_val" + ), "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams" + _, reduction_dims = _get_reduction_params( + self.block_size, weight.size() + ) + assert len(reduction_dims) == 1, "Only 1-D group size is supported" + reduction_dims = reduction_dims[0] - 1 + q_weight, scale, zp = do_integer_quantization(Tensor(weight), self.wc_config, reduction_axes=reduction_dims) + zp = zp.data if zp is not None else None + return q_weight.data, scale.data, zp + + def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node): + print("calling convert") + assert ( + self.original_dtype is not None + ), "Expecting original_dtype to be populated" + weight_node = observer_node.args[0] + original_weight = get_tensor_constant_from_node(weight_node, model) + q_weight, scale, zero_point = self.calculate_qparams(original_weight) + + with model.graph.inserting_before(observer_node): + if(zero_point is not None): + decompressor = INT4AsymmetricWeightsDecompressor(scale, zero_point, q_weight.shape, original_weight.shape, original_weight.dtype) + else: + decompressor = INT4SymmetricWeightsDecompressor(scale, q_weight.shape, original_weight.shape, original_weight.dtype) + packed_q_weight = decompressor.pack_weight(q_weight) + new_weight_node = constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0) + decompressor_name = f'NNCFDecompressor_{new_weight_node.name}' + + module_insertion_transformation_builder( + decompressor, + [PTTargetPoint(TargetType.OPERATOR_POST_HOOK, target_node_name=new_weight_node.name)], + decompressor_name, + )(model) + decomp_node = observer_node.args[0] + observer_node.replace_all_uses_with(decomp_node) + model.graph.erase_node(observer_node) + + +class NNCFInt8observer(PerChannelMinMaxObserver): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + qmode = CompressWeightsMode.INT8_SYM if self.qscheme==torch.per_channel_symmetric else CompressWeightsMode.INT8_ASYM + self.wc_config = WeightCompressionConfig(mode=qmode) + + def calculate_qparams(self, weight): + assert hasattr(self, "min_val") and hasattr( + self, "max_val" + ), "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams" + self.granularity = PerAxis(axis=self.ch_axis) + self.block_size = get_block_size(weight.shape, self.granularity) + _, reduction_dims = _get_reduction_params( + self.block_size, weight.size() + ) + q_weight, scale, zp = do_integer_quantization(Tensor(weight), self.wc_config, reduction_axes=reduction_dims) + zp = zp.data if zp is not None else None + return q_weight.data, scale.data, zp + + def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node): + print("calling convert") + weight_node = observer_node.args[0] + original_weight = get_tensor_constant_from_node(weight_node, model) + q_weight, scale, zero_point = self.calculate_qparams(original_weight) + + with model.graph.inserting_before(observer_node): + if(zero_point is not None): + decompressor = INT8AsymmetricWeightsDecompressor(scale, zero_point, original_weight.dtype) + else: + decompressor = INT8SymmetricWeightsDecompressor(scale, original_weight.dtype) + packed_q_weight = decompressor.pack_weight(q_weight) + new_weight_node = constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0) + decompressor_name = f'NNCFDecompressor_{new_weight_node.name}' + + module_insertion_transformation_builder( + decompressor, + [PTTargetPoint(TargetType.OPERATOR_POST_HOOK, target_node_name=new_weight_node.name)], + decompressor_name, + )(model) + decomp_node = observer_node.args[0] + observer_node.replace_all_uses_with(decomp_node) + model.graph.erase_node(observer_node) \ No newline at end of file diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py index d0622b24e6d..f8f08996f53 100644 --- a/backends/openvino/quantizer/quantizer.py +++ b/backends/openvino/quantizer/quantizer.py @@ -21,6 +21,8 @@ HistogramObserver, PerChannelMinMaxObserver, UniformQuantizationObserverBase, + PerGroup, + MappingType, ) from torchao.quantization.pt2e.quantizer import ( EdgeOrNode, @@ -30,6 +32,9 @@ Quantizer, SharedQuantizationSpec, ) +from nncf.quantization.quantize_model import get_weight_compression_configuration +from nncf.common.quantization.structs import QuantizerConfig, QuantizationScheme +from executorch.backends.openvino.quantizer.observers.nncf_observers import PTPerBlockParamObserver,NNCFInt8observer QUANT_ANNOTATION_KEY = "quantization_annotation" @@ -46,6 +51,10 @@ class QuantizationMode(Enum): INT8_SYM = "int8_sym" INT8_MIXED = "int8_mixed" INT8_TRANSFORMER = "int8_transformer" + INT8_SYM_WC = "int8_sym_wc" + INT8_ASYM_WC = "int8_asym_wc" + INT4_SYM_WC = "int4_sym" + INT4_ASYM_WC = "int4_asym" class OpenVINOQuantizer(Quantizer): @@ -66,8 +75,12 @@ def __init__( - INT8_MIXED: INT8 asymmetric quantization for activations, symmetric for weights. - INT8_TRANSFORMER: Optimized INT8 quantization for transformer-based models Default value is INT8_SYM. + - INT4_SYM: Symmetric INT4 Weights-Only Compression + - INT4_ASYM: Asymmetric INT4 Weights-Only Compression :param kwargs: Arguments to pass to the NNCF MinMaxQuantization algorithm. """ + self.mode = mode + self.wc_modes = [QuantizationMode.INT4_ASYM_WC,QuantizationMode.INT4_SYM_WC, QuantizationMode.INT8_ASYM_WC, QuantizationMode.INT8_SYM_WC] if mode == QuantizationMode.INT8_SYM: preset = quantization.structs.QuantizationPreset.PERFORMANCE model_type = None @@ -77,11 +90,24 @@ def __init__( else: preset = None model_type = nncf.parameters.ModelType.TRANSFORMER - self._min_max_algo = ( - nncf.quantization.algorithms.min_max.algorithm.MinMaxQuantization( - preset=preset, model_type=model_type, **kwargs + if(self.mode not in self.wc_modes): + self._min_max_algo = ( + nncf.quantization.algorithms.min_max.algorithm.MinMaxQuantization( + preset=preset, model_type=model_type, **kwargs + ) ) - ) + self._algo = self._min_max_algo + else: + weight_compression_configuration = get_weight_compression_configuration( + mode.value.replace("_wc", ""), # Mode value has to match NNCF CompressWeightsMode + **kwargs + ) + self._weight_compression_algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression( + subset_size=None, + **weight_compression_configuration + ) + self._algo = self._weight_compression_algo + def set_ignored_scope( self, @@ -102,7 +128,7 @@ def set_ignored_scope( :param validate: If set to True, then a RuntimeError will be raised if any ignored scope does not match in the model graph. """ - self._min_max_algo.set_ignored_scope( + self._algo.set_ignored_scope( nncf.IgnoredScope( names=names or [], patterns=patterns or [], @@ -115,63 +141,80 @@ def set_ignored_scope( def get_nncf_quantization_setup( self, model: torch.fx.GraphModule, nncf_graph: NNCFGraph ) -> quantization.quantizer_setup.SingleConfigQuantizerSetup: - self._min_max_algo._set_backend_entity(model) - return self._min_max_algo.find_quantization_setup(model, nncf_graph) + self._algo._set_backend_entity(model) + return self._algo.find_quantization_setup(model, nncf_graph) def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule: nncf_graph = nncf_fx.nncf_graph_builder.GraphConverter.create_nncf_graph(model) - quantization_setup = self.get_nncf_quantization_setup(model, nncf_graph) - + graph = model.graph node_vs_torch_annotation: DefaultDict[torch.fx.Node, QuantizationAnnotation] = ( defaultdict(QuantizationAnnotation) ) + # Serperate into annotation for quantize and compress + if(self.mode in self.wc_modes): + self._algo.set_backend_entity(model) + nodes_to_compress = self._algo.get_nodes_to_compress(nncf_graph) + for node in nodes_to_compress: + quantization_insertion_point = quantization.quantizer_setup.WeightQuantizationInsertionPoint(target_node_name=node.node_name) + group_size = self._algo._group_size + num_bits = 4 if self.mode in [QuantizationMode.INT4_SYM_WC,QuantizationMode.INT4_ASYM_WC] else 8 + qmode = QuantizationScheme.SYMMETRIC if self.mode in [QuantizationMode.INT4_SYM_WC,QuantizationMode.INT8_SYM_WC] else QuantizationScheme.ASYMMETRIC + nncf_qconfig = QuantizerConfig(num_bits=num_bits, mode=qmode) + qp = quantization.quantizer_setup.SingleConfigQuantizationPoint(qip=quantization_insertion_point, qconfig=nncf_qconfig, directly_quantized_operator_node_names=[node]) + edge_or_node, annotation = self._get_edge_or_node_and_annotation( + graph, nncf_graph, qp, node_vs_torch_annotation + ) + qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(qp, group_size=group_size, weights_only=True) + self._fill_torch_ao_annotation(edge_or_node, qspec, annotation) + else: + quantization_setup = self.get_nncf_quantization_setup(model, nncf_graph) - for qp in quantization_setup.quantization_points.values(): - edge_or_node, annotation = self._get_edge_or_node_and_annotation( - graph, nncf_graph, qp, node_vs_torch_annotation - ) - qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_qp(qp) - self._fill_torch_ao_annotation(edge_or_node, qspec, annotation) + for qp in quantization_setup.quantization_points.values(): + edge_or_node, annotation = self._get_edge_or_node_and_annotation( + graph, nncf_graph, qp, node_vs_torch_annotation + ) + qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(qp) + self._fill_torch_ao_annotation(edge_or_node, qspec, annotation) - for quantizer_ids in quantization_setup.unified_scale_groups.values(): + for quantizer_ids in quantization_setup.unified_scale_groups.values(): - root_quantizer_id = self._get_unified_scales_root_quantizer_id( - nncf_graph, quantizer_ids, quantization_setup - ) - root_qp = quantization_setup.quantization_points[root_quantizer_id] + root_quantizer_id = self._get_unified_scales_root_quantizer_id( + nncf_graph, quantizer_ids, quantization_setup + ) + root_qp = quantization_setup.quantization_points[root_quantizer_id] - if any( - root_qp.qconfig != quantization_setup.quantization_points[q_id].qconfig - for q_id in quantizer_ids - ): - qps = [ - quantization_setup.quantization_points[q_id] + if any( + root_qp.qconfig != quantization_setup.quantization_points[q_id].qconfig for q_id in quantizer_ids - ] - msg = ( - "Different quantization configs are set to one unified scale group:" - f"{[(qp.insertion_point.__dict__, str(qp.qconfig)) for qp in qps]}" + ): + qps = [ + quantization_setup.quantization_points[q_id] + for q_id in quantizer_ids + ] + msg = ( + "Different quantization configs are set to one unified scale group:" + f"{[(qp.insertion_point.__dict__, str(qp.qconfig)) for qp in qps]}" + ) + raise nncf.InternalError(msg) + + root_target_node = nncf_fx.node_utils.get_graph_node_by_name( + graph, root_qp.insertion_point.target_node_name + ) + root_edge_or_node = self._get_edge_or_node( + root_target_node, root_qp, nncf_graph ) - raise nncf.InternalError(msg) - - root_target_node = nncf_fx.node_utils.get_graph_node_by_name( - graph, root_qp.insertion_point.target_node_name - ) - root_edge_or_node = self._get_edge_or_node( - root_target_node, root_qp, nncf_graph - ) - for quantizer_id in quantizer_ids: - if quantizer_id == root_quantizer_id: - continue + for quantizer_id in quantizer_ids: + if quantizer_id == root_quantizer_id: + continue - qspec = SharedQuantizationSpec(root_edge_or_node) - qp = quantization_setup.quantization_points[quantizer_id] - edge_or_node, annotation = self._get_edge_or_node_and_annotation( - graph, nncf_graph, qp, node_vs_torch_annotation - ) - self._fill_torch_ao_annotation(edge_or_node, qspec, annotation) + qspec = SharedQuantizationSpec(root_edge_or_node) + qp = quantization_setup.quantization_points[quantizer_id] + edge_or_node, annotation = self._get_edge_or_node_and_annotation( + graph, nncf_graph, qp, node_vs_torch_annotation + ) + self._fill_torch_ao_annotation(edge_or_node, qspec, annotation) for node, annotation in node_vs_torch_annotation.items(): assert QUANT_ANNOTATION_KEY not in node.meta @@ -295,8 +338,8 @@ def _fill_torch_ao_annotation( annotation_to_update.input_qspec_map[edge_or_node[0]] = qspec @staticmethod - def _get_torch_ao_qspec_from_qp( - qp: quantization.quantizer_setup.QuantizationPointBase, + def _get_torch_ao_qspec_from_nncf_config( + qp: quantization.quantizer_setup.QuantizationPointBase, group_size=-1, weights_only=False ) -> QuantizationSpec: """ Retrieves the quantization configuration from the given quantization point and @@ -307,11 +350,10 @@ def _get_torch_ao_qspec_from_qp( """ # Eps value is copied from nncf/torch/quantization/layers.py extra_args = {"eps": 1e-16} - qconfig = qp.qconfig is_weight = qp.is_weight_quantization_point() + qconfig = qp.qconfig observer: Type[UniformQuantizationObserverBase] - if qconfig.per_channel: torch_qscheme = ( torch.per_channel_symmetric @@ -325,11 +367,27 @@ def _get_torch_ao_qspec_from_qp( else torch.per_tensor_affine ) if is_weight: - observer = PerChannelMinMaxObserver - quant_min = -128 - quant_max = 127 - dtype = torch.int8 - channel_axis = 0 + mapping_type = MappingType.SYMMETRIC if qconfig.mode == QuantizationScheme.SYMMETRIC else MappingType.ASYMMETRIC + if qconfig.num_bits==4: + extra_args["mapping_type"] = mapping_type + extra_args["target_dtype"] = torch.int8 + extra_args["granularity"] = PerGroup(group_size=group_size) + observer = PTPerBlockParamObserver + quant_min = -8 + quant_max = 7 + dtype = torch.int8 + channel_axis = 0 + elif qconfig.num_bits==8: + observer = NNCFInt8observer if weights_only else PerChannelMinMaxObserver + quant_min = -128 + quant_max = 127 + dtype = torch.int8 + channel_axis = 0 + torch_qscheme = ( + torch.per_channel_symmetric + if qconfig.mode is quantization.structs.QuantizationScheme.SYMMETRIC + else torch.per_channel_affine + ) else: observer = ( HistogramObserver