From 7698b934ee27395014d74727f4d0d0b5a2f297fb Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Fri, 13 Mar 2026 10:06:55 +0800 Subject: [PATCH 01/90] init Signed-off-by: n1ck-guo --- auto_round/algorithms/__init__.py | 13 + auto_round/algorithms/alg_config.py | 18 + auto_round/algorithms/base.py | 17 + .../algorithms/quantization/__init__.py | 13 + .../quantization/auto_round/__init__.py | 13 + .../quantization/auto_round/config.py | 67 + .../quantization/auto_round/quantize.py | 396 ++++ auto_round/algorithms/quantization/base.py | 27 + .../algorithms/quantization/rtn/__init__.py | 13 + .../algorithms/quantization/rtn/config.py | 9 + auto_round/algorithms/quantization/rtn/rtn.py | 13 + auto_round/calibration/__init__.py | 13 + auto_round/calibration/utils.py | 53 + auto_round/compressors/utils.py | 74 +- auto_round/compressors_new/__init__.py | 13 + auto_round/compressors_new/base.py | 1846 +++++++++++++++++ auto_round/compressors_new/config.py | 296 +++ auto_round/compressors_new/shard_writer.py | 242 +++ auto_round/compressors_new/utils.py | 1034 +++++++++ auto_round/context/__init__.py | 13 + auto_round/context/base.py | 26 + auto_round/context/compress_context.py | 51 + auto_round/context/model_context.py | 227 ++ auto_round/formats.py | 4 +- auto_round/schemes.py | 150 +- auto_round/utils/model.py | 6 +- 26 files changed, 4640 insertions(+), 7 deletions(-) create mode 100644 auto_round/algorithms/__init__.py create mode 100644 auto_round/algorithms/alg_config.py create mode 100644 auto_round/algorithms/base.py create mode 100644 auto_round/algorithms/quantization/__init__.py create mode 100644 auto_round/algorithms/quantization/auto_round/__init__.py create mode 100644 auto_round/algorithms/quantization/auto_round/config.py create mode 100644 auto_round/algorithms/quantization/auto_round/quantize.py create mode 100644 auto_round/algorithms/quantization/base.py create mode 100644 auto_round/algorithms/quantization/rtn/__init__.py create mode 100644 auto_round/algorithms/quantization/rtn/config.py create mode 100644 auto_round/algorithms/quantization/rtn/rtn.py create mode 100644 auto_round/calibration/__init__.py create mode 100644 auto_round/calibration/utils.py create mode 100644 auto_round/compressors_new/__init__.py create mode 100644 auto_round/compressors_new/base.py create mode 100644 auto_round/compressors_new/config.py create mode 100644 auto_round/compressors_new/shard_writer.py create mode 100644 auto_round/compressors_new/utils.py create mode 100644 auto_round/context/__init__.py create mode 100644 auto_round/context/base.py create mode 100644 auto_round/context/compress_context.py create mode 100644 auto_round/context/model_context.py diff --git a/auto_round/algorithms/__init__.py b/auto_round/algorithms/__init__.py new file mode 100644 index 000000000..14a492441 --- /dev/null +++ b/auto_round/algorithms/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/auto_round/algorithms/alg_config.py b/auto_round/algorithms/alg_config.py new file mode 100644 index 000000000..d9d5f0c75 --- /dev/null +++ b/auto_round/algorithms/alg_config.py @@ -0,0 +1,18 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class AlgConfig: + def __init__(self): + pass diff --git a/auto_round/algorithms/base.py b/auto_round/algorithms/base.py new file mode 100644 index 000000000..4590536c5 --- /dev/null +++ b/auto_round/algorithms/base.py @@ -0,0 +1,17 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class BaseAlgorithm: + pass diff --git a/auto_round/algorithms/quantization/__init__.py b/auto_round/algorithms/quantization/__init__.py new file mode 100644 index 000000000..14a492441 --- /dev/null +++ b/auto_round/algorithms/quantization/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/auto_round/algorithms/quantization/auto_round/__init__.py b/auto_round/algorithms/quantization/auto_round/__init__.py new file mode 100644 index 000000000..14a492441 --- /dev/null +++ b/auto_round/algorithms/quantization/auto_round/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/auto_round/algorithms/quantization/auto_round/config.py b/auto_round/algorithms/quantization/auto_round/config.py new file mode 100644 index 000000000..8272ea156 --- /dev/null +++ b/auto_round/algorithms/quantization/auto_round/config.py @@ -0,0 +1,67 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from auto_round.algorithms.alg_config import AlgConfig +from auto_round.logger import logger + + +class AutoRoundConfig(AlgConfig): + """ + + Args: + iters (int): Number of iterations (default is 200). + lr (float): The learning rate (default is 0.005). + minmax_lr (float): The learning rate for min-max tuning (default is None). + lr_scheduler: The learning rate scheduler to be used. + batch_size (int): Batch size for training (default is 8). + enable_minmax_tuning (bool): Whether to enable min-max tuning (default is True). + enable_norm_bias_tuning (bool): Whether to enable fast norm/layer_bias tuning + """ + + def __init__( + self, + iters: int = 200, + lr: float = None, + minmax_lr: float = None, + lr_scheduler=None, + batch_size: int = 8, + enable_minmax_tuning: bool = True, + enable_norm_bias_tuning: bool = False, + ): + self.iters = iters + if self.iters < 0: + logger.warning("`iters` must be non-negative, reset it to 200") + self.iters = 200 + + if not lr: + # TODO need to check 4 bits lr setting for auto-round-best, 3bits only validate on small models + if self.iters >= 1000 and self.bits <= 3: + self.lr = 2.0 / self.iters + logger.info("set the lr to 2.0/iters for better accuracy") + else: + self.lr = 1.0 / self.iters + else: + self.lr = lr + self.minmax_lr = minmax_lr or self.lr + self.lr_scheduler = lr_scheduler + + self.batch_size = batch_size + + # Some helpers + self.infer_bs_coeff = 1 + self.batch_dim = None + + self.enable_minmax_tuning = enable_minmax_tuning + self.enable_norm_bias_tuning = enable_norm_bias_tuning + if self.enable_norm_bias_tuning: + logger.warning("the `enable_norm_bias_tuning` feature is experimental and currently has limited support.") diff --git a/auto_round/algorithms/quantization/auto_round/quantize.py b/auto_round/algorithms/quantization/auto_round/quantize.py new file mode 100644 index 000000000..1941db2d0 --- /dev/null +++ b/auto_round/algorithms/quantization/auto_round/quantize.py @@ -0,0 +1,396 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy +from typing import Any, Callable, Optional, Union + +import accelerate +import torch + +from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig +from auto_round.compressors_new.utils import ( + IndexSampler, + block_forward, + check_need_act_calibration, + check_skippable_keywords, + collect_best_params, + get_shared_keys, + infer_bits_by_data_type, + init_cache, + is_nv_fp, + reset_params, + set_layer_config, +) +from auto_round.context.compress_context import CompressContext +from auto_round.context.model_context import ModelContext +from auto_round.logger import logger +from auto_round.modeling.fused_moe.replace_modules import materialize_model_, safe_to_cpu_ +from auto_round.utils import ( + clear_memory, + convert_module_to_hp_if_necessary, + is_auto_device_mapping, + memory_monitor, + mv_module_from_gpu, + set_amax_for_all_moe_layers, + to_device, +) +from auto_round.utils.device import ( + clear_memory_if_reached_threshold, + get_major_device, + parse_available_devices, + set_auto_device_map_for_block_with_tuning, + set_non_auto_device_map, +) +from auto_round.utils.distributed import setup_ddp_if_needed_ +from auto_round.wrapper import WrapperLinear, WrapperMultiblock, unwrapper_block, unwrapper_layer, wrapper_block + + +class ARQuantizer: + + def __init__(self, config: AutoRoundConfig): + self.config = AutoRoundConfig + + def quantize_block( + self, + block: torch.nn.Module, + input_ids: Union[list[torch.Tensor], dict], + input_others: dict, + q_input: Union[torch.Tensor, dict, None] = None, + device: Union[str, torch.device] = "cpu", + auto_offload=True, + ): + """Quantize the weights of a given block of the model. + + Args: + block: The block of the model to be quantized. + input_ids: The input tensor containing tokenized input ids. + input_others: A dictionary containing additional input data. + q_input: The quantized input tensor. + device: The device for quantization. + + Returns: + Tuple: (q_outputs, output) if self.enable_quanted_input is True, else (None, output) + """ + model_context = ModelContext.get_context() + compress_context = CompressContext.get_context() + + materialize_model_(block) + convert_module_to_hp_if_necessary(block, model_context._amp_dtype, device) + + if auto_offload: + # card_0_in_high_risk indicates that card_0 memory is already in high usage (90%) w/o any weights + # loss_device is used to calculate loss on the second device if available and card_0_in_high_risk + if is_auto_device_mapping(compress_context.device_map) and len(compress_context.device_list) > 1: + card_0_in_high_risk, loss_device = set_auto_device_map_for_block_with_tuning( + block, + compress_context.device_map, + input_ids, + compress_context.low_gpu_mem_usage, + self.batch_size, + device, + ) + else: + block = block.to(device) + card_0_in_high_risk, loss_device = False, device + else: + card_0_in_high_risk, loss_device = False, device + + if len(compress_context.device_list) > 1 and auto_offload: + for n, m in block.named_modules(): + if len(list(m.children())) != 0 or not hasattr(m, "tuning_device"): + continue + from accelerate.hooks import AlignDevicesHook, add_hook_to_module + + hook = AlignDevicesHook(m.tuning_device, io_same_device=True) + add_hook_to_module(m, hook, True) + + if q_input is None: + hook_handles = self._register_act_max_hook(block) + + output = self._get_block_outputs( + block, + input_ids, + input_others, + self.batch_size * self.infer_bs_coeff, + device, + compress_context.cache_device, + ) + + for handle in hook_handles: + handle.remove() + else: + output = self._get_block_outputs( + block, + input_ids, + input_others, + self.batch_size * self.infer_bs_coeff, + device, + compress_context.cache_device, + ) + hook_handles = self._register_act_max_hook(block) + if hook_handles: + self._get_block_outputs( + block, + q_input if q_input is not None else input_ids, + input_others, + self.batch_size * self.infer_bs_coeff, + device, + compress_context.cache_device, + save_output=False, + ) + + for handle in hook_handles: + handle.remove() + + if q_input is not None: + if input_ids is not q_input: + clear_memory(input_ids, device_list=compress_context.device_list) + else: + clear_memory(device_list=compress_context.device_list) + input_ids = q_input + + quantized_layer_names, unquantized_layer_names = self.wrapper_block( + block, + self.enable_minmax_tuning, + self.enable_norm_bias_tuning, + enable_torch_compile=self.enable_torch_compile, + device=device, + ) + # Call this before quantization and after applying the block wrapper. + if is_nv_fp(self.data_type): # enable qkv and moe structure global_scale fuse. + from auto_round.data_type.utils import update_fused_layer_global_scales + + modules = block.modules() + for module in modules: + update_fused_layer_global_scales(module) + round_params = [] + minmax_params = [] + for n, m in block.named_modules(): + if hasattr(m, "orig_layer"): + for key in m.params.keys(): + if "min" in key or "max" in key: + minmax_params.append(m.params[key]) + else: + round_params.append(m.params[key]) + + lr = torch.tensor(self.lr) + minmax_lr = torch.tensor(self.minmax_lr) + is_adam = "adam" in self.__class__.__name__.lower() + + extra_kwargs = {} if is_adam else {"momentum": self.momentum} + + if self.enable_minmax_tuning: + params = [ + {"params": round_params}, + {"params": minmax_params, "lr": minmax_lr}, + ] + else: + params = round_params + + optimizer = self.optimizer( + params, + lr=lr, + weight_decay=0, + **extra_kwargs, + ) + + if len(round_params) + len(minmax_params) <= 0: + dump_info = ( + f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} " + f"layers in the block" + ) + logger.info(dump_info) + unwrapper_block(block, {}) + mv_module_from_gpu(block) + return output, output + + if self.lr_scheduler is None: + lr_schedule = torch.optim.lr_scheduler.LinearLR( + optimizer, start_factor=1.0, end_factor=0.0, total_iters=self.iters + ) + else: + lr_schedule = copy.deepcopy(self.lr_scheduler) + + if isinstance(input_ids, dict): # input_ids of Flux is dict + nsamples = len(input_ids["hidden_states"]) + else: + nsamples = len(input_ids) + last_best_iter = 0 + best_loss = torch.finfo(torch.float).max + num_elm = 1 + mse_reduction = "mean" + if self.gradient_accumulate_steps != 1: + mse_reduction = "sum" + mse_loss = torch.nn.MSELoss(reduction=mse_reduction).to(device) + scaler = self._get_scaler() # pylint: disable=assignment-from-none + init_loss = None + best_params = {} + total_loss = 0 + global_batch_size = self.batch_size * self.gradient_accumulate_steps + global_batch_size = min(nsamples, global_batch_size) + # We assume the block input and output shape is same + if self.gradient_accumulate_steps != 1 and not self.attention_mask: + whole_indices = torch.arange(global_batch_size) + num_elm = self._get_current_num_elm(input_ids, whole_indices) + setup_ddp_if_needed_(self, block, self.device_list) + index_sampler = IndexSampler(nsamples, global_batch_size) + batch_size = self.batch_size + for i in range(self.iters): + if self.enable_alg_ext and self.data_type.endswith("dq"): + for n, m in block.named_modules(): + m.cur_iter = i + total_loss = 0 + global_indices = index_sampler.next_batch() + if self.attention_mask: + num_elm = self._get_non_zero_cnt(self.attention_mask, global_indices) + + for tmp_step in range(self.gradient_accumulate_steps): + indices = global_indices[tmp_step * batch_size : (tmp_step + 1) * batch_size] + current_output = self._get_current_output(output, indices) + current_output = to_device(current_output, loss_device) + output_q = self._get_current_q_output(block, input_ids, input_others, indices, device, loss_device) + loss = self._get_loss(output_q, current_output, indices, mse_loss, device) + num_elm = 1 if num_elm <= 0 else num_elm + total_loss += loss.item() / num_elm + + if self.low_gpu_mem_usage and card_0_in_high_risk: + # clear memory to avoid OOM due to memory fragmentation + clear_memory_if_reached_threshold(threshold=0.5, device_list=compress_context.device_list) + + self._scale_loss_and_backward(scaler, loss) + + if self.low_gpu_mem_usage and card_0_in_high_risk: + # clear memory to avoid OOM due to memory fragmentation + clear_memory_if_reached_threshold(threshold=0.8, device_list=compress_context.device_list) + + if i == 0: + init_loss = total_loss + + if total_loss < best_loss: + best_loss = total_loss + if not self.not_use_best_mse: + best_params = collect_best_params(block, compress_context.cache_device) + # print(f"get better result at iter {i}, the loss is {total_loss}", flush=True) + + last_best_iter = i + if self.not_use_best_mse and i == self.iters - 1: + best_params = collect_best_params(block, compress_context.cache_device) + + if not self.not_use_best_mse: + if 0 < self.dynamic_max_gap <= i - last_best_iter: + break + self._step(scaler, optimizer, lr_schedule) + + last_loss = total_loss + best_iter = self.iters + if not self.not_use_best_mse: + last_loss = best_loss + best_iter = last_best_iter + if self.iters > 0: + dump_info = ( + f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} " + f"layers in the block, loss iter 0: {init_loss:.6f} -> iter {best_iter}: {last_loss:.6f}" + ) + else: + dump_info = ( + f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} " + "layers in the block" + ) + + if self.low_gpu_mem_usage: + clear_memory(device_list=compress_context.device_list) # clear cached memory during training + if len(unquantized_layer_names) != 0: + logger.info(f"{unquantized_layer_names} have not been quantized") + with torch.no_grad(): + unwrapper_block(block, best_params) + + if is_nv_fp(self.act_data_type): + # enable moe experts act_max automatic generation for WrapperWALayer + set_amax_for_all_moe_layers(block, attr_name="orig_layer.act_max") + + if self.enable_quanted_input: + q_outputs = self._get_block_outputs( + block, + input_ids, + input_others, + self.batch_size * self.infer_bs_coeff, + device, + cache_device=compress_context.cache_device, + ) + + if len(compress_context.device_list) > 1 and auto_offload: + accelerate.hooks.remove_hook_from_submodules(block) + if auto_offload: + mv_module_from_gpu(block) + + clear_memory(input_ids, device_list=compress_context.device_list) + memory_info_summary = memory_monitor.get_summary() + logger.infoclean(dump_info + "," + memory_info_summary) + + return q_outputs, output + else: + if len(compress_context.device_list) > 1 and auto_offload: + accelerate.hooks.remove_hook_from_submodules(block) + if auto_offload: + mv_module_from_gpu(block) + clear_memory(input_ids, device_list=compress_context.device_list) + memory_info_summary = memory_monitor.get_summary() + logger.infoclean(dump_info + "," + memory_info_summary) + + return None, output + + @torch.no_grad() + def _get_block_outputs( + self, + block: torch.nn.Module, + input_ids: torch.Tensor | list[torch.Tensor], + input_others: torch.Tensor | dict, + bs: int, + device: Union[str, torch.device], + cache_device: Union[str, torch.device], + save_output: bool = True, + ): + """Compute the output of a given block of the model for a given input. + + Args: + block: The block of the model. + input_ids: The input tensor containing tokenized input ids. + input_others: A dictionary containing additional input data. + bs: The batch size for computing the output. + device: The device for computation. + cache_device: The device for storing the output. + batch_dim: The batch dimension of the output tensor. + + Returns: + The output tensor of the block. + """ + output = [] + nsamples = len(input_ids) + for i in range(0, nsamples, bs): + end_index = min(nsamples, i + bs) + indices = torch.arange(i, end_index).to(torch.long) + tmp_input_ids, tmp_input_others = self._sampling_inputs( + input_ids, input_others, indices, self.seqlen, self.batch_dim, share_cache_keys=self.shared_cache_keys + ) + tmp_output = self.block_forward( + block, tmp_input_ids, tmp_input_others, self.amp, self.amp_dtype, device + ).to(cache_device) + if save_output: + if self.batch_size == 1: + output.append(tmp_output) + else: + output.extend(list(torch.split(tmp_output, 1, dim=self.batch_dim))) + if self.low_gpu_mem_usage: + clear_memory(device_list=self.device_list) + + return output diff --git a/auto_round/algorithms/quantization/base.py b/auto_round/algorithms/quantization/base.py new file mode 100644 index 000000000..18e300095 --- /dev/null +++ b/auto_round/algorithms/quantization/base.py @@ -0,0 +1,27 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class BaseQuanizers: + def __init__(self): + pass + + def pre_quantize(self): + pass + + def quantize(self): + pass + + def post_quantize(self): + pass diff --git a/auto_round/algorithms/quantization/rtn/__init__.py b/auto_round/algorithms/quantization/rtn/__init__.py new file mode 100644 index 000000000..14a492441 --- /dev/null +++ b/auto_round/algorithms/quantization/rtn/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/auto_round/algorithms/quantization/rtn/config.py b/auto_round/algorithms/quantization/rtn/config.py new file mode 100644 index 000000000..01d7fe939 --- /dev/null +++ b/auto_round/algorithms/quantization/rtn/config.py @@ -0,0 +1,9 @@ +# # Copyright (C) 2026 Intel Corporation +# # SPDX-License-Identifier: Apache-2.0 + +from auto_round.algorithms.alg_config import AlgConfig + + +class RTNConfig(AlgConfig): + def __init__(self): + super().__init__() diff --git a/auto_round/algorithms/quantization/rtn/rtn.py b/auto_round/algorithms/quantization/rtn/rtn.py new file mode 100644 index 000000000..14a492441 --- /dev/null +++ b/auto_round/algorithms/quantization/rtn/rtn.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/auto_round/calibration/__init__.py b/auto_round/calibration/__init__.py new file mode 100644 index 000000000..14a492441 --- /dev/null +++ b/auto_round/calibration/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/auto_round/calibration/utils.py b/auto_round/calibration/utils.py new file mode 100644 index 000000000..0dc997fae --- /dev/null +++ b/auto_round/calibration/utils.py @@ -0,0 +1,53 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch + +from auto_round.modeling.fused_moe.replace_modules import materialize_model_, safe_to_cpu_ +from auto_round.utils import ( + is_quantized_input_module, +) + + +def _infer_last_cache_name(block_names, layer_names=None, requested_last_cache_name=None): + """The latest required cache layer for early-stop forward. + + If there are multiple cache targets, return ``None`` and let runtime + hooks stop only after all targets are observed in real forward execution. + """ + if layer_names is None: + layer_names = [] + + if requested_last_cache_name is not None: + return requested_last_cache_name + + cache_targets = list(block_names) + list(layer_names) + if len(cache_targets) == 1: + return cache_targets[0] + + # return None here to enable the logic in _should_stop_cache_forward + return None + + +def _update_inputs(inputs: dict, q_inputs: dict) -> tuple[dict, torch.Tensor]: + keys = inputs.keys() + input_id_str = [key for key in keys if key.startswith("hidden_state")] + if len(input_id_str) != 1: + raise RuntimeError( + "hidden_states arg mismatch error," "please raise an issue in https://github.com/intel/auto-round/issues" + ) + inputs["input_ids"] = inputs.pop(input_id_str[0], None) + if q_inputs is not None: + q_inputs = q_inputs.pop(input_id_str[0], None) + return inputs, q_inputs diff --git a/auto_round/compressors/utils.py b/auto_round/compressors/utils.py index 73ed63206..1aef5bd7c 100644 --- a/auto_round/compressors/utils.py +++ b/auto_round/compressors/utils.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import copy +import json +import os import random import re import sys @@ -208,6 +210,63 @@ def infer_bits_by_data_type(data_type: str): return None +def _get_safetensor_layer_names_not_in_model(model, all_module_names: list) -> list: + """Collect layer names from safetensor files that are not loaded into the model. + + Some tensors (e.g. MTP layers) exist in the original checkpoint but are not + instantiated by ``transformers``. This function discovers them so that regex + patterns in ``layer_config`` can still match them. + + Returns: + List of layer names (the path without the ``.weight`` suffix) for weight + tensors present in the safetensor files but absent from *all_module_names*. + """ + name_or_path = None + if hasattr(model, "config") and hasattr(model.config, "name_or_path"): + name_or_path = model.config.name_or_path + if not name_or_path: + return [] + + if not os.path.isdir(name_or_path): + try: + from auto_round.utils.model import download_hf_model + + name_or_path = download_hf_model(name_or_path) + except Exception as e: + logger.debug(f"Could not resolve source model path to check for missing tensors: {e}") + return [] + + try: + from safetensors import safe_open + except ImportError: + return [] + + # Build tensor-name list from the safetensors index or single file + source_index_file = os.path.join(name_or_path, "model.safetensors.index.json") + source_single_file = os.path.join(name_or_path, "model.safetensors") + + tensor_names: list = [] + if os.path.exists(source_index_file): + with open(source_index_file) as f: + src_index = json.load(f) + tensor_names = list(src_index["weight_map"].keys()) + elif os.path.exists(source_single_file): + with safe_open(source_single_file, framework="pt", device="cpu") as f: + tensor_names = list(f.keys()) + else: + return [] + + module_name_set = set(all_module_names) + extra_layer_names = [] + for tensor_name in tensor_names: + if not tensor_name.endswith(".weight"): + continue + layer_name = tensor_name[: -len(".weight")] + if layer_name not in module_name_set: + extra_layer_names.append(layer_name) + return extra_layer_names + + def set_layer_config( model: torch.nn.Module, layer_config: dict[str, Union[str, dict, "QuantizationScheme"]], @@ -235,6 +294,10 @@ def dispatch_layer_config(layer_config: dict[str, dict]) -> None: """Assign scheme values as attributes to matched modules.""" for layer_name, scheme in layer_config.items(): module = get_module(model, layer_name) + if module is None: + # Layer exists in safetensor files but is not loaded into the model + # (e.g. MTP layers that transformers does not instantiate). Skip. + continue for attr, value in scheme.items(): setattr(module, attr, value) @@ -337,6 +400,10 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str if isinstance(m, embedding_types) or m.__class__.__name__.endswith("Embedding"): embedding_layer_names.append(n) + # Also include layer names from safetensor files not loaded into the model + # (e.g. MTP layers that transformers does not instantiate). + safetensor_only_names = _get_safetensor_layer_names_not_in_model(model, all_module_names) + # 6. expand regex configs regex_config = {} for name in list(layer_config.keys()): @@ -346,12 +413,14 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str m = get_module(model, name) if len(list(m.children())) == 0 and type(m) not in supported_types: layer_config.pop(name) - logger.warning(f"{name} is not supported in current scheme, ignoring its setting in `layer_config`") + logger.debug(f"{name} is not supported in current scheme, ignoring its setting in `layer_config`") continue regex = re.compile(name) matched = [ln for ln in all_supported_layer_names if regex.search(ln)] - if not matched: + safetensor_only_matched = [ln for ln in safetensor_only_names if regex.search(ln)] + # skip it for mtp layers not loaded in transformers + if not matched and not safetensor_only_matched: raise ValueError(f"Invalid '{name}' in layer_config, no match found.") val = layer_config.pop(name) regex_config[name] = val # keep regex config @@ -898,6 +967,7 @@ def get_fp_layer_names(model: torch.nn.Module, ignore_layers: str): for name in all_layer_names: if fp_layer in name: not_to_quantized_layers.append(name) + not_to_quantized_layers.extend(ignore_layers) # keep regex name for later use logger.trace(f"not_to_quantized_layers: {not_to_quantized_layers}") return not_to_quantized_layers diff --git a/auto_round/compressors_new/__init__.py b/auto_round/compressors_new/__init__.py new file mode 100644 index 000000000..14a492441 --- /dev/null +++ b/auto_round/compressors_new/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/auto_round/compressors_new/base.py b/auto_round/compressors_new/base.py new file mode 100644 index 000000000..7d9cbb9f8 --- /dev/null +++ b/auto_round/compressors_new/base.py @@ -0,0 +1,1846 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy +import os +import sys +import time +import traceback +from dataclasses import asdict, dataclass, fields +from enum import Enum +from functools import partial +from typing import Any, Callable, Optional, Union + +import accelerate +import torch +from accelerate.big_modeling import dispatch_model, infer_auto_device_map +from accelerate.utils import get_balanced_memory, get_max_memory +from tqdm import tqdm +from transformers import AutoConfig, set_seed + +from auto_round import envs +from auto_round.algorithms.alg_config import AlgConfig +from auto_round.algorithms.base import BaseAlgorithm +from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig +from auto_round.algorithms.quantization.auto_round.quantize import ARQuantizer +from auto_round.auto_scheme.gen_auto_scheme import AutoScheme +from auto_round.calibration.utils import ( + _infer_last_cache_name, + _update_inputs, +) +from auto_round.compressors.shard_writer import shard_writer +from auto_round.compressors_new.utils import ( + IndexSampler, + _get_quantized_layer_names_outside_blocks, + block_forward, + check_need_act_calibration, + check_skippable_keywords, + collect_best_params, + get_shared_keys, + infer_bits_by_data_type, + init_cache, + reset_params, + set_layer_config, +) +from auto_round.context.compress_context import CompressContext +from auto_round.context.model_context import ModelContext +from auto_round.data_type import QUANT_FUNC_WITH_DTYPE +from auto_round.export.export_to_gguf.config import GGUF_INNER_CONFIG +from auto_round.formats import OutputFormat, get_formats +from auto_round.logger import logger +from auto_round.modeling.fused_moe.replace_modules import materialize_model_, safe_to_cpu_ +from auto_round.schemes import ( + QuantizationScheme, + _handle_special_schemes, + _parse_scheme, + get_gguf_scheme, + preset_name_to_scheme, +) +from auto_round.special_model_handler import get_predefined_ignore_layers, update_module +from auto_round.utils import ( + INNER_SUPPORTED_LAYER_TYPES, + SUPPORTED_DTYPES, + SUPPORTED_LAYER_TYPES, + TORCH_VERSION_AT_LEAST_2_6, + CpuInfo, + check_and_mark_quantized_module, + check_seqlen_compatible, + check_to_quantized, + clear_memory, + compile_func, + convert_dtype_str2torch, + convert_module_to_hp_if_necessary, + detect_device, + find_matching_blocks, + flatten_list, + get_block_names, + get_layer_names_in_block, + get_lm_head_name, + get_module, + global_state, + htcore, + is_auto_device_mapping, + is_debug_mode, + is_hpex_available, + is_moe_model, + is_moe_model_via_config, + is_quantized_input_module, + llm_load_model, + memory_monitor, + mv_module_from_gpu, + safe_device_move_with_meta_handling, + set_module, + to_device, + to_dtype, + unsupported_meta_device, +) +from auto_round.utils.device import ( + clear_memory_if_reached_threshold, + get_major_device, + parse_available_devices, + set_auto_device_map_for_block_with_tuning, + set_non_auto_device_map, +) +from auto_round.utils.offload import OffloadManager +from auto_round.wrapper import WrapperLinear, WrapperMultiblock, unwrapper_block, unwrapper_layer, wrapper_block + +SERIALIZATION_KEYS = ( + "bits", + "act_bits", + "data_type", + "act_data_type", + "group_size", + "act_group_size", + "sym", + "act_sym", + "act_dynamic", + "amp", + "batch_size", + "enable_minmax_tuning", + "enable_norm_bias_tuning", + "enable_quanted_input", + "gradient_accumulate_steps", + "iters", + "lr", + "low_gpu_mem_usage", + "minmax_lr", + "nsamples", + "quant_block_list", + "regex_config", + "scale_dtype", + "seqlen", + "supported_types", + "static_attention_dtype", + "static_kv_dtype", + "super_bits", + "super_group_size", + "to_quant_block_names", +) + + +class BackendDataType(str, Enum): + STANDARD_FP = "fp" + MX_FP = "mx_fp" + NV_FP = "nv_fp" + + +@dataclass +class QuantizationArgs: + bits: int = None + group_size: int = None + sym: bool = None + data_type: str = None + act_bits: int = None + act_group_size: int = None + act_sym: bool = None + act_data_type: str = None + act_dynamic: bool = None + super_bits: int = None + super_group_size: int = None + + def is_act_quantize(self): + return self.act_bits is not None and self.act_bits <= 8 + + def is_nv_fp(self, act=False): + data_type = self.data_type if act is False else self.act_data_type + return BackendDataType.NV_FP in data_type + + def is_mx_fp(self, act=False): + data_type = self.data_type if act is False else self.act_data_type + return BackendDataType.MX_FP in data_type + + def is_dynamic_wint8aint8(self): + if self.act_dynamic: + return True + if ("int8" in self.act_data_type or ("int" in self.act_data_type and self.act_bits == 8)) and ( + "int8" in self.data_type or ("int" in self.data_type and self.bits == 8) + ): + return True + return False + + def is_static_wfp8afp8(self, act=False): + data_type = self.data_type if act is False else self.act_data_type + return "fp8_static" in data_type + + def is_standard_fp(self, act=False): + data_type = self.data_type if act is False else self.act_data_type + return BackendDataType.STANDARD_FP in data_type and not self.is_mx_fp(act=act) and not self.is_nv_fp(act=act) + + def is_wfp8afp8(self): + if ( + ("fp8" in self.act_data_type or ("fp" in self.act_data_type and self.act_bits == 8)) + and ("fp8" in self.data_type or ("fp" in self.data_type and self.bits == 8)) + and self.is_standard_fp(act=True) + and self.is_standard_fp(act=False) + ): + return True + else: + return False + + @classmethod + def from_dict(cls, config: dict): + new_config = {} + for k, v in config.items(): + if hasattr(cls, k): + new_config[k] = v + return cls(**new_config) + + def non_default(self): + config = {} + for k, v in asdict(self).items(): + if v: + config[k] = v + return config + + def check_config(self): + if self.bits <= 0: + raise ValueError("`bits` must be positive") + if self.act_bits <= 0: + raise ValueError("`act_bits` must be positive") + if not (self.group_size == -1 or self.group_size >= 0): + raise ValueError("`group_size` must be -1 (per channel) or 0 (per-tensor) or a positive integer") + if not (self.act_group_size == -1 or self.act_group_size >= 0): + raise ValueError("`act_group_size` must be -1 (per channel) or 0 (per-tensor) or a positive integer") + """Reset the default value of super_bits and super_group_size""" + if self.data_type.endswith("_dq"): + gguf_config = GGUF_INNER_CONFIG[f"gguf:q{self.bits}_k"] + self.super_bits = gguf_config.get("super_bits", None) if self.super_bits is None else self.super_bits + self.super_group_size = ( + gguf_config.get("super_group_size", None) if self.super_group_size is None else self.super_group_size + ) + + +class Compressor(object): + SKIP_ARGS = ("local_args", "kwargs", "cls", "config") + + def __new__( + cls, + config: Union[AlgConfig, list[AlgConfig]], + model: Union[torch.nn.Module, str], + tokenizer=None, + scheme: Union[str, dict, QuantizationScheme, AutoScheme] = "W4A16", + platform="hf", + format=None, + **kwargs, + ): + # using different compressor base on AlgConfigs + local_args = {k: v for k, v in locals().items() if k not in cls.SKIP_ARGS} + + if isinstance(config, AutoRoundConfig): + return BaseCompressor(ARQuantizer(config), **local_args, **kwargs) + + +class BaseCompressor(object): + need_calib: bool = True + + def __init__( + self, + algorithms: Union[BaseAlgorithm, list[BaseAlgorithm]], + model: Union[torch.nn.Module, str], + tokenizer=None, + platform="hf", + format=None, + scheme: Union[str, dict, QuantizationScheme, AutoScheme] = "W4A16", + layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None, + dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k", + iters: int = 200, + seqlen: int = 2048, + nsamples: int = 128, + batch_size: int = 8, + gradient_accumulate_steps: int = 1, + low_gpu_mem_usage: bool = False, + device_map: Union[str, torch.device, int, dict] = 0, + enable_torch_compile: bool = False, + enable_alg_ext: bool = False, + disable_opt_rtn: bool | None = None, + seed: int = 42, + low_cpu_mem_usage: bool = True, + **kwargs, + ): + self.quantization_args = QuantizationArgs.from_dict(kwargs) + + self.algorithms = algorithms if isinstance(algorithms, list) else [algorithms] + # TODO: refactor calibration + self.calibration = None + + self.scheme = scheme + self.formats = format + self.layer_config = layer_config + self.seqlen = seqlen + + # Extra/legacy kwargs for backward compatibility + # Major version releases may pack them with extra configuration options + amp = kwargs.pop("amp", True) + not_use_best_mse = kwargs.pop("not_use_best_mse", False) + dynamic_max_gap = kwargs.pop("dynamic_max_gap", -1) + nblocks = kwargs.pop("nblocks", 1) + to_quant_block_names: Union[str, list, None] = kwargs.pop("to_quant_block_names", None) + enable_quanted_input: bool = kwargs.pop("enable_quanted_input", True) + disable_deterministic_algorithms = kwargs.pop("disable_deterministic_algorithms", True) + enable_deterministic_algorithms = kwargs.pop("enable_deterministic_algorithms", False) + self.momentum = kwargs.pop("momentum", 0.0) + enable_opt_rtn = kwargs.pop("enable_opt_rtn", None) + self.quant_lm_head = kwargs.pop("quant_lm_head", False) + + self.ignore_layers = kwargs.pop("ignore_layers", "") + + self._offloader = OffloadManager(enabled=low_cpu_mem_usage, mode="offload", offload_dir_prefix="compressor") + + # Model related + model_dtype = kwargs.pop("model_dtype", None) + trust_remote_code = kwargs.pop("trust_remote_code") if "trust_remote_code" in kwargs else True + + self.scale_dtype = kwargs.pop("scale_dtype", None) + + self.static_attention_dtype = kwargs.pop("static_attention_dtype", None) + # Attention static dtype + if self.static_attention_dtype is not None: + logger.warning("The static attention dtype is experimental and currently has limited support.") + # KV cache, this one does not affect tuning but will collect some infos during tuning + self.static_kv_dtype = kwargs.pop("static_kv_dtype", None) + if self.static_kv_dtype is not None: + logger.warning("The static kv is experimental and currently has limited support.") + + if kwargs: + logger.warning(f"unrecognized keys {list(kwargs.keys())} were passed. Please check them.") + if "CUBLAS_WORKSPACE_CONFIG" not in os.environ: + os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" + # Deprecated, default not to use torch.use_deterministic_algorithms + if not disable_deterministic_algorithms or enable_deterministic_algorithms: + if not disable_deterministic_algorithms: + logger.warning( + "default not use deterministic_algorithms. disable_deterministic_algorithms is deprecated," + " please use enable_deterministic_algorithms instead. " + ) + + torch.use_deterministic_algorithms(True, warn_only=False) + else: + torch.use_deterministic_algorithms(True, warn_only=True) + + self.to_quant_block_names = to_quant_block_names + + device = kwargs.pop("device", None) + if device is not None: + logger.warning("`device` is deprecated, please use `device_map` instead") + + # Tuning hyperparameters + self.seed = seed + set_seed(self.seed) + self.enable_quanted_input = enable_quanted_input + + self.nsamples = nsamples + self.seqlen = seqlen + self.batch_size, self.gradient_accumulate_steps = batch_size, gradient_accumulate_steps + self.nblocks = nblocks + self.dataset = dataset + self.iters = iters + + if self.iters == 0: + self.lr = 5e-3 + + # Automatically adjust the disable_opt_rtn option if the user does not explicitly set it. + # To avoid None issue, we keep a copy though it's a little ugly + if enable_opt_rtn and disable_opt_rtn: + raise ValueError("`enable_opt_rtn` and `disable_opt_rtn` are mutually exclusive; " "only one can be set.") + if enable_opt_rtn: + disable_opt_rtn = False + self.orig_disable_opt_rtn = disable_opt_rtn + self.disable_opt_rtn = disable_opt_rtn + self.enable_torch_compile = enable_torch_compile + + self.enable_alg_ext = enable_alg_ext + self.not_use_best_mse = not_use_best_mse + self.dynamic_max_gap = dynamic_max_gap + + # Whether to pack the layer immediately after tuning + self.is_immediate_packing = False + self.is_immediate_saving = False + + # Some helpers + self.batch_dim = None + + torch.set_printoptions(precision=3, sci_mode=True) + + if is_hpex_available(): + logger.info("habana_frameworks is available, import htcore explicitly.") + import habana_frameworks.torch.core as htcore # pylint: disable=E0401 + + self.attention_mask = [] + + self.wrapper_block = wrapper_block + if self.enable_alg_ext: + try: + logger.warning_once("using algorithm extension for quantization.") + from auto_round.alg_ext import wrapper_autoround + + wrapper_autoround(self) + except (ImportError, ModuleNotFoundError): + logger.error("algorithm extension import error, fallback to default mode") + + self.model_context = ModelContext.create_context( + model, + tokenizer=tokenizer, + platform=platform, + model_dtype=model_dtype, + trust_remote_code=trust_remote_code, + amp=amp, + need_calib=self.need_calib, + device=self.compress_context.device, + ) + self.compress_context = CompressContext.create_context( + low_cpu_mem_usage, + low_gpu_mem_usage, + device_map, + enable_torch_compile, + ) + + # backward compatible with the legacy API + def __getattr__(self, name: str) -> Any: + if name in self.__dict__: + return self.__dict__[name] + + for obj in ["quantization_args", "model_context"]: + if obj not in self.__dict__: + continue + obj = object.__getattribute__(self, obj) + try: + return object.__getattribute__(obj, name) + except AttributeError: + continue + + raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'") + + def _check_configs(self) -> None: + """Checks if the configurations are valid. + + Raises: + ValueError, TypeError: If any of the configurations are invalid. + """ + self.quantization_args.check_config() + if self.batch_size <= 0: + raise ValueError("`batch_size` must be positive") + if self.iters < 0: + raise ValueError("`iters` must be non-negative") + if self.seqlen <= 0: + raise ValueError("`seqlen` must be positive") + if self.nblocks <= 0: + raise ValueError("`nblocks` must be positive") + if self.gradient_accumulate_steps <= 0: + raise ValueError("`gradient_accumulate_steps` must be positive") + + if ( + self.quantization_args.is_act_quantize() + and ( + not self.quantization_args.is_nv_fp(act=True) or "static_gs" not in self.quantization_args.act_data_type + ) + and not self.quantization_args.is_mx_fp(act=True) + and not self.quantization_args.is_dynamic_wint8aint8() + and not self.quantization_args.is_static_wfp8afp8() + ): + logger.warning( + "activation quantization is an experimental feature with limited support and a complex API. " + "And please save the quantized model to fake format as real deployment is not supported currently" + ) + + if self.quantization_args.is_mx_fp() and self.group_size != 32: + logger.warning("dtype mx_fp should only support group_size of 32 in real deployment") + + if self.quantization_args.is_nv_fp() and (self.group_size != 16): + logger.warning("dtype nv_fp should only support group_size of 16 in real deployment") + + if self.nsamples < self.gradient_accumulate_steps * self.batch_size: + if self.batch_size > self.nsamples: + if self.iters > 0: # GGUF should log this warning, but we don't know the format here + logger.warning( + f"reset `batch_size` to {self.nsamples} as `nsamples`({self.nsamples})" + f" is smaller than batch_size({self.batch_size})" + ) + self.batch_size = self.nsamples + if self.gradient_accumulate_steps > self.nsamples // self.batch_size: + self.gradient_accumulate_steps = self.nsamples // self.batch_size + logger.warning( + f"reset `gradient_accumulate_steps` to {self.gradient_accumulate_steps}" + f" as nsamples must equal or greater" + f" than gradient_accumulate_steps * batch_size" + ) + + def _gen_auto_scheme(self) -> dict[str, dict]: + if self.mllm: + logger.info("AutoScheme is not yet supported for multimodal LLMs.") + sys.exit(-1) + + if is_quantized_input_module(self.model_context.model): + logger.info("AutoScheme does not currently support quantized input models (e.g., FP8).") + sys.exit(-1) + + all_dtypes = [] + all_gguf = True + for option in self.orig_scheme.options: + # Resolve the quantization scheme or data type + dtype = "int" + if isinstance(option, str): + if not option.lower().startswith("gguf"): + all_gguf = False + + option = preset_name_to_scheme(option) + + else: + all_gguf = False + + if isinstance(option, QuantizationScheme): + dtype = option.data_type + elif isinstance(option, dict): + dtype = option.get("data_type", "int") + + all_dtypes.append(dtype) + + # Check for mixed data types + unique_dtypes = set(all_dtypes) + if len(unique_dtypes) > 1 and not all_gguf: + logger.warning( + "Models with mixed data_types " + "cannot yet be exported to real formats except GGUF. " + "Please save the model using the `fake` format for now." + ) + + layer_config, self.has_qlayer_outside_block, self.regex_config = set_layer_config( + self.model_context.model, + self.layer_config, + self.scheme, + self.scale_dtype, + self.supported_types, + self.inner_supported_types, + self.quant_block_list, + self.ignore_layers, + self.quant_lm_head, + enable_gguf_official_mixed=False, + is_mllm=self.mllm, + ) + quant_layer_names = layer_config.keys() + scheme_keys = {f.name for f in fields(QuantizationScheme)} + fixed_layer_scheme_new = { + k: {key: v[key] for key in scheme_keys & v.keys()} + for k, v in layer_config.items() + if v.get("fixed_by_user", False) + } + + # mainly using quant_layers and fixed by users + from auto_round.auto_scheme.gen_auto_scheme import GenScheme + + if not self.enable_torch_compile and self.super_bits is None and not self.orig_scheme.low_gpu_mem_usage: + logger.warning("we strongly recommend to set `enable_torch_compile` to True for AutoScheme to save VRAM") + self.scheme_generator = GenScheme( + self.orig_scheme, + self.model_context.model, + quant_layer_names, + fixed_layer_scheme_new, + self.dataset, + device_map=self.compress_context.device_map, + tokenizer=self.tokenizer, + enable_torch_compile=self.enable_torch_compile, + ) + layer_config = self.scheme_generator.get_layer_config() + return layer_config + + def post_init(self): + assert self.model_context._model_loaded, "should load model first" + # should be set after loading model and set layer_config, cause some special scheme need these. + # Preserve the original, unparsed scheme for later use in auto scheme generation + # within `configure_layer_config` (which may need the raw value instead of `self.scheme`). + default_scheme, self.is_auto_scheme, final_attrs = _parse_scheme( + self.scheme, self.quantization_args.non_default() + ) + # Bind attributes to self for easy instance-level access + # for key, value in final_attrs.items(): + # setattr(self, key, value) + self.quantization_args = QuantizationArgs.from_dict(final_attrs) + self._check_configs() + + self.orig_scheme = copy.deepcopy(self.scheme) + self.scheme = default_scheme + + # check and update the format based on the current configuration + if self.formats: + self.formats = get_formats(self.formats, self) + gguf_scheme_name = get_gguf_scheme(self.scheme) + # GGUF uses fp32 scale dtype as default + if self.scale_dtype is None: + self.scale_dtype = "fp32" if gguf_scheme_name else "fp16" + self.scale_dtype = convert_dtype_str2torch(self.scale_dtype) + + if not hasattr(self, "quant_block_list"): + all_blocks = get_block_names(self.model_context.model) + self.quant_block_list = find_matching_blocks( + self.model_context.model, all_blocks, self.to_quant_block_names + ) + + # Set device, must place after model loading + set_non_auto_device_map(self.model_context.model, self.compress_context.device_map) + + if self.iters != 0 and self.orig_disable_opt_rtn is not None: + logger.warning("`disable_opt_rtn` only works when `iters` is set to 0, ignore it now.") + self.disable_opt_rtn = True + if ( + self.quantization_args.bits >= 8 + and self.quantization_args.act_bits >= 8 + and self.iters == 0 + and self.quantization_args.data_type == "int" + and self.disable_opt_rtn is None + ): + logger.warning("`disable_opt_rtn` is turned on for W8A16/W8A8 quantization to improve efficiency.") + self.disable_opt_rtn = True + if self.disable_opt_rtn is None and self.iters == 0: + logger.info( + "`enable_opt_rtn` is turned on, set `--disable_opt_rtn` for higher speed at the cost of accuracy." + ) + self.disable_opt_rtn = False + + # after setting iters + self._adjust_torch_compile(self.enable_torch_compile) + + self.block_forward = ( + compile_func(block_forward, self.compress_context.device) if self.enable_torch_compile else block_forward + ) + + if not self.is_auto_scheme: + enable_gguf_official_mixed = True + else: + enable_gguf_official_mixed = False + + self.configure_layer_config(enable_gguf_official_mixed=enable_gguf_official_mixed) + + if self.compress_context.low_cpu_mem_usage: + self._offloader.reset() + + def _should_disable_inplace_due_to_layers_outside_block() -> bool: + return self.has_qlayer_outside_block and self.need_calib + + # Disable inplace mode when there are quantized layers outside blocks + # under specific iteration/optimization settings. + if _should_disable_inplace_due_to_layers_outside_block(): + self.inplace = False + if not hasattr(self, "formats"): + logger.warning("this API is deprecated, please use `quantize_and_save` instead") + else: + # Determine if immediate packing is required + self._adjust_immediate_packing_and_saving() + + def _adjust_torch_compile(self, enable_torch_compile: bool) -> None: + """Sets the torch compile configuration for the tuning.""" + self.enable_torch_compile = enable_torch_compile + if ( + not self.enable_torch_compile + and TORCH_VERSION_AT_LEAST_2_6 + and self.quantization_args.act_bits > 8 + and not is_debug_mode() + and "fp8" not in self.quantization_args.data_type + and "fp8" not in self.quantization_args.act_data_type + and self.iters > 0 + ): + logger.info( + "%s", + "'enable_torch_compile' is set to `False` by default. " + "Enabling it can reduce tuning cost by 20%, but it might throw an exception.", + ) + # On HPU, we rely on torch.compile to speed up the model execution. + if self.enable_torch_compile and self.quantization_args.is_wfp8afp8 and not is_hpex_available(): + self.enable_torch_compile = False + logger.warning("reset enable_torch_compile to `False` as fp8 is enabled") + # TODO: fix https://github.com/intel/auto-round/issues/1109 + if self.enable_torch_compile and self.quantization_args.is_nv_fp(act=True): + self.enable_torch_compile = False + logger.warning("reset enable_torch_compile to `False` as nvfp4 is enabled") + + def configure_layer_config(self, enable_gguf_official_mixed: None | bool = True): + is_gguf_format = (f := getattr(self, "formats", None)) is not None and len(f) > 0 and f[0].is_gguf() + if not is_gguf_format: + predefined_ignore_layers = get_predefined_ignore_layers(self.model_context.model) + if predefined_ignore_layers: + logger.info(f"Using predefined ignore_layers: {predefined_ignore_layers}") + tmp_str = ",".join(predefined_ignore_layers) + if self.ignore_layers == "": + self.ignore_layers = tmp_str + else: + self.ignore_layers += "," + tmp_str + + if self.is_auto_scheme: + self.layer_config = self._gen_auto_scheme() + else: + self.layer_config = _handle_special_schemes( + self.orig_scheme, + self.layer_config, + self.model_context.model, + supported_types=SUPPORTED_LAYER_TYPES, + inner_supported_types=INNER_SUPPORTED_LAYER_TYPES, + quant_lm_head=self.quant_lm_head, + mllm=self.model_context.is_mllm, + ) + + fill_default_value = True + if self.is_auto_scheme: + fill_default_value = False + self.layer_config, self.has_qlayer_outside_block, self.regex_config = set_layer_config( + self.model_context.model, + self.layer_config, + self.scheme, + self.scale_dtype, + SUPPORTED_LAYER_TYPES, + INNER_SUPPORTED_LAYER_TYPES, + self.quant_block_list, + self.ignore_layers, + self.quant_lm_head, + enable_gguf_official_mixed=enable_gguf_official_mixed, + is_mllm=self.model_context.is_mllm, + fill_default_value=fill_default_value, + ) + + def _adjust_immediate_packing_and_saving(self): + formats = getattr(self, "formats", []) + if len(formats) == 1 and not formats[0].is_fake() and self.inplace: + self.is_immediate_packing = True + + if self.has_qlayer_outside_block and self.iters != 0: + self.is_immediate_packing = False + + if not ("causallm" in self.model_context.model.__class__.__name__.lower() and not self.model_context.is_mllm): + # TODO For tied keys, there may some issues, we haven't not verified this + tied_weight_keys = getattr(self.model_context.model, "_tied_weight_keys", {}) + if len(tied_weight_keys) > 1: + self.is_immediate_saving = False + if self.compress_context.low_cpu_mem_usage: + logger.warning("reset low_cpu_mem_usage to False due to tied weights") + return + if len(tied_weight_keys) == 1: + key = tied_weight_keys.keys[0] + if "lm_head" not in key: + self.is_immediate_saving = False + if self.compress_context.low_cpu_mem_usage: + logger.warning("reset low_cpu_mem_usage to False due to tied weights") + return + + if self.compress_context.low_cpu_mem_usage and self.is_immediate_packing: + self.is_immediate_saving = True + + if self.compress_context.low_cpu_mem_usage and not self.is_immediate_packing: + logger.info( + "`low_cpu_mem_usage` is only supported when `immediate_packing` is True. " + "Setting `low_cpu_mem_usage` to False." + ) + self.compress_context.low_cpu_mem_usage = False + self.is_immediate_saving = False + + if self.compress_context.low_cpu_mem_usage and self.is_immediate_packing: + if formats[0].is_gguf(): + logger.warning( + "`low_cpu_mem_usage` is not fully supported for gguf format. " + "Setting `low_cpu_mem_usage` to False." + ) + self.compress_context.low_cpu_mem_usage = False + self.is_immediate_saving = False + elif self.has_qlayer_outside_block and self.disable_opt_rtn and self.iters == 0: + logger.info( + "Keeping `low_cpu_mem_usage` enabled in RTN mode (iters=0): " + "RTN path uses blockwise quantization and supports per-block offloading." + ) + elif self.has_qlayer_outside_block and self.iters > 0: + logger.warning( + "`low_cpu_mem_usage` is not fully supported " + "when there are quantized layers outside blocks and optimized RTN is disabled. " + "Setting low_cpu_mem_usage to False." + ) + self.compress_context.low_cpu_mem_usage = False + self.is_immediate_saving = False + + if self.is_immediate_saving and "int" not in self.data_type: + logger.warning("immediate_saving is only supported for int quantization, set to False") + self.is_immediate_saving = False + + if self.orig_output_dir is None: + self.is_immediate_saving = False + + @torch.no_grad() + def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, last_cache_name=None): + """Attempts to cache intermediate data on GPU, if failed, then using CPU. + + Args: + block_names (list): List of block names to cache data for. + nsamples (int): Number of samples to use for caching. + layer_names (list, optional): List of layer names to cache data for. Defaults to []. + last_cache_name (str, optional): Name of the last cache. Defaults to None. + + Returns: + all_inputs: Cached intermediate data. + + Raises: + Exception: If caching on GPU fails, switches to CPU and caches there. + """ + if is_quantized_input_module(self.model_context.model): + layer_names = [] + if layer_names is None: + layer_names = [] + if self.compress_context.low_gpu_mem_usage or ( + len(block_names) == 1 + and len(layer_names) == 0 + and not self.has_qlayer_outside_block + and (last_cache_name is None or last_cache_name in block_names) + ): + # low_gpu_mem_usage or calibrate only the embedding layer, which is also very fast on CPU + all_inputs = self.cache_inter_data(block_names, nsamples, layer_names=[], last_cache_name=last_cache_name) + else: + try: + if any(p.device.type == "meta" for p in self.model_context.model.parameters()): + materialize_model_(self.model_context.model) + + if ( + hasattr(self.model_context.model, "hf_device_map") + and len(self.model_context.model.hf_device_map) > 1 + ): + self.model_context.model = dispatch_model( + self.model_context.model, device_map=self.model_context.model.hf_device_map + ) + else: + # Change this if new device is supported + if str(self.model_context.model.device) == "cpu" and ( + not self.compress_context.device.startswith("hpu") + ): + # type(self.model_context.model._no_split_modules) changes from list to set when transformers > 5.0 + no_split_modules = list(getattr(self.model_context.model, "_no_split_modules", [])) + devices = parse_available_devices(self.compress_context.device_map) + + max_memory = get_max_memory() + new_max_memory = {} + if "cpu" not in devices: + devices.append("cpu") + for device in devices: + if ":" in device: + device = int(device.split(":")[-1]) + elif device == "cpu": + device = "cpu" + elif isinstance(device, str): + device = 0 + else: + raise ValueError( + f"Unsupported device {device} in device_map: {self.compress_context.device_map}" + ) + if device not in max_memory: + # Skip devices that aee not reported by accelerate's max_memory. + # This is expected when a device is unavailable or cannot provide memory info. + continue + # Use 90% of the reported max memory to leave headroom for activations, + # temporary tensors, other processes, and allocator fragmentation, reducing + # the chance of runtime OOM while still utilizing most available memory. + new_max_memory[device] = max_memory[device] * 0.9 + + # If non-CPU devices were requested but none survived, fall back to CPU caching + # via the OOM handler below, avoiding unnecessary dispatch overhead. + requested_non_cpu = any((d != "cpu") for d in devices) + has_non_cpu_memory = any((k != "cpu") for k in new_max_memory) + if requested_non_cpu and not has_non_cpu_memory: + raise torch.OutOfMemoryError( + "No non-CPU device available in accelerate's reported memory. " + "Falling back to CPU caching." + ) + + new_max_memory = get_balanced_memory( + self.model_context.model, + max_memory=new_max_memory, + no_split_module_classes=no_split_modules, + ) + self.model_context.model.tie_weights() + device_map = infer_auto_device_map( + self.model_context.model, + max_memory=new_max_memory, + no_split_module_classes=no_split_modules, + ) + if len(devices) > 1 and "cpu" in device_map.values(): + logger.warning( + "Some layers are offloaded to cpu, which may severely impact calibration speed." + " Please consider using more cards." + ) + + try: + + self.model_context.model = dispatch_model(self.model_context.model, device_map=device_map) + except ValueError as e: + if "offload_dir" in e.__str__(): + logger.warning( + f"Due to insufficient resources, disk is used to store the model." + f" `offload_dir={envs.AR_WORK_SPACE}`" + ) + self.model_context.model = dispatch_model( + self.model_context.model, device_map=device_map, offload_dir=envs.AR_WORK_SPACE + ) + else: + raise + else: + + self.model_context.model = self.model_context.model.to(self.compress_context.device) + + all_inputs = self.cache_inter_data( + block_names, nsamples, layer_names=layer_names, last_cache_name=last_cache_name + ) + if ( + hasattr(self.model_context.model, "hf_device_map") + and len(self.model_context.model.hf_device_map) > 1 + ): + accelerate.hooks.remove_hook_from_submodules(self.model_context.model) + + except torch.OutOfMemoryError: + cuda_error_msg = traceback.format_exc() + try: + logger.info("switch to cpu to cache block inputs") + self.compress_context.cache_device = torch.device("cpu") + if self.has_qlayer_outside_block or self.__class__.__name__ == "AutoRoundMLLM": + logger.warning( + "we recommend using more GPUs in calibration." + " Otherwise, some layers may fall back to `rtn` mode, which can affect accuracy." + ) + accelerate.hooks.remove_hook_from_submodules(self.model_context.model) + self.model_context.model = mv_module_from_gpu(self.model_context.model) + clear_memory(device_list=self.compress_context.device_list) + # Important change after v0.51, on cpu, we use rtn mode for layers in layer_names + all_inputs = self.cache_inter_data( + block_names, nsamples, layer_names=[], last_cache_name=last_cache_name + ) + except Exception as e: + logger.error(cuda_error_msg) + raise + return all_inputs + + @torch.no_grad() + def cache_inter_data(self, block_names, nsamples, layer_names=None, last_cache_name=None): + """Save the inputs of block_name for calibration. + + This method temporarily replaces the forward method of the model to capture + the inputs passing through the specified block. It then calibrates the model + using a specified number of samples. Finally, it restores the original forward + method and returns the inputs for the specified block. + Args: + block_names (list): The names of the blocks for which inputs are to be saved. + layer_names (list):The names of the layers for which inputs are to be saved. + nsamples (int): The number of samples to use for calibration. + last_cache_name (str, optional): The name of the last layer to be cached, + we could break the forward in this layer to save time + + Returns: + dict: A dictionary containing the inputs for the specified block. + """ + if layer_names is None: + layer_names = [] + self.inputs = {} + self.to_cached_layers = block_names + layer_names + + tmp_dtype = None # TODO delete this as most model is not fp32 now + ## have bug if block name is not the first block + if (len(block_names) > 1 or len(layer_names) > 0) and self.compress_context.low_gpu_mem_usage: + tmp_dtype = self.model_context.model.dtype + if self.amp: + if self.model_context.model.dtype != self.model_context.model.dtype: + self.model_context.model = self.model_context.model.to(torch.bfloat16) + else: + self.model_context.model = self.model_context.model.to(torch.float32) ##model on cpu + + self.last_cache_name = _infer_last_cache_name(block_names, layer_names, last_cache_name) + self._cache_target_set = set(self.to_cached_layers) + self._cache_seen_targets = set() + calib_bs = self.batch_size + self.hook_handles = [] + self._replace_forward() + self.calib(nsamples, calib_bs) + self.model_context.recover_forward() + res = self.inputs + del self.last_cache_name + del self._cache_target_set + del self._cache_seen_targets + del self.to_cached_layers + if tmp_dtype is not None: + self.model_context.model = self.model_context.model.to(tmp_dtype) + + return res + + @torch.no_grad() + def calib(self, nsamples, bs): + """Perform calibration for quantization. + + This method calibrates the model for quantization by processing a specified + number of samples from the calibration dataset. It ensures that the data is + properly formatted and feeds it to the model. If the number of samples processed + is less than the specified number, it logs a warning. If no samples are processed, + it logs an error and exits. + Args: + nsamples (int): The number of samples to use for calibration. + bs (int): The number of samples to use for calibration + """ + from auto_round.calib_dataset import get_dataloader + + need_attention_mask = True + if isinstance(self.dataset, str): + need_attention_mask = False # all supported datasets does not use pad + dataset = self.dataset.replace(" ", "") ##remove all whitespaces + + # slow here + self.dataloader = get_dataloader( + self.tokenizer, + self.seqlen, + dataset, + self.seed, + bs, + self.nsamples, + ) + else: + self.dataloader = self.dataset + total_cnt = 0 + if self.dataloader.__class__.__name__ == "BatchEncoding": + self.dataloader = [self.dataloader.data] + + for data in self.dataloader: + if data.__class__.__name__ == "BatchEncoding": + data = data.data + if data is None: + continue + if isinstance(data, torch.Tensor): + input_ids = data.to(self.model.device) + data_new = input_ids + elif isinstance(data, str): + if self.tokenizer is None: + logger.error("please provide tokenizer for string input") + exit(-1) + data = self.tokenizer(data, truncation=True, max_length=self.seqlen, return_tensors="pt").data + data_new = {} + for key in data.keys(): + data_new[key] = data[key].to(self.model.device) + input_ids = data_new["input_ids"] + elif isinstance(data, tuple) or isinstance(data, list): + data_new = to_device(data, self.model.device) + input_ids = data_new[0] + else: + data_new = {} + for key in data.keys(): + data_new[key] = to_device(data[key], self.model.device) + if key == "images": + data_new[key] = to_dtype(data_new[key], self.model.dtype) + input_ids = data_new["input_ids"] + if input_ids.shape[-1] < self.seqlen: + continue + if need_attention_mask: + if ( + isinstance(data_new, dict) + and "attention_mask" in data_new + and data_new["attention_mask"] is not None + ): + new_attention_mask = data_new["attention_mask"] + elif ( + self.tokenizer is not None + and hasattr(self.tokenizer, "pad_token") + and self.tokenizer.pad_token is not None + ): + new_attention_mask = (input_ids != self.tokenizer.pad_token_id).to(torch.long) + else: + # Default all ones + new_attention_mask = torch.ones_like(input_ids, dtype=torch.long) + + # For each sample, check if there are trailing repeated tokens + # If so, set the mask of the last token to 0 + batch_size, seq_len = input_ids.shape + for i in range(batch_size): + last_token = input_ids[i, -1] + # Check for trailing repeats + j = seq_len - 2 + repeated = False + while j >= 0 and input_ids[i, j] == last_token: + repeated = True + new_attention_mask[i, j] = 0 + j -= 1 + # If there was at least one repeat, set last token mask to 0 + if repeated: + new_attention_mask[i, -1] = 0 + + # Workaround: some models treat an all-1 attention mask as equivalent to None and + # will internally replace it with None for block inputs, which can cause tensor + # concatenation / shape-mismatch issues in downstream code. To avoid providing an + # all-1 mask, we force the last token in each sequence to be masked out (set to 0) + # so that the mask is never "all ones". This means the model will not attend to the + # last position, so the impact on accuracy is minimal as basically equivalent to dropping a single token + new_attention_mask[:, -1] = 0 + + self.attention_mask.extend(list(torch.split(new_attention_mask, 1, dim=0))) + else: + new_attention_mask = None + try: + kwargs = {"use_cache": False} + if new_attention_mask is not None and not (isinstance(data_new, dict) and "attention_mask" in data_new): + kwargs["attention_mask"] = new_attention_mask + + if isinstance(data_new, torch.Tensor): + self.model(data_new, **kwargs) + elif isinstance(data_new, tuple) or isinstance(data_new, list): + self.model(*data_new, **kwargs) + else: + self.model(**data_new, **kwargs) + except NotImplementedError: + pass + except RuntimeError as error: + error_msg = str(error) + if "The expanded size of the tensor" in str(error_msg) and "must match the existing size" in error_msg: + check_seqlen_compatible(self.seqlen, self.tokenizer, self.model) + logger.warning( + "When quantization encounters tensor shape mismatch error, " + "you can try to avoid it with batch_size=1" + ) + raise error + except Exception as error: + raise error + + total_cnt += input_ids.shape[0] if len(input_ids.shape) > 1 else 1 + if total_cnt >= nsamples: + break + if total_cnt == 0: + logger.error( + f"no data has been cached, please provide more data with sequence length >={self.seqlen} in the " + f"dataset or decease the sequence length" + ) + exit(-1) + elif total_cnt < nsamples: + logger.warning_once( + f"An insufficient number of samples likely reduces the accuracy of the quantized model. " + f"Target samples count is {nsamples}, while valid samples count is {total_cnt}" + ) + + @torch.no_grad() + def _get_block_forward_func(self, name: str) -> Callable: + """Gets the forward function. + + Args: + name (str): The name of the function. + Returns: + function: The forward function. + """ + + def post_process_cache_data(batch_size, data, data_name): + """ + Processes store data for batch handling, reshaping if necessary. + + Args: + batch_size (int): The size of the batch. + data: The data value to store, potentially for caching. + data_name (str): Name of the data. + + Returns: + Processed data or None + """ + new_data = data + if batch_size <= 1: + return new_data + if data_name in self.shared_cache_keys: + return None + if "alibi" in data_name: + if isinstance(data, torch.Tensor): + alibi = data + alibi = alibi.reshape(batch_size, -1, alibi.shape[1], alibi.shape[2]) + new_data = alibi + return new_data + + def forward(m, hidden_states=None, *positional_inputs, **kwargs): + """Rewrite forward function, process and collect input data. + + Args: + hidden_states (torch.Tensor): The hidden states tensor. + *positional_inputs: Variable number of positional arguments. + **kwargs: Variable number of keyword arguments. + + Returns: + NotImplementedError: Getting the first layer inputs and then raise the error to save runtime. + """ + if name not in self.inputs: + self.inputs[name] = {} + init_cache(positional_inputs, self.inputs[name]) + + if self.batch_dim is None: + self.batch_dim = 0 + if hidden_states is not None and self.batch_size > 1: + if hidden_states.shape[0] > self.batch_size: + self.batch_dim = 1 + if len(hidden_states.shape) > 1 and hidden_states.shape[1] > self.batch_size: + logger.error( + "this model has not been supported, " + "please raise an issue in https://github.com/intel/auto-round/issues" + " or try to set the `batch_size` to 1 and " + "`gradient_accumulate_steps` to your current batch size." + ) + exit(-1) + + if hidden_states is not None: + kwargs["hidden_states"] = hidden_states + + for key in kwargs.keys(): + if ( + isinstance(kwargs[key], torch.Tensor) + or isinstance(kwargs[key], list) + or isinstance(kwargs[key], tuple) + ): + if key not in self.inputs[name].keys(): # initialization + data = to_device(kwargs[key], device=torch.device("cpu")) + if data is None or (self.batch_size > 1 and key in self.shared_cache_keys): + self.inputs[name][key] = data + continue + if self.batch_size <= 1: + self.inputs[name][key] = [data] + else: + data = post_process_cache_data(self.batch_size, data, key) + self.inputs[name][key] = list(torch.split(data, 1, dim=self.batch_dim)) + else: # append cache inputs + new_data = post_process_cache_data(self.batch_size, kwargs[key], key) + if new_data is None: # shareable args or NoneType + continue + new_data = to_device(new_data, device=torch.device("cpu")) + if self.batch_size <= 1: + self.inputs[name][key].append(new_data) + else: + self.inputs[name][key].extend(list(torch.split(new_data, 1, dim=self.batch_dim))) + elif isinstance(kwargs[key], (str, bool, type(None))): + if key not in self.inputs[name].keys(): + self.inputs[name][key] = kwargs[key] + else: + # Parameters not to be cached + if check_skippable_keywords(key): + logger.warning_once( + f"Please note that '{key}' key" " is not currently used in quantization fine-tuning." + ) + reset_params(self.inputs[name]) + + if self._should_stop_cache_forward(name): + raise NotImplementedError + else: + if hidden_states is not None: + kwargs.pop("hidden_states") + return m.orig_forward(hidden_states, *positional_inputs, **kwargs) + else: + # Currently only for Llama-3.2-Vision-Instruct Series + return m.orig_forward(*positional_inputs, **kwargs) + + return forward + + @torch.no_grad() + def _get_cache_data_hook_for_layer(self, name): + """A forward hook to save input max of a module + :param name: the module name + :return: A hook function.""" + + def cache_input_hook(module, inputs, outputs): + input = inputs + if isinstance(inputs, tuple) or isinstance(input, list): + input = inputs[0] + if name in self.inputs: + self.inputs[name].extend(list(torch.split(input.to("cpu"), 1, dim=0))) + else: + self.inputs[name] = list(torch.split(input.to("cpu"), 1, dim=0)) + + if self._should_stop_cache_forward(name): + raise NotImplementedError + + return cache_input_hook + + def _replace_forward(self): + """Replaces the forward function.""" + + def register_hook(n, m, hook_handles): + if n in self.to_cached_layers and type(m) not in SUPPORTED_LAYER_TYPES: ##block + m.orig_forward = m.forward + m.forward = partial(self._get_block_forward_func(n), m) + elif n in self.to_cached_layers: ##linear layer or conv1d layer + hook_func = self._get_cache_data_hook_for_layer(n) + hook_handle = m.register_forward_hook(hook_func) + hook_handles.append(hook_handle) + + self.model_context.replace_forward(register_hook) + + def _should_stop_cache_forward(self, name: str) -> bool: + """Determine whether current forward pass can stop after caching `name`.""" + if name == self.last_cache_name: + return True + + if self.last_cache_name is not None: + return False + + if not hasattr(self, "_cache_target_set") or not hasattr(self, "_cache_seen_targets"): + return False + + if name in self._cache_target_set: + self._cache_seen_targets.add(name) + + if not self._cache_target_set.issubset(self._cache_seen_targets): + return False + + # Lock the last cache name after the first full forward pass. + self.last_cache_name = name + return True + + def _preprocess_block_inputs(self, inputs, first_input_name="input_ids"): + input_ids, input_others = self._split_inputs(inputs, first_input_name) + clear_memory(device_list=self.compress_context.device_list) + input_ids = to_device(input_ids, self.compress_context.cache_device) + input_others = to_device(input_others, self.compress_context.cache_device) + # As in calibration phase, we may use bf16 for calibration due to low_gpu_memory usage + + tmp_dtype = self.model_context._amp_dtype if self.model_context._amp else torch.float32 + input_ids = to_dtype(input_ids, tmp_dtype) + + for key in input_others.keys(): + if isinstance(input_others[key], torch.Tensor) and ( + input_others[key].dtype == torch.float16 or input_others[key].dtype == torch.bfloat16 + ): + input_others[key] = input_others[key].to(tmp_dtype) + elif isinstance(input_others[key], list): + for i in range(len(input_others[key])): + to_dtype(input_others[key][i], tmp_dtype) + return input_ids, input_others + + def _split_inputs(self, inputs: dict, first_input_name: str) -> tuple[torch.Tensor, dict]: + input_ids = inputs[first_input_name] + inputs.pop(first_input_name, None) + input_others = inputs + return input_ids, input_others + + @torch.inference_mode() + def _quantize_embedding_layer(self): + """Quantizes embedding layers in the model according to the configuration. + + This method iterates through all modules in the model, identifies embedding + layers specified in `self.layer_config`, and applies the appropriate quantization + function based on bit precision, grouping strategy, and dtype. + + Returns: + bool: True if the quantization process completes without critical errors. + """ + is_quantized = False + for name, module in self.model.named_modules(): + # Skip non-Embedding modules or layers not in config + if not isinstance(module, torch.nn.Embedding) or name not in self.layer_config: + continue + + config = self.layer_config[name] + + # Skip layers that are not marked for quantization + if not check_to_quantized(config): + continue + is_quantized = True + config["scale_dtype"] = self.scale_dtype + dtype = config["data_type"] + + # Determine quantization function key with symmetry/asymmetry + if dtype not in QUANT_FUNC_WITH_DTYPE: + dtype = f"{dtype}_{'sym' if config['sym'] else 'asym'}" + + # Optionally use optimized rounding (RTN) variant + if not self.disable_opt_rtn and f"rtn_{dtype}" in QUANT_FUNC_WITH_DTYPE: + dtype = f"rtn_{dtype}" + + quant_func = QUANT_FUNC_WITH_DTYPE[dtype] + dtype = module.weight.dtype + # As typically float32 are used in RTN to search scale zp, + # to avoid cache a bf16 copy we'd better use float32 + if config.get("super_group_size", None) is not None: + dtype = torch.float32 + + # Attempt quantization on GPU, fall back to CPU if OOM + try: + weight, scale, zp = quant_func( + module.weight.to(dtype=dtype, device=self.compress_context.device), + **{ + k: config.get(k, None) + for k in ["bits", "group_size", "super_bits", "super_group_size", "scale_dtype"] + }, + ) + except torch.OutOfMemoryError: + cuda_error_msg = traceback.format_exc() + try: + logger.error(cuda_error_msg) + logger.warning("falling back to CPU") + weight, scale, zp = quant_func( + module.weight.to("cpu"), + **{ + k: config.get(k, None) + for k in ["bits", "group_size", "super_bits", "super_group_size", "scale_dtype"] + }, + ) + except Exception as e: + raise + + # Overwrite the module's weights with the quantized version + module.weight.data.copy_(weight.cpu()) + + # Attach scale and zero point (zp) to the module + for param_name, value in zip(["scale", "zp"], [scale, zp]): + if isinstance(value, dict): + for k, v in value.items(): + setattr(module, k if k == "scale" else f"w_{k}", v.cpu()) + elif isinstance(value, torch.Tensor): + setattr(module, param_name, value.cpu()) + else: + setattr(module, param_name, value) + + # Update config + self.layer_config.setdefault(name, {}).update(config) + del weight + del scale + del zp + clear_memory(device_list=self.compress_context.device_list) + + return is_quantized + + def _quantize_blocks( + self, + model: torch.nn.Module, + inputs: dict, + block_names: list, + q_input: torch.Tensor = None, + nblocks: int = 1, + device: str = "cpu", + pbar: tqdm = None, + ): + """Quantize and dequantize the weights of the specified blocks in the model. + + Args: + model: The PyTorch model to be quantized. + inputs: The input data for quantization. + block_names: The names of the blocks to be quantized and dequantized. + nblocks: The number of blocks to quantize and dequantize. + device: The device for quantization and dequantization. + + Returns: + None + """ + clear_memory(device_list=self.compress_context.device_list) + for n, m in model.named_parameters(): + m.requires_grad_(False) + + input_ids, input_others = self._preprocess_block_inputs(inputs) + + if pbar is None: + pbar = tqdm(range(0, len(block_names), nblocks)) + + for i in range(0, len(block_names), nblocks): + if i != 0: + pbar.update(1) + if nblocks == 1: + n = block_names[i] + pbar.set_description(f"Quantizing {n}") + m = get_module(model, n) + else: + names = block_names[i : min(i + nblocks, len(block_names))] + pbar.set_description(f"Quantizing [{i + 1}-{min(i + nblocks, len(block_names))}]/{len(block_names)}") + modules = [get_module(model, n) for n in names] + m = WrapperMultiblock(modules) + + if self.compress_context.low_cpu_mem_usage: + if nblocks == 1: + self._offloader.reload(model, n) + else: + self._offloader.reload(model, names) + + m.config = model.config if hasattr(model, "config") else None + for alg in self.algorithms: + q_input, input_ids = alg.quantize_block( + m, + input_ids, + input_others, + q_input=q_input, + device=device, + ) + if hasattr(model, "config"): + del m.config + if self.is_immediate_packing: + for n, tmp_m in m.named_modules(): + if not (hasattr(tmp_m, "bits") and check_to_quantized(tmp_m)): + continue + self._immediate_pack(tmp_m.global_name) + + if self.is_immediate_saving: + shard_writer(self, m, is_finalize=False) + + if self.compress_context.low_cpu_mem_usage and not self.is_immediate_saving: + if nblocks == 1: + self._offloader.offload(model, n, overwrite=True) + else: + for name in names: + self._offloader.offload(model, name, overwrite=True) + if pbar is not None: + pbar.update(1) + + if not self.is_immediate_saving: + self.model = mv_module_from_gpu(self.model) + for n, m in self.model.named_modules(): + if hasattr(m, "name"): + delattr(m, "name") + + del q_input + del input_ids + del input_others + del inputs + + clear_memory(device_list=self.compress_context.device_list) + + def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: + """Quantize the model and return the quantized model along with layer configurations.The entry of AutoRound. + Returns: + The quantized model and layer configurations. + """ + self.model_context._load_model() + self.post_init() + self.model_context.initialize(formats=self.formats) + + self._check_compatibility() + + if bool(self.quant_block_list): + all_blocks = self.quant_block_list + else: + all_blocks = get_block_names(self.model_context.model) + + if len(all_blocks) == 0: + logger.warning("could not find blocks, exit with original model") + return self.model_context.model, self.layer_config + + layer_names = _get_quantized_layer_names_outside_blocks( + model=self.model_context.model, + layer_config=self.layer_config, + supported_types=SUPPORTED_LAYER_TYPES, + quant_block_list=self.quant_block_list, + ) + start_time = time.time() + all_first_block_names = [block[0] for block in all_blocks] + if len(layer_names) > 0: + logger.info( + "Starting to cache block inputs. This may be slow due to external block layers: %s", layer_names + ) + else: + logger.info("start to cache block inputs") + all_inputs = self.try_cache_inter_data_gpucpu( + all_first_block_names, + self.nsamples, + layer_names, + ) + self.inputs = all_inputs + is_quantized_embedding = self._quantize_embedding_layer() + clear_memory(device_list=self.compress_context.device_list) + all_q_inputs = None + if is_quantized_embedding: + all_inputs = copy.deepcopy(self.inputs) + clear_memory(self.inputs, device_list=self.compress_context.device_list) + all_q_inputs = self.try_cache_inter_data_gpucpu(all_first_block_names, self.nsamples, layer_names) + self.inputs = all_q_inputs + # Remove accelerate dispatch hooks before moving parameters. + # hf_device_map is kept for reference but hooks are no longer needed. + if hasattr(self.model_context.model, "hf_device_map") and len(self.model_context.model.hf_device_map) > 1: + accelerate.hooks.remove_hook_from_submodules(self.model_context.model) + self.model_context.model = mv_module_from_gpu(self.model_context.model) + clear_memory(device_list=self.compress_context.device_list) + logger.info("caching done") + if self.compress_context.low_cpu_mem_usage: + self._offloader.offload( + self.model_context.model, all_blocks, clear_memory=True, device_list=self.compress_context.device_list + ) + if len(all_blocks) > 1: + pbar = tqdm(range(0, sum([len(i) for i in all_blocks]), self.nblocks)) + else: + pbar = tqdm(range(0, len(all_blocks[0]), self.nblocks)) # move the alg warning outside pbar + + for block_names in all_blocks: + inputs = all_inputs[block_names[0]] + all_inputs.pop(block_names[0]) + q_inputs = None + if all_q_inputs is not None: + q_inputs = all_q_inputs[block_names[0]] + all_q_inputs.pop(block_names[0]) + + inputs, q_inputs = _update_inputs(inputs, q_inputs) + + clear_memory(self.inputs, device_list=self.compress_context.device_list) + + if "input_ids" in inputs.keys(): + total_samples = len(inputs["input_ids"]) + if total_samples < self.batch_size: + self.batch_size = total_samples + logger.warning(f"force the train batch size to {total_samples}") + + self._quantize_blocks( + self.model_context.model, + inputs, + block_names, + q_input=q_inputs if q_inputs is not None else None, + nblocks=self.nblocks, + device=self.compress_context.device, + pbar=pbar, + ) + if self.is_immediate_packing and len(self.formats) != 1: + raise ValueError( + f"Expected exactly one packing format when 'immediate_packing' is True, " + f"but got {len(self.formats)} formats." + ) + pbar.set_description("Quantizing done") + pbar.close() + self._quantize_layers(layer_names, all_inputs) + + convert_module_to_hp_if_necessary( + self.model_context.model, self.amp_dtype, self.compress_context.device, to_cpu=True + ) + if self.is_immediate_saving: + shard_writer(self, is_finalize=True) + + if self.compress_context.low_cpu_mem_usage: + self._offloader.reload(self.model_context.model) + + end_time = time.time() + cost_time = end_time - start_time + logger.info(f"quantization tuning time {cost_time}") + + # Dump a summary + quantized_layers = [] + unquantized_layers = [] + for n, m in self.model_context.model.named_modules(): + if isinstance(m, tuple(SUPPORTED_LAYER_TYPES)): + if check_to_quantized(m): + quantized_layers.append(n) + else: + unquantized_layers.append(n) + elif hasattr(m, "scales") or hasattr(m, "scale"): # packing_immediately + quantized_layers.append(n) + summary_info = ( + f"Summary: quantized {len(quantized_layers)}/{len(quantized_layers) + len(unquantized_layers)} in the model" + ) + if len(unquantized_layers) > 0: + summary_info += f", {unquantized_layers} have not been quantized" + logger.info(summary_info) + + self.model_context.quantized = True + return self.model_context.model, self.layer_config + + def _check_compatibility(self) -> None: + """Checks compatibility of the configurations and model.""" + if ( + self.seqlen is not None + and hasattr(self.model_context.model, "config") + and hasattr(self.model_context.model.config, "max_position_embeddings") + ): + if self.model_context.model.config.max_position_embeddings < self.seqlen: + logger.warning( + f"Change sequence length to {self.model_context.model.config.max_position_embeddings} " + "due to the limitation of max_position_embeddings" + ) + self.seqlen = min(self.seqlen, self.model_context.model.config.max_position_embeddings) + + if self.seqlen is not None and hasattr(self.tokenizer, "model_max_length"): + if self.tokenizer.model_max_length < self.seqlen: + logger.warning( + f"Change sequence length to {self.tokenizer.model_max_length} " + "due to the limitation of model_max_length. " + "You can also try to increase the model_max_length to avoid this issue." + ) + self.seqlen = min(self.seqlen, self.tokenizer.model_max_length) + + if self.group_size == 0 and "fp8" not in self.data_type: + logger.warning("`group_size==0` is not supported for data_type other than fp8 ") + + if self.bits <= 2 and (self.iters < 1000 or not self.enable_alg_ext) and self.super_group_size is None: + logger.warning( + "for bits <= 2, it is recommended to enable `auto-round-best` " "and turn on `--enable_alg_ext` " + ) + + def _get_save_folder_name(self, format: OutputFormat) -> str: + """Generates the save folder name based on the provided format string. + + If there are multiple formats to handle, the function creates a subfolder + named after the format string with special characters replaced. If there's + only one format, it returns the original output directory directly. + + Args: + format_str (str): The format identifier (e.g., 'gguf:q2_k_s'). + + Returns: + str: The path to the folder where results should be saved. + """ + # Replace special characters to make the folder name filesystem-safe + sanitized_format = format.get_backend_name().replace(":", "-").replace("_", "-") + + # Use a subfolder only if there are multiple formats + if len(self.formats) > 1: + return os.path.join(self.orig_output_dir, sanitized_format) + + return self.orig_output_dir + + def save_quantized( + self, + output_dir: str = None, + format: Union[str, list[OutputFormat]] = None, + inplace: bool = True, + return_folders=False, + **kwargs, + ) -> torch.nn.Module: + """Save the quantized model to the specified output directory in the specified format. + + Args: + output_dir (str, optional): The directory to save the quantized model. Defaults to None. + format (str, optional): The format in which to save the model. Defaults to "auto_round". + inplace (bool, optional): Whether to modify the model in place. Defaults to True. + **kwargs: Additional keyword arguments specific to the export format. + + Returns: + object: The compressed model object. + """ + self.orig_output_dir = output_dir + if format is not None: + logger.warning( + f"save_quantized with format is deprecated and will be deleted in auto_round version 1.0." + f" Please use Compressor(format='{format}' instead)." + ) + if isinstance(format, str) and getattr(self, "formats", None) is None: + formats = get_formats(format, self) + if not hasattr(self, "formats"): + self.formats = formats + + if not self.quantized: + logger.warning("please run autoround.quantize first") + return + folders = [] + for format in self.formats: + save_folder = self._get_save_folder_name(format) + if self.act_bits <= 8 and format.is_fake(): + logger.warning( + "Support for exporting activation quantization is limited. " + "Please ensure that your configuration is supported." + ) + + serialization_dict = {} + for key in SERIALIZATION_KEYS: + serialization_dict[key] = getattr(self, key) + from auto_round.version import __version__ + + serialization_dict["autoround_version"] = __version__ + if "scale_dtype" in serialization_dict.keys(): + serialization_dict["scale_dtype"] = str(serialization_dict["scale_dtype"]) + compressed_model = format.save_quantized( + save_folder, + model=self.model_context.model, + layer_config=self.layer_config, + inplace=inplace, + tokenizer=self.tokenizer, + device=self.compress_context.device, + serialization_dict=serialization_dict, + **kwargs, + ) + folders.append(save_folder) + + if return_folders: + return compressed_model, folders + else: + return compressed_model + + def quantize_and_save( + self, output_dir: str = "tmp_autoround", format: str = None, inplace: bool = True, **kwargs + ) -> tuple[torch.nn.Module, dict[str, Any]]: + """Quantizes the model and saves it in the specified format(s). + + This function checks the validity of the requested format(s), quantizes + the model accordingly, and saves it to the specified output directory. + If multiple formats are provided, the model is saved separately for each format. + + Args: + output_dir (str, optional): The directory where the quantized model + will be saved. Defaults to "tmp_autoround". + format (str, optional): The quantization format(s) to use, separated + by commas if multiple. Defaults to "auto_round". + inplace (bool, optional): Whether to modify the model in place if only + one format is used. Defaults to True. + **kwargs: Additional arguments for the quantization and saving process. + + Returns: + model: A qdq model or packed model based on the configurations + folders: The folder paths where the quantized models are saved. + + Raises: + ValueError: If an unsupported format is specified. + """ + # Validate and process the specified formats + self.orig_output_dir = output_dir + + # check and update the format based on the current configuration + if format and self.formats is not None: + logger.warning( + f"quantize_and_save with format is deprecated and will be deleted in auto_round version 1.0." + f" Please use Compressor(format='{format}' instead)." + ) + format_list = get_formats(format, self) + self.formats = format_list + if self.formats is None: + logger.info("format is not set, using default auto_round format.") + self.formats = get_formats("auto_round", self) + + # If multiple formats are specified, enforce inplace=False + if len(self.formats) > 1: + inplace = False + self.inplace = kwargs.get("inplace", inplace) + kwargs.pop("inplace", None) + + # Perform model quantization + if self.static_attention_dtype is not None: + from auto_round.experimental.attention import attention_quant_ctx + + with attention_quant_ctx(self.model_context.model, static_attention_dtype=self.static_attention_dtype): + model, _ = self.quantize() + elif self.static_kv_dtype is not None: + from auto_round.experimental.kv_cache import kvcache_quant_context + + with kvcache_quant_context(self.model_context.model, static_kv_dtype=self.static_kv_dtype): + model, _ = self.quantize() + else: + model, _ = self.quantize() + # Save the quantized model in the specified format_list + model, folders = self.save_quantized(output_dir, inplace=inplace, return_folders=True, **kwargs) + memory_monitor.log_summary() + + return model, folders + + +class TuneCompressor(BaseCompressor): + need_calib: bool = True + + +class ZeroShotCompressor(BaseCompressor): + need_calib: bool = False + + def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: + """Quantize the model and return the quantized model along with layer configurations.The entry of AutoRound. + Returns: + The quantized model and layer configurations. + """ + + pass diff --git a/auto_round/compressors_new/config.py b/auto_round/compressors_new/config.py new file mode 100644 index 000000000..dd028b6cd --- /dev/null +++ b/auto_round/compressors_new/config.py @@ -0,0 +1,296 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +from dataclasses import dataclass, fields +from typing import Any, Callable, Optional, Union + +import torch + + +class ExtraConfig: + """Class for extra or legacy configs.""" + + _model_config = None + _scheme_config = None + _tuning_config = None + _mllm_config = None + _diffusion_config = None + + def __init__( + self, + # tuning + amp: bool = True, + disable_opt_rtn: bool | None = None, + enable_alg_ext: bool = False, + enable_minmax_tuning: bool = True, + enable_norm_bias_tuning: bool = False, + enable_quanted_input: bool = True, + enable_deterministic_algorithms: bool = False, + lr: float = None, + lr_scheduler: Callable = None, + minmax_lr: float = None, + nblocks: int = 1, + to_quant_block_names: Union[str, list, None] = None, + scale_dtype: str = "fp16", + # scheme + bits: int = None, + group_size: int = None, + sym: bool = None, + data_type: str = None, + act_bits: int = None, + act_group_size: int = None, + act_sym: bool = None, + act_data_type: str = None, + act_dynamic: bool = None, + super_bits: int = None, + super_group_size: int = None, + static_kv_dtype: Union[str, torch.dtype] = None, + quant_lm_head: bool = False, + ignore_layers: str = None, + # mllm + processor: Callable = None, + image_processor: Callable = None, + quant_nontext_module: bool = False, + extra_data_dir: str = None, + template: str = None, + # diffusion + guidance_scale: float = 7.5, + num_inference_steps: int = 50, + generator_seed: int = None, + ): + """Initialize + + Args: + amp (bool): Whether to use automatic mixed precision (default is True). + disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to True. + enable_alg_ext (bool, optional): Enable algorithm extension (primarily for INT2). Defaults to False. + enable_minmax_tuning (bool, optional): Enable weight min-max tuning. Defaults to True. + enable_norm_bias_tuning (bool): Whether to enable fast norm/layer_bias tuning. + enable_quanted_input (bool): Whether to use quantized input data (default is True). + enable_deterministic_algorithms (bool): Whether to use deterministic_algorithms. + lr (float): The learning rate (default is 0.005). + lr_scheduler: The learning rate scheduler to be used. + minmax_lr (float): The learning rate for min-max tuning (default is None). + nblocks (int): Number of blocks (default is 1). + quant_lm_head (bool): Whether to quant lm_head. + to_quant_block_names (str|list): Names of quantitative blocks, please use commas to separate them. + scale_dtype (str): The data type of quantization scale to be used (default is "float16"), different kernels + bits (int, optional): Weight quantization bits. Defaults to 4. + group_size (int, optional): Weight quantization group size. Defaults to 128. + sym (bool, optional): Symmetric weight quantization. Defaults to True. + data_type (str, optional): Weight data type string, e.g., "int". Defaults to "int". + act_bits (int, optional): Activation quantization bits. Defaults to 16. + act_group_size (int, optional): Activation group size. Defaults to None. + act_sym (bool, optional): Symmetric activation quantization. Defaults to None. + act_data_type (str, optional): Activation data type; inherits weight dtype if None and act_bits < 16. + act_dynamic (bool, optional): Dynamic activation quantization. Defaults to True. + super_bits (int): number of scale and mins quant bits for double quant. + super_group_size (int): the number of super group size when use double quant. + static_kv_dtype (str): The data type of kv-cache to be used. + processor: Any multi-modal model will require an object to encode or + decode the data that groups several modalities (among text, vision and audio). + image_processor: Image processor for special model like llava. + quant_nontext_module: Whether to quantize nontext module. + extra_data_dir: The path of extra data such as images, audio and videos. + template: The path or name of template used to specify process for different MLLMs. + guidance_scale (float): Control how much the image generation process follows the text prompt. + The more it is, the more closely it follows the prompt (default is 7.5). + num_inference_steps (int): The reference number of denoising steps (default is 50). + generator_seed (int): A seed that controls the initial noise for image generation (default is None). + """ + self.tuning_config = TuningExtraConfig( + amp=amp, + disable_opt_rtn=disable_opt_rtn, + enable_alg_ext=enable_alg_ext, + enable_minmax_tuning=enable_minmax_tuning, + enable_norm_bias_tuning=enable_norm_bias_tuning, + enable_quanted_input=enable_quanted_input, + enable_deterministic_algorithms=enable_deterministic_algorithms, + lr=lr, + lr_scheduler=lr_scheduler, + minmax_lr=minmax_lr, + nblocks=nblocks, + to_quant_block_names=to_quant_block_names, + scale_dtype=scale_dtype, + ) + self.scheme_config = SchemeExtraConfig( + bits=bits, + group_size=group_size, + sym=sym, + data_type=data_type, + act_bits=act_bits, + act_group_size=act_group_size, + act_sym=act_sym, + act_data_type=act_data_type, + act_dynamic=act_dynamic, + super_bits=super_bits, + super_group_size=super_group_size, + static_kv_dtype=static_kv_dtype, + quant_lm_head=quant_lm_head, + ignore_layers=ignore_layers, + ) + self.mllm_config = MLLMExtraConfig( + processor=processor, + image_processor=image_processor, + quant_nontext_module=quant_nontext_module, + extra_data_dir=extra_data_dir, + template=template, + ) + self.diffusion_config = DiffusionExtraConfig( + guidance_scale=guidance_scale, + num_inference_steps=num_inference_steps, + generator_seed=generator_seed, + ) + + @property + def tuning_config(self): + return self._tuning_config + + @tuning_config.setter + def tuning_config(self, config: TuningExtraConfig): + assert isinstance( + config, TuningExtraConfig + ), f"tuning_config should be ModelExtraConfig, but got {config.__class__.__name__}" + self._tuning_config = config + + @property + def scheme_config(self): + return self._scheme_config + + @scheme_config.setter + def scheme_config(self, config: SchemeExtraConfig): + assert isinstance( + config, SchemeExtraConfig + ), f"scheme_config should be SchemeExtraConfig, but got {config.__class__.__name__}" + self._scheme_config = config + + @property + def mllm_config(self): + return self._mllm_config + + @mllm_config.setter + def mllm_config(self, config: MLLMExtraConfig): + if config is None: + self._mllm_config = None + else: + assert isinstance( + config, MLLMExtraConfig + ), f"mllm_config should be MLLMExtraConfig, but got {config.__class__.__name__}" + self._mllm_config = config + + @property + def diffusion_config(self): + return self._diffusion_config + + @diffusion_config.setter + def diffusion_config(self, config: DiffusionExtraConfig): + if config is None: + self._diffusion_config = None + else: + assert isinstance( + config, DiffusionExtraConfig + ), f"diffusion_config should be DiffusionExtraConfig, but got {config.__class__.__name__}" + self._diffusion_config = config + + def to_dict(self): + output_dict = {} + for config in self.__dict__.values(): + if config: + output_dict.update(config.to_dict()) + return output_dict + + +@dataclass +class BaseExtraConfig: + + @classmethod + def get_attributes(cls: "BaseExtraConfig") -> list[str]: + return [field.name for field in fields(cls)] + + def __getitem__(self, key: str): + if key not in self.get_attributes(): + raise KeyError(f"{key} is not a valid attribute") + return getattr(self, key) + + def __setitem__(self, key: str, value: None | int | str): + if key not in self.get_attributes(): + raise KeyError(f"{key} is not a valid attribute") + setattr(self, key, value) + + def __contains__(self, item): + return item in self.get_attributes() + + def to_dict(self): + return self.__dict__ + + def is_default(self): + for field in fields(self): + default_value = field.default + current_value = getattr(self, field.name) + if current_value != default_value: + return False + return True + + +@dataclass +class TuningExtraConfig(BaseExtraConfig): + amp: bool = True + disable_opt_rtn: bool | None = None + enable_alg_ext: bool = False + enable_minmax_tuning: bool = True + enable_norm_bias_tuning: bool = False + enable_quanted_input: bool = True + enable_deterministic_algorithms: bool = False + lr: float = None + lr_scheduler: Callable = None + minmax_lr: float = None + nblocks: int = 1 + to_quant_block_names: Union[str, list, None] = None + scale_dtype: str = "fp16" + + +@dataclass +class SchemeExtraConfig(BaseExtraConfig): + bits: int = None + group_size: int = None + sym: bool = None + data_type: str = None + act_bits: int = None + act_group_size: int = None + act_sym: bool = None + act_data_type: str = None + act_dynamic: bool = None + super_bits: int = None + super_group_size: int = None + static_kv_dtype: Union[str, torch.dtype] = None + static_attention_dtype: Union[str, torch.dtype] = None + quant_lm_head: bool = False + ignore_layers: str = None + + +@dataclass +class MLLMExtraConfig(BaseExtraConfig): + processor: Callable = None + image_processor: Callable = None + quant_nontext_module: bool = False + extra_data_dir: str = None + template: str = None + + +@dataclass +class DiffusionExtraConfig(BaseExtraConfig): + guidance_scale: float = 7.5 + num_inference_steps: int = 50 + generator_seed: int = None diff --git a/auto_round/compressors_new/shard_writer.py b/auto_round/compressors_new/shard_writer.py new file mode 100644 index 000000000..990c5c387 --- /dev/null +++ b/auto_round/compressors_new/shard_writer.py @@ -0,0 +1,242 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +from collections import OrderedDict + +import torch + +from auto_round.logger import logger +from auto_round.utils import get_lm_head_name, get_module + + +class ShardWriter: + """ + Handles shard-saving of model parameters to disk with memory management. + """ + + def __init__(self, rounder): + self.model = rounder.model + self.lm_head_name = get_lm_head_name(self.model) + total_params = sum(p.numel() for p in self.model.parameters()) + # Heuristic estimate of model size in GB used to choose a default max_shard_size: + # - total_params * rounder.bits -> total number of bits in all parameters + # - // 8 -> convert bits to bytes + # - // 1e9 -> approx convert bytes to GB (1e9 bytes ~= 1 GB) + # - final // 10 -> apply a safety margin so default shards are + # smaller than the full model; this intentionally + # underestimates size before clamping below. + max_split_num = 10 + model_size = int(total_params * rounder.bits // 1e9 // 8 + max_split_num - 1) / max_split_num + model_size = max(1, min(int(model_size), 5)) + + # Configuration + self.max_shard_size = self._parse_size(getattr(rounder, "max_shard_size", f"{model_size}GB")) + self.safe_serialization = getattr(rounder, "safe_serialization", True) + + # Internal State + self.use_safetensors = self._check_safetensors() + self.shard_suffix = "safetensors" if self.use_safetensors else "bin" + self.current_shard_tensors = OrderedDict() + self.current_shard_size = 0 + self.shard_meta = [] # List of {tmp_file: str, params: list} + self.global_weight_map = {} + self.shard_counter = 0 + + # Stats + self.total_param_elems = 0 + self.total_param_size_bytes = 0 + self.skipped_meta_tensors = [] + + # Directory Setup + self.output_dir = os.path.join(rounder._get_save_folder_name(rounder.formats[0]), "") + os.makedirs(self.output_dir, exist_ok=True) + + def _parse_size(self, size_str: str) -> int: + if isinstance(size_str, int): + return size_str + s = size_str.strip().upper() + units = {"GB": 1024**3, "MB": 1024**2, "KB": 1024, "B": 1} + for unit, mult in units.items(): + if s.endswith(unit): + return int(float(s[: -len(unit)]) * mult) + return int(s) + + def _check_safetensors(self) -> bool: + if self.safe_serialization: + try: + import safetensors.torch + + return True + except ImportError: + logger.warning("safetensors not installed; falling back to torch.save.") + return False + + def save_module(self, m: torch.nn.Module, name: str = None): + """Extracts and accumulates tensors from a module.""" + prefix = name if name is not None else getattr(m, "global_name", "model") + sd = m.state_dict() + + for k, v in sd.items(): + if not isinstance(v, torch.Tensor): + continue + param_name = f"{prefix}.{k}" + self._add_tensor(param_name, v) + + def _add_tensor(self, name: str, tensor: torch.Tensor): + if isinstance(tensor, torch.Tensor) and tensor.device.type == "meta": + self.skipped_meta_tensors.append(name) + return + t_size = tensor.nbytes + self.total_param_elems += tensor.numel() + self.total_param_size_bytes += t_size + tensor = tensor.detach().cpu() + # If single tensor exceeds limit, flush current, save it solo, then continue + if t_size > self.max_shard_size: + self._flush_shard() + self.current_shard_tensors[name] = tensor + self.current_shard_size = t_size + self._flush_shard() + # If adding exceeds limit, flush first + elif self.current_shard_size + t_size > self.max_shard_size and self.current_shard_size > 0: + self._flush_shard() + self.current_shard_tensors[name] = tensor + self.current_shard_size = t_size + else: + self.current_shard_tensors[name] = tensor + self.current_shard_size += t_size + + def _flush_shard(self): + if not self.current_shard_tensors: + return + + self.shard_counter += 1 + tmp_name = f"model-shard-{self.shard_counter:05d}.{self.shard_suffix}" + tmp_path = os.path.join(self.output_dir, tmp_name) + + if self.use_safetensors: + from safetensors.torch import save_file + + save_file(self.current_shard_tensors, tmp_path) + else: + torch.save(self.current_shard_tensors, tmp_path) + + saved_params = list(self.current_shard_tensors.keys()) + self.shard_meta.append({"tmp_file": tmp_name, "params": saved_params}) + + # Offload logic: move modules to meta device once all params are saved + self._offload_to_meta(saved_params) + + self.current_shard_tensors = OrderedDict() + self.current_shard_size = 0 + + def _offload_to_meta(self, saved_params): + """Attempts to move fully saved modules to the 'meta' device to free RAM.""" + # Using a set for faster lookup of all saved parameters + all_saved = {p for meta in self.shard_meta for p in meta["params"]} + + for param_full_name in saved_params: + module_path = param_full_name.rsplit(".", 1)[0] + + module = get_module(self.model, module_path) + # Check if all parameters of this module are now in 'all_saved' + if ( + module is not None + and isinstance(module, torch.nn.Module) + and all(f"{module_path}.{k}" in all_saved for k in module.state_dict().keys()) + ): + module.to("meta") + + def finalize(self): + """Saves remaining weights, renames files, and writes the index JSON.""" + # 1. Capture remaining weights not yet saved + full_sd = self.model.state_dict() + tie_word_embeddings = getattr(getattr(self.model, "config", None), "tie_word_embeddings", True) + all_saved_names = {p for meta in self.shard_meta for p in meta["params"]} + + finalize_skipped_meta_tensors = [] + for pname, tensor in full_sd.items(): + if pname in all_saved_names: + continue + if tensor.device.type == "meta": + continue + layer_name = ".".join(pname.split(".")[:-1]) + if self.lm_head_name is not None and layer_name == self.lm_head_name and tie_word_embeddings: + lm_head_module = get_module(self.model, self.lm_head_name) + lm_head_module.to("meta") # Must to meta, otherwise model's saver will dump it again + continue + self._add_tensor(pname, tensor.detach().to("cpu")) + + self._flush_shard() + + total_skipped = len(self.skipped_meta_tensors) + len(finalize_skipped_meta_tensors) + if total_skipped > 0: + examples = (self.skipped_meta_tensors + finalize_skipped_meta_tensors)[:5] + + # 2. Rename temp files to HF standard and map weights + if self.shard_counter == 0: + logger.warning("No tensors saved.") + return + + for idx, meta in enumerate(self.shard_meta, start=1): + old_path = os.path.join(self.output_dir, meta["tmp_file"]) + new_name = ( + f"model.{self.shard_suffix}" + if self.shard_counter == 1 + else f"model-{idx:05d}-of-{self.shard_counter:05d}.{self.shard_suffix}" + ) + + os.rename(old_path, os.path.join(self.output_dir, new_name)) + for p in meta["params"]: + self.global_weight_map[p] = new_name + + # 3. Write Index JSON + index_ext = "safetensors.index.json" if self.use_safetensors else "bin.index.json" + index_path = os.path.join(self.output_dir, f"model.{index_ext}") + + index_data = { + "metadata": { + "format": "safetensors" if self.use_safetensors else "pytorch", + "total_shards": self.shard_counter, + "total_parameters": int(self.total_param_elems), + "total_size": int(self.total_param_size_bytes), + }, + "weight_map": self.global_weight_map, + } + + if self.shard_counter > 1: + with open(index_path, "w", encoding="utf-8") as f: + json.dump(index_data, f, indent=2) + + logger.info(f"model has been saved to {self.output_dir}") + + +@torch.no_grad() +def shard_writer(rounder: object, m: torch.nn.Module = None, name: str = None, is_finalize: bool = False): + if m is None and name is None and not is_finalize and not is_finalize: + raise ValueError("Must specify either name or m") + if not hasattr(rounder, "_shard_writer"): + rounder._shard_writer = ShardWriter(rounder) + + if m is None and name is not None: + m = get_module(rounder.model, name) + # Perform the save + if m is not None: + rounder._shard_writer.save_module(m, name) + + if is_finalize: + rounder._shard_writer.finalize() + # Optional: cleanup the saver object from rounder + del rounder._shard_writer diff --git a/auto_round/compressors_new/utils.py b/auto_round/compressors_new/utils.py new file mode 100644 index 000000000..8afe83f72 --- /dev/null +++ b/auto_round/compressors_new/utils.py @@ -0,0 +1,1034 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy +import random +import re +import sys +from dataclasses import asdict, fields +from enum import Enum +from typing import TYPE_CHECKING, Callable, Union + +import torch +import transformers +from torch.amp import autocast + +from auto_round.export.export_to_gguf.config import GGML_QUANT_SIZES, GGUF_CONFIG, GGUF_INNER_CONFIG, QK_K, ModelType +from auto_round.logger import logger +from auto_round.utils import check_to_quantized, get_layer_names_in_block, get_module + +if TYPE_CHECKING: + from auto_round.schemes import QuantizationScheme + + +class BackendDataType(str, Enum): + STANDARD_FP = "fp" + MX_FP = "mx_fp" + NV_FP = "nv_fp" + + +def is_standard_fp(backend): + backend = backend.lower() + return BackendDataType.STANDARD_FP in backend and not is_mx_fp(backend) and not is_nv_fp(backend) + + +def is_mx_fp(backend): + backend = backend.lower() + return BackendDataType.MX_FP in backend + + +def is_nv_fp(backend): + backend = backend.lower() + return BackendDataType.NV_FP in backend + + +def _is_weight_fp8_activation_static_fp8( + bit: int, group_size: int, sym: bool, data_type: str, act_dynamic: bool +) -> bool: + return bit == 8 and group_size == -1 and sym and data_type == "fp" and not act_dynamic + + +def is_wfp8afp8(ar): + if ( + ("fp8" in ar.act_data_type or ("fp" in ar.act_data_type and ar.act_bits == 8)) + and ("fp8" in ar.data_type or ("fp" in ar.data_type and ar.bits == 8)) + and is_standard_fp(ar.act_data_type) + and is_standard_fp(ar.data_type) + ): + return True + else: + return False + + +def is_wint8aint8(ar): + if ("int8" in ar.act_data_type or ("int" in ar.act_data_type and ar.act_bits == 8)) and ( + "int8" in ar.data_type or ("int" in ar.data_type and ar.bits == 8) + ): + return True + else: + return False + + +def is_static_wfp8afp8(ar_or_format: Union[str, Callable]) -> bool: + if isinstance(ar_or_format, str): + return "fp8_static" in ar_or_format.lower() + if ar_or_format.act_dynamic: + return False + if is_wfp8afp8(ar_or_format): + return True + return False + + +def is_dynamic_wint8aint8(ar_or_format: Union[str, Callable]) -> bool: + if isinstance(ar_or_format, str): + return "int8_w8a8" in ar_or_format.lower() + if not ar_or_format.act_dynamic: + return False + if is_wint8aint8(ar_or_format): + return True + return False + + +def block_forward( + block: torch.nn.Module, + input_ids: torch.Tensor, + input_others: dict, + amp: bool = False, + amp_dtype: torch.dtype = torch.float16, + device: torch.device = torch.device("cpu"), + output_return_id: int = 0, +) -> Union[torch.Tensor, dict]: + """Performs a forward pass through a block with the given inputs. + + Args: + block: The block to perform the forward pass on. + input_ids: The input IDs. + input_others: A dictionary containing other input data. + amp: A boolean indicating whether to use automatic mixed precision. + amp_dtype: The data type for automatic mixed precision. + device: The target device. + output_return_id: if the output has more than one tenor, return the specified idx tensor. + + Returns: + output: The output of the forward pass. + """ + from auto_round.utils.model import to_device + + if input_ids.device != device: + input_ids = to_device(input_ids, device) + input_others = to_device(input_others, device) + input_tuple = input_others.pop("positional_inputs", None) + if "alibi" in input_others.keys() and input_others["alibi"] is not None: + alibi = input_others["alibi"] + input_others["alibi"] = alibi.reshape(-1, alibi.shape[2], alibi.shape[3]) + if amp: + with autocast(device_type=str(device).split(":")[0], dtype=amp_dtype): # pragma: no cover + output = block(input_ids, *input_tuple, **input_others) + else: + output = block(input_ids, *input_tuple, **input_others) + if isinstance(output_return_id, int) and (isinstance(output, list) or isinstance(output, tuple)): + output = output[output_return_id] + return output + + +def check_skippable_keywords(key): + """ + Prints a reminder if a key is not stored during quantization fine-tuning. + """ + skippable_cache_keys = ("past_key_value",) + for cache_key in skippable_cache_keys: + if cache_key not in key: + return True + return False + + +def check_need_act_calibration( + is_act_dynamic: Union[bool, None], + act_data_type: Union[str, None] = None, + act_bits: Union[int, None] = 16, + static_kv_dtype: Union[str, None] = None, + static_attention_dtype: Union[str, None] = None, +) -> bool: + if static_kv_dtype is not None or static_attention_dtype is not None: + return True + if act_bits is None or act_bits > 8: + return False + # None is dynamic + if is_act_dynamic is not None and not is_act_dynamic: + return True + if act_data_type is not None and "static" in act_data_type: + return True + return False + + +def collect_best_params(block, cache_device="cpu"): + """Collect the best parameters from the block to the specified device.""" + params = {} + if hasattr(block, "orig_layer"): + for key in block.params.keys(): + params[key] = block.params[key].data.to(cache_device, copy=True) + else: + for n, m in block.named_modules(): + if hasattr(m, "orig_layer"): + params[n] = {} + for key in m.params.keys(): + params[n][key] = m.params[key].data.to(cache_device, copy=True) + return params + + +def infer_bits_by_data_type(data_type: str): + """Infer bits by data_type + + Args: + data_type (str): data_type + + Returns: + int: bits inferred by data_type, None means cannot infer correct bits by data_type + """ + from auto_round.utils import SUPPORTED_DTYPES + + if data_type is None: + return 16 + for supported_dtype in SUPPORTED_DTYPES: + if data_type.startswith(supported_dtype) and len(data_type) > len(supported_dtype): + ##first check the following two bits + suc_2str = data_type[len(supported_dtype) : len(supported_dtype) + 2] + if str.isdigit(suc_2str): + return int(suc_2str) + if str.isdigit(data_type[len(supported_dtype)]): + return int(data_type[len(supported_dtype)]) + return None + + +def set_layer_config( + model: torch.nn.Module, + layer_config: dict[str, Union[str, dict, "QuantizationScheme"]], + default_scheme: Union[str, "QuantizationScheme"], + default_scale_dtype: torch.dtype | str, + supported_types: tuple, + inner_supported_types: tuple, + quant_block_list=None, + ignore_layers: str = "", + quant_lm_head: bool = False, + enable_gguf_official_mixed: bool = True, + is_mllm: bool = False, + fill_default_value=True, +) -> tuple[dict, bool, dict]: + """ + Normalize, validate, and expand layer-specific quantization configs. + Returns (final_layer_config, has_quant_layer_outside_block) + """ + + from auto_round.schemes import QuantizationScheme, get_gguf_scheme, preset_name_to_scheme + from auto_round.utils.model import get_layer_names_in_block, get_lm_head_name, get_module, is_separate_lm_head + + # ---- helpers ------------------------------------------------- + def dispatch_layer_config(layer_config: dict[str, dict]) -> None: + """Assign scheme values as attributes to matched modules.""" + for layer_name, scheme in layer_config.items(): + module = get_module(model, layer_name) + for attr, value in scheme.items(): + setattr(module, attr, value) + + def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str) -> dict: + """Convert config entry into dict and validate keys.""" + if isinstance(item, str): + config = asdict(preset_name_to_scheme(item.upper())) + elif isinstance(item, QuantizationScheme): + config = asdict(item) + elif isinstance(item, dict): + invalid = set(item) - set(scheme_keys + ("fixed_by_user", "scale_dtype")) + if invalid: + raise ValueError( + f"Invalid keys {invalid} in layer_config for '{layer_name}'. " f"Allowed keys: {scheme_keys}" + ) + config = dict(item) + else: + raise TypeError( + f"Unsupported type for layer_config[{layer_name}]: {type(item)}. " + f"Expected str, dict, or QuantizationScheme." + ) + # Clean up + config = {k: v for k, v in config.items() if v is not None} + config["fixed_by_user"] = True + return config + + # ---- main logic ---------------------------------------------- + extra_scheme_keys = ("scale_dtype",) + scheme_keys = tuple(f.name for f in fields(QuantizationScheme)) + ("scale_dtype",) + layer_config = copy.deepcopy(layer_config) or {} + + # 1. ignore_layers -> force 16 + for name in get_fp_layer_names(model, ignore_layers): + layer_config[name] = { + "bits": 16, + "act_bits": 16, + "data_type": "float", + "act_data_type": "float", + "fixed_by_user": True, + } + + # 2. normalize + layer_config = {k: normalize_item(v, k) for k, v in layer_config.items()} + + # 3. infer missing bits + for cfg in layer_config.values(): + if "data_type" in cfg and "bits" not in cfg: + if (b := infer_bits_by_data_type(cfg["data_type"])) is not None: + cfg["bits"] = b + if "act_data_type" in cfg and "act_bits" not in cfg: + if (b := infer_bits_by_data_type(cfg["act_data_type"])) is not None: + cfg["act_bits"] = b + + # 4. fill defaults + if isinstance(default_scheme, str): + default_dict = asdict(preset_name_to_scheme(default_scheme.upper())) + else: + default_dict = asdict(default_scheme) + default_dict["scale_dtype"] = default_scale_dtype + + # In AutoScheme with mixed gguf:q4_k_m, the super_group_size of gguf:q8_0 layer is None, + # which should not be filled by default q4km again + for cfg in layer_config.values(): + for key in scheme_keys: + if fill_default_value: + cfg.setdefault(key, copy.deepcopy(default_dict.get(key))) + else: + if key in extra_scheme_keys: + cfg.setdefault(key, copy.deepcopy(default_dict.get(key))) + else: + cfg.setdefault(key, None) + + # 5. collect supported modules + embedding_types = (torch.nn.Embedding,) + gguf_name = get_gguf_scheme(default_scheme) + if gguf_name: + if torch.nn.Embedding not in supported_types: + supported_types = (*supported_types, torch.nn.Embedding) + + # for some Embedding which type() is not torch.nn.Embedding + # for example: transformers.models.gemma3.modeling_gemma3.Gemma3TextScaledWordEmbedding + model_module_name = model.__class__.__module__ + module_cls = sys.modules[model_module_name] + for name in module_cls.__dict__: + if name.endswith("Embedding") and not name.endswith("RotaryEmbedding"): + embedding_types = (*embedding_types, getattr(module_cls, name)) + supported_types = (*supported_types, *embedding_types) + + all_supported_layer_names, embedding_layer_names = [], [] + all_module_names = [] + for n, m in model.named_modules(): + all_module_names.append(n) + # cleanup stale attributes + for key in scheme_keys: + if hasattr(m, key): + delattr(m, key) + if type(m) not in supported_types and m.__class__.__name__ not in inner_supported_types: + continue + all_supported_layer_names.append(n) + if isinstance(m, embedding_types) or m.__class__.__name__.endswith("Embedding"): + embedding_layer_names.append(n) + + # 6. expand regex configs + regex_config = {} + for name in list(layer_config.keys()): + if name in all_supported_layer_names: + continue + if name in all_module_names: + m = get_module(model, name) + if len(list(m.children())) == 0 and type(m) not in supported_types: + layer_config.pop(name) + logger.warning(f"{name} is not supported in current scheme, ignoring its setting in `layer_config`") + continue + + regex = re.compile(name) + matched = [ln for ln in all_supported_layer_names if regex.search(ln)] + if not matched: + raise ValueError(f"Invalid '{name}' in layer_config, no match found.") + val = layer_config.pop(name) + regex_config[name] = val # keep regex config + for match in matched: + layer_config[match] = val + # regex_config = None if len(regex_config)==0 else regex_config + + # 7. lm_head + lm_head_name = get_lm_head_name(model) + tie_word_embeddings = False + if hasattr(model, "config") and hasattr(model.config, "tie_word_embeddings"): + tie_word_embeddings = model.config.tie_word_embeddings + + if lm_head_name in layer_config: + quant_lm_head = True + + if quant_lm_head and tie_word_embeddings and not gguf_name: + quant_lm_head = False + logger.warning( + "reset `quant_lm_head` to false as quantizing " "lm_head with tied weights has not been supported currently" + ) + + if lm_head_name not in layer_config and quant_lm_head: + layer_config[lm_head_name] = copy.deepcopy(default_dict) + + if not quant_lm_head and not gguf_name: + layer_config.pop(lm_head_name, None) + + # 8. enforce shape divisibility for int weight-only + if default_dict["data_type"] == "int" and default_dict["act_bits"] >= 16 and not gguf_name: + for n, m in model.named_modules(): + if type(m) in supported_types or m.__class__.__name__ in inner_supported_types: + if m.weight.shape[0] % 32 or m.weight.shape[1] % 32: + layer_config.setdefault(n, copy.deepcopy(default_dict)) + layer_config[n].update({"bits": 16, "data_type": "fp", "fixed_by_user": True}) + # logger.warning_once(f"{n} skipped quantization (shape not divisible by 32).") + # enforce shape divisibility for mxfp/nvfp + if (is_nv_fp(default_dict["data_type"]) or is_mx_fp(default_dict["data_type"])) and not gguf_name: + for n, m in model.named_modules(): + if type(m) in supported_types or m.__class__.__name__ in inner_supported_types: + if m.weight.shape[1] % default_dict["group_size"]: + layer_config.setdefault(n, copy.deepcopy(default_dict)) + layer_config[n].update( + {"bits": 16, "data_type": "fp", "act_bits": 16, "act_data_type": "fp", "fixed_by_user": True} + ) + logger.warning_once( + f"{n} skipped quantization (shape not divisible by {default_dict['group_size']})." + ) + + # 9. block layers: mark as in_blocks=True + for name in get_layer_names_in_block(model, supported_types, quant_block_list, inner_supported_types): + if name not in layer_config: + layer_config[name] = copy.deepcopy(default_dict) + layer_config[name]["fixed_by_user"] = False + layer_config[name]["in_blocks"] = True + + # ---- restore: ensure missing in_blocks are set to False and compute flag ---- + has_qlayer_outside_block = False + for cfg in layer_config.values(): + if "in_blocks" not in cfg: + cfg["in_blocks"] = False + # mark layer outside block + if not cfg["in_blocks"] and check_to_quantized(cfg): + has_qlayer_outside_block = True + + # 10. GGUF handling + if not gguf_name: + dispatch_layer_config(layer_config) + return layer_config, has_qlayer_outside_block, regex_config + + # embed + lm_head defaults for gguf + tie_word_embeddings &= not is_separate_lm_head(model) + if lm_head_name not in layer_config and not tie_word_embeddings: + cfg = GGUF_INNER_CONFIG[GGUF_CONFIG[gguf_name.lower()]["lm_head"]] + cfg = {**cfg, "fixed_by_user": False, "scale_dtype": default_scale_dtype} + layer_config[lm_head_name] = cfg + has_qlayer_outside_block = True + for emd_name in embedding_layer_names: + if emd_name in layer_config: + continue + if not tie_word_embeddings: + cfg = GGUF_INNER_CONFIG[GGUF_CONFIG[gguf_name.lower()]["embedding"]] + else: + cfg = GGUF_INNER_CONFIG[GGUF_CONFIG[gguf_name.lower()]["lm_head"]] + cfg = {**cfg, "fixed_by_user": False, "scale_dtype": default_scale_dtype} + layer_config[emd_name] = cfg + + if enable_gguf_official_mixed: + model_type = ModelType.MMPROJ if is_mllm else ModelType.TEXT + layer_config, _ = get_layer_config_by_gguf_format(layer_config, gguf_name.lower(), model, model_type) + + dispatch_layer_config(layer_config) + return layer_config, has_qlayer_outside_block, regex_config + + +def _use_more_bits(i_layer: int, n_layer: int): + return (i_layer < n_layer // 8) or (i_layer >= 7 * n_layer // 8) or ((i_layer - n_layer // 8) % 3 == 2) + + +def _search_gguf_type(gguf_type): + if gguf_type in GGUF_INNER_CONFIG: + return gguf_type + pattern = re.compile("gguf:q([0-9]{1,})_[01k]") + bits = re.search(pattern, gguf_type) + if not bits: + raise KeyError(f"{gguf_type} is not a correct gguf type, please check") + + for suffix in ["_k", "_0", "_1"]: + if gguf_type.endswith(suffix): + continue + if (tmp_type := re.sub("_[01k]", suffix, gguf_type)) in GGUF_INNER_CONFIG: + return tmp_type + return None + + +def gguf_type_fallback(gguf_type: str) -> str: + gguf_type = gguf_type.lower() + if gguf_type in ("gguf:q2_k", "gguf:q3_k", "gguf:q4_k"): + gguf_type = "gguf:q5_0" + elif gguf_type == "gguf:q5_k": + gguf_type = "gguf:q5_0" + elif gguf_type == "gguf:q6_k": + gguf_type = "gguf:q8_0" + return gguf_type + + +def get_gguf_qtype_by_layer_config(layer_config): + import gguf # pylint: disable=E0401 + + if layer_config["bits"] >= 16: + return None + bits = layer_config["bits"] + super_bits = layer_config.get("super_bits", None) + sym = layer_config["sym"] + group_size = layer_config.get("group_size", None) + super_group_size = layer_config.get("super_group_size", None) + if bits == 2 and super_bits == 4 and not sym and group_size == 16 and super_group_size == 16: + return gguf.GGMLQuantizationType.Q2_K + if bits == 3 and super_bits == 6 and sym and group_size == 16 and super_group_size == 16: + return gguf.GGMLQuantizationType.Q3_K + if bits == 4: + if super_bits is not None and super_bits == 6 and not sym and group_size == 32 and super_group_size == 8: + return gguf.GGMLQuantizationType.Q4_K + if super_bits is None and sym and group_size == 32: + return gguf.GGMLQuantizationType.Q4_0 + if super_bits is None and not sym and group_size == 32: + return gguf.GGMLQuantizationType.Q4_1 + if bits == 5: + if super_bits == 6 and not sym and group_size == 32 and super_group_size == 8: + return gguf.GGMLQuantizationType.Q5_K + if super_bits is None and sym and group_size == 32: + return gguf.GGMLQuantizationType.Q5_0 + if super_bits is None and not sym and group_size == 32: + return gguf.GGMLQuantizationType.Q5_1 + if bits == 6 and super_bits == 8 and group_size == 16 and super_group_size == 16: + return gguf.GGMLQuantizationType.Q6_K + if bits == 8 and sym and group_size == 32: + return gguf.GGMLQuantizationType.Q8_0 + raise ValueError("Unknown layer config") + + +def _get_digital_in_layer_name(layer_name): + pattern = re.compile(r"([a-zA-Z]+\.){1,}(\d+)") + res = re.search(pattern, layer_name) + if res: + return int(res[2]) + else: + return None + + +def _gguf_type_fallback(gguf_type: str) -> str: + gguf_type = gguf_type.lower() + if gguf_type in ("gguf:q2_k", "gguf:q3_k", "gguf:q4_k"): + gguf_type = "gguf:q5_0" + elif gguf_type == "gguf:q5_k": + gguf_type = "gguf:q5_0" + elif gguf_type == "gguf:q6_k": + gguf_type = "gguf:q8_0" + return gguf_type + + +##https://github.com/ggml-org/llama.cpp/blob/9e31bec4fd53634c9e5b04650488a09a055f5dab/src/llama-quant.cpp#L129 +def get_layer_config_by_gguf_format(layer_config, target_gguf_format: str, model, model_type=ModelType.TEXT): + # # TODO: support for other format later + # target_gguf_format = next((fmt for fmt in gguf_format if fmt != "fake"), None) + + import gguf # pylint: disable=E0401 + + from auto_round.schemes import get_gguf_scheme + from auto_round.utils.common import MM_KEYS, LazyImport + from auto_round.utils.model import get_lm_head_name, get_module + + # from auto_round.export.export_to_gguf.convert import ModelBase, get_model_architecture + convert_hf_to_gguf = LazyImport("auto_round.export.export_to_gguf.convert_hf_to_gguf") + + try: + model_architecture = convert_hf_to_gguf.get_model_architecture( + hparams=model.config.to_dict(), model_type=model_type + ) + except AttributeError as e: + raise ImportError( + "Please use the latest gguf-py, you can use the following command to install it:\n" + "git clone https://github.com/ggml-org/llama.cpp.git && cd llama.cpp/gguf-py" + " && pip install . sentencepiece" + ) + try: + if model_type != ModelType.TEXT: + model_class_vision = convert_hf_to_gguf.ModelBase.from_model_architecture( + model_architecture, model_type=model_type + ) + model_class = convert_hf_to_gguf.ModelBase.from_model_architecture( + model_architecture, model_type=ModelType.TEXT + ) + + except NotImplementedError: + return layer_config, {} + + n_layer = None + if model_type != ModelType.TEXT: + n_layer_vision = None + for name in ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"]: + if hasattr(model.config, name): + n_layer = getattr(model.config, name) + if model_type != ModelType.TEXT: + if n_layer is not None and hasattr(model.config, "text_config"): + if hasattr(getattr(model.config, "text_config"), name): + n_layer = getattr(getattr(model.config, "text_config"), name) + for config_name in ["vision_config", "vision_encoder"]: + if hasattr(model.config, config_name): + if hasattr(getattr(model.config, config_name), name): + n_layer_vision = getattr(getattr(model.config, config_name), name) + break + if n_layer and n_layer_vision: + break + + if n_layer is None: + return layer_config, {} + + tensor_map = gguf.get_tensor_name_map(model_class.model_arch, n_layer) + if model_type != ModelType.TEXT: + tensor_map_vision = gguf.get_tensor_name_map(model_class_vision.model_arch, n_layer_vision) + + def _set_config(config, target_config): + for k, v in target_config.items(): + if isinstance(config, dict): + config[k] = v + else: + setattr(config, k, v) + return config + + gguf_format_config = {} + lm_head_name = get_lm_head_name(model) + inner_gguf_format = GGUF_CONFIG[target_gguf_format]["mostly"] + # ggml_type = getattr(gguf.GGMLQuantizationType,inner_gguf_format.split(":")[-1].upper()) + block_size = GGML_QUANT_SIZES[inner_gguf_format.split(":")[-1].lower()][0] + tie_word_embeddings = True + if hasattr(model, "config") and hasattr(model.config, "tie_word_embeddings"): + tie_word_embeddings = model.config.tie_word_embeddings + + n_gqa = 1 + if ( + hasattr(model, "config") + and hasattr(model.config, "num_attention_heads") + and hasattr(model.config, "num_key_value_heads") + ): + n_gqa = model.config.num_attention_heads // model.config.num_key_value_heads + n_expert = 0 + for name in ["num_experts", "num_local_experts", "n_routed_experts"]: + if hasattr(model.config, name): + n_expert = getattr(model.config, name) + + i_attention_wv = 0 + i_ffn_down = 0 + layer_config_copy = copy.deepcopy(layer_config) + target_bits = None + if inner_gguf_format.startswith("gguf:q") and len(inner_gguf_format) >= 7 and (inner_gguf_format[6]).isdigit(): + target_bits = int(inner_gguf_format[6]) + + for layer_name, config in layer_config_copy.items(): + if not check_to_quantized(config): + continue + new_type = GGUF_CONFIG[target_gguf_format]["mostly"] + layer = get_module(model, layer_name) + if type(layer) == transformers.pytorch_utils.Conv1D: + input_features = layer.weight.shape[0] + else: + input_features = layer.weight.shape[-1] + i_layer = _get_digital_in_layer_name(layer_name) + + if lm_head_name is not None and layer_name == lm_head_name: + target_bits = int(re.search("gguf:q([0-9]{1,})_[01k]", GGUF_CONFIG[target_gguf_format]["lm_head"]).group(1)) + if isinstance(layer, torch.nn.Embedding): + target_bits = int( + re.search("gguf:q([0-9]{1,})_[01k]", GGUF_CONFIG[target_gguf_format]["embedding"]).group(1) + ) + + if model_type != ModelType.TEXT and any([key in layer_name for key in MM_KEYS]): + gguf_name = tensor_map_vision.get_name(layer_name) + if gguf_name is None: + for key in MM_KEYS: + gguf_name = tensor_map_vision.get_name(layer_name.replace(f".{key}", "")) + if gguf_name is not None: + break + else: + gguf_name = tensor_map.get_name(layer_name) + if gguf_name is None: + gguf_name = tensor_map.get_name(layer_name.replace(".language_model", "")) + bits_index = 6 + if config.get("fixed_by_user", False): + if "bits" not in config: + logger.warning( + f"Setting layer_config requires providing bits, {layer_name} has not bits," + f" using bits={target_bits} instead." + ) + new_type = new_type[:bits_index] + target_bits + new_type[bits_index + 1 :] + else: + config_tmp = config.copy() + scheme_keys = [f.name for f in fields(QuantizationScheme)] + for key in config.keys(): + if key not in scheme_keys: + config_tmp.pop(key, None) + matched_scheme = get_gguf_scheme(QuantizationScheme.from_dict(config_tmp)) # check matched + if not matched_scheme: + if config.get("super_group_size", None) is not None or config.get("super_bits", None) is not None: + new_type = new_type[:bits_index] + str(config["bits"]) + "_k" + if new_type not in GGUF_INNER_CONFIG: + prefix_idx = 0 if config.get("sym", True) else 1 + new_type = new_type[:bits_index] + str(config["bits"]) + f"_{prefix_idx}" + if new_type not in GGUF_INNER_CONFIG: + new_type = new_type[:bits_index] + str(config["bits"]) + f"_{1-prefix_idx}" + if new_type not in GGUF_INNER_CONFIG: + raise ValueError( + f"the setting in layer_config {layer_name} " + f"could not match any supported gguf format, please have a check." + ) + + new_type = new_type[:bits_index] + str(config["bits"]) + new_type[bits_index + 1 :] + new_type = _search_gguf_type(new_type) + if new_type is None: + raise ValueError(f"invalid bit setting for {layer_name}") + elif target_bits is not None and "bits" in config and config["bits"] != target_bits: + new_type = new_type[:bits_index] + str(config["bits"]) + new_type[bits_index + 1 :] + new_type = _search_gguf_type(new_type) + if new_type is None: + raise ValueError(f"invalid bit setting for {layer_name}") + elif lm_head_name is not None and layer_name == lm_head_name and not tie_word_embeddings: + if gguf.MODEL_ARCH.FALCON == model_class.model_arch or input_features % block_size != 0: + new_type = "gguf:q8_0" + elif "lm_head" in GGUF_CONFIG[target_gguf_format]: + new_type = GGUF_CONFIG[target_gguf_format]["lm_head"] + elif new_type != "gguf:q8_0": + new_type = "gguf:q6_k" + elif lm_head_name is not None and layer_name == lm_head_name and tie_word_embeddings: + # new_type = GGUF_CONFIG[target_gguf_format]["lm_head"] + continue + elif isinstance(layer, torch.nn.Embedding): + if "embedding" in GGUF_CONFIG[target_gguf_format]: + new_type = GGUF_CONFIG[target_gguf_format]["embedding"] + elif gguf_name is None: + pass + # attn_v + elif "attn_v" in gguf_name: + if target_gguf_format == "gguf:q2_k": + new_type = "gguf:q4_k" if n_gqa >= 4 else "gguf:q3_k" + elif target_gguf_format == "gguf:q2_k_s" and n_gqa >= 4: + new_type = "gguf:q4_k" + elif target_gguf_format == "gguf:q3_k_m": + new_type = "gguf:q5_k" if i_attention_wv < 2 else "gguf:q4_k" + elif target_gguf_format == "gguf:q3_k_l": + new_type = "gguf:q5_k" + elif (target_gguf_format == "gguf:q4_k_m" or target_gguf_format == "gguf:q5_k_m") and _use_more_bits( + i_layer, n_layer + ): + new_type = "gguf:q6_k" + elif target_gguf_format == "gguf:q4_k_s" and i_attention_wv < 4: + new_type = "gguf:q5_k" + ##TODO check which models are be grouped into to LLM_TYPE_70B + # if (qs.model.type == LLM_TYPE_70B) { + # // In the 70B model we have 8 heads sharing the same attn_v weights. + # As a result, the attn_v.weight tensor is + # // 8x smaller compared to attn_q.weight.Hence, we can get a nice boost in quantization accuracy with + # // nearly negligible increase in model size by quantizing this tensor with more bits: + # if + # (new_type == GGML_TYPE_Q3_K | | new_type == GGML_TYPE_Q4_K) + # new_type = GGML_TYPE_Q5_K; + # } + if n_expert == 8: + new_type = "gguf:q8_k" + i_attention_wv += 1 + + elif "attn_k" in gguf_name: + if n_expert == 8: + new_type = "gguf:q8_0" + # ffn_down + elif "ffn_down" in gguf_name: + if target_gguf_format == "gguf:q2_k": + new_type = "gguf:q3_k" + elif target_gguf_format == "gguf:q2_k_s": + if i_layer < n_layer / 8: + new_type = "gguf:q4_k" + elif target_gguf_format == "gguf:q3_k_m": + if i_layer < n_layer / 16: + new_type = "gguf:q5_k" + elif gguf.MODEL_ARCH.FALCON == model_class.model_arch or _use_more_bits(i_layer, n_layer): + new_type = "gguf:q4_k" + else: + new_type = "gguf:q3_k" + elif target_gguf_format == "gguf:q3_k_l": + if gguf.MODEL_ARCH.FALCON == model_class.model_arch: + new_type = "gguf:q4_k" + else: + new_type = "gguf:q5_k" + elif target_gguf_format == "gguf:q4_k_m": + if gguf.MODEL_ARCH.FALCON == model_class.model_arch: + if i_layer < n_layer // 16: + new_type = "gguf:q6_k" + elif _use_more_bits(i_layer, n_layer): + new_type = "gguf:q5_k" + else: + new_type = "gguf:q4_k" + else: + if _use_more_bits(i_layer, n_layer): + new_type = "gguf:q6_k" + elif target_gguf_format == "gguf:q5_k_m" and _use_more_bits(i_layer, n_layer): + new_type = "gguf:q6_k" + elif ( + target_gguf_format == "gguf:q4_k_s" + and model_class.model_arch != gguf.MODEL_ARCH.FALCON + and i_layer < n_layer / 8 + ): + new_type = "gguf:q5_k" + elif (target_gguf_format == "gguf:q4_0" or target_gguf_format == "gguf:q5_0") and i_layer < n_layer / 8: + if target_gguf_format == "gguf:q4_0": + new_type = "gguf:q4_1" + else: + new_type = "gguf:q5_1" + i_ffn_down += 1 + + # attn_output + elif "attn_output" in gguf_name: + if gguf.MODEL_ARCH.FALCON != model_class.model_arch: + if n_expert == 8: + if target_gguf_format in ( + "gguf:q2_k", + "gguf:q3_k_s", + "gguf:q3_k_m", + "gguf:q4_k_s", + "gguf:q4_k_m", + "gguf:q5_k", + ): + new_type = "gguf:q5_k" + elif target_gguf_format == "gguf:q2_k": + new_type = "gguf:q3_k" + elif target_gguf_format == "gguf:q3_k_m": + new_type = "gguf:q4_k" + elif target_gguf_format == "gguf:q3_k_l": + new_type = "gguf:q5_k" + else: + if target_gguf_format == "gguf:q3_k_l": + new_type = "gguf:q4_k" + # attn_qkv + elif "attn_qkv" in gguf_name: + if target_gguf_format in ("gguf:q3_k_m", "gguf:q3_k_l"): + new_type = "gguf:q4_k" + elif target_gguf_format == "gguf:q4_k_m": + new_type = "gguf:q5_k" + elif target_gguf_format == "gguf:q5_k_m": + new_type = "gguf:q5_k" + new_block_size = GGML_QUANT_SIZES[new_type.split(":")[-1].lower()][0] + if input_features % new_block_size != 0: + new_type = _gguf_type_fallback(new_type) + new_block_size = GGML_QUANT_SIZES[new_type.split(":")[-1].lower()][0] + if input_features % new_block_size != 0: + new_type = "gguf:bf16" + logger.warning( + f"fallback {layer_name} to {new_type}, " + f"because input_features({input_features}) % block_size({block_size}) != 0" + ) + # for deepseek v2 + if layer_name.endswith("kv_b_proj") and new_type.endswith("_k") and "Deepseek" in model.config.architectures[0]: + fallback = False + + # calc if need fallback + qk_nope_head_dim = model.config.qk_nope_head_dim + kv_b_shape = get_module(model, layer_name).weight.shape + + if ( + qk_nope_head_dim < QK_K + or qk_nope_head_dim % QK_K != 0 + or kv_b_shape[-1] < QK_K + or kv_b_shape[-1] % QK_K != 0 + ): + fallback = True + if fallback: + tmp_type = _gguf_type_fallback(new_type) + logger.warning_once( + f"self_attn.kv_b_proj does not support the use of {new_type}, replace it with {tmp_type}" + ) + new_type = tmp_type + + target_config = GGUF_INNER_CONFIG[new_type] + + _set_config(layer_config[layer_name], target_config) + _set_config(layer, target_config) + gguf_format_config[layer_name] = new_type + + return layer_config, gguf_format_config + + +def get_fp_layer_names(model: torch.nn.Module, ignore_layers: str): + """Identifies and returns layers in the model to exclude from quantization. + + This function processes a comma-separated list of fully precision (FP) layers, + matches them to the names of layers in the model, and returns a list of such + layers to exclude from quantization. + + Args: + model (torch.nn.Module): The model whose layers will be inspected. + ignore_layers (str): A comma-separated string of layer names to be excluded + from quantization. Whitespace is ignored in this string. + + Returns: + list: A list of layer names that match the specified FP layers or are + subcomponents of those layers. + """ + from auto_round.utils import SUPPORTED_LAYER_TYPES + + if not ignore_layers: + return [] + ignore_layers = ignore_layers.replace(" ", "").split(",") + all_layer_names = [] + for n, m in model.named_modules(): + if type(m) in SUPPORTED_LAYER_TYPES: + all_layer_names.append(n) + not_to_quantized_layers = [] + + for fp_layer in ignore_layers: + if fp_layer == "": + continue + if fp_layer in all_layer_names: + not_to_quantized_layers.append(fp_layer) + continue + if fp_layer[-1].isdigit(): + fp_layer = fp_layer + "." ##tricky setting + for name in all_layer_names: + if fp_layer in name: + not_to_quantized_layers.append(name) + logger.trace(f"not_to_quantized_layers: {not_to_quantized_layers}") + return not_to_quantized_layers + + +def get_shared_keys(model): + """ + Retrieves shared keys from the model's state dictionary. + + Args: + model (torch.nn.Module): The model to retrieve shared keys from. + + Returns: + tuple: tuple of shared keys. + """ + from auto_round.special_model_handler import SPECIAL_SHARED_CACHE_KEYS + from auto_round.utils import SHARED_CACHE_KEYS + + shared_keys = SHARED_CACHE_KEYS + shared_keys += SPECIAL_SHARED_CACHE_KEYS.get(model.__class__.__name__, ()) + return shared_keys + + +def init_cache(positional_inputs, inputs): + """ + Initializes special model inputs by adding positional inputs if missing. + + Args: + positional_inputs (list): List of positional inputs to add to inputs. + inputs (dict): Dictionary of model inputs. + + Modifies: + inputs (dict): Adds "positional_inputs" key if not present. + """ + from auto_round.utils.model import to_device + + if "positional_inputs" not in inputs: # for chatglm Series + inputs["positional_inputs"] = [] + for idx, item in enumerate(positional_inputs): + inputs["positional_inputs"] = to_device(positional_inputs) + + +def reset_params(inputs): + """ + Resets specific input parameters to avoid saving the key-value cache during fine-tuning. + + Args: + inputs (dict): Dictionary of model inputs. + + Modifies: + inputs (dict): Sets "use_cache" to False if the key is present. + """ + if "use_cache" in inputs.keys(): # Not storing kv cache + inputs["use_cache"] = False + + +class IndexSampler: + """A cyclic sampler that returns shuffled index batches. + + This sampler maintains internal state so that each call to `next_batch()` + continues from where it left off. When the remaining number of samples is + less than `batch_size`, the sampler reshuffles all indices and starts from + the beginning, discarding the last incomplete batch. + + Attributes: + nsamples (int): Total number of samples. + batch_size (int): Number of indices to return in each batch. + index (int): Current position in the index list. + indices (List[int]): Shuffled list of indices. + """ + + def __init__(self, nsamples: int, batch_size: int) -> None: + """Initializes the sampler. + + Args: + nsamples (int): Total number of samples (must be >= batch_size). + batch_size (int): Number of indices per batch. + + Raises: + ValueError: If batch_size is not in the range (0, nsamples]. + """ + if batch_size <= 0 or batch_size > nsamples: + raise ValueError("batch_size must be > 0 and <= nsamples") + + self.nsamples: int = nsamples + self.batch_size: int = batch_size + self.index: int = 0 + + self.indices: list[int] = list(range(nsamples)) + random.shuffle(self.indices) + + def next_batch(self) -> list[int]: + """Returns the next batch of shuffled indices. + + If the remaining indices are fewer than `batch_size`, the sampler + reshuffles the entire list and starts from the beginning. + + Returns: + list[int]: A list of size `batch_size` containing sample indices. + """ + if self.index + self.batch_size > self.nsamples: + random.shuffle(self.indices) + self.index = 0 + + batch = self.indices[self.index : self.index + self.batch_size] + self.index += self.batch_size + return batch + + +def _get_quantized_layer_names_outside_blocks(model, layer_config, supported_types, quant_block_list) -> list: + """Gets the names of quantized layers outside blocks in the model. + + Returns: + list: List of layer names outside blocks. + """ + if layer_config is None or len(layer_config) == 0: + return [] + + layer_names = [] + all_layers_in_block = get_layer_names_in_block(model, supported_types, quant_block_list) + + for key in layer_config.keys(): + if key in all_layers_in_block: + continue + layer = get_module(model, key) + if layer is None: + logger.error(f"could not find layer {key} in the model, exit...") + exit(-1) + if type(layer) in supported_types and check_to_quantized(layer_config[key]): + layer_names.append(key) + + return layer_names diff --git a/auto_round/context/__init__.py b/auto_round/context/__init__.py new file mode 100644 index 000000000..14a492441 --- /dev/null +++ b/auto_round/context/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/auto_round/context/base.py b/auto_round/context/base.py new file mode 100644 index 000000000..f427ce664 --- /dev/null +++ b/auto_round/context/base.py @@ -0,0 +1,26 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class BaseContext: + __instance = None + + @classmethod + def get_context(cls): + return cls.__instance + + @classmethod + def create_context(cls, *args, **kwargs): + cls.__instance = cls(*args, **kwargs) + return cls.__instance diff --git a/auto_round/context/compress_context.py b/auto_round/context/compress_context.py new file mode 100644 index 000000000..090b83cef --- /dev/null +++ b/auto_round/context/compress_context.py @@ -0,0 +1,51 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any, Callable, Optional, Union + +import torch + +from auto_round.context.base import BaseContext +from auto_round.utils.device import ( + clear_memory_if_reached_threshold, + get_major_device, + parse_available_devices, + set_auto_device_map_for_block_with_tuning, + set_non_auto_device_map, +) + +__all__ = ["CompressContext"] + + +class CompressContext(BaseContext): + def __init__( + self, + low_cpu_mem_usage: bool = True, + low_gpu_mem_usage: bool = False, + device_map: Union[str, torch.device, int, dict] = 0, + enable_torch_compile: bool = False, + ): + self.low_cpu_mem_usage = low_cpu_mem_usage + self.low_gpu_mem_usage - low_gpu_mem_usage + + if device_map is None: + device_map = 0 + self.device_map = device_map + if isinstance(self.device_map, str): + self.device_map = self.device_map.replace(" ", "") + self.device_list = parse_available_devices(self.device_map) + self.device = get_major_device(self.device_map) + + self.cache_device = torch.device("cpu") if low_gpu_mem_usage else self.device + + self.enable_torch_compile = enable_torch_compile diff --git a/auto_round/context/model_context.py b/auto_round/context/model_context.py new file mode 100644 index 000000000..ffa9b3a56 --- /dev/null +++ b/auto_round/context/model_context.py @@ -0,0 +1,227 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Callable, Optional, Union + +import torch +from packaging import version +from transformers import AutoConfig + +from auto_round import envs +from auto_round.compressors.utils import get_shared_keys +from auto_round.context.base import BaseContext +from auto_round.logger import logger +from auto_round.modeling.unfused_moe import apply_model_monkey_patches +from auto_round.special_model_handler import update_module +from auto_round.utils import ( + CpuInfo, + check_and_mark_quantized_module, + diffusion_load_model, + is_diffusion_model, + is_mllm_model, + is_moe_model, + is_moe_model_via_config, + llm_load_model, + mllm_load_model, + unsupported_meta_device, +) + +__all__ = ["ModelContext"] + + +class ModelContext(BaseContext): + _is_initialized = False + quantized = False + + act_quantize = False + + # model_related + _model_loaded = False + _init_model = False + is_mllm = False + is_diffusion = False + is_model_patched = False + is_moe_model = False + + hook_handles = [] + + def __init__( + self, + model, + tokenizer=None, + platform="hf", + model_dtype=None, + trust_remote_code=True, + amp=True, + need_calib=True, + device="cpu", + ): + self.model = model + self.tokenizer = tokenizer + self.device = device + + if envs.AR_USE_MODELSCOPE: + platform = "model_scope" + self._platform = platform + self._model_dtype = model_dtype + self._trust_remote_code = trust_remote_code + self._amp = amp + + self.need_calib = need_calib + + def _load_model(self): + if is_mllm_model(self.model, platform=self._platform): + self.is_mllm = True + if isinstance(self.model, str): + self.model, self.processor, self.tokenizer, self.image_processor = mllm_load_model( + self.model, platform=self._platform, device="cpu", model_dtype=self.model_dtype + ) + elif is_diffusion_model(self.model): + self.is_diffusion = True + self.pipe, self.model = diffusion_load_model( + self.model, platform=self._platform, device="cpu", model_dtype=self._model_dtype + ) + elif isinstance(self.model, str): + config: Optional[AutoConfig] = None + try: + config = AutoConfig.from_pretrained(self.model, trust_remote_code=self._trust_remote_code) + except (OSError, EnvironmentError) as e: + logger.debug( + "Failed to load config via AutoConfig.from_pretrained for %s: %s. " + "Proceeding without config-based checks.", + self.model, + e, + ) + + self.is_model_patched = apply_model_monkey_patches( + model_name=self.model, trust_remote_code=self._trust_remote_code + ) + import transformers + + if ( + not self.is_model_patched + and config is not None + and is_moe_model_via_config(config) + and version.parse(transformers.__version__) >= version.parse("5.0.0") + ): + from auto_round.modeling.fused_moe.replace_modules import BUILTIN_MODULES + + model_type = getattr(config, "model_type", None) + if model_type is not None and model_type not in BUILTIN_MODULES: + logger.warning( + "This MoE model has not been optimized by AutoRound yet, which may result in high RAM usage, " + "Please consider submitting an issue to https://github.com/intel/auto-round/issues" + ) + + self.model, self.tokenizer = llm_load_model( + self.model, + platform=self._platform, + device="cpu", # always load cpu first + model_dtype=self._model_dtype, + trust_remote_code=self._trust_remote_code, + ) + elif self.tokenizer is None and not self.diffusion and self.need_calib: + raise ValueError("A tokenizer must be set for non-str model input") + + self._model_loaded = True + + def _set_amp_dtype(self) -> None: + """Sets the automatic mixed precision (AMP) data type for the model based on the device and configuration.""" + self._amp_dtype = torch.bfloat16 + if self.model.dtype != torch.float32: + self._amp_dtype = self.model.dtype + if self.device == "cpu" or "hpu" in self.device: + self._amp_dtype = torch.bfloat16 + if self._amp: + if self.device == "cpu" and not CpuInfo().bf16: + self._amp = False + self._amp_dtype = torch.float32 + self.model = self.model.to(torch.float32) + logger.warning( + f"amp is set to FALSE as the current {self.device} device does not support the 'bf16' data type." + ) + else: + if self.model.dtype != self._amp_dtype: + self.model = self.model.to(self._amp_dtype) + else: + self._amp_dtype = torch.float32 + self.model = self.model.to(torch.float32) + + def initialize(self, formats): + # load and handle model + if not self._model_loaded: + self._load_model() + + if unsupported_meta_device(self.model): + raise RuntimeError( + "AutoRound does not support parameters on meta device. " + "Please use more GPUs by setting `--device 0,1,2,3` or just place the model on CPU." + ) + check_and_mark_quantized_module(self.model) + self.model = self.model.eval() + self.shared_cache_keys = get_shared_keys(self.model) + + # Important Note! This is not very robust, do NOT rely on it to do high risky thing + self.is_moe_model = is_moe_model(self.model) + + self._set_amp_dtype() + if self.act_quantize and self._amp_dtype == torch.float16: + logger.warning("force to use bf16 to for quantization tuning when enabling activation quantization") + self._amp_dtype = torch.bfloat16 + if self.model.dtype != torch.bfloat16: # keep the model's buffer dtype unchanged + self.model = self.model.to(torch.bfloat16) + else: + logger.info(f"using {self.model.dtype} for quantization tuning") + + # It is best to modify the model structure in the quantize function and check the format, + # because it may cause the gguf format to not be exported normally. + self.model = update_module( + self.model, formats=formats, trust_remote_code=self._trust_remote_code, cleanup_original=False + ) + + # Temporary names must be assigned after handle_moe_model; + # placing them earlier would cause them to be removed when the module is replaced. + for n, m in self.model.named_modules(): + m.global_name = n + + if self._amp and self.model.dtype != self._amp_dtype: + self.model = self.model.to(self._amp_dtype) + + self._init_model = True + + self._is_initialized = True + + def replace_forward(self, register_hook): + """Replaces the forward function. + register_hook(layer_name, module, hook_handles) + """ + assert self._init_model, "should load and initialize model first" + hook_handles = [] + + for n, m in self.model.named_modules(): + register_hook(n, m, hook_handles) + + self.hook_handles = hook_handles + + def recover_forward(self): + """Recovers the forward function.""" + assert self._init_model, "should load and initialize model first" + + for n, m in self.model.named_modules(): + if hasattr(m, "orig_forward"): + m.forward = m.orig_forward + delattr(m, "orig_forward") + for hook_handle in self.hook_handles: + hook_handle.remove() + self.hook_handles = [] diff --git a/auto_round/formats.py b/auto_round/formats.py index e6fc7f56f..9496165c8 100644 --- a/auto_round/formats.py +++ b/auto_round/formats.py @@ -41,7 +41,9 @@ get_gguf_scheme, ) from auto_round.utils import ( + INNER_SUPPORTED_LAYER_TYPES, SUPPORTED_FORMATS, + SUPPORTED_LAYER_TYPES, check_to_quantized, copy_python_files_from_model_cache, find_matching_blocks, @@ -143,7 +145,7 @@ def _check_divisible_by_32(ar): default_dict = asdict(ar.scheme) if default_dict["data_type"] == "int" and default_dict["act_bits"] >= 16: for n, m in ar.model.named_modules(): - if type(m) in ar.supported_types or m.__class__.__name__ in ar.inner_supported_types: + if type(m) in SUPPORTED_LAYER_TYPES or m.__class__.__name__ in INNER_SUPPORTED_LAYER_TYPES: if m.weight.shape[0] % 32 or m.weight.shape[1] % 32: if ar.layer_config is None: ar.layer_config = {} diff --git a/auto_round/schemes.py b/auto_round/schemes.py index 9ac3a4a9d..9b70a5a86 100644 --- a/auto_round/schemes.py +++ b/auto_round/schemes.py @@ -13,15 +13,20 @@ # limitations under the License. import copy from copy import deepcopy -from dataclasses import dataclass, fields -from typing import Optional, Union +from dataclasses import asdict, dataclass, fields +from typing import TYPE_CHECKING, Any, Optional, Union import torch +from auto_round.compressors.utils import infer_bits_by_data_type from auto_round.logger import logger +from auto_round.utils import SUPPORTED_DTYPES __all__ = ["QuantizationScheme", "get_gguf_scheme", "preset_name_to_scheme"] +if TYPE_CHECKING: + from auto_round.auto_scheme.gen_auto_scheme import AutoScheme + @dataclass class QuantizationScheme: @@ -106,6 +111,147 @@ def is_preset_scheme(name: str) -> bool: return name.upper() in PRESET_SCHEMES +def _reconcile_bits_and_dtype(config: dict, prefix: str = ""): + """ + Harmonizes 'bits' and 'data_type' for weights or activations. + Ensures internal consistency by prioritizing data_type inference. + """ + + dt_key = f"{prefix}data_type" + bits_key = f"{prefix}bits" + + if config.get(dt_key) is None: + return + + # Infer the correct bit-width based on the data_type string + inferred_bits = infer_bits_by_data_type(config[dt_key]) + + if inferred_bits is not None and inferred_bits < 16: + # Check for conflict between user-specified bits and inferred bits + if inferred_bits != config.get(bits_key): + logger.warning(f"'{dt_key}' does not match '{bits_key}'. " f"Resetting '{bits_key}' to {inferred_bits}.") + config[bits_key] = inferred_bits + + # Normalize data_type (e.g., 'mx_fp4' -> 'mx') + for supported in SUPPORTED_DTYPES: + if config[dt_key] == f"{supported}{inferred_bits}": + config[dt_key] = supported + break + + +def _override_scheme_with_user_specify( + scheme: Union[str, dict, QuantizationScheme], user_scheme_overrides: dict[str, Any], return_str=True +) -> Union[str, QuantizationScheme]: + """ + Updates a base quantization scheme with user-provided overrides. + Handles GGUF formatting and synchronizes weight/activation parameters. + """ + # 1. GGUF special handling: map data_type suffix to GGUF scheme names + dt_override = user_scheme_overrides.get("data_type", "") + if ( + isinstance(scheme, QuantizationScheme) or (isinstance(scheme, str) and not scheme.startswith("gguf")) + ) and dt_override.endswith("_dq"): + if "bits" not in user_scheme_overrides: + raise KeyError(f"Must specify 'bits' when using data_type={dt_override}") + + bits = user_scheme_overrides["bits"] + suffix = "k" if bits == 6 else "k_s" + scheme = f"gguf:q{bits}_{suffix}" + + # 2. Convert input scheme to a dictionary for processing + if isinstance(scheme, QuantizationScheme): + scheme_dict = asdict(scheme) + elif isinstance(scheme, str): + normalized_name = scheme.strip("'\" ").upper() + if normalized_name.startswith("GGUF") and len(user_scheme_overrides) > 0: + logger.warning_once( + "When using GGUF scheme, user-specified overrides will be ignored to ensure format compatibility." + ) + user_scheme_overrides = {} + # If no overrides exist, return the normalized string immediately + if not user_scheme_overrides and return_str: + return normalized_name + scheme_dict = asdict(preset_name_to_scheme(normalized_name)) + else: + scheme_dict = scheme.copy() + + # 3. Apply overrides and define default behaviors + scheme_dict.update(user_scheme_overrides) + + if scheme_dict.get("act_dynamic") is None: + scheme_dict["act_dynamic"] = True + + # 4. Reconcile weight settings (bits vs data_type) + _reconcile_bits_and_dtype(scheme_dict) + + # 5. Fallback logic: Inherit activation settings from weight settings + scheme_dict["act_group_size"] = ( + scheme_dict.get("act_group_size") + if scheme_dict.get("act_group_size") is not None + else scheme_dict.get("group_size") + ) + scheme_dict["act_bits"] = scheme_dict.get("act_bits") or 16 + scheme_dict["act_sym"] = ( + scheme_dict.get("act_sym") if scheme_dict.get("act_sym") is not None else scheme_dict.get("sym") + ) + + # 6. Activation data_type logic + if scheme_dict.get("act_data_type") is None: + is_supported = scheme_dict["data_type"] in SUPPORTED_DTYPES + if is_supported and scheme_dict["act_bits"] < 16: + scheme_dict["act_data_type"] = scheme_dict["data_type"] + logger.info(f"Activation adopting weight data_type: {scheme_dict['data_type']}") + else: + scheme_dict["act_data_type"] = "float" + + # 7. Reconcile activation settings + _reconcile_bits_and_dtype(scheme_dict, prefix="act_") + + return QuantizationScheme.from_dict(scheme_dict) + + +def _parse_scheme( + scheme: Union[str, dict, QuantizationScheme, "AutoScheme"], user_scheme_overrides: dict[str, Any] +) -> tuple[Union[str, QuantizationScheme], bool]: + """ + Parses the final scheme. + """ + from auto_round.auto_scheme.gen_auto_scheme import AutoScheme + + is_auto_scheme = isinstance(scheme, AutoScheme) + if is_auto_scheme: + if not scheme.options: + raise ValueError("AutoScheme options cannot be empty") + else: + for option in scheme.options: + if isinstance(option, str): + if "mixed" in option: + raise ValueError(f"Mixed option {option} is not supported") + + # Map user overrides across all auto-scheme options + scheme.options = [_override_scheme_with_user_specify(opt, user_scheme_overrides) for opt in scheme.options] + + # Select the primary scheme for attribute binding (skipping BF16) + default_scheme = scheme.options[0] + for opt in scheme.options: + if opt == "BF16": + continue + if isinstance(opt, QuantizationScheme): + if opt.bits < 16 or (opt.act_bits and opt.act_bits < 16): + default_scheme = opt + break + else: + default_scheme = _override_scheme_with_user_specify(scheme, user_scheme_overrides) + + # Extract attributes from the chosen default_scheme + if isinstance(default_scheme, str): + final_attrs = _override_scheme_with_user_specify(default_scheme, user_scheme_overrides, return_str=False) + final_attrs = asdict(final_attrs) + else: + final_attrs = asdict(default_scheme) + return default_scheme, is_auto_scheme, final_attrs + + W4A16 = QuantizationScheme.from_dict( { "bits": 4, diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py index 91f2b7e89..7f2d7a9b2 100644 --- a/auto_round/utils/model.py +++ b/auto_round/utils/model.py @@ -17,7 +17,7 @@ import re from collections import UserDict from pathlib import Path -from typing import Union +from typing import TYPE_CHECKING, Union import psutil import torch @@ -27,7 +27,6 @@ from auto_round import envs from auto_round.export.export_to_gguf.config import ModelType from auto_round.logger import logger -from auto_round.schemes import QuantizationScheme from auto_round.utils.weight_handler import ( _dequant_fp8_linear_weight, check_and_mark_quantized_module, @@ -35,6 +34,9 @@ is_quantized_input_module, ) +if TYPE_CHECKING: + from auto_round.schemes import QuantizationScheme + def clean_module_parameter(submodule: torch.nn.Module, param_name: str) -> None: """This function is recommended to be used instead of module.weight = None. From ca1709748d779ae4e2517fe7d3255be11bac2fe7 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Mon, 16 Mar 2026 09:09:21 +0800 Subject: [PATCH 02/90] update Signed-off-by: n1ck-guo --- .../algorithms/quantization/__init__.py | 6 + .../quantization/auto_round/adam.py | 62 ++ .../quantization/auto_round/config.py | 74 +- .../quantization/auto_round/quantize.py | 396 -------- .../quantization/auto_round/quantizer.py | 952 ++++++++++++++++++ auto_round/algorithms/quantization/base.py | 293 +++++- auto_round/algorithms/quantization/config.py | 181 ++++ auto_round/compressors_new/base.py | 633 ++++-------- auto_round/compressors_new/shard_writer.py | 20 +- auto_round/context/base.py | 39 +- .../{compress_context.py => compress.py} | 3 +- .../context/{model_context.py => model.py} | 58 +- auto_round/schemes.py | 4 +- auto_round/utils/model.py | 1 + 14 files changed, 1836 insertions(+), 886 deletions(-) create mode 100644 auto_round/algorithms/quantization/auto_round/adam.py delete mode 100644 auto_round/algorithms/quantization/auto_round/quantize.py create mode 100644 auto_round/algorithms/quantization/auto_round/quantizer.py create mode 100644 auto_round/algorithms/quantization/config.py rename auto_round/context/{compress_context.py => compress.py} (95%) rename auto_round/context/{model_context.py => model.py} (82%) diff --git a/auto_round/algorithms/quantization/__init__.py b/auto_round/algorithms/quantization/__init__.py index 14a492441..a4b2fca7f 100644 --- a/auto_round/algorithms/quantization/__init__.py +++ b/auto_round/algorithms/quantization/__init__.py @@ -11,3 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +from auto_round.algorithms.quantization.base import BaseQuantizers +from auto_round.algorithms.quantization.config import QuantizationConfig +from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig +from auto_round.algorithms.quantization.auto_round.quantizer import ARQuantizer +from auto_round.algorithms.quantization.auto_round.adam import ARAdamQuantizer diff --git a/auto_round/algorithms/quantization/auto_round/adam.py b/auto_round/algorithms/quantization/auto_round/adam.py new file mode 100644 index 000000000..b05af35ea --- /dev/null +++ b/auto_round/algorithms/quantization/auto_round/adam.py @@ -0,0 +1,62 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Union + +import torch + +from auto_round.algorithms.quantization.auto_round.quantizer import ARQuantizer +from auto_round.compressors.base import BaseCompressor +from auto_round.schemes import QuantizationScheme +from auto_round.utils import check_is_cpu, htcore, is_hpex_available + + +class ARAdamQuantizer(ARQuantizer): + def _get_optimizer(self, optimizer): + if optimizer is None: + optimizer = torch.optim.AdamW + elif isinstance(optimizer, str): + optimizer = getattr(torch.optim, optimizer) + else: + optimizer = optimizer + return optimizer + + def _get_scaler(self): + scaler = None + if self.amp and not check_is_cpu(self.device): + from torch.cuda.amp import GradScaler + + scaler = GradScaler(init_scale=1024, growth_interval=100000) + return scaler + + def _scale_loss_and_backward(self, scaler, loss): + if scaler is not None: + loss = scaler.scale(loss) + + loss.backward() + if is_hpex_available(): + htcore.mark_step() + return loss + + def _step(self, scaler, optimizer, lr_schedule): + if scaler is not None: + scaler.step(optimizer) + optimizer.zero_grad() + lr_schedule.step() + scaler.update() + else: + optimizer.step() + optimizer.zero_grad() + lr_schedule.step() + if is_hpex_available(): + htcore.mark_step() diff --git a/auto_round/algorithms/quantization/auto_round/config.py b/auto_round/algorithms/quantization/auto_round/config.py index 8272ea156..cb8e1b23f 100644 --- a/auto_round/algorithms/quantization/auto_round/config.py +++ b/auto_round/algorithms/quantization/auto_round/config.py @@ -11,11 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from auto_round.algorithms.alg_config import AlgConfig +from typing import Union + +from auto_round.algorithms.quantization.config import QuantizationConfig +from auto_round.auto_scheme.gen_auto_scheme import AutoScheme from auto_round.logger import logger +from auto_round.schemes import QuantizationScheme -class AutoRoundConfig(AlgConfig): +class AutoRoundConfig(QuantizationConfig): """ Args: @@ -28,16 +32,33 @@ class AutoRoundConfig(AlgConfig): enable_norm_bias_tuning (bool): Whether to enable fast norm/layer_bias tuning """ + _alg_cls = "ARQuantizer" + def __init__( self, + scheme: Union[str, dict, QuantizationScheme, AutoScheme] = "W4A16", + layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None, + *, iters: int = 200, lr: float = None, minmax_lr: float = None, lr_scheduler=None, + seqlen: int = 2048, + nsamples: int = 128, + momentum: float = 0.0, batch_size: int = 8, enable_minmax_tuning: bool = True, enable_norm_bias_tuning: bool = False, + gradient_accumulate_steps: int = 1, + enable_alg_ext: bool = False, + not_use_best_mse: bool = False, + dynamic_max_gap: int = -1, + enable_quanted_input: bool = True, + optimizer: str = None, + enable_adam: bool = False, + **kwargs, ): + super().__init__(scheme=scheme, layer_config=layer_config, **kwargs) self.iters = iters if self.iters < 0: logger.warning("`iters` must be non-negative, reset it to 200") @@ -55,7 +76,11 @@ def __init__( self.minmax_lr = minmax_lr or self.lr self.lr_scheduler = lr_scheduler - self.batch_size = batch_size + self.seqlen = seqlen + self.nsamples = nsamples + self.batch_size, self.gradient_accumulate_steps = batch_size, gradient_accumulate_steps + self.momentum = momentum + self.enable_alg_ext = enable_alg_ext # Some helpers self.infer_bs_coeff = 1 @@ -65,3 +90,46 @@ def __init__( self.enable_norm_bias_tuning = enable_norm_bias_tuning if self.enable_norm_bias_tuning: logger.warning("the `enable_norm_bias_tuning` feature is experimental and currently has limited support.") + self.not_use_best_mse = not_use_best_mse + self.dynamic_max_gap = dynamic_max_gap + self.enable_quanted_input = enable_quanted_input + self.optimizer = optimizer + self.enable_adam = enable_adam + + if self.enable_adam: + self._alg_cls = "ARAdamQuantizer" + + def check_configs(self) -> None: + """Checks if the configurations are valid. + + Raises: + ValueError, TypeError: If any of the configurations are invalid. + """ + super().check_config() + + if self.batch_size <= 0: + raise ValueError("`batch_size` must be positive") + if self.iters < 0: + raise ValueError("`iters` must be non-negative") + if self.seqlen <= 0: + raise ValueError("`seqlen` must be positive") + if self.nblocks <= 0: + raise ValueError("`nblocks` must be positive") + if self.gradient_accumulate_steps <= 0: + raise ValueError("`gradient_accumulate_steps` must be positive") + + if self.nsamples < self.gradient_accumulate_steps * self.batch_size: + if self.batch_size > self.nsamples: + if self.iters > 0: # GGUF should log this warning, but we don't know the format here + logger.warning( + f"reset `batch_size` to {self.nsamples} as `nsamples`({self.nsamples})" + f" is smaller than batch_size({self.batch_size})" + ) + self.batch_size = self.nsamples + if self.gradient_accumulate_steps > self.nsamples // self.batch_size: + self.gradient_accumulate_steps = self.nsamples // self.batch_size + logger.warning( + f"reset `gradient_accumulate_steps` to {self.gradient_accumulate_steps}" + f" as nsamples must equal or greater" + f" than gradient_accumulate_steps * batch_size" + ) diff --git a/auto_round/algorithms/quantization/auto_round/quantize.py b/auto_round/algorithms/quantization/auto_round/quantize.py deleted file mode 100644 index 1941db2d0..000000000 --- a/auto_round/algorithms/quantization/auto_round/quantize.py +++ /dev/null @@ -1,396 +0,0 @@ -# Copyright (c) 2026 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import copy -from typing import Any, Callable, Optional, Union - -import accelerate -import torch - -from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig -from auto_round.compressors_new.utils import ( - IndexSampler, - block_forward, - check_need_act_calibration, - check_skippable_keywords, - collect_best_params, - get_shared_keys, - infer_bits_by_data_type, - init_cache, - is_nv_fp, - reset_params, - set_layer_config, -) -from auto_round.context.compress_context import CompressContext -from auto_round.context.model_context import ModelContext -from auto_round.logger import logger -from auto_round.modeling.fused_moe.replace_modules import materialize_model_, safe_to_cpu_ -from auto_round.utils import ( - clear_memory, - convert_module_to_hp_if_necessary, - is_auto_device_mapping, - memory_monitor, - mv_module_from_gpu, - set_amax_for_all_moe_layers, - to_device, -) -from auto_round.utils.device import ( - clear_memory_if_reached_threshold, - get_major_device, - parse_available_devices, - set_auto_device_map_for_block_with_tuning, - set_non_auto_device_map, -) -from auto_round.utils.distributed import setup_ddp_if_needed_ -from auto_round.wrapper import WrapperLinear, WrapperMultiblock, unwrapper_block, unwrapper_layer, wrapper_block - - -class ARQuantizer: - - def __init__(self, config: AutoRoundConfig): - self.config = AutoRoundConfig - - def quantize_block( - self, - block: torch.nn.Module, - input_ids: Union[list[torch.Tensor], dict], - input_others: dict, - q_input: Union[torch.Tensor, dict, None] = None, - device: Union[str, torch.device] = "cpu", - auto_offload=True, - ): - """Quantize the weights of a given block of the model. - - Args: - block: The block of the model to be quantized. - input_ids: The input tensor containing tokenized input ids. - input_others: A dictionary containing additional input data. - q_input: The quantized input tensor. - device: The device for quantization. - - Returns: - Tuple: (q_outputs, output) if self.enable_quanted_input is True, else (None, output) - """ - model_context = ModelContext.get_context() - compress_context = CompressContext.get_context() - - materialize_model_(block) - convert_module_to_hp_if_necessary(block, model_context._amp_dtype, device) - - if auto_offload: - # card_0_in_high_risk indicates that card_0 memory is already in high usage (90%) w/o any weights - # loss_device is used to calculate loss on the second device if available and card_0_in_high_risk - if is_auto_device_mapping(compress_context.device_map) and len(compress_context.device_list) > 1: - card_0_in_high_risk, loss_device = set_auto_device_map_for_block_with_tuning( - block, - compress_context.device_map, - input_ids, - compress_context.low_gpu_mem_usage, - self.batch_size, - device, - ) - else: - block = block.to(device) - card_0_in_high_risk, loss_device = False, device - else: - card_0_in_high_risk, loss_device = False, device - - if len(compress_context.device_list) > 1 and auto_offload: - for n, m in block.named_modules(): - if len(list(m.children())) != 0 or not hasattr(m, "tuning_device"): - continue - from accelerate.hooks import AlignDevicesHook, add_hook_to_module - - hook = AlignDevicesHook(m.tuning_device, io_same_device=True) - add_hook_to_module(m, hook, True) - - if q_input is None: - hook_handles = self._register_act_max_hook(block) - - output = self._get_block_outputs( - block, - input_ids, - input_others, - self.batch_size * self.infer_bs_coeff, - device, - compress_context.cache_device, - ) - - for handle in hook_handles: - handle.remove() - else: - output = self._get_block_outputs( - block, - input_ids, - input_others, - self.batch_size * self.infer_bs_coeff, - device, - compress_context.cache_device, - ) - hook_handles = self._register_act_max_hook(block) - if hook_handles: - self._get_block_outputs( - block, - q_input if q_input is not None else input_ids, - input_others, - self.batch_size * self.infer_bs_coeff, - device, - compress_context.cache_device, - save_output=False, - ) - - for handle in hook_handles: - handle.remove() - - if q_input is not None: - if input_ids is not q_input: - clear_memory(input_ids, device_list=compress_context.device_list) - else: - clear_memory(device_list=compress_context.device_list) - input_ids = q_input - - quantized_layer_names, unquantized_layer_names = self.wrapper_block( - block, - self.enable_minmax_tuning, - self.enable_norm_bias_tuning, - enable_torch_compile=self.enable_torch_compile, - device=device, - ) - # Call this before quantization and after applying the block wrapper. - if is_nv_fp(self.data_type): # enable qkv and moe structure global_scale fuse. - from auto_round.data_type.utils import update_fused_layer_global_scales - - modules = block.modules() - for module in modules: - update_fused_layer_global_scales(module) - round_params = [] - minmax_params = [] - for n, m in block.named_modules(): - if hasattr(m, "orig_layer"): - for key in m.params.keys(): - if "min" in key or "max" in key: - minmax_params.append(m.params[key]) - else: - round_params.append(m.params[key]) - - lr = torch.tensor(self.lr) - minmax_lr = torch.tensor(self.minmax_lr) - is_adam = "adam" in self.__class__.__name__.lower() - - extra_kwargs = {} if is_adam else {"momentum": self.momentum} - - if self.enable_minmax_tuning: - params = [ - {"params": round_params}, - {"params": minmax_params, "lr": minmax_lr}, - ] - else: - params = round_params - - optimizer = self.optimizer( - params, - lr=lr, - weight_decay=0, - **extra_kwargs, - ) - - if len(round_params) + len(minmax_params) <= 0: - dump_info = ( - f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} " - f"layers in the block" - ) - logger.info(dump_info) - unwrapper_block(block, {}) - mv_module_from_gpu(block) - return output, output - - if self.lr_scheduler is None: - lr_schedule = torch.optim.lr_scheduler.LinearLR( - optimizer, start_factor=1.0, end_factor=0.0, total_iters=self.iters - ) - else: - lr_schedule = copy.deepcopy(self.lr_scheduler) - - if isinstance(input_ids, dict): # input_ids of Flux is dict - nsamples = len(input_ids["hidden_states"]) - else: - nsamples = len(input_ids) - last_best_iter = 0 - best_loss = torch.finfo(torch.float).max - num_elm = 1 - mse_reduction = "mean" - if self.gradient_accumulate_steps != 1: - mse_reduction = "sum" - mse_loss = torch.nn.MSELoss(reduction=mse_reduction).to(device) - scaler = self._get_scaler() # pylint: disable=assignment-from-none - init_loss = None - best_params = {} - total_loss = 0 - global_batch_size = self.batch_size * self.gradient_accumulate_steps - global_batch_size = min(nsamples, global_batch_size) - # We assume the block input and output shape is same - if self.gradient_accumulate_steps != 1 and not self.attention_mask: - whole_indices = torch.arange(global_batch_size) - num_elm = self._get_current_num_elm(input_ids, whole_indices) - setup_ddp_if_needed_(self, block, self.device_list) - index_sampler = IndexSampler(nsamples, global_batch_size) - batch_size = self.batch_size - for i in range(self.iters): - if self.enable_alg_ext and self.data_type.endswith("dq"): - for n, m in block.named_modules(): - m.cur_iter = i - total_loss = 0 - global_indices = index_sampler.next_batch() - if self.attention_mask: - num_elm = self._get_non_zero_cnt(self.attention_mask, global_indices) - - for tmp_step in range(self.gradient_accumulate_steps): - indices = global_indices[tmp_step * batch_size : (tmp_step + 1) * batch_size] - current_output = self._get_current_output(output, indices) - current_output = to_device(current_output, loss_device) - output_q = self._get_current_q_output(block, input_ids, input_others, indices, device, loss_device) - loss = self._get_loss(output_q, current_output, indices, mse_loss, device) - num_elm = 1 if num_elm <= 0 else num_elm - total_loss += loss.item() / num_elm - - if self.low_gpu_mem_usage and card_0_in_high_risk: - # clear memory to avoid OOM due to memory fragmentation - clear_memory_if_reached_threshold(threshold=0.5, device_list=compress_context.device_list) - - self._scale_loss_and_backward(scaler, loss) - - if self.low_gpu_mem_usage and card_0_in_high_risk: - # clear memory to avoid OOM due to memory fragmentation - clear_memory_if_reached_threshold(threshold=0.8, device_list=compress_context.device_list) - - if i == 0: - init_loss = total_loss - - if total_loss < best_loss: - best_loss = total_loss - if not self.not_use_best_mse: - best_params = collect_best_params(block, compress_context.cache_device) - # print(f"get better result at iter {i}, the loss is {total_loss}", flush=True) - - last_best_iter = i - if self.not_use_best_mse and i == self.iters - 1: - best_params = collect_best_params(block, compress_context.cache_device) - - if not self.not_use_best_mse: - if 0 < self.dynamic_max_gap <= i - last_best_iter: - break - self._step(scaler, optimizer, lr_schedule) - - last_loss = total_loss - best_iter = self.iters - if not self.not_use_best_mse: - last_loss = best_loss - best_iter = last_best_iter - if self.iters > 0: - dump_info = ( - f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} " - f"layers in the block, loss iter 0: {init_loss:.6f} -> iter {best_iter}: {last_loss:.6f}" - ) - else: - dump_info = ( - f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} " - "layers in the block" - ) - - if self.low_gpu_mem_usage: - clear_memory(device_list=compress_context.device_list) # clear cached memory during training - if len(unquantized_layer_names) != 0: - logger.info(f"{unquantized_layer_names} have not been quantized") - with torch.no_grad(): - unwrapper_block(block, best_params) - - if is_nv_fp(self.act_data_type): - # enable moe experts act_max automatic generation for WrapperWALayer - set_amax_for_all_moe_layers(block, attr_name="orig_layer.act_max") - - if self.enable_quanted_input: - q_outputs = self._get_block_outputs( - block, - input_ids, - input_others, - self.batch_size * self.infer_bs_coeff, - device, - cache_device=compress_context.cache_device, - ) - - if len(compress_context.device_list) > 1 and auto_offload: - accelerate.hooks.remove_hook_from_submodules(block) - if auto_offload: - mv_module_from_gpu(block) - - clear_memory(input_ids, device_list=compress_context.device_list) - memory_info_summary = memory_monitor.get_summary() - logger.infoclean(dump_info + "," + memory_info_summary) - - return q_outputs, output - else: - if len(compress_context.device_list) > 1 and auto_offload: - accelerate.hooks.remove_hook_from_submodules(block) - if auto_offload: - mv_module_from_gpu(block) - clear_memory(input_ids, device_list=compress_context.device_list) - memory_info_summary = memory_monitor.get_summary() - logger.infoclean(dump_info + "," + memory_info_summary) - - return None, output - - @torch.no_grad() - def _get_block_outputs( - self, - block: torch.nn.Module, - input_ids: torch.Tensor | list[torch.Tensor], - input_others: torch.Tensor | dict, - bs: int, - device: Union[str, torch.device], - cache_device: Union[str, torch.device], - save_output: bool = True, - ): - """Compute the output of a given block of the model for a given input. - - Args: - block: The block of the model. - input_ids: The input tensor containing tokenized input ids. - input_others: A dictionary containing additional input data. - bs: The batch size for computing the output. - device: The device for computation. - cache_device: The device for storing the output. - batch_dim: The batch dimension of the output tensor. - - Returns: - The output tensor of the block. - """ - output = [] - nsamples = len(input_ids) - for i in range(0, nsamples, bs): - end_index = min(nsamples, i + bs) - indices = torch.arange(i, end_index).to(torch.long) - tmp_input_ids, tmp_input_others = self._sampling_inputs( - input_ids, input_others, indices, self.seqlen, self.batch_dim, share_cache_keys=self.shared_cache_keys - ) - tmp_output = self.block_forward( - block, tmp_input_ids, tmp_input_others, self.amp, self.amp_dtype, device - ).to(cache_device) - if save_output: - if self.batch_size == 1: - output.append(tmp_output) - else: - output.extend(list(torch.split(tmp_output, 1, dim=self.batch_dim))) - if self.low_gpu_mem_usage: - clear_memory(device_list=self.device_list) - - return output diff --git a/auto_round/algorithms/quantization/auto_round/quantizer.py b/auto_round/algorithms/quantization/auto_round/quantizer.py new file mode 100644 index 000000000..423bb4f29 --- /dev/null +++ b/auto_round/algorithms/quantization/auto_round/quantizer.py @@ -0,0 +1,952 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy +from collections import defaultdict +from contextlib import nullcontext +from functools import partial +from typing import Any, Callable, Optional, Union + +import accelerate +import torch +from torch import autocast + +from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig +from auto_round.algorithms.quantization.base import BaseQuantizers +from auto_round.compressors_new.utils import ( + IndexSampler, + block_forward, + check_need_act_calibration, + check_skippable_keywords, + collect_best_params, + get_shared_keys, + infer_bits_by_data_type, + init_cache, + is_nv_fp, + reset_params, + set_layer_config, +) +from auto_round.logger import logger +from auto_round.modeling.fused_moe.replace_modules import materialize_model_, safe_to_cpu_ +from auto_round.sign_sgd import SignSGD +from auto_round.utils import ( + clear_memory, + compile_func, + convert_module_to_hp_if_necessary, + get_module, + htcore, + is_auto_device_mapping, + is_hpex_available, + memory_monitor, + mv_module_from_gpu, + set_amax_for_all_moe_layers, + to_device, +) +from auto_round.utils.device import ( + clear_memory_if_reached_threshold, + get_major_device, + parse_available_devices, + set_auto_device_map_for_block_with_tuning, + set_non_auto_device_map, +) +from auto_round.utils.distributed import setup_ddp_if_needed_ +from auto_round.wrapper import WrapperLinear, WrapperMultiblock, unwrapper_block, unwrapper_layer, wrapper_block + + +class ARQuantizer(BaseQuantizers): + + def __init__(self, config: AutoRoundConfig): + super().__init__(config) + self.attention_mask = [] + + self.iters = config.iters + self.lr = config.lr + self.minmax_lr = config.minmax_lr + self.lr_scheduler = config.lr_scheduler + self.seqlen = config.seqlen + self.nsamples = config.nsamples + self.batch_size = config.batch_size + self.batch_dim = config.batch_dim + self.momentum = config.momentum + self.infer_bs_coeff = config.infer_bs_coeff + self.enable_minmax_tuning = config.enable_minmax_tuning + self.enable_norm_bias_tuning = config.enable_norm_bias_tuning + self.gradient_accumulate_steps = config.gradient_accumulate_steps + self.enable_alg_ext = config.enable_alg_ext + self.not_use_best_mse = config.not_use_best_mse + self.enable_quanted_input = config.enable_quanted_input + self.dynamic_max_gap = config.dynamic_max_gap + + self.optimizer = self._get_optimizer(optimizer=config.optimizer) + self.wrapper_block = wrapper_block + + def post_init(self): + super().post_init() + if self.enable_alg_ext: + try: + logger.warning_once("using algorithm extension for quantization.") + from auto_round.alg_ext import wrapper_autoround + + wrapper_autoround(self.quantizer) + except (ImportError, ModuleNotFoundError): + logger.error("algorithm extension import error, fallback to default mode") + + @torch.no_grad() + def _get_block_forward_func(self, name: str) -> Callable: + """Gets the forward function. + + Args: + name (str): The name of the function. + Returns: + function: The forward function. + """ + + def post_process_cache_data(batch_size, data, data_name): + """ + Processes store data for batch handling, reshaping if necessary. + + Args: + batch_size (int): The size of the batch. + data: The data value to store, potentially for caching. + data_name (str): Name of the data. + + Returns: + Processed data or None + """ + new_data = data + if batch_size <= 1: + return new_data + if data_name in self.model_context.shared_cache_keys: + return None + if "alibi" in data_name: + if isinstance(data, torch.Tensor): + alibi = data + alibi = alibi.reshape(batch_size, -1, alibi.shape[1], alibi.shape[2]) + new_data = alibi + return new_data + + def forward(m, hidden_states=None, *positional_inputs, **kwargs): + """Rewrite forward function, process and collect input data. + + Args: + hidden_states (torch.Tensor): The hidden states tensor. + *positional_inputs: Variable number of positional arguments. + **kwargs: Variable number of keyword arguments. + + Returns: + NotImplementedError: Getting the first layer inputs and then raise the error to save runtime. + """ + if name not in self.inputs: + self.inputs[name] = {} + init_cache(positional_inputs, self.inputs[name]) + + if self.batch_dim is None: + self.batch_dim = 0 + if hidden_states is not None and self.batch_size > 1: + if hidden_states.shape[0] > self.batch_size: + self.batch_dim = 1 + if len(hidden_states.shape) > 1 and hidden_states.shape[1] > self.batch_size: + logger.error( + "this model has not been supported, " + "please raise an issue in https://github.com/intel/auto-round/issues" + " or try to set the `batch_size` to 1 and " + "`gradient_accumulate_steps` to your current batch size." + ) + exit(-1) + + if hidden_states is not None: + kwargs["hidden_states"] = hidden_states + + for key in kwargs.keys(): + if ( + isinstance(kwargs[key], torch.Tensor) + or isinstance(kwargs[key], list) + or isinstance(kwargs[key], tuple) + ): + if key not in self.inputs[name].keys(): # initialization + data = to_device(kwargs[key], device=torch.device("cpu")) + if data is None or (self.batch_size > 1 and key in self.model_context.shared_cache_keys): + self.inputs[name][key] = data + continue + if self.batch_size <= 1: + self.inputs[name][key] = [data] + else: + data = post_process_cache_data(self.batch_size, data, key) + self.inputs[name][key] = list(torch.split(data, 1, dim=self.batch_dim)) + else: # append cache inputs + new_data = post_process_cache_data(self.batch_size, kwargs[key], key) + if new_data is None: # shareable args or NoneType + continue + new_data = to_device(new_data, device=torch.device("cpu")) + if self.batch_size <= 1: + self.inputs[name][key].append(new_data) + else: + self.inputs[name][key].extend(list(torch.split(new_data, 1, dim=self.batch_dim))) + elif isinstance(kwargs[key], (str, bool, type(None))): + if key not in self.inputs[name].keys(): + self.inputs[name][key] = kwargs[key] + else: + # Parameters not to be cached + if check_skippable_keywords(key): + logger.warning_once( + f"Please note that '{key}' key" " is not currently used in quantization fine-tuning." + ) + reset_params(self.inputs[name]) + + if self._should_stop_cache_forward(name): + raise NotImplementedError + else: + if hidden_states is not None: + kwargs.pop("hidden_states") + return m.orig_forward(hidden_states, *positional_inputs, **kwargs) + else: + # Currently only for Llama-3.2-Vision-Instruct Series + return m.orig_forward(*positional_inputs, **kwargs) + + return forward + + def normalize_decoding_layer_inputs_(self, decoding_layer_inputs: list[tuple[tuple[Any, dict[str, Any]]]]): + """ + Processes and stores decoding layer inputs for block quantization. + + This function iterates through a list of captured decoding layer calls, + replaying them through a fake decoding layer to extract and store the + inputs required for the decoding block in `self.inputs`. This effectively + "normalizes" the inputs by making them accessible in a consistent format + for subsequent quantization steps. + + Args: + decoding_layer_inputs: + A list of entries captured by a forward hook on the decoding layer. + Each element is expected to be a tuple whose first item is + `(args, kwargs)`, where `args` are the positional arguments and + `kwargs` are the keyword arguments seen during the original + forward pass. + + The capture hook look like: + + def input_capture_hook(module, *args, **kwargs): + _all_module_input[module._global_name].append((args, kwargs)) + """ + first_block_name = self.quant_block_list[0][0] + + class _FakeDecodingLayer(torch.nn.Module): + + def forward(self, *args, **kwargs): + return args, kwargs + + fake_layer = _FakeDecodingLayer() + fake_layer.orig_forward = fake_layer.forward + fake_layer.forward = partial(self._get_block_forward_func(first_block_name), fake_layer) + + self.inputs = {} + self.last_cache_name = None + for step_input in decoding_layer_inputs: + args, kwargs = step_input[0] + fake_layer(*args, **kwargs) + + def _get_current_output(self, output: list[torch.Tensor], indices: list[int]) -> torch.Tensor: + current_output = [output[x] for x in indices] + current_output = torch.cat(current_output, dim=self.batch_dim) + return current_output + + def _get_current_q_output( + self, + block: torch.nn.Module, + input_ids: list[torch.Tensor], + input_others: dict, + indices: list[int], + device: str, + cache_device: str = "cpu", + ) -> torch.Tensor: + current_input_ids, current_input_others = self._sampling_inputs( + input_ids, + input_others, + indices, + seqlen=self.seqlen, + batch_dim=self.batch_dim, + share_cache_keys=self.model_context.shared_cache_keys, + ) + output_q = self.block_forward( + block, current_input_ids, current_input_others, self.model_context.amp, self.model_context.amp_dtype, device + ) + return output_q.to(cache_device) + + def _get_current_num_elm( + self, + input_ids: list[torch.Tensor], + indices: list[int], + ) -> int: + current_input_ids = [input_ids[i] for i in indices] + return sum(id.numel() for id in current_input_ids) + + def _get_non_zero_cnt(self, tensor: list[torch.Tensor], indices: list[int]) -> int: + current_tensors = [tensor[i] for i in indices] + non_zero_cnt = 0 + for t in current_tensors: + non_zero_cnt += torch.count_nonzero(t).item() + return non_zero_cnt + + def _get_loss( + self, + output_q: torch.Tensor, + current_output: torch.Tensor, + indices: torch.Tensor, + mse_loss: Callable, + device: Union[str, torch.device] = "cpu", + ): + autocast_ctx = ( + nullcontext() + if self.model_context.amp + else autocast(device_type=str(device).split(":")[0], dtype=self.model_context.amp_dtype) + ) + if self.attention_mask: + tmp_attention_mask = [self.attention_mask[i] for i in indices] + tmp_attention_mask = torch.cat(tmp_attention_mask, dim=0).to(device) + tmp_attention_mask.unsqueeze_(-1) + + with autocast_ctx: + loss = mse_loss( # pylint: disable=not-callable + (output_q * tmp_attention_mask).to(torch.float32), + (current_output * tmp_attention_mask).to(torch.float32), + ) + else: + with autocast_ctx: + loss = mse_loss( # pylint: disable=not-callable + output_q.to(torch.float32), current_output.to(torch.float32) + ) + + return loss + + def quantize_block( + self, + block: torch.nn.Module, + input_ids: Union[list[torch.Tensor], dict], + input_others: dict, + q_input: Union[torch.Tensor, dict, None] = None, + device: Union[str, torch.device] = "cpu", + auto_offload=True, + ): + """Quantize the weights of a given block of the model. + + Args: + block: The block of the model to be quantized. + input_ids: The input tensor containing tokenized input ids. + input_others: A dictionary containing additional input data. + q_input: The quantized input tensor. + device: The device for quantization. + + Returns: + Tuple: (q_outputs, output) if self.enable_quanted_input is True, else (None, output) + """ + + materialize_model_(block) + convert_module_to_hp_if_necessary(block, self.model_context.amp_dtype, device) + + if auto_offload: + # card_0_in_high_risk indicates that card_0 memory is already in high usage (90%) w/o any weights + # loss_device is used to calculate loss on the second device if available and card_0_in_high_risk + if is_auto_device_mapping(self.compress_context.device_map) and len(self.compress_context.device_list) > 1: + card_0_in_high_risk, loss_device = set_auto_device_map_for_block_with_tuning( + block, + self.compress_context.device_map, + input_ids, + self.compress_context.low_gpu_mem_usage, + self.batch_size, + device, + ) + else: + block = block.to(device) + card_0_in_high_risk, loss_device = False, device + else: + card_0_in_high_risk, loss_device = False, device + + if len(self.compress_context.device_list) > 1 and auto_offload: + for n, m in block.named_modules(): + if len(list(m.children())) != 0 or not hasattr(m, "tuning_device"): + continue + from accelerate.hooks import AlignDevicesHook, add_hook_to_module + + hook = AlignDevicesHook(m.tuning_device, io_same_device=True) + add_hook_to_module(m, hook, True) + + if q_input is None: + hook_handles = self._register_act_max_hook(block) + + output = self._get_block_outputs( + block, + input_ids, + input_others, + self.batch_size * self.infer_bs_coeff, + device, + self.compress_context.cache_device, + ) + + for handle in hook_handles: + handle.remove() + else: + output = self._get_block_outputs( + block, + input_ids, + input_others, + self.batch_size * self.infer_bs_coeff, + device, + self.compress_context.cache_device, + ) + hook_handles = self._register_act_max_hook(block) + if hook_handles: + self._get_block_outputs( + block, + q_input if q_input is not None else input_ids, + input_others, + self.batch_size * self.infer_bs_coeff, + device, + self.compress_context.cache_device, + save_output=False, + ) + + for handle in hook_handles: + handle.remove() + + if q_input is not None: + if input_ids is not q_input: + clear_memory(input_ids, device_list=self.compress_context.device_list) + else: + clear_memory(device_list=self.compress_context.device_list) + input_ids = q_input + + quantized_layer_names, unquantized_layer_names = self.wrapper_block( + block, + self.enable_minmax_tuning, + self.enable_norm_bias_tuning, + enable_torch_compile=self.compress_context.enable_torch_compile, + device=device, + ) + # Call this before quantization and after applying the block wrapper. + if self.config.is_nv_fp: # enable qkv and moe structure global_scale fuse. + from auto_round.data_type.utils import update_fused_layer_global_scales + + modules = block.modules() + for module in modules: + update_fused_layer_global_scales(module) + round_params = [] + minmax_params = [] + for n, m in block.named_modules(): + if hasattr(m, "orig_layer"): + for key in m.params.keys(): + if "min" in key or "max" in key: + minmax_params.append(m.params[key]) + else: + round_params.append(m.params[key]) + + lr = torch.tensor(self.lr) + minmax_lr = torch.tensor(self.minmax_lr) + is_adam = "adam" in self.__class__.__name__.lower() + + extra_kwargs = {} if is_adam else {"momentum": self.momentum} + + if self.enable_minmax_tuning: + params = [ + {"params": round_params}, + {"params": minmax_params, "lr": minmax_lr}, + ] + else: + params = round_params + + optimizer = self.optimizer( + params, + lr=lr, + weight_decay=0, + **extra_kwargs, + ) + + if len(round_params) + len(minmax_params) <= 0: + dump_info = ( + f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} " + f"layers in the block" + ) + logger.info(dump_info) + unwrapper_block(block, {}) + mv_module_from_gpu(block) + return output, output + + if self.lr_scheduler is None: + lr_schedule = torch.optim.lr_scheduler.LinearLR( + optimizer, start_factor=1.0, end_factor=0.0, total_iters=self.iters + ) + else: + lr_schedule = copy.deepcopy(self.lr_scheduler) + + if isinstance(input_ids, dict): # input_ids of Flux is dict + nsamples = len(input_ids["hidden_states"]) + else: + nsamples = len(input_ids) + last_best_iter = 0 + best_loss = torch.finfo(torch.float).max + num_elm = 1 + mse_reduction = "mean" + if self.gradient_accumulate_steps != 1: + mse_reduction = "sum" + mse_loss = torch.nn.MSELoss(reduction=mse_reduction).to(device) + scaler = self._get_scaler() # pylint: disable=assignment-from-none + init_loss = None + best_params = {} + total_loss = 0 + global_batch_size = self.batch_size * self.gradient_accumulate_steps + global_batch_size = min(nsamples, global_batch_size) + # We assume the block input and output shape is same + if self.gradient_accumulate_steps != 1 and not self.attention_mask: + whole_indices = torch.arange(global_batch_size) + num_elm = self._get_current_num_elm(input_ids, whole_indices) + setup_ddp_if_needed_(self, block, self.compress_context.device_list) + index_sampler = IndexSampler(nsamples, global_batch_size) + batch_size = self.batch_size + for i in range(self.iters): + if self.enable_alg_ext and self.data_type.endswith("dq"): + for n, m in block.named_modules(): + m.cur_iter = i + total_loss = 0 + global_indices = index_sampler.next_batch() + if self.attention_mask: + num_elm = self._get_non_zero_cnt(self.attention_mask, global_indices) + + for tmp_step in range(self.gradient_accumulate_steps): + indices = global_indices[tmp_step * batch_size : (tmp_step + 1) * batch_size] + current_output = self._get_current_output(output, indices) + current_output = to_device(current_output, loss_device) + output_q = self._get_current_q_output(block, input_ids, input_others, indices, device, loss_device) + loss = self._get_loss(output_q, current_output, indices, mse_loss, device) + num_elm = 1 if num_elm <= 0 else num_elm + total_loss += loss.item() / num_elm + + if self.compress_context.low_gpu_mem_usage and card_0_in_high_risk: + # clear memory to avoid OOM due to memory fragmentation + clear_memory_if_reached_threshold(threshold=0.5, device_list=self.compress_context.device_list) + + self._scale_loss_and_backward(scaler, loss) + + if self.compress_context.low_gpu_mem_usage and card_0_in_high_risk: + # clear memory to avoid OOM due to memory fragmentation + clear_memory_if_reached_threshold(threshold=0.8, device_list=self.compress_context.device_list) + + if i == 0: + init_loss = total_loss + + if total_loss < best_loss: + best_loss = total_loss + if not self.not_use_best_mse: + best_params = collect_best_params(block, self.compress_context.cache_device) + # print(f"get better result at iter {i}, the loss is {total_loss}", flush=True) + + last_best_iter = i + if self.not_use_best_mse and i == self.iters - 1: + best_params = collect_best_params(block, self.compress_context.cache_device) + + if not self.not_use_best_mse: + if 0 < self.dynamic_max_gap <= i - last_best_iter: + break + self._step(scaler, optimizer, lr_schedule) + + last_loss = total_loss + best_iter = self.iters + if not self.not_use_best_mse: + last_loss = best_loss + best_iter = last_best_iter + if self.iters > 0: + dump_info = ( + f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} " + f"layers in the block, loss iter 0: {init_loss:.6f} -> iter {best_iter}: {last_loss:.6f}" + ) + else: + dump_info = ( + f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} " + "layers in the block" + ) + + if self.compress_context.low_gpu_mem_usage: + clear_memory(device_list=self.compress_context.device_list) # clear cached memory during training + if len(unquantized_layer_names) != 0: + logger.info(f"{unquantized_layer_names} have not been quantized") + with torch.no_grad(): + unwrapper_block(block, best_params) + + if self.config.is_act_nv_fp: + # enable moe experts act_max automatic generation for WrapperWALayer + set_amax_for_all_moe_layers(block, attr_name="orig_layer.act_max") + + if self.enable_quanted_input: + q_outputs = self._get_block_outputs( + block, + input_ids, + input_others, + self.batch_size * self.infer_bs_coeff, + device, + cache_device=self.compress_context.cache_device, + ) + + if len(self.compress_context.device_list) > 1 and auto_offload: + accelerate.hooks.remove_hook_from_submodules(block) + if auto_offload: + mv_module_from_gpu(block) + + clear_memory(input_ids, device_list=self.compress_context.device_list) + memory_info_summary = memory_monitor.get_summary() + logger.infoclean(dump_info + "," + memory_info_summary) + + return q_outputs, output + else: + if len(self.compress_context.device_list) > 1 and auto_offload: + accelerate.hooks.remove_hook_from_submodules(block) + if auto_offload: + mv_module_from_gpu(block) + clear_memory(input_ids, device_list=self.compress_context.device_list) + memory_info_summary = memory_monitor.get_summary() + logger.infoclean(dump_info + "," + memory_info_summary) + + return None, output + + def quantize_layer(self, layer_name: str, inputs: torch.Tensor, q_inputs: torch.Tensor = None, device: str = "cpu"): + """Quantize a specific layer of the model using the provided inputs. + + Args: + layer_name (str): The name of the layer to quantize. + inputs (torch.Tensor): Input data for quantization. + q_inputs (torch.Tensor, optional): Quantized input data. Defaults to None. + device (torch.device, optional): The device to use for quantization. Defaults to torch.device("cpu"). + + Returns: + None + """ + logger.info(f"quantizing layer {layer_name}") + layer = get_module(self.model, layer_name) + if hasattr(layer, "tuning_device"): + device = layer.tuning_device + + layer = layer.to(device) + for i in range(len(inputs)): + inputs[i] = inputs[i].to(layer.weight.dtype) + if q_inputs is not None: + q_inputs[i] = q_inputs[i].to(layer.weight.dtype) + + if self.act_bits <= 8 and check_need_act_calibration( + self.act_dynamic, + self.act_data_type, + self.act_bits, + self.static_kv_dtype, + self.static_attention_dtype, + ): + tmp_inputs = q_inputs if q_inputs is not None else inputs + hook_handles = self._register_act_max_hook(layer) + with torch.no_grad(): + for input in tmp_inputs: + layer(input) + for handle in hook_handles: + handle.remove() + + wrapper_linear = WrapperLinear( + layer, + enable_minmax_tuning=self.enable_minmax_tuning, + enable_torch_compile=self.enable_torch_compile, + device=device, + ).to(device) + round_params = [] + minmax_params = [] + for key in wrapper_linear.params.keys(): + if "min" in key or "max" in key: + minmax_params.append(wrapper_linear.params[key]) + else: + round_params.append(wrapper_linear.value) + if len(round_params) + len(minmax_params) <= 0: + dump_info = f"quantized {layer_name}" + logger.info(dump_info) + with torch.no_grad(): + unwrapper_layer(self.model, wrapper_linear, layer_name, {}) + mv_module_from_gpu(layer) + + lr = torch.tensor(self.lr) + minmax_lr = torch.tensor(self.minmax_lr) + if self.enable_minmax_tuning: + optimizer = self.optimizer( + [{"params": round_params}, {"params": minmax_params, "lr": minmax_lr}], lr=lr, weight_decay=0 + ) + else: + optimizer = self.optimizer(round_params, lr=lr, weight_decay=0) + + if self.lr_scheduler is None: + lr_schedule = torch.optim.lr_scheduler.LinearLR( + optimizer, start_factor=1.0, end_factor=0.0, total_iters=self.iters + ) + else: + lr_schedule = copy.deepcopy(self.lr_scheduler) + nsamples = len(inputs) + last_best_iter = 0 + best_loss = torch.finfo(torch.float).max + scaler = self._get_scaler() # pylint: disable=assignment-from-none + init_loss = None + gradient_accumulate_steps = self.batch_size # Force to low gpu + + total_loss = 0 + num_elm = 1 + mse_reduction = "mean" + if gradient_accumulate_steps != 1: + mse_reduction = "sum" + mse_loss = torch.nn.MSELoss(reduction=mse_reduction).to(device) + batch_size = 1 # Force to low gpu + global_batch_size = self.batch_size * gradient_accumulate_steps + global_batch_size = min(nsamples, global_batch_size) + if gradient_accumulate_steps != 1 and not self.attention_mask: + whole_indices = torch.arange(global_batch_size) + if q_inputs is not None: + num_elm = self._get_current_num_elm(q_inputs, whole_indices) + else: + num_elm = self._get_current_num_elm(inputs, whole_indices) + + index_sampler = IndexSampler(nsamples, global_batch_size) + + for i in range(self.iters): + total_loss = 0 + global_indices = index_sampler.next_batch() + if self.attention_mask: + num_elm = self._get_non_zero_cnt(self.attention_mask, global_indices) + + for tmp_step in range(gradient_accumulate_steps): + indices = global_indices[tmp_step * batch_size : (tmp_step + 1) * batch_size] + if q_inputs is not None: + current_input = [q_inputs[i] for i in indices] + current_input = torch.cat(current_input, dim=0).to(device) + org_input = [inputs[i] for i in indices] + org_input = torch.cat(org_input, dim=0).to(device) + else: + current_input = [inputs[i] for i in indices] + current_input = torch.cat(current_input, dim=0).to(device) + org_input = current_input + with torch.no_grad(): + current_output = layer(org_input) + autocast_ctx = ( + nullcontext() + if not self.amp + else autocast(device_type=str(device).split(":")[0], dtype=self.amp_dtype) + ) + if self.attention_mask: + tmp_attention_mask = [self.attention_mask[i] for i in indices] + tmp_attention_mask = torch.cat(tmp_attention_mask, dim=0).to(device) + tmp_attention_mask.unsqueeze_(-1) + + with autocast_ctx: + output_q = wrapper_linear(current_input) # pylint: disable=not-callable + loss = mse_loss( # pylint: disable=not-callable + (output_q * tmp_attention_mask).to(torch.float32), + (current_output * tmp_attention_mask).to(torch.float32), + ) + + else: + with autocast_ctx: + output_q = wrapper_linear(current_input) # pylint: disable=not-callable + loss = mse_loss( # pylint: disable=not-callable + output_q.to(torch.float32), + current_output.to(torch.float32), # mul 1.0 will copy the output + ) + + num_elm = 1 if num_elm <= 0 else num_elm + total_loss += loss.item() / num_elm + + self._scale_loss_and_backward(scaler, loss) + if i == 0: + init_loss = total_loss + + if total_loss < best_loss: + best_loss = total_loss + if not self.not_use_best_mse: + best_params = collect_best_params(wrapper_linear, self.cache_device) + last_best_iter = i + if self.not_use_best_mse and i == self.iters - 1: + best_params = collect_best_params(wrapper_linear, self.cache_device) + + if not self.not_use_best_mse: + if 0 < self.dynamic_max_gap <= i - last_best_iter: + break + self._step(scaler, optimizer, lr_schedule) + + last_loss = total_loss + best_iter = self.iters + if not self.not_use_best_mse: + last_loss = best_loss + best_iter = last_best_iter + with torch.no_grad(): + unwrapper_layer(self.model, wrapper_linear, layer_name, best_params) + mv_module_from_gpu(layer) + dump_info = f"quantized {layer_name}, loss iter 0: {init_loss:.6f} -> iter {best_iter}: {last_loss:.6f}" + logger.info(dump_info) + + @torch.no_grad() + def _get_block_outputs( + self, + block: torch.nn.Module, + input_ids: torch.Tensor | list[torch.Tensor], + input_others: torch.Tensor | dict, + bs: int, + device: Union[str, torch.device], + cache_device: Union[str, torch.device], + save_output: bool = True, + ): + """Compute the output of a given block of the model for a given input. + + Args: + block: The block of the model. + input_ids: The input tensor containing tokenized input ids. + input_others: A dictionary containing additional input data. + bs: The batch size for computing the output. + device: The device for computation. + cache_device: The device for storing the output. + batch_dim: The batch dimension of the output tensor. + + Returns: + The output tensor of the block. + """ + + self.block_forward = ( + compile_func(block_forward, self.device) if self.compress_context.enable_torch_compile else block_forward + ) + + output = [] + nsamples = len(input_ids) + for i in range(0, nsamples, bs): + end_index = min(nsamples, i + bs) + indices = torch.arange(i, end_index).to(torch.long) + tmp_input_ids, tmp_input_others = self._sampling_inputs( + input_ids, + input_others, + indices, + self.seqlen, + self.batch_dim, + share_cache_keys=self.model_context.shared_cache_keys, + ) + tmp_output = self.block_forward( + block, tmp_input_ids, tmp_input_others, self.model_context.amp, self.model_context.amp_dtype, device + ).to(self.compress_context.cache_device) + if save_output: + if self.batch_size == 1: + output.append(tmp_output) + else: + output.extend(list(torch.split(tmp_output, 1, dim=self.batch_dim))) + if self.compress_context.low_gpu_mem_usage: + clear_memory(device_list=self.compress_context.device_list) + + return output + + @classmethod + @torch.no_grad() + def _sampling_inputs( + cls, + input_ids: Union[list[torch.Tensor], dict], + input_others: dict, + indices: list[int] | torch.Tensor, + seqlen: int, + batch_dim: int = 0, + share_cache_keys: tuple = (), + ): + """Samples inputs based on the given indices and sequence length. + + Args: + input_ids: The list of input tensor containing input_ids. + input_others: A dictionary containing other input data. + indices: The indices to sample from the input. + seqlen: The sequence length. + + Returns: + current_input_ids: The sampled input IDs. + current_input_others: The sampled other input data. + """ + if isinstance(input_ids, list): + current_input_ids = [input_ids[i] for i in indices] + current_input_ids = torch.cat(current_input_ids, dim=batch_dim) + elif isinstance(input_ids, dict): + current_input_ids = defaultdict(list) + for k in input_ids.keys(): + current_input_ids[k].extend([input_ids[k][i] for i in indices]) + current_input_ids[k] = torch.cat(current_input_ids[k], dim=batch_dim) + + current_input_others = {"positional_inputs": input_others["positional_inputs"]} + for key in input_others.keys(): + if "positional_inputs" in key: + continue + if (key not in share_cache_keys or len(indices) == 1) and not isinstance( + input_others[key], (str, bool, type(None)) + ): + current_input_others[key] = None + if input_others[key] is not None: + current_input_others[key] = [input_others[key][i] for i in indices] + if len(indices) == 1: + current_input_others[key] = current_input_others[key][0] + else: + try: + current_input_others[key] = torch.cat(current_input_others[key], dim=0) + except TypeError as err: + logger.warning_once("Please check the model cache inputs or try setting batch_size to 1.") + else: + current_input_others[key] = input_others[key] + + return current_input_ids, current_input_others + + def _get_optimizer(self, optimizer: Any): + """Returns the specified optimizer. In SignRound, we fix the optimizer. + + Args: + optimizer: The optimizer to be used. + + Returns: + The specified optimizer. + """ + if optimizer is not None: + logger.warning_once( + "The optimizer setting in config will be ignored in AutoRound, using SignSGD as default." + ) + return SignSGD + + def _get_scaler(self): + """Returns scaler, in SignRound, no need to use scaler.""" + return None + + def _scale_loss_and_backward(self, scaler: Any, loss: torch.Tensor) -> torch.Tensor: + """Scales the loss and performs backward pass. + + Args: + scaler: The scaler to be used. + loss: The loss to be scaled. + + Returns: + The scaled loss. + """ + scale_loss = loss * 1000 + scale_loss.backward() + if is_hpex_available(): + htcore.mark_step() + return scale_loss + + def _step(self, scaler: Any, optimizer: Any, lr_schedule: Any): + """Performs a step in the optimization process. + + Args: + scaler: The scaler to be used. + optimizer: The optimizer for the step. + lr_schedule: The learning rate schedule. + + Returns: + None + """ + optimizer.step() + # for hpu + if is_hpex_available(): + htcore.mark_step() + optimizer.zero_grad() + lr_schedule.step() diff --git a/auto_round/algorithms/quantization/base.py b/auto_round/algorithms/quantization/base.py index 18e300095..0d293afa4 100644 --- a/auto_round/algorithms/quantization/base.py +++ b/auto_round/algorithms/quantization/base.py @@ -11,17 +11,292 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import copy +import importlib +import sys +from dataclasses import fields +import torch -class BaseQuanizers: - def __init__(self): - pass +from auto_round.algorithms.quantization.config import QuantizationConfig +from auto_round.compressors_new.utils import ( + IndexSampler, + _get_quantized_layer_names_outside_blocks, + block_forward, + check_need_act_calibration, + check_skippable_keywords, + collect_best_params, + get_shared_keys, + infer_bits_by_data_type, + init_cache, + reset_params, + set_layer_config, +) +from auto_round.context.compress import CompressContext +from auto_round.context.model import ModelContext +from auto_round.data_type.utils import reshape_pad_tensor_by_group_size +from auto_round.logger import logger +from auto_round.schemes import ( + QuantizationScheme, + _handle_special_schemes, + _parse_scheme, + get_gguf_scheme, + preset_name_to_scheme, +) +from auto_round.special_model_handler import get_predefined_ignore_layers, update_module +from auto_round.utils import ( + INNER_SUPPORTED_LAYER_TYPES, + SUPPORTED_LAYER_TYPES, + check_to_quantized, + convert_dtype_str2torch, + find_matching_blocks, + get_block_names, + is_quantized_input_module, +) - def pre_quantize(self): - pass - def quantize(self): - pass +class BaseQuantizers: - def post_quantize(self): - pass + def __init__(self, config: QuantizationConfig): + self.scheme = config.scheme + self.layer_config = config.layer_config + self.quant_lm_head = config.quant_lm_head + self.scale_dtype = config.scale_dtype + self.to_quant_block_names = config.to_quant_block_names + self.ignore_layers = config.ignore_layers + self.config = config + + @classmethod + def from_config(cls, config: QuantizationConfig): + if cls.__name__ == config._alg_cls: + return cls(config) + else: + module = importlib.import_module("auto_round.algorithms.quantization") + alg_cls = getattr(module, config._alg_cls) + return alg_cls(config) + + def post_init(self): + # should be set after loading model and set layer_config, cause some special scheme need these. + # Preserve the original, unparsed scheme for later use in auto scheme generation + # within `configure_layer_config` (which may need the raw value instead of `self.scheme`). + + # # Alternatively, you can use ModelContext.get_context + self.model_context = ModelContext() + self.compress_context = CompressContext() + + scheme_fields = {f.name for f in fields(QuantizationScheme)} + user_scheme_overrides = {} + for k in scheme_fields: + v = getattr(self.config, k, None) + if v is not None: + user_scheme_overrides[k] = v + default_scheme, self.is_auto_scheme, final_attrs = _parse_scheme(self.scheme, user_scheme_overrides) + + # Bind attributes to self.config for easy instance-level access + for key, value in final_attrs.items(): + setattr(self.config, key, value) + self.config.check_config() + + self.orig_scheme = copy.deepcopy(self.scheme) + self.scheme = default_scheme + + gguf_scheme_name = get_gguf_scheme(self.scheme) + # GGUF uses fp32 scale dtype as default + if self.scale_dtype is None: + self.scale_dtype = "fp32" if gguf_scheme_name else "fp16" + self.scale_dtype = convert_dtype_str2torch(self.scale_dtype) + + if not self.is_auto_scheme: + enable_gguf_official_mixed = True + else: + enable_gguf_official_mixed = False + + if not hasattr(self, "quant_block_list"): + all_blocks = get_block_names(self.model_context.model) + self.quant_block_list = find_matching_blocks( + self.model_context.model, all_blocks, self.to_quant_block_names + ) + + self.configure_layer_config(enable_gguf_official_mixed=enable_gguf_official_mixed) + + def _gen_auto_scheme(self) -> dict[str, dict]: + if self.mllm: + logger.info("AutoScheme is not yet supported for multimodal LLMs.") + sys.exit(-1) + + if is_quantized_input_module(self.model_context.model): + logger.info("AutoScheme does not currently support quantized input models (e.g., FP8).") + sys.exit(-1) + + all_dtypes = [] + all_gguf = True + for option in self.orig_scheme.options: + # Resolve the quantization scheme or data type + dtype = "int" + if isinstance(option, str): + if not option.lower().startswith("gguf"): + all_gguf = False + + option = preset_name_to_scheme(option) + + else: + all_gguf = False + + if isinstance(option, QuantizationScheme): + dtype = option.data_type + elif isinstance(option, dict): + dtype = option.get("data_type", "int") + + all_dtypes.append(dtype) + + # Check for mixed data types + unique_dtypes = set(all_dtypes) + if len(unique_dtypes) > 1 and not all_gguf: + logger.warning( + "Models with mixed data_types " + "cannot yet be exported to real formats except GGUF. " + "Please save the model using the `fake` format for now." + ) + + layer_config, self.has_qlayer_outside_block, self.regex_config = set_layer_config( + self.model_context.model, + self.layer_config, + self.scheme, + self.scale_dtype, + self.supported_types, + self.inner_supported_types, + self.quant_block_list, + self.ignore_layers, + self.quant_lm_head, + enable_gguf_official_mixed=False, + is_mllm=self.mllm, + ) + quant_layer_names = layer_config.keys() + scheme_keys = {f.name for f in fields(QuantizationScheme)} + fixed_layer_scheme_new = { + k: {key: v[key] for key in scheme_keys & v.keys()} + for k, v in layer_config.items() + if v.get("fixed_by_user", False) + } + + # mainly using quant_layers and fixed by users + from auto_round.auto_scheme.gen_auto_scheme import GenScheme + + if not self.enable_torch_compile and self.super_bits is None and not self.orig_scheme.low_gpu_mem_usage: + logger.warning("we strongly recommend to set `enable_torch_compile` to True for AutoScheme to save VRAM") + self.scheme_generator = GenScheme( + self.orig_scheme, + self.model_context.model, + quant_layer_names, + fixed_layer_scheme_new, + self.dataset, + device_map=self.compress_context.device_map, + tokenizer=self.tokenizer, + enable_torch_compile=self.enable_torch_compile, + ) + layer_config = self.scheme_generator.get_layer_config() + return layer_config + + def configure_layer_config(self, enable_gguf_official_mixed: None | bool = True): + + is_gguf_format = (f := getattr(self, "formats", None)) is not None and len(f) > 0 and f[0].is_gguf() + if not is_gguf_format: + predefined_ignore_layers = get_predefined_ignore_layers(self.model_context.model) + if predefined_ignore_layers: + logger.info(f"Using predefined ignore_layers: {predefined_ignore_layers}") + tmp_str = ",".join(predefined_ignore_layers) + if self.ignore_layers == "": + self.ignore_layers = tmp_str + else: + self.ignore_layers += "," + tmp_str + + if self.is_auto_scheme: + self.layer_config = self._gen_auto_scheme() + else: + self.layer_config = _handle_special_schemes( + self.orig_scheme, + self.layer_config, + self.model_context.model, + supported_types=SUPPORTED_LAYER_TYPES, + inner_supported_types=INNER_SUPPORTED_LAYER_TYPES, + quant_lm_head=self.quant_lm_head, + mllm=self.model_context.is_mllm, + ) + + fill_default_value = True + if self.is_auto_scheme: + fill_default_value = False + self.layer_config, self.has_qlayer_outside_block, self.regex_config = set_layer_config( + self.model_context.model, + self.layer_config, + self.scheme, + self.scale_dtype, + SUPPORTED_LAYER_TYPES, + INNER_SUPPORTED_LAYER_TYPES, + self.quant_block_list, + self.ignore_layers, + self.quant_lm_head, + enable_gguf_official_mixed=enable_gguf_official_mixed, + is_mllm=self.model_context.is_mllm, + fill_default_value=fill_default_value, + ) + + def _register_act_max_hook(self, model): + def get_act_max_hook(module, input, output): + if isinstance(input, (tuple, list)): + input = input[0] + if input.numel() == 0: + return # as no needs for act_max update + input, _, _ = reshape_pad_tensor_by_group_size(input, self.act_group_size) + act_max = torch.max(torch.abs(input), dim=-1).values + if not hasattr(module, "act_max") or module.act_max.numel() == 0: + module.act_max = act_max + if self.config.is_act_nv_fp: ## for nvfp per-tensor input_global_scale calculation usage + max_val = act_max.max() + module.act_max = max_val.unsqueeze(0) if max_val.dim() == 0 else max_val + else: + act_max = act_max.to(module.act_max.device) + if self.config.is_act_nv_fp: ## for nvfp per-tensor input_global_scale calculation usage + max_val = torch.max(act_max.max(), module.act_max.max()) + module.act_max = max_val.unsqueeze(0) if max_val.dim() == 0 else max_val + else: + module.act_max = torch.max(act_max, module.act_max) + + hook_handles = [] + # for single layers out of blocks, like lm_head + if isinstance(model, SUPPORTED_LAYER_TYPES): + m = model + if ( + hasattr(m, "act_dynamic") + and check_need_act_calibration(m.act_dynamic, m.act_data_type, m.act_bits) + and check_to_quantized(m) + ): + hook = m.register_forward_hook(get_act_max_hook) + hook_handles.append(hook) + return hook_handles + + for n, m in model.named_modules(): + if ( + hasattr(m, "act_dynamic") + and check_need_act_calibration(m.act_dynamic, m.act_data_type, m.act_bits) + and check_to_quantized(m) + ): + hook = m.register_forward_hook(get_act_max_hook) + hook_handles.append(hook) + continue + + # for whole model, RTN + if n in self.layer_config: + config = self.layer_config[n] + act_dynamic = config.get("act_dynamic", True) + act_data_type = config.get("act_data_type", None) + act_bits = config.get("act_bits", 16) + if ( + config["bits"] <= 8 + and check_need_act_calibration(act_dynamic, act_data_type, act_bits) + and check_to_quantized(config) + ): + hook = m.register_forward_hook(get_act_max_hook) + hook_handles.append(hook) + continue + return hook_handles diff --git a/auto_round/algorithms/quantization/config.py b/auto_round/algorithms/quantization/config.py new file mode 100644 index 000000000..a26eb5d35 --- /dev/null +++ b/auto_round/algorithms/quantization/config.py @@ -0,0 +1,181 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy +from enum import Enum +from typing import Union + +from auto_round.algorithms.alg_config import AlgConfig +from auto_round.auto_scheme.gen_auto_scheme import AutoScheme +from auto_round.export.export_to_gguf.config import GGUF_INNER_CONFIG +from auto_round.logger import logger +from auto_round.schemes import ( + QuantizationScheme, + _handle_special_schemes, + _parse_scheme, + get_gguf_scheme, + preset_name_to_scheme, +) +from auto_round.utils import convert_dtype_str2torch + + +class BackendDataType(str, Enum): + STANDARD_FP = "fp" + MX_FP = "mx_fp" + NV_FP = "nv_fp" + FP8_STATIC = "fp8_static" + FP8 = "fp8" + + +class QuantizationConfig(AlgConfig): + _alg_cls: str = None + + def __init__( + self, + scheme: Union[str, dict, QuantizationScheme, AutoScheme] = "W4A16", + layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None, + *, + # quantization args + bits: int = None, + group_size: int = None, + sym: bool = None, + data_type: str = None, + act_bits: int = None, + act_group_size: int = None, + act_sym: bool = None, + act_data_type: str = None, + act_dynamic: bool = None, + super_bits: int = None, + super_group_size: int = None, + scale_dtype: str = None, + ignore_layers: str = "", + quant_lm_head: bool = False, + to_quant_block_names: Union[str, list, None] = None, + ): + + self.scheme = scheme + self.layer_config = layer_config + + self.bits = bits + self.group_size = group_size + self.sym = sym + self.data_type = data_type + self.act_bits = act_bits + self.act_group_size = act_group_size + self.act_sym = act_sym + self.act_data_type = act_data_type + self.act_dynamic = act_dynamic + self.super_bits = super_bits + self.super_group_size = super_group_size + + self.scale_dtype = scale_dtype + self.ignore_layers = ignore_layers + self.quant_lm_head = quant_lm_head + self.to_quant_block_names = to_quant_block_names + + def check_config(self) -> None: + """Checks if the configurations are valid. + + Raises: + ValueError, TypeError: If any of the configurations are invalid. + """ + if self.bits <= 0: + raise ValueError("`bits` must be positive") + if self.act_bits <= 0: + raise ValueError("`act_bits` must be positive") + if not (self.group_size == -1 or self.group_size >= 0): + raise ValueError("`group_size` must be -1 (per channel) or 0 (per-tensor) or a positive integer") + if not (self.act_group_size == -1 or self.act_group_size >= 0): + raise ValueError("`act_group_size` must be -1 (per channel) or 0 (per-tensor) or a positive integer") + """Reset the default value of super_bits and super_group_size""" + if self.data_type.endswith("_dq"): + gguf_config = GGUF_INNER_CONFIG[f"gguf:q{self.bits}_k"] + self.super_bits = gguf_config.get("super_bits", None) if self.super_bits is None else self.super_bits + self.super_group_size = ( + gguf_config.get("super_group_size", None) if self.super_group_size is None else self.super_group_size + ) + + if ( + self.is_act_quantize + and (not self.is_act_nv_fp or "static_gs" not in self.act_data_type) + and not self.is_act_mx_fp + and not self.is_dynamic_wint8aint8 + and not self.is_static_afp8 + ): + logger.warning( + "activation quantization is an experimental feature with limited support and a complex API. " + "And please save the quantized model to fake format as real deployment is not supported currently" + ) + if self.is_mx_fp and self.group_size != 32: + logger.warning("dtype mx_fp should only support group_size of 32 in real deployment") + + if self.is_nv_fp and (self.group_size != 16): + logger.warning("dtype nv_fp should only support group_size of 16 in real deployment") + + @property + def is_act_quantize(self): + return self.act_bits is not None and self.act_bits <= 8 + + @property + def is_nv_fp(self): + return BackendDataType.NV_FP in self.data_type + + @property + def is_act_nv_fp(self): + return BackendDataType.NV_FP in self.act_data_type + + @property + def is_mx_fp(self): + return BackendDataType.MX_FP in self.data_type + + @property + def is_act_mx_fp(self): + return BackendDataType.MX_FP in self.act_data_type + + @property + def is_dynamic_wint8aint8(self): + if self.act_dynamic: + return True + if ("int8" in self.act_data_type or ("int" in self.act_data_type and self.act_bits == 8)) and ( + "int8" in self.data_type or ("int" in self.data_type and self.bits == 8) + ): + return True + return False + + @property + def is_standard_fp(self, act=False): + return BackendDataType.STANDARD_FP in self.data_type and not self.is_mx_fp and not self.is_nv_fp + + @property + def is_act_standard_fp(self, act=False): + return BackendDataType.STANDARD_FP in self.act_data_type and not self.is_act_mx_fp and not self.is_act_nv_fp + + @property + def is_static_afp8(self): + return BackendDataType.FP8_STATIC in self.act_data_type + + @property + def is_static_wfp8afp8(self): + return BackendDataType.FP8_STATIC in self.data_type and self.is_static_afp8 + + @property + def is_wfp8afp8(self): + if ( + ("fp8" in self.act_data_type or ("fp" in self.act_data_type and self.act_bits == 8)) + and ("fp8" in self.data_type or ("fp" in self.data_type and self.bits == 8)) + and self.is_standard_fp(act=True) + and self.is_standard_fp(act=False) + ): + return True + else: + return False diff --git a/auto_round/compressors_new/base.py b/auto_round/compressors_new/base.py index 7d9cbb9f8..a968c55e6 100644 --- a/auto_round/compressors_new/base.py +++ b/auto_round/compressors_new/base.py @@ -31,9 +31,7 @@ from auto_round import envs from auto_round.algorithms.alg_config import AlgConfig from auto_round.algorithms.base import BaseAlgorithm -from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig -from auto_round.algorithms.quantization.auto_round.quantize import ARQuantizer -from auto_round.auto_scheme.gen_auto_scheme import AutoScheme +from auto_round.algorithms.quantization import ARQuantizer, AutoRoundConfig, BaseQuantizers, QuantizationConfig from auto_round.calibration.utils import ( _infer_last_cache_name, _update_inputs, @@ -52,20 +50,13 @@ reset_params, set_layer_config, ) -from auto_round.context.compress_context import CompressContext -from auto_round.context.model_context import ModelContext +from auto_round.context.compress import CompressContext +from auto_round.context.model import ModelContext from auto_round.data_type import QUANT_FUNC_WITH_DTYPE -from auto_round.export.export_to_gguf.config import GGUF_INNER_CONFIG from auto_round.formats import OutputFormat, get_formats from auto_round.logger import logger from auto_round.modeling.fused_moe.replace_modules import materialize_model_, safe_to_cpu_ -from auto_round.schemes import ( - QuantizationScheme, - _handle_special_schemes, - _parse_scheme, - get_gguf_scheme, - preset_name_to_scheme, -) +from auto_round.schemes import QuantizationScheme from auto_round.special_model_handler import get_predefined_ignore_layers, update_module from auto_round.utils import ( INNER_SUPPORTED_LAYER_TYPES, @@ -148,98 +139,6 @@ ) -class BackendDataType(str, Enum): - STANDARD_FP = "fp" - MX_FP = "mx_fp" - NV_FP = "nv_fp" - - -@dataclass -class QuantizationArgs: - bits: int = None - group_size: int = None - sym: bool = None - data_type: str = None - act_bits: int = None - act_group_size: int = None - act_sym: bool = None - act_data_type: str = None - act_dynamic: bool = None - super_bits: int = None - super_group_size: int = None - - def is_act_quantize(self): - return self.act_bits is not None and self.act_bits <= 8 - - def is_nv_fp(self, act=False): - data_type = self.data_type if act is False else self.act_data_type - return BackendDataType.NV_FP in data_type - - def is_mx_fp(self, act=False): - data_type = self.data_type if act is False else self.act_data_type - return BackendDataType.MX_FP in data_type - - def is_dynamic_wint8aint8(self): - if self.act_dynamic: - return True - if ("int8" in self.act_data_type or ("int" in self.act_data_type and self.act_bits == 8)) and ( - "int8" in self.data_type or ("int" in self.data_type and self.bits == 8) - ): - return True - return False - - def is_static_wfp8afp8(self, act=False): - data_type = self.data_type if act is False else self.act_data_type - return "fp8_static" in data_type - - def is_standard_fp(self, act=False): - data_type = self.data_type if act is False else self.act_data_type - return BackendDataType.STANDARD_FP in data_type and not self.is_mx_fp(act=act) and not self.is_nv_fp(act=act) - - def is_wfp8afp8(self): - if ( - ("fp8" in self.act_data_type or ("fp" in self.act_data_type and self.act_bits == 8)) - and ("fp8" in self.data_type or ("fp" in self.data_type and self.bits == 8)) - and self.is_standard_fp(act=True) - and self.is_standard_fp(act=False) - ): - return True - else: - return False - - @classmethod - def from_dict(cls, config: dict): - new_config = {} - for k, v in config.items(): - if hasattr(cls, k): - new_config[k] = v - return cls(**new_config) - - def non_default(self): - config = {} - for k, v in asdict(self).items(): - if v: - config[k] = v - return config - - def check_config(self): - if self.bits <= 0: - raise ValueError("`bits` must be positive") - if self.act_bits <= 0: - raise ValueError("`act_bits` must be positive") - if not (self.group_size == -1 or self.group_size >= 0): - raise ValueError("`group_size` must be -1 (per channel) or 0 (per-tensor) or a positive integer") - if not (self.act_group_size == -1 or self.act_group_size >= 0): - raise ValueError("`act_group_size` must be -1 (per channel) or 0 (per-tensor) or a positive integer") - """Reset the default value of super_bits and super_group_size""" - if self.data_type.endswith("_dq"): - gguf_config = GGUF_INNER_CONFIG[f"gguf:q{self.bits}_k"] - self.super_bits = gguf_config.get("super_bits", None) if self.super_bits is None else self.super_bits - self.super_group_size = ( - gguf_config.get("super_group_size", None) if self.super_group_size is None else self.super_group_size - ) - - class Compressor(object): SKIP_ARGS = ("local_args", "kwargs", "cls", "config") @@ -248,7 +147,6 @@ def __new__( config: Union[AlgConfig, list[AlgConfig]], model: Union[torch.nn.Module, str], tokenizer=None, - scheme: Union[str, dict, QuantizationScheme, AutoScheme] = "W4A16", platform="hf", format=None, **kwargs, @@ -257,27 +155,22 @@ def __new__( local_args = {k: v for k, v in locals().items() if k not in cls.SKIP_ARGS} if isinstance(config, AutoRoundConfig): - return BaseCompressor(ARQuantizer(config), **local_args, **kwargs) + return BaseCompressor(config, **local_args, **kwargs) class BaseCompressor(object): need_calib: bool = True + supported_types = SUPPORTED_LAYER_TYPES def __init__( self, - algorithms: Union[BaseAlgorithm, list[BaseAlgorithm]], + config: Union[AlgConfig, list[AlgConfig]], model: Union[torch.nn.Module, str], tokenizer=None, platform="hf", format=None, - scheme: Union[str, dict, QuantizationScheme, AutoScheme] = "W4A16", - layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None, dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k", iters: int = 200, - seqlen: int = 2048, - nsamples: int = 128, - batch_size: int = 8, - gradient_accumulate_steps: int = 1, low_gpu_mem_usage: bool = False, device_map: Union[str, torch.device, int, dict] = 0, enable_torch_compile: bool = False, @@ -287,32 +180,26 @@ def __init__( low_cpu_mem_usage: bool = True, **kwargs, ): - self.quantization_args = QuantizationArgs.from_dict(kwargs) + self.quantize_config = None + self.config_list = config if isinstance(config, list) else [config] + for config in self.config_list: + if isinstance(config, QuantizationConfig): + self.quantize_config = config + assert self.quantize_config is not None, "QuantizationConfig is required for Compressor" + self.config_list.remove(self.quantize_config) - self.algorithms = algorithms if isinstance(algorithms, list) else [algorithms] # TODO: refactor calibration self.calibration = None - self.scheme = scheme self.formats = format - self.layer_config = layer_config - self.seqlen = seqlen # Extra/legacy kwargs for backward compatibility # Major version releases may pack them with extra configuration options amp = kwargs.pop("amp", True) - not_use_best_mse = kwargs.pop("not_use_best_mse", False) - dynamic_max_gap = kwargs.pop("dynamic_max_gap", -1) nblocks = kwargs.pop("nblocks", 1) - to_quant_block_names: Union[str, list, None] = kwargs.pop("to_quant_block_names", None) - enable_quanted_input: bool = kwargs.pop("enable_quanted_input", True) disable_deterministic_algorithms = kwargs.pop("disable_deterministic_algorithms", True) enable_deterministic_algorithms = kwargs.pop("enable_deterministic_algorithms", False) - self.momentum = kwargs.pop("momentum", 0.0) enable_opt_rtn = kwargs.pop("enable_opt_rtn", None) - self.quant_lm_head = kwargs.pop("quant_lm_head", False) - - self.ignore_layers = kwargs.pop("ignore_layers", "") self._offloader = OffloadManager(enabled=low_cpu_mem_usage, mode="offload", offload_dir_prefix="compressor") @@ -320,8 +207,6 @@ def __init__( model_dtype = kwargs.pop("model_dtype", None) trust_remote_code = kwargs.pop("trust_remote_code") if "trust_remote_code" in kwargs else True - self.scale_dtype = kwargs.pop("scale_dtype", None) - self.static_attention_dtype = kwargs.pop("static_attention_dtype", None) # Attention static dtype if self.static_attention_dtype is not None: @@ -347,8 +232,6 @@ def __init__( else: torch.use_deterministic_algorithms(True, warn_only=True) - self.to_quant_block_names = to_quant_block_names - device = kwargs.pop("device", None) if device is not None: logger.warning("`device` is deprecated, please use `device_map` instead") @@ -356,11 +239,7 @@ def __init__( # Tuning hyperparameters self.seed = seed set_seed(self.seed) - self.enable_quanted_input = enable_quanted_input - self.nsamples = nsamples - self.seqlen = seqlen - self.batch_size, self.gradient_accumulate_steps = batch_size, gradient_accumulate_steps self.nblocks = nblocks self.dataset = dataset self.iters = iters @@ -379,35 +258,25 @@ def __init__( self.enable_torch_compile = enable_torch_compile self.enable_alg_ext = enable_alg_ext - self.not_use_best_mse = not_use_best_mse - self.dynamic_max_gap = dynamic_max_gap # Whether to pack the layer immediately after tuning self.is_immediate_packing = False self.is_immediate_saving = False - # Some helpers - self.batch_dim = None - torch.set_printoptions(precision=3, sci_mode=True) if is_hpex_available(): logger.info("habana_frameworks is available, import htcore explicitly.") import habana_frameworks.torch.core as htcore # pylint: disable=E0401 - self.attention_mask = [] - - self.wrapper_block = wrapper_block - if self.enable_alg_ext: - try: - logger.warning_once("using algorithm extension for quantization.") - from auto_round.alg_ext import wrapper_autoround - - wrapper_autoround(self) - except (ImportError, ModuleNotFoundError): - logger.error("algorithm extension import error, fallback to default mode") - - self.model_context = ModelContext.create_context( + # Alternatively, you can use CompressContext.create_context + self.compress_context = CompressContext( + low_cpu_mem_usage, + low_gpu_mem_usage, + device_map, + enable_torch_compile, + ) + self.model_context = ModelContext( model, tokenizer=tokenizer, platform=platform, @@ -417,19 +286,13 @@ def __init__( need_calib=self.need_calib, device=self.compress_context.device, ) - self.compress_context = CompressContext.create_context( - low_cpu_mem_usage, - low_gpu_mem_usage, - device_map, - enable_torch_compile, - ) # backward compatible with the legacy API def __getattr__(self, name: str) -> Any: if name in self.__dict__: return self.__dict__[name] - for obj in ["quantization_args", "model_context"]: + for obj in ["quantize_config", "model_context", "compress_context", "quantizer"]: if obj not in self.__dict__: continue obj = object.__getattribute__(self, obj) @@ -440,210 +303,56 @@ def __getattr__(self, name: str) -> Any: raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'") - def _check_configs(self) -> None: - """Checks if the configurations are valid. - - Raises: - ValueError, TypeError: If any of the configurations are invalid. - """ - self.quantization_args.check_config() - if self.batch_size <= 0: - raise ValueError("`batch_size` must be positive") - if self.iters < 0: - raise ValueError("`iters` must be non-negative") - if self.seqlen <= 0: - raise ValueError("`seqlen` must be positive") - if self.nblocks <= 0: - raise ValueError("`nblocks` must be positive") - if self.gradient_accumulate_steps <= 0: - raise ValueError("`gradient_accumulate_steps` must be positive") - - if ( - self.quantization_args.is_act_quantize() - and ( - not self.quantization_args.is_nv_fp(act=True) or "static_gs" not in self.quantization_args.act_data_type - ) - and not self.quantization_args.is_mx_fp(act=True) - and not self.quantization_args.is_dynamic_wint8aint8() - and not self.quantization_args.is_static_wfp8afp8() - ): - logger.warning( - "activation quantization is an experimental feature with limited support and a complex API. " - "And please save the quantized model to fake format as real deployment is not supported currently" - ) - - if self.quantization_args.is_mx_fp() and self.group_size != 32: - logger.warning("dtype mx_fp should only support group_size of 32 in real deployment") - - if self.quantization_args.is_nv_fp() and (self.group_size != 16): - logger.warning("dtype nv_fp should only support group_size of 16 in real deployment") - - if self.nsamples < self.gradient_accumulate_steps * self.batch_size: - if self.batch_size > self.nsamples: - if self.iters > 0: # GGUF should log this warning, but we don't know the format here - logger.warning( - f"reset `batch_size` to {self.nsamples} as `nsamples`({self.nsamples})" - f" is smaller than batch_size({self.batch_size})" - ) - self.batch_size = self.nsamples - if self.gradient_accumulate_steps > self.nsamples // self.batch_size: - self.gradient_accumulate_steps = self.nsamples // self.batch_size - logger.warning( - f"reset `gradient_accumulate_steps` to {self.gradient_accumulate_steps}" - f" as nsamples must equal or greater" - f" than gradient_accumulate_steps * batch_size" - ) - - def _gen_auto_scheme(self) -> dict[str, dict]: - if self.mllm: - logger.info("AutoScheme is not yet supported for multimodal LLMs.") - sys.exit(-1) - - if is_quantized_input_module(self.model_context.model): - logger.info("AutoScheme does not currently support quantized input models (e.g., FP8).") - sys.exit(-1) - - all_dtypes = [] - all_gguf = True - for option in self.orig_scheme.options: - # Resolve the quantization scheme or data type - dtype = "int" - if isinstance(option, str): - if not option.lower().startswith("gguf"): - all_gguf = False - - option = preset_name_to_scheme(option) - - else: - all_gguf = False - - if isinstance(option, QuantizationScheme): - dtype = option.data_type - elif isinstance(option, dict): - dtype = option.get("data_type", "int") - - all_dtypes.append(dtype) - - # Check for mixed data types - unique_dtypes = set(all_dtypes) - if len(unique_dtypes) > 1 and not all_gguf: - logger.warning( - "Models with mixed data_types " - "cannot yet be exported to real formats except GGUF. " - "Please save the model using the `fake` format for now." - ) - - layer_config, self.has_qlayer_outside_block, self.regex_config = set_layer_config( - self.model_context.model, - self.layer_config, - self.scheme, - self.scale_dtype, - self.supported_types, - self.inner_supported_types, - self.quant_block_list, - self.ignore_layers, - self.quant_lm_head, - enable_gguf_official_mixed=False, - is_mllm=self.mllm, - ) - quant_layer_names = layer_config.keys() - scheme_keys = {f.name for f in fields(QuantizationScheme)} - fixed_layer_scheme_new = { - k: {key: v[key] for key in scheme_keys & v.keys()} - for k, v in layer_config.items() - if v.get("fixed_by_user", False) - } - - # mainly using quant_layers and fixed by users - from auto_round.auto_scheme.gen_auto_scheme import GenScheme - - if not self.enable_torch_compile and self.super_bits is None and not self.orig_scheme.low_gpu_mem_usage: - logger.warning("we strongly recommend to set `enable_torch_compile` to True for AutoScheme to save VRAM") - self.scheme_generator = GenScheme( - self.orig_scheme, - self.model_context.model, - quant_layer_names, - fixed_layer_scheme_new, - self.dataset, - device_map=self.compress_context.device_map, - tokenizer=self.tokenizer, - enable_torch_compile=self.enable_torch_compile, - ) - layer_config = self.scheme_generator.get_layer_config() - return layer_config - def post_init(self): + self.model_context._load_model() assert self.model_context._model_loaded, "should load model first" - # should be set after loading model and set layer_config, cause some special scheme need these. - # Preserve the original, unparsed scheme for later use in auto scheme generation - # within `configure_layer_config` (which may need the raw value instead of `self.scheme`). - default_scheme, self.is_auto_scheme, final_attrs = _parse_scheme( - self.scheme, self.quantization_args.non_default() - ) - # Bind attributes to self for easy instance-level access - # for key, value in final_attrs.items(): - # setattr(self, key, value) - self.quantization_args = QuantizationArgs.from_dict(final_attrs) - self._check_configs() - self.orig_scheme = copy.deepcopy(self.scheme) - self.scheme = default_scheme + self.quantizer = BaseQuantizers.from_config(self.quantize_config) + self.quantizer.post_init() + self.wrapper_block = wrapper_block + + # TODO: add other algs here when they are ready + # self.other_alg = OtherAlg.from_config(self.other_alg_config) if self.other_alg_config is not None else None + # self.other_alg.post_init() if self.other_alg is not None else None # check and update the format based on the current configuration if self.formats: self.formats = get_formats(self.formats, self) - gguf_scheme_name = get_gguf_scheme(self.scheme) - # GGUF uses fp32 scale dtype as default - if self.scale_dtype is None: - self.scale_dtype = "fp32" if gguf_scheme_name else "fp16" - self.scale_dtype = convert_dtype_str2torch(self.scale_dtype) - - if not hasattr(self, "quant_block_list"): - all_blocks = get_block_names(self.model_context.model) - self.quant_block_list = find_matching_blocks( - self.model_context.model, all_blocks, self.to_quant_block_names - ) # Set device, must place after model loading set_non_auto_device_map(self.model_context.model, self.compress_context.device_map) - if self.iters != 0 and self.orig_disable_opt_rtn is not None: - logger.warning("`disable_opt_rtn` only works when `iters` is set to 0, ignore it now.") - self.disable_opt_rtn = True - if ( - self.quantization_args.bits >= 8 - and self.quantization_args.act_bits >= 8 - and self.iters == 0 - and self.quantization_args.data_type == "int" - and self.disable_opt_rtn is None - ): - logger.warning("`disable_opt_rtn` is turned on for W8A16/W8A8 quantization to improve efficiency.") - self.disable_opt_rtn = True - if self.disable_opt_rtn is None and self.iters == 0: - logger.info( - "`enable_opt_rtn` is turned on, set `--disable_opt_rtn` for higher speed at the cost of accuracy." - ) - self.disable_opt_rtn = False + # if self.iters != 0 and self.orig_disable_opt_rtn is not None: + # logger.warning("`disable_opt_rtn` only works when `iters` is set to 0, ignore it now.") + # self.disable_opt_rtn = True + # if ( + # self.quantization_args.bits >= 8 + # and self.quantization_args.act_bits >= 8 + # and self.iters == 0 + # and self.quantization_args.data_type == "int" + # and self.disable_opt_rtn is None + # ): + # logger.warning("`disable_opt_rtn` is turned on for W8A16/W8A8 quantization to improve efficiency.") + # self.disable_opt_rtn = True + # if self.disable_opt_rtn is None and self.iters == 0: + # logger.info( + # "`enable_opt_rtn` is turned on, set `--disable_opt_rtn` for higher speed at the cost of accuracy." + # ) + # self.disable_opt_rtn = False # after setting iters self._adjust_torch_compile(self.enable_torch_compile) + self.compress_context.enable_torch_compile = self.enable_torch_compile self.block_forward = ( compile_func(block_forward, self.compress_context.device) if self.enable_torch_compile else block_forward ) - if not self.is_auto_scheme: - enable_gguf_official_mixed = True - else: - enable_gguf_official_mixed = False - - self.configure_layer_config(enable_gguf_official_mixed=enable_gguf_official_mixed) - if self.compress_context.low_cpu_mem_usage: self._offloader.reset() def _should_disable_inplace_due_to_layers_outside_block() -> bool: - return self.has_qlayer_outside_block and self.need_calib + return self.quantizer.has_qlayer_outside_block and self.need_calib # Disable inplace mode when there are quantized layers outside blocks # under specific iteration/optimization settings. @@ -661,10 +370,10 @@ def _adjust_torch_compile(self, enable_torch_compile: bool) -> None: if ( not self.enable_torch_compile and TORCH_VERSION_AT_LEAST_2_6 - and self.quantization_args.act_bits > 8 + and self.quantize_config.act_bits > 8 and not is_debug_mode() - and "fp8" not in self.quantization_args.data_type - and "fp8" not in self.quantization_args.act_data_type + and "fp8" not in self.quantize_config.data_type + and "fp8" not in self.quantize_config.act_data_type and self.iters > 0 ): logger.info( @@ -673,63 +382,20 @@ def _adjust_torch_compile(self, enable_torch_compile: bool) -> None: "Enabling it can reduce tuning cost by 20%, but it might throw an exception.", ) # On HPU, we rely on torch.compile to speed up the model execution. - if self.enable_torch_compile and self.quantization_args.is_wfp8afp8 and not is_hpex_available(): + if self.enable_torch_compile and self.quantize_config.is_wfp8afp8 and not is_hpex_available(): self.enable_torch_compile = False logger.warning("reset enable_torch_compile to `False` as fp8 is enabled") # TODO: fix https://github.com/intel/auto-round/issues/1109 - if self.enable_torch_compile and self.quantization_args.is_nv_fp(act=True): + if self.enable_torch_compile and self.quantize_config.is_act_nv_fp: self.enable_torch_compile = False logger.warning("reset enable_torch_compile to `False` as nvfp4 is enabled") - def configure_layer_config(self, enable_gguf_official_mixed: None | bool = True): - is_gguf_format = (f := getattr(self, "formats", None)) is not None and len(f) > 0 and f[0].is_gguf() - if not is_gguf_format: - predefined_ignore_layers = get_predefined_ignore_layers(self.model_context.model) - if predefined_ignore_layers: - logger.info(f"Using predefined ignore_layers: {predefined_ignore_layers}") - tmp_str = ",".join(predefined_ignore_layers) - if self.ignore_layers == "": - self.ignore_layers = tmp_str - else: - self.ignore_layers += "," + tmp_str - - if self.is_auto_scheme: - self.layer_config = self._gen_auto_scheme() - else: - self.layer_config = _handle_special_schemes( - self.orig_scheme, - self.layer_config, - self.model_context.model, - supported_types=SUPPORTED_LAYER_TYPES, - inner_supported_types=INNER_SUPPORTED_LAYER_TYPES, - quant_lm_head=self.quant_lm_head, - mllm=self.model_context.is_mllm, - ) - - fill_default_value = True - if self.is_auto_scheme: - fill_default_value = False - self.layer_config, self.has_qlayer_outside_block, self.regex_config = set_layer_config( - self.model_context.model, - self.layer_config, - self.scheme, - self.scale_dtype, - SUPPORTED_LAYER_TYPES, - INNER_SUPPORTED_LAYER_TYPES, - self.quant_block_list, - self.ignore_layers, - self.quant_lm_head, - enable_gguf_official_mixed=enable_gguf_official_mixed, - is_mllm=self.model_context.is_mllm, - fill_default_value=fill_default_value, - ) - def _adjust_immediate_packing_and_saving(self): formats = getattr(self, "formats", []) if len(formats) == 1 and not formats[0].is_fake() and self.inplace: self.is_immediate_packing = True - if self.has_qlayer_outside_block and self.iters != 0: + if self.quantizer.has_qlayer_outside_block and self.iters != 0: self.is_immediate_packing = False if not ("causallm" in self.model_context.model.__class__.__name__.lower() and not self.model_context.is_mllm): @@ -767,12 +433,12 @@ def _adjust_immediate_packing_and_saving(self): ) self.compress_context.low_cpu_mem_usage = False self.is_immediate_saving = False - elif self.has_qlayer_outside_block and self.disable_opt_rtn and self.iters == 0: + elif self.quantizer.has_qlayer_outside_block and self.disable_opt_rtn and self.iters == 0: logger.info( "Keeping `low_cpu_mem_usage` enabled in RTN mode (iters=0): " "RTN path uses blockwise quantization and supports per-block offloading." ) - elif self.has_qlayer_outside_block and self.iters > 0: + elif self.quantizer.has_qlayer_outside_block and self.iters > 0: logger.warning( "`low_cpu_mem_usage` is not fully supported " "when there are quantized layers outside blocks and optimized RTN is disabled. " @@ -811,7 +477,7 @@ def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, l if self.compress_context.low_gpu_mem_usage or ( len(block_names) == 1 and len(layer_names) == 0 - and not self.has_qlayer_outside_block + and not self.quantizer.has_qlayer_outside_block and (last_cache_name is None or last_cache_name in block_names) ): # low_gpu_mem_usage or calibrate only the embedding layer, which is also very fast on CPU @@ -920,7 +586,7 @@ def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, l try: logger.info("switch to cpu to cache block inputs") self.compress_context.cache_device = torch.device("cpu") - if self.has_qlayer_outside_block or self.__class__.__name__ == "AutoRoundMLLM": + if self.quantizer.has_qlayer_outside_block or self.__class__.__name__ == "AutoRoundMLLM": logger.warning( "we recommend using more GPUs in calibration." " Otherwise, some layers may fall back to `rtn` mode, which can affect accuracy." @@ -964,7 +630,7 @@ def cache_inter_data(self, block_names, nsamples, layer_names=None, last_cache_n ## have bug if block name is not the first block if (len(block_names) > 1 or len(layer_names) > 0) and self.compress_context.low_gpu_mem_usage: tmp_dtype = self.model_context.model.dtype - if self.amp: + if self.model_context.amp: if self.model_context.model.dtype != self.model_context.model.dtype: self.model_context.model = self.model_context.model.to(torch.bfloat16) else: @@ -973,7 +639,7 @@ def cache_inter_data(self, block_names, nsamples, layer_names=None, last_cache_n self.last_cache_name = _infer_last_cache_name(block_names, layer_names, last_cache_name) self._cache_target_set = set(self.to_cached_layers) self._cache_seen_targets = set() - calib_bs = self.batch_size + calib_bs = self.quantizer.batch_size self.hook_handles = [] self._replace_forward() self.calib(nsamples, calib_bs) @@ -1093,7 +759,7 @@ def calib(self, nsamples, bs): # last position, so the impact on accuracy is minimal as basically equivalent to dropping a single token new_attention_mask[:, -1] = 0 - self.attention_mask.extend(list(torch.split(new_attention_mask, 1, dim=0))) + self.quantizer.attention_mask.extend(list(torch.split(new_attention_mask, 1, dim=0))) else: new_attention_mask = None try: @@ -1161,7 +827,7 @@ def post_process_cache_data(batch_size, data, data_name): new_data = data if batch_size <= 1: return new_data - if data_name in self.shared_cache_keys: + if data_name in self.model_context.shared_cache_keys: return None if "alibi" in data_name: if isinstance(data, torch.Tensor): @@ -1185,12 +851,12 @@ def forward(m, hidden_states=None, *positional_inputs, **kwargs): self.inputs[name] = {} init_cache(positional_inputs, self.inputs[name]) - if self.batch_dim is None: - self.batch_dim = 0 - if hidden_states is not None and self.batch_size > 1: - if hidden_states.shape[0] > self.batch_size: - self.batch_dim = 1 - if len(hidden_states.shape) > 1 and hidden_states.shape[1] > self.batch_size: + if self.quantizer.batch_dim is None: + self.quantizer.batch_dim = 0 + if hidden_states is not None and self.quantizer.batch_size > 1: + if hidden_states.shape[0] > self.quantizer.batch_size: + self.quantizer.batch_dim = 1 + if len(hidden_states.shape) > 1 and hidden_states.shape[1] > self.quantizer.batch_size: logger.error( "this model has not been supported, " "please raise an issue in https://github.com/intel/auto-round/issues" @@ -1210,23 +876,25 @@ def forward(m, hidden_states=None, *positional_inputs, **kwargs): ): if key not in self.inputs[name].keys(): # initialization data = to_device(kwargs[key], device=torch.device("cpu")) - if data is None or (self.batch_size > 1 and key in self.shared_cache_keys): + if data is None or ( + self.quantizer.batch_size > 1 and key in self.model_context.shared_cache_keys + ): self.inputs[name][key] = data continue - if self.batch_size <= 1: + if self.quantizer.batch_size <= 1: self.inputs[name][key] = [data] else: - data = post_process_cache_data(self.batch_size, data, key) - self.inputs[name][key] = list(torch.split(data, 1, dim=self.batch_dim)) + data = post_process_cache_data(self.quantizer.batch_size, data, key) + self.inputs[name][key] = list(torch.split(data, 1, dim=self.quantizer.batch_dim)) else: # append cache inputs - new_data = post_process_cache_data(self.batch_size, kwargs[key], key) + new_data = post_process_cache_data(self.quantizer.batch_size, kwargs[key], key) if new_data is None: # shareable args or NoneType continue new_data = to_device(new_data, device=torch.device("cpu")) - if self.batch_size <= 1: + if self.quantizer.batch_size <= 1: self.inputs[name][key].append(new_data) else: - self.inputs[name][key].extend(list(torch.split(new_data, 1, dim=self.batch_dim))) + self.inputs[name][key].extend(list(torch.split(new_data, 1, dim=self.quantizer.batch_dim))) elif isinstance(kwargs[key], (str, bool, type(None))): if key not in self.inputs[name].keys(): self.inputs[name][key] = kwargs[key] @@ -1312,7 +980,7 @@ def _preprocess_block_inputs(self, inputs, first_input_name="input_ids"): input_others = to_device(input_others, self.compress_context.cache_device) # As in calibration phase, we may use bf16 for calibration due to low_gpu_memory usage - tmp_dtype = self.model_context._amp_dtype if self.model_context._amp else torch.float32 + tmp_dtype = self.model_context.amp_dtype if self.model_context.amp else torch.float32 input_ids = to_dtype(input_ids, tmp_dtype) for key in input_others.keys(): @@ -1336,7 +1004,7 @@ def _quantize_embedding_layer(self): """Quantizes embedding layers in the model according to the configuration. This method iterates through all modules in the model, identifies embedding - layers specified in `self.layer_config`, and applies the appropriate quantization + layers specified in `self.quantizer.layer_config`, and applies the appropriate quantization function based on bit precision, grouping strategy, and dtype. Returns: @@ -1345,16 +1013,16 @@ def _quantize_embedding_layer(self): is_quantized = False for name, module in self.model.named_modules(): # Skip non-Embedding modules or layers not in config - if not isinstance(module, torch.nn.Embedding) or name not in self.layer_config: + if not isinstance(module, torch.nn.Embedding) or name not in self.quantizer.layer_config: continue - config = self.layer_config[name] + config = self.quantizer.layer_config[name] # Skip layers that are not marked for quantization if not check_to_quantized(config): continue is_quantized = True - config["scale_dtype"] = self.scale_dtype + config["scale_dtype"] = self.quantizer.scale_dtype dtype = config["data_type"] # Determine quantization function key with symmetry/asymmetry @@ -1410,7 +1078,7 @@ def _quantize_embedding_layer(self): setattr(module, param_name, value) # Update config - self.layer_config.setdefault(name, {}).update(config) + self.quantizer.layer_config.setdefault(name, {}).update(config) del weight del scale del zp @@ -1469,14 +1137,13 @@ def _quantize_blocks( self._offloader.reload(model, names) m.config = model.config if hasattr(model, "config") else None - for alg in self.algorithms: - q_input, input_ids = alg.quantize_block( - m, - input_ids, - input_others, - q_input=q_input, - device=device, - ) + q_input, input_ids = self.quantizer.quantize_block( + m, + input_ids, + input_others, + q_input=q_input, + device=device, + ) if hasattr(model, "config"): del m.config if self.is_immediate_packing: @@ -1515,26 +1182,26 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: Returns: The quantized model and layer configurations. """ - self.model_context._load_model() + self.post_init() self.model_context.initialize(formats=self.formats) self._check_compatibility() - if bool(self.quant_block_list): - all_blocks = self.quant_block_list + if bool(self.quantizer.quant_block_list): + all_blocks = self.quantizer.quant_block_list else: all_blocks = get_block_names(self.model_context.model) if len(all_blocks) == 0: logger.warning("could not find blocks, exit with original model") - return self.model_context.model, self.layer_config + return self.model_context.model, self.quantizer.layer_config layer_names = _get_quantized_layer_names_outside_blocks( model=self.model_context.model, - layer_config=self.layer_config, + layer_config=self.quantizer.layer_config, supported_types=SUPPORTED_LAYER_TYPES, - quant_block_list=self.quant_block_list, + quant_block_list=self.quantizer.quant_block_list, ) start_time = time.time() all_first_block_names = [block[0] for block in all_blocks] @@ -1588,8 +1255,8 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: if "input_ids" in inputs.keys(): total_samples = len(inputs["input_ids"]) - if total_samples < self.batch_size: - self.batch_size = total_samples + if total_samples < self.quantizer.batch_size: + self.quantizer.batch_size = total_samples logger.warning(f"force the train batch size to {total_samples}") self._quantize_blocks( @@ -1611,7 +1278,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: self._quantize_layers(layer_names, all_inputs) convert_module_to_hp_if_necessary( - self.model_context.model, self.amp_dtype, self.compress_context.device, to_cpu=True + self.model_context.model, self.model_context.amp_dtype, self.compress_context.device, to_cpu=True ) if self.is_immediate_saving: shard_writer(self, is_finalize=True) @@ -1642,7 +1309,101 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: logger.info(summary_info) self.model_context.quantized = True - return self.model_context.model, self.layer_config + return self.model_context.model, self.quantizer.layer_config + + def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None: + """Quantizes specified layers based on inputs and configuration. + + Args: + layer_names (list): list of layer names to quantize. + layer_inputs (dict): Dictionary mapping layer names to input data. + + Returns: + None + """ + # TODO currently we take all the layers outside blocks as post block layers which is not optimal + # if there is no input for layer, we use rtn + + for layer_name in copy.deepcopy(layer_names): + if layer_name not in layer_inputs: + if self.act_bits < 16 and not self.act_dynamic: + # Activation quantization requires collected inputs + msg_prefix = ( + f"Activation max hook for layer '{layer_name}' is unavailable due to " + f"insufficient collected inputs. " + ) + if "fp8_e5m2" in self.act_data_type: + logger.warning(msg_prefix + "Please notes that unit scale is used for this layer.") + else: + logger.warning( + msg_prefix + "Static activation quantization is not supported or ineffective, " + "Skipping quantization for this layer." + ) + layer_names.remove(layer_name) + continue + logger.info(f"using rtn to quantize {layer_name}") + from auto_round.data_type import QUANT_FUNC_WITH_DTYPE + + layer = get_module(self.model, layer_name) + layer = layer.to(self.device) + layer = convert_module_to_hp_if_necessary(layer, self.model_context.amp_dtype, self.device) + set_module(self.model, layer_name, layer) + + wrapper_layer = WrapperLinear( + layer, + enable_round_tuning=False, + enable_minmax_tuning=False, + enable_norm_bias_tuning=False, + enable_torch_compile=self.enable_torch_compile, + device=self.device, + disable_opt_rtn=self.disable_opt_rtn, + ) + new_layer = wrapper_layer.unwrapper({}) + set_module(self.model, layer_name, new_layer) + layer.cpu() + layer_names.remove(layer_name) + if len(layer_names) == 0: + memory_monitor.update() + memory_monitor.log_summary() + return + q_layer_inputs = None + enable_quanted_input = self.enable_quanted_input + has_gguf = False + + if hasattr(self, "formats"): + has_gguf = any(format_.is_gguf() for format_ in self.formats) + if has_gguf and self.is_immediate_packing: + enable_quanted_input = False + + if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1 and enable_quanted_input: + dispatch_model(self.model, self.model.hf_device_map) + + if enable_quanted_input: + logger.info("starting to cache layer inputs for %s, this may be quite slow ", layer_names) + q_layer_inputs = self.try_cache_inter_data_gpucpu([], self.nsamples, layer_names=layer_names) + if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1: + accelerate.hooks.remove_hook_from_submodules( + self.model + ) # self.model.hf_device_map has not been changed + if not self.is_immediate_saving: + self.model = mv_module_from_gpu(self.model) + clear_memory(device_list=self.device_list) + quant_layer = self.quantizer.quantize_layer + for layer_name in layer_names: + layer_input = layer_inputs[layer_name] + layer_input = to_device(layer_input, self.cache_device) + q_layer_input = q_layer_inputs.get(layer_name, None) if q_layer_inputs is not None else None + q_layer_input = to_device(q_layer_input, self.cache_device) + quant_layer(layer_name, layer_input, q_layer_input, device=self.device) + if self.is_immediate_packing: + self._immediate_pack(layer_name) + + if self.is_immediate_saving: + m = get_module(self.model, layer_name) + shard_writer(self, m, name=layer_name, is_finalize=False) + del layer_input + clear_memory(q_layer_input, device_list=self.device_list) + memory_monitor.log_summary() def _check_compatibility(self) -> None: """Checks compatibility of the configurations and model.""" @@ -1750,7 +1511,7 @@ def save_quantized( compressed_model = format.save_quantized( save_folder, model=self.model_context.model, - layer_config=self.layer_config, + layer_config=self.quantizer.layer_config, inplace=inplace, tokenizer=self.tokenizer, device=self.compress_context.device, diff --git a/auto_round/compressors_new/shard_writer.py b/auto_round/compressors_new/shard_writer.py index 990c5c387..b6dd329e3 100644 --- a/auto_round/compressors_new/shard_writer.py +++ b/auto_round/compressors_new/shard_writer.py @@ -55,6 +55,10 @@ def __init__(self, rounder): self.global_weight_map = {} self.shard_counter = 0 + # Persistent set of all parameter names already flushed to a shard file. + # Maintained incrementally in _flush_shard to avoid O(N^2) rebuilds in _add_tensor. + self._all_saved = set() + # Stats self.total_param_elems = 0 self.total_param_size_bytes = 0 @@ -99,6 +103,11 @@ def _add_tensor(self, name: str, tensor: torch.Tensor): if isinstance(tensor, torch.Tensor) and tensor.device.type == "meta": self.skipped_meta_tensors.append(name) return + + # Guard against duplicate saving of the same parameter + if name in self._all_saved or name in self.current_shard_tensors: + return + t_size = tensor.nbytes self.total_param_elems += tensor.numel() self.total_param_size_bytes += t_size @@ -135,6 +144,7 @@ def _flush_shard(self): saved_params = list(self.current_shard_tensors.keys()) self.shard_meta.append({"tmp_file": tmp_name, "params": saved_params}) + self._all_saved.update(saved_params) # Offload logic: move modules to meta device once all params are saved self._offload_to_meta(saved_params) @@ -144,18 +154,15 @@ def _flush_shard(self): def _offload_to_meta(self, saved_params): """Attempts to move fully saved modules to the 'meta' device to free RAM.""" - # Using a set for faster lookup of all saved parameters - all_saved = {p for meta in self.shard_meta for p in meta["params"]} - for param_full_name in saved_params: module_path = param_full_name.rsplit(".", 1)[0] module = get_module(self.model, module_path) - # Check if all parameters of this module are now in 'all_saved' + # Check if all parameters of this module are now in '_all_saved' if ( module is not None and isinstance(module, torch.nn.Module) - and all(f"{module_path}.{k}" in all_saved for k in module.state_dict().keys()) + and all(f"{module_path}.{k}" in self._all_saved for k in module.state_dict().keys()) ): module.to("meta") @@ -164,11 +171,10 @@ def finalize(self): # 1. Capture remaining weights not yet saved full_sd = self.model.state_dict() tie_word_embeddings = getattr(getattr(self.model, "config", None), "tie_word_embeddings", True) - all_saved_names = {p for meta in self.shard_meta for p in meta["params"]} finalize_skipped_meta_tensors = [] for pname, tensor in full_sd.items(): - if pname in all_saved_names: + if pname in self._all_saved: continue if tensor.device.type == "meta": continue diff --git a/auto_round/context/base.py b/auto_round/context/base.py index f427ce664..9f65357f0 100644 --- a/auto_round/context/base.py +++ b/auto_round/context/base.py @@ -12,15 +12,44 @@ # See the License for the specific language governing permissions and # limitations under the License. +from auto_round.logger import logger -class BaseContext: - __instance = None + +class AutoSkipInitMeta(type): + + def __new__(mcs, name, bases, namespace): + if "__init__" in namespace: + original_init = namespace["__init__"] + + def wrapped_init(self, *args, **kwargs): + if getattr(self, "_singleton_skip_init", False): + return + original_init(self, *args, **kwargs) + self._singleton_skip_init = True + + namespace["__init__"] = wrapped_init + + namespace["_instances"] = {} + return super().__new__(mcs, name, bases, namespace) + + def __call__(cls, *args, **kwargs): + if cls not in cls._instances: + instance = cls.__new__(cls, *args, **kwargs) + cls._instances[cls] = instance + instance.__init__(*args, **kwargs) + + return cls._instances[cls] + + +class BaseContext(metaclass=AutoSkipInitMeta): + def __init__(self): + logger.info(f"{self.__class__.__name__} context initialized.") @classmethod def get_context(cls): - return cls.__instance + assert cls in cls._instances, f"{cls.__name__} context has not been created yet." + return cls._instances.get(cls) @classmethod def create_context(cls, *args, **kwargs): - cls.__instance = cls(*args, **kwargs) - return cls.__instance + return cls(*args, **kwargs) diff --git a/auto_round/context/compress_context.py b/auto_round/context/compress.py similarity index 95% rename from auto_round/context/compress_context.py rename to auto_round/context/compress.py index 090b83cef..1c2d23689 100644 --- a/auto_round/context/compress_context.py +++ b/auto_round/context/compress.py @@ -35,8 +35,9 @@ def __init__( device_map: Union[str, torch.device, int, dict] = 0, enable_torch_compile: bool = False, ): + super().__init__() self.low_cpu_mem_usage = low_cpu_mem_usage - self.low_gpu_mem_usage - low_gpu_mem_usage + self.low_gpu_mem_usage = low_gpu_mem_usage if device_map is None: device_map = 0 diff --git a/auto_round/context/model_context.py b/auto_round/context/model.py similarity index 82% rename from auto_round/context/model_context.py rename to auto_round/context/model.py index ffa9b3a56..a60385331 100644 --- a/auto_round/context/model_context.py +++ b/auto_round/context/model.py @@ -58,7 +58,7 @@ class ModelContext(BaseContext): def __init__( self, - model, + model=None, tokenizer=None, platform="hf", model_dtype=None, @@ -67,35 +67,37 @@ def __init__( need_calib=True, device="cpu", ): + super().__init__() + assert model is not None, "model must be provided for ModelContext" self.model = model self.tokenizer = tokenizer self.device = device if envs.AR_USE_MODELSCOPE: platform = "model_scope" - self._platform = platform - self._model_dtype = model_dtype - self._trust_remote_code = trust_remote_code - self._amp = amp + self.platform = platform + self.model_dtype = model_dtype + self.trust_remote_code = trust_remote_code + self.amp = amp self.need_calib = need_calib def _load_model(self): - if is_mllm_model(self.model, platform=self._platform): + if is_mllm_model(self.model, platform=self.platform): self.is_mllm = True if isinstance(self.model, str): self.model, self.processor, self.tokenizer, self.image_processor = mllm_load_model( - self.model, platform=self._platform, device="cpu", model_dtype=self.model_dtype + self.model, platform=self.platform, device="cpu", model_dtype=self.model_dtype ) elif is_diffusion_model(self.model): self.is_diffusion = True self.pipe, self.model = diffusion_load_model( - self.model, platform=self._platform, device="cpu", model_dtype=self._model_dtype + self.model, platform=self.platform, device="cpu", model_dtype=self.model_dtype ) elif isinstance(self.model, str): config: Optional[AutoConfig] = None try: - config = AutoConfig.from_pretrained(self.model, trust_remote_code=self._trust_remote_code) + config = AutoConfig.from_pretrained(self.model, trust_remote_code=self.trust_remote_code) except (OSError, EnvironmentError) as e: logger.debug( "Failed to load config via AutoConfig.from_pretrained for %s: %s. " @@ -105,7 +107,7 @@ def _load_model(self): ) self.is_model_patched = apply_model_monkey_patches( - model_name=self.model, trust_remote_code=self._trust_remote_code + model_name=self.model, trust_remote_code=self.trust_remote_code ) import transformers @@ -126,36 +128,36 @@ def _load_model(self): self.model, self.tokenizer = llm_load_model( self.model, - platform=self._platform, + platform=self.platform, device="cpu", # always load cpu first - model_dtype=self._model_dtype, - trust_remote_code=self._trust_remote_code, + model_dtype=self.model_dtype, + trust_remote_code=self.trust_remote_code, ) - elif self.tokenizer is None and not self.diffusion and self.need_calib: + elif self.tokenizer is None and not self.is_diffusion and self.need_calib: raise ValueError("A tokenizer must be set for non-str model input") self._model_loaded = True def _set_amp_dtype(self) -> None: """Sets the automatic mixed precision (AMP) data type for the model based on the device and configuration.""" - self._amp_dtype = torch.bfloat16 + self.amp_dtype = torch.bfloat16 if self.model.dtype != torch.float32: - self._amp_dtype = self.model.dtype + self.amp_dtype = self.model.dtype if self.device == "cpu" or "hpu" in self.device: - self._amp_dtype = torch.bfloat16 - if self._amp: + self.amp_dtype = torch.bfloat16 + if self.amp: if self.device == "cpu" and not CpuInfo().bf16: - self._amp = False - self._amp_dtype = torch.float32 + self.amp = False + self.amp_dtype = torch.float32 self.model = self.model.to(torch.float32) logger.warning( f"amp is set to FALSE as the current {self.device} device does not support the 'bf16' data type." ) else: - if self.model.dtype != self._amp_dtype: - self.model = self.model.to(self._amp_dtype) + if self.model.dtype != self.amp_dtype: + self.model = self.model.to(self.amp_dtype) else: - self._amp_dtype = torch.float32 + self.amp_dtype = torch.float32 self.model = self.model.to(torch.float32) def initialize(self, formats): @@ -176,9 +178,9 @@ def initialize(self, formats): self.is_moe_model = is_moe_model(self.model) self._set_amp_dtype() - if self.act_quantize and self._amp_dtype == torch.float16: + if self.act_quantize and self.amp_dtype == torch.float16: logger.warning("force to use bf16 to for quantization tuning when enabling activation quantization") - self._amp_dtype = torch.bfloat16 + self.amp_dtype = torch.bfloat16 if self.model.dtype != torch.bfloat16: # keep the model's buffer dtype unchanged self.model = self.model.to(torch.bfloat16) else: @@ -187,7 +189,7 @@ def initialize(self, formats): # It is best to modify the model structure in the quantize function and check the format, # because it may cause the gguf format to not be exported normally. self.model = update_module( - self.model, formats=formats, trust_remote_code=self._trust_remote_code, cleanup_original=False + self.model, formats=formats, trust_remote_code=self.trust_remote_code, cleanup_original=False ) # Temporary names must be assigned after handle_moe_model; @@ -195,8 +197,8 @@ def initialize(self, formats): for n, m in self.model.named_modules(): m.global_name = n - if self._amp and self.model.dtype != self._amp_dtype: - self.model = self.model.to(self._amp_dtype) + if self.amp and self.model.dtype != self.amp_dtype: + self.model = self.model.to(self.amp_dtype) self._init_model = True diff --git a/auto_round/schemes.py b/auto_round/schemes.py index 9b70a5a86..8ebe5ab0a 100644 --- a/auto_round/schemes.py +++ b/auto_round/schemes.py @@ -44,7 +44,9 @@ class QuantizationScheme: @classmethod def from_dict(cls, config: dict): - return cls(**config) + field_names = {f.name for f in fields(cls)} + filtered_config = {k: v for k, v in config.items() if k in field_names} + return cls(**filtered_config) @classmethod def get_attributes(cls: "QuantizationScheme") -> list[str]: diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py index bb7bb03de..07cfe7795 100644 --- a/auto_round/utils/model.py +++ b/auto_round/utils/model.py @@ -976,6 +976,7 @@ def check_to_quantized(config): bool: True if the configuration is valid for quantization (bits <= 8), False otherwise. """ + from auto_round.schemes import QuantizationScheme if isinstance(config, (dict, QuantizationScheme)): bits = config.get("bits", None) From e265b8fea22d4a023d93afbebdee98bd64618be3 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Tue, 17 Mar 2026 17:02:38 +0800 Subject: [PATCH 03/90] update Signed-off-by: n1ck-guo --- .../algorithms/quantization/__init__.py | 2 + .../quantization/auto_round/quantizer.py | 80 +- auto_round/algorithms/quantization/base.py | 123 +- .../algorithms/quantization/rtn/config.py | 74 +- .../algorithms/quantization/rtn/quantizer.py | 390 +++++ auto_round/algorithms/quantization/rtn/rtn.py | 13 - auto_round/compressors/base.py | 31 +- auto_round/compressors/utils.py | 10 +- auto_round/compressors_new/__init__.py | 26 + .../architecture_visualization.py | 255 ++++ auto_round/compressors_new/base.py | 1351 ++--------------- auto_round/compressors_new/calib.py | 1235 +++++++++++++++ auto_round/compressors_new/diffusion_mixin.py | 157 ++ .../docs/compressors_new_architecture.md | 291 ++++ .../docs/compressors_new_architecture_CN.md | 787 ++++++++++ auto_round/compressors_new/entry.py | 354 +++++ auto_round/compressors_new/mllm_mixin.py | 148 ++ auto_round/compressors_new/utils.py | 46 + auto_round/compressors_new/zero_shot.py | 317 ++++ auto_round/context/compress.py | 10 +- auto_round/context/model.py | 6 +- 21 files changed, 4413 insertions(+), 1293 deletions(-) create mode 100644 auto_round/algorithms/quantization/rtn/quantizer.py delete mode 100644 auto_round/algorithms/quantization/rtn/rtn.py create mode 100644 auto_round/compressors_new/architecture_visualization.py create mode 100644 auto_round/compressors_new/calib.py create mode 100644 auto_round/compressors_new/diffusion_mixin.py create mode 100644 auto_round/compressors_new/docs/compressors_new_architecture.md create mode 100644 auto_round/compressors_new/docs/compressors_new_architecture_CN.md create mode 100644 auto_round/compressors_new/entry.py create mode 100644 auto_round/compressors_new/mllm_mixin.py create mode 100644 auto_round/compressors_new/zero_shot.py diff --git a/auto_round/algorithms/quantization/__init__.py b/auto_round/algorithms/quantization/__init__.py index a4b2fca7f..00de9d3ac 100644 --- a/auto_round/algorithms/quantization/__init__.py +++ b/auto_round/algorithms/quantization/__init__.py @@ -17,3 +17,5 @@ from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig from auto_round.algorithms.quantization.auto_round.quantizer import ARQuantizer from auto_round.algorithms.quantization.auto_round.adam import ARAdamQuantizer +from auto_round.algorithms.quantization.rtn.config import RTNConfig +from auto_round.algorithms.quantization.rtn.quantizer import RTNQuantizer, OptimizedRTNQuantizer diff --git a/auto_round/algorithms/quantization/auto_round/quantizer.py b/auto_round/algorithms/quantization/auto_round/quantizer.py index 423bb4f29..d64e5087d 100644 --- a/auto_round/algorithms/quantization/auto_round/quantizer.py +++ b/auto_round/algorithms/quantization/auto_round/quantizer.py @@ -30,16 +30,16 @@ check_skippable_keywords, collect_best_params, get_shared_keys, + immediate_pack, infer_bits_by_data_type, init_cache, - is_nv_fp, reset_params, - set_layer_config, ) from auto_round.logger import logger from auto_round.modeling.fused_moe.replace_modules import materialize_model_, safe_to_cpu_ from auto_round.sign_sgd import SignSGD from auto_round.utils import ( + check_to_quantized, clear_memory, compile_func, convert_module_to_hp_if_necessary, @@ -97,7 +97,7 @@ def post_init(self): logger.warning_once("using algorithm extension for quantization.") from auto_round.alg_ext import wrapper_autoround - wrapper_autoround(self.quantizer) + wrapper_autoround(self) except (ImportError, ModuleNotFoundError): logger.error("algorithm extension import error, fallback to default mode") @@ -334,8 +334,26 @@ def quantize_block( input_ids: Union[list[torch.Tensor], dict], input_others: dict, q_input: Union[torch.Tensor, dict, None] = None, - device: Union[str, torch.device] = "cpu", auto_offload=True, + **kwargs, + ): + self._quantize_block(block, input_ids, input_others, q_input=q_input, auto_offload=auto_offload, **kwargs) + if hasattr(block, "config"): + del block.block + if self.compress_context.is_immediate_saving: + for n, tmp_m in block.named_modules(): + if not (hasattr(tmp_m, "bits") and check_to_quantized(tmp_m)): + continue + immediate_pack(tmp_m.global_name, self.quantizer.layer_config) + + def _quantize_block( + self, + block: torch.nn.Module, + input_ids: Union[list[torch.Tensor], dict], + input_others: dict, + q_input: Union[torch.Tensor, dict, None] = None, + auto_offload=True, + **kwargs, ): """Quantize the weights of a given block of the model. @@ -349,6 +367,7 @@ def quantize_block( Returns: Tuple: (q_outputs, output) if self.enable_quanted_input is True, else (None, output) """ + device = self.compress_context.device materialize_model_(block) convert_module_to_hp_if_necessary(block, self.model_context.amp_dtype, device) @@ -388,8 +407,6 @@ def quantize_block( input_ids, input_others, self.batch_size * self.infer_bs_coeff, - device, - self.compress_context.cache_device, ) for handle in hook_handles: @@ -400,8 +417,6 @@ def quantize_block( input_ids, input_others, self.batch_size * self.infer_bs_coeff, - device, - self.compress_context.cache_device, ) hook_handles = self._register_act_max_hook(block) if hook_handles: @@ -410,8 +425,6 @@ def quantize_block( q_input if q_input is not None else input_ids, input_others, self.batch_size * self.infer_bs_coeff, - device, - self.compress_context.cache_device, save_output=False, ) @@ -590,8 +603,6 @@ def quantize_block( input_ids, input_others, self.batch_size * self.infer_bs_coeff, - device, - cache_device=self.compress_context.cache_device, ) if len(self.compress_context.device_list) > 1 and auto_offload: @@ -615,7 +626,9 @@ def quantize_block( return None, output - def quantize_layer(self, layer_name: str, inputs: torch.Tensor, q_inputs: torch.Tensor = None, device: str = "cpu"): + def quantize_layer( + self, layer_name: str, inputs: torch.Tensor, q_inputs: torch.Tensor = None, device: str = "cpu", **kwargs + ): """Quantize a specific layer of the model using the provided inputs. Args: @@ -638,12 +651,12 @@ def quantize_layer(self, layer_name: str, inputs: torch.Tensor, q_inputs: torch. if q_inputs is not None: q_inputs[i] = q_inputs[i].to(layer.weight.dtype) - if self.act_bits <= 8 and check_need_act_calibration( - self.act_dynamic, - self.act_data_type, - self.act_bits, - self.static_kv_dtype, - self.static_attention_dtype, + if self.config.is_act_quantize and check_need_act_calibration( + self.config.act_dynamic, + self.config.act_data_type, + self.config.act_bits, + self.config.static_kv_dtype, + self.config.static_attention_dtype, ): tmp_inputs = q_inputs if q_inputs is not None else inputs hook_handles = self._register_act_max_hook(layer) @@ -656,7 +669,7 @@ def quantize_layer(self, layer_name: str, inputs: torch.Tensor, q_inputs: torch. wrapper_linear = WrapperLinear( layer, enable_minmax_tuning=self.enable_minmax_tuning, - enable_torch_compile=self.enable_torch_compile, + enable_torch_compile=self.compress_context.enable_torch_compile, device=device, ).to(device) round_params = [] @@ -795,8 +808,6 @@ def _get_block_outputs( input_ids: torch.Tensor | list[torch.Tensor], input_others: torch.Tensor | dict, bs: int, - device: Union[str, torch.device], - cache_device: Union[str, torch.device], save_output: bool = True, ): """Compute the output of a given block of the model for a given input. @@ -814,9 +825,21 @@ def _get_block_outputs( The output tensor of the block. """ - self.block_forward = ( - compile_func(block_forward, self.device) if self.compress_context.enable_torch_compile else block_forward - ) + if ( + (self.config.is_act_quantize and (not self.config.act_dynamic or self.config.is_act_nv_fp)) # have hooks + or self.enable_alg_ext # Use imatrix + # or not self.disable_opt_rtn # Use imatrix + ): + self.block_forward = block_forward + else: + # TODO FIXME + # This function could not be compiled, causing a large accuracy drop when `enable_alg_ext` is used. + # To avoid issues, remove it in all scenarios except WOQ. + self.block_forward = ( + compile_func(block_forward, self.compress_context.device) + if self.compress_context.enable_torch_compile + else block_forward + ) output = [] nsamples = len(input_ids) @@ -832,7 +855,12 @@ def _get_block_outputs( share_cache_keys=self.model_context.shared_cache_keys, ) tmp_output = self.block_forward( - block, tmp_input_ids, tmp_input_others, self.model_context.amp, self.model_context.amp_dtype, device + block, + tmp_input_ids, + tmp_input_others, + self.model_context.amp, + self.model_context.amp_dtype, + self.compress_context.device, ).to(self.compress_context.cache_device) if save_output: if self.batch_size == 1: diff --git a/auto_round/algorithms/quantization/base.py b/auto_round/algorithms/quantization/base.py index 0d293afa4..affd244b1 100644 --- a/auto_round/algorithms/quantization/base.py +++ b/auto_round/algorithms/quantization/base.py @@ -14,6 +14,7 @@ import copy import importlib import sys +import traceback from dataclasses import fields import torch @@ -22,6 +23,7 @@ from auto_round.compressors_new.utils import ( IndexSampler, _get_quantized_layer_names_outside_blocks, + _get_save_folder_name, block_forward, check_need_act_calibration, check_skippable_keywords, @@ -29,11 +31,11 @@ get_shared_keys, infer_bits_by_data_type, init_cache, - reset_params, set_layer_config, ) from auto_round.context.compress import CompressContext from auto_round.context.model import ModelContext +from auto_round.data_type import QUANT_FUNC_WITH_DTYPE from auto_round.data_type.utils import reshape_pad_tensor_by_group_size from auto_round.logger import logger from auto_round.schemes import ( @@ -48,6 +50,7 @@ INNER_SUPPORTED_LAYER_TYPES, SUPPORTED_LAYER_TYPES, check_to_quantized, + clear_memory, convert_dtype_str2torch, find_matching_blocks, get_block_names, @@ -56,15 +59,25 @@ class BaseQuantizers: - def __init__(self, config: QuantizationConfig): - self.scheme = config.scheme + self.config = config self.layer_config = config.layer_config - self.quant_lm_head = config.quant_lm_head + self.scheme = config.scheme + self.bits = config.bits + self.group_size = config.group_size + self.sym = config.sym + self.data_type = config.data_type + self.act_bits = config.act_bits + self.act_group_size = config.act_group_size + self.act_sym = config.act_sym + self.act_data_type = config.act_data_type + self.act_dynamic = config.act_dynamic + self.super_bits = config.super_bits + self.super_group_size = config.super_group_size self.scale_dtype = config.scale_dtype - self.to_quant_block_names = config.to_quant_block_names self.ignore_layers = config.ignore_layers - self.config = config + self.quant_lm_head = config.quant_lm_head + self.to_quant_block_names = config.to_quant_block_names @classmethod def from_config(cls, config: QuantizationConfig): @@ -75,6 +88,10 @@ def from_config(cls, config: QuantizationConfig): alg_cls = getattr(module, config._alg_cls) return alg_cls(config) + @property + def formats(self): + return getattr(self.compress_context, "formats", None) + def post_init(self): # should be set after loading model and set layer_config, cause some special scheme need these. # Preserve the original, unparsed scheme for later use in auto scheme generation @@ -84,6 +101,11 @@ def post_init(self): self.model_context = ModelContext() self.compress_context = CompressContext() + # used in shard writer, rafactor later + self._get_save_folder_name = _get_save_folder_name + + self.model = self.model_context.model + scheme_fields = {f.name for f in fields(QuantizationScheme)} user_scheme_overrides = {} for k in scheme_fields: @@ -95,6 +117,8 @@ def post_init(self): # Bind attributes to self.config for easy instance-level access for key, value in final_attrs.items(): setattr(self.config, key, value) + if hasattr(self, key): + setattr(self, key, value) self.config.check_config() self.orig_scheme = copy.deepcopy(self.scheme) @@ -198,8 +222,8 @@ def _gen_auto_scheme(self) -> dict[str, dict]: return layer_config def configure_layer_config(self, enable_gguf_official_mixed: None | bool = True): - - is_gguf_format = (f := getattr(self, "formats", None)) is not None and len(f) > 0 and f[0].is_gguf() + # before get_format, therefore, compress_context.formats is str + is_gguf_format = (f := getattr(self.compress_context, "formats", None)) is not None and "gguf" in f if not is_gguf_format: predefined_ignore_layers = get_predefined_ignore_layers(self.model_context.model) if predefined_ignore_layers: @@ -300,3 +324,86 @@ def get_act_max_hook(module, input, output): hook_handles.append(hook) continue return hook_handles + + @torch.inference_mode() + def _quantize_embedding_layer(self): + """Quantizes embedding layers in the model according to the configuration. + + This method iterates through all modules in the model, identifies embedding + layers specified in `self.quantizer.layer_config`, and applies the appropriate quantization + function based on bit precision, grouping strategy, and dtype. + + Returns: + bool: True if the quantization process completes without critical errors. + """ + is_quantized = False + for name, module in self.model_context.model.named_modules(): + # Skip non-Embedding modules or layers not in config + if not isinstance(module, torch.nn.Embedding) or name not in self.layer_config: + continue + + config = self.layer_config[name] + + # Skip layers that are not marked for quantization + if not check_to_quantized(config): + continue + is_quantized = True + config["scale_dtype"] = self.scale_dtype + dtype = config["data_type"] + + # Determine quantization function key with symmetry/asymmetry + if dtype not in QUANT_FUNC_WITH_DTYPE: + dtype = f"{dtype}_{'sym' if config['sym'] else 'asym'}" + + quant_func = QUANT_FUNC_WITH_DTYPE[dtype] + dtype = module.weight.dtype + # As typically float32 are used in RTN to search scale zp, + # to avoid cache a bf16 copy we'd better use float32 + if config.get("super_group_size", None) is not None: + dtype = torch.float32 + + # Attempt quantization on GPU, fall back to CPU if OOM + try: + weight, scale, zp = quant_func( + module.weight.to(dtype=dtype, device=self.compress_context.device), + **{ + k: config.get(k, None) + for k in ["bits", "group_size", "super_bits", "super_group_size", "scale_dtype"] + }, + ) + except torch.OutOfMemoryError: + cuda_error_msg = traceback.format_exc() + try: + logger.error(cuda_error_msg) + logger.warning("falling back to CPU") + weight, scale, zp = quant_func( + module.weight.to("cpu"), + **{ + k: config.get(k, None) + for k in ["bits", "group_size", "super_bits", "super_group_size", "scale_dtype"] + }, + ) + except Exception as e: + raise + + # Overwrite the module's weights with the quantized version + module.weight.data.copy_(weight.cpu()) + + # Attach scale and zero point (zp) to the module + for param_name, value in zip(["scale", "zp"], [scale, zp]): + if isinstance(value, dict): + for k, v in value.items(): + setattr(module, k if k == "scale" else f"w_{k}", v.cpu()) + elif isinstance(value, torch.Tensor): + setattr(module, param_name, value.cpu()) + else: + setattr(module, param_name, value) + + # Update config + self.layer_config.setdefault(name, {}).update(config) + del weight + del scale + del zp + clear_memory(device_list=self.compress_context.device_list) + + return is_quantized diff --git a/auto_round/algorithms/quantization/rtn/config.py b/auto_round/algorithms/quantization/rtn/config.py index 01d7fe939..2eae2d507 100644 --- a/auto_round/algorithms/quantization/rtn/config.py +++ b/auto_round/algorithms/quantization/rtn/config.py @@ -1,9 +1,71 @@ -# # Copyright (C) 2026 Intel Corporation -# # SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -from auto_round.algorithms.alg_config import AlgConfig +from auto_round.algorithms.quantization.config import QuantizationConfig +from auto_round.data_type import QUANT_FUNC_WITH_DTYPE +from auto_round.logger import logger -class RTNConfig(AlgConfig): - def __init__(self): - super().__init__() +class RTNConfig(QuantizationConfig): + _alg_cls = "RTNQuantizer" + + def __init__( + self, + scheme="W4A16", + layer_config=None, + *, + disable_opt_rtn: bool = None, + # for opt-rtn + seqlen: int = 2048, + nsamples: int = 128, + batch_size: int = 8, + **kwargs, + ): + super().__init__(scheme=scheme, layer_config=layer_config, **kwargs) + + self.seqlen = seqlen + self.nsamples = nsamples + self.batch_size = batch_size + + # Some helpers + self.infer_bs_coeff = 1 + self.batch_dim = None + + # Automatically adjust the disable_opt_rtn option if the user does not explicitly set it. + # To avoid None issue, we keep a copy though it's a little ugly + enable_opt_rtn = kwargs.pop("enable_opt_rtn", None) + if enable_opt_rtn and disable_opt_rtn: + raise ValueError("`enable_opt_rtn` and `disable_opt_rtn` are mutually exclusive; " "only one can be set.") + if enable_opt_rtn: + disable_opt_rtn = False + self.orig_disable_opt_rtn = disable_opt_rtn + + if disable_opt_rtn is None: + if isinstance(scheme, str) and scheme in ["W8A16", "W8A8"]: + logger.warning("`disable_opt_rtn` is turned on for W8A16/W8A8 quantization to improve efficiency.") + disable_opt_rtn = True + if self.bits and self.bits >= 8 and self.act_bits and self.act_bits >= 8 and self.data_type == "int": + logger.warning("`disable_opt_rtn` is turned on for W8A16/W8A8 quantization to improve efficiency.") + disable_opt_rtn = True + if disable_opt_rtn is None: + logger.info( + "`enable_opt_rtn` is turned on, set `--disable_opt_rtn` for higher speed at the cost of accuracy." + ) + disable_opt_rtn = False + self.disable_opt_rtn = disable_opt_rtn + if not self.disable_opt_rtn: + self._alg_cls = "OptimizedRTNQuantizer" + + if not self.disable_opt_rtn and f"rtn_{self.data_type}" in QUANT_FUNC_WITH_DTYPE: + self.data_type = f"rtn_{self.data_type}" diff --git a/auto_round/algorithms/quantization/rtn/quantizer.py b/auto_round/algorithms/quantization/rtn/quantizer.py new file mode 100644 index 000000000..52a6d83a5 --- /dev/null +++ b/auto_round/algorithms/quantization/rtn/quantizer.py @@ -0,0 +1,390 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy +import traceback +from collections import defaultdict +from typing import Any, Callable, Optional, Union + +import accelerate +import torch + +from auto_round.algorithms.quantization.auto_round.quantizer import ARQuantizer +from auto_round.algorithms.quantization.base import BaseQuantizers +from auto_round.algorithms.quantization.rtn.config import RTNConfig +from auto_round.compressors.shard_writer import shard_writer +from auto_round.compressors_new.utils import ( + IndexSampler, + block_forward, + check_need_act_calibration, + check_skippable_keywords, + collect_best_params, + get_shared_keys, + immediate_pack, + infer_bits_by_data_type, + init_cache, + reset_params, + set_layer_config, +) +from auto_round.data_type.utils import update_block_global_scale_if_needed +from auto_round.logger import logger +from auto_round.modeling.fused_moe.replace_modules import materialize_model_, safe_to_cpu_ +from auto_round.utils import ( + check_to_quantized, + clear_memory, + convert_module_to_hp_if_necessary, + get_lm_head_name, + get_module, + htcore, + is_auto_device_mapping, + is_hpex_available, + memory_monitor, + mv_module_from_gpu, + set_amax_for_all_moe_layers, + set_module, +) +from auto_round.utils.device import ( + clear_memory_if_reached_threshold, + get_major_device, + parse_available_devices, + set_auto_device_map_for_block_with_tuning, + set_non_auto_device_map, +) +from auto_round.wrapper import WrapperLinear, WrapperMultiblock, unwrapper_block, unwrapper_layer, wrapper_block + + +class RTNQuantizer(BaseQuantizers): + def __init__(self, config: RTNConfig): + BaseQuantizers.__init__(self, config) + + def quantize_block(self, block: torch.nn.Module, block_name: str, **kwargs): + tied_weights_keys = getattr(self.model, "_tied_weights_keys", []) + if tied_weights_keys is None: + tied_weights_keys = [] + if isinstance(tied_weights_keys, dict): + tied_weights_values = list(tied_weights_keys.values()) + else: + tied_weights_values = list(tied_weights_keys) + tied_weights_layers = [".".join(val.split(".")[:-1]) for val in tied_weights_values] # rm weight/bias + # In fact, we should detect whether it is is_separate_lm_head, to simplify, we don't do it + if hasattr(self.compress_context, "formats") and self.compress_context.formats[0].is_gguf(): + lm_head_name = get_lm_head_name(self.model) + if lm_head_name is not None: + tied_weights_layers.append(lm_head_name) + + materialize_model_(block) + for name, m in block.named_modules(): + if hasattr(m, "global_name") and check_to_quantized(m): + self.quantize_layer(m.global_name, to_cpu=self.low_gpu_mem_usage) + elif ( + not any(m.children()) + and len(m.state_dict()) > 0 + and m.global_name not in tied_weights_layers + and self.compress_context.is_immediate_saving + ): + set_module(self.model, m.global_name, copy.deepcopy(m)) + if self.compress_context.is_immediate_saving: + shard_writer(self, name=m.global_name) + copied_m = get_module(self.model, m.global_name) + copied_m.to("meta") + m.to("meta") + # Move remaining GPU tensors to CPU; offload to disk if low_cpu_mem_usage. + # This mirrors _quantize_via_rtn_blockwise's post-block cleanup. + if not self.compress_context.is_immediate_saving: + mv_module_from_gpu(block) + else: + # Save once at block scope to capture tensors that are not saved + # in per-layer branch (e.g., custom module-level params/buffers). + shard_writer(self, name=block_name) + block.to("meta") + + def quantize_layer(self, name: str, dtype: torch.dtype = None) -> None: + """Quantizes a layer using RTN (Round-To-Nearest) if available. + + This function attempts to quantize a layer by switching its data type to a + `rtn_*` version if supported, then wraps and unwraps the module to apply + quantization. If GPU memory is insufficient, it falls back to CPU. + + If packing is enabled (`immediate_packing`), the function will also export + the quantized layer to the appropriate backend format. + + Args: + name (str): Name of the layer to quantize. + + Raises: + RuntimeError: If quantization fails for reasons unrelated to memory. + """ + + m = get_module(self.model, name) + if dtype is not None: + m = m.to(dtype) + + m = convert_module_to_hp_if_necessary(m, self.model_context.amp_dtype, self.compress_context.device) + set_module(self.model, name, m) + tuning_device = m.tuning_device if hasattr(m, "tuning_device") else self.compress_context.device + # Step 1: let gguf merge layers or rename module first and we will handle the RTN is gguf specific logic + if self.compress_context.is_immediate_packing and self.compress_context.formats[0].is_gguf(): + m = m.to(tuning_device) + m.scale = None + m.zp = None + else: + try: + disable_opt_rtn = False + if ( + self.config.orig_disable_opt_rtn is None + and self.model_context.is_moe_model + and "expert" in m.global_name + and "shared_expert" not in m.global_name + and self.config.super_bits is None # GGUF still uses the optimized RTN for MoE layers + ): + disable_opt_rtn = True + logger.warning_once( + "MoE layer detected: optimized RTN is disabled for efficiency. " + "Use `--enable_opt_rtn` to force-enable it for MoE layers." + ) + m = m.to(tuning_device) + m = WrapperLinear( + m, + device=tuning_device, + enable_minmax_tuning=False, + enable_norm_bias_tuning=False, + enable_round_tuning=False, + enable_torch_compile=self.compress_context.enable_torch_compile, + disable_opt_rtn=disable_opt_rtn, + ) + m = m.unwrapper({}) + except torch.OutOfMemoryError: + cuda_error_msg = traceback.format_exc() + m = m.orig_layer if hasattr(m, "orig_layer") else m + try: + logger.error(cuda_error_msg) + logger.warning("falling back to CPU.") + m.to("cpu") + m = WrapperLinear( + m, + enable_minmax_tuning=False, + enable_norm_bias_tuning=False, + enable_round_tuning=False, + enable_torch_compile=self.compress_context.enable_torch_compile, + ) + m = m.unwrapper({}) + except Exception as e: + raise + self._immediate_pack_and_save_module(name) + + def _immediate_pack_and_save_module(self, module_name): + to_cpu = self.compress_context.low_gpu_mem_usage + module = get_module(self.model, module_name) + if self.compress_context.is_immediate_packing: # For gguf, packing conducts on block level + immediate_pack(module_name, self.layer_config) + if to_cpu: + module = module.to("cpu") + packed_module = get_module(self.model, module_name) + set_module(self.model, module_name, packed_module.to("cpu")) + else: + if to_cpu: + module = module.to("cpu") + set_module(self.model, module_name, module) + if self.compress_context.is_immediate_saving: + module = get_module(self.model, module_name) + module.to("cpu") + shard_writer(self, module, module_name, False) + # Free RAM immediately: the data is now in the shard-writer buffer + # (and will be flushed to disk). Keeping it also in the model tree + # causes linear RAM growth for large models. + module.to("meta") + + +class OptimizedRTNQuantizer(RTNQuantizer): + def __init__(self, config: RTNConfig): + BaseQuantizers.__init__(self, config) + self.batch_size = config.batch_size + self.seqlen = config.seqlen + self.nsamples = config.nsamples + self.batch_dim = config.batch_dim + self.data_type = config.data_type + self.group_size = config.group_size + self.infer_bs_coeff = config.infer_bs_coeff + + self.enable_alg_ext = True + + def quantize_block( + self, block: torch.nn.Module, input_ids: Union[list[torch.Tensor], dict], input_others: dict, **kwargs + ): + + materialize_model_(block) + block.to("cpu") + + block = convert_module_to_hp_if_necessary( + block, dtype=self.model_context.amp_dtype, device=self.compress_context.device + ) + update_block_global_scale_if_needed(block, self.data_type, self.group_size) + self._register_act_max_hook(block) + if is_auto_device_mapping(self.compress_context.device_map) and len(self.compress_context.device_list) > 1: + set_auto_device_map_for_block_with_tuning( + block, + self.compress_context.device_map, + input_ids, + self.compress_context.low_gpu_mem_usage, + self.batch_size, + self.compress_context.device, + ) + # Dispatch model if needed + if len(self.compress_context.device_list) > 1: + from accelerate.hooks import AlignDevicesHook, add_hook_to_module + + for _, m in block.named_modules(): + if len(list(m.children())) != 0 or not hasattr(m, "tuning_device"): + continue + hook = AlignDevicesHook(m.tuning_device, io_same_device=True) + add_hook_to_module(m, hook, True) + else: + block = block.to(self.compress_context.device) + input_ids = self._get_block_outputs( + block, + input_ids, + input_others, + self.batch_size * self.infer_bs_coeff, + ) + + if len(self.compress_context.device_list) > 1: + accelerate.hooks.remove_hook_from_submodules(block) + + if self.config.is_act_nv_fp or self.config.is_static_afp8: + # enable moe experts act_max automatic generation for Linear + set_amax_for_all_moe_layers(block, attr_name="act_max") + # Normalize imatrix and quantize layers + if self.compress_context.low_gpu_mem_usage: + block.to("cpu") + clear_memory(device_list=self.compress_context.device_list) + + for name, m in block.named_modules(): + # fix issue: Ling-flash-2.0-q2_k_s fail infer on cuda but well on cpu + # https://huggingface.co/Intel/Ling-flash-2.0-gguf-q2ks-mixed-AutoRound/discussions/1 + if hasattr(m, "imatrix"): + m.imatrix /= m.imatrix_cnt + if hasattr(m, "global_name") and check_to_quantized(m): + self.quantize_layer(m.global_name) + + mv_module_from_gpu(block) + + @torch.no_grad() + def _get_block_outputs( + self, + block: torch.nn.Module, + input_ids: torch.Tensor | list[torch.Tensor], + input_others: torch.Tensor | dict, + bs: int, + save_output: bool = True, + ): + """Compute the output of a given block of the model for a given input. + + Args: + block: The block of the model. + input_ids: The input tensor containing tokenized input ids. + input_others: A dictionary containing additional input data. + bs: The batch size for computing the output. + device: The device for computation. + cache_device: The device for storing the output. + batch_dim: The batch dimension of the output tensor. + + Returns: + The output tensor of the block. + """ + + self.block_forward = block_forward + + output = [] + nsamples = len(input_ids) + for i in range(0, nsamples, bs): + end_index = min(nsamples, i + bs) + indices = torch.arange(i, end_index).to(torch.long) + tmp_input_ids, tmp_input_others = self._sampling_inputs( + input_ids, + input_others, + indices, + self.seqlen, + self.batch_dim, + share_cache_keys=self.model_context.shared_cache_keys, + ) + tmp_output = self.block_forward( + block, + tmp_input_ids, + tmp_input_others, + self.model_context.amp, + self.model_context.amp_dtype, + self.compress_context.device, + ).to(self.compress_context.cache_device) + if save_output: + if self.batch_size == 1: + output.append(tmp_output) + else: + output.extend(list(torch.split(tmp_output, 1, dim=self.batch_dim))) + if self.compress_context.low_gpu_mem_usage: + clear_memory(device_list=self.compress_context.device_list) + + return output + + @classmethod + @torch.no_grad() + def _sampling_inputs( + cls, + input_ids: Union[list[torch.Tensor], dict], + input_others: dict, + indices: list[int] | torch.Tensor, + seqlen: int, + batch_dim: int = 0, + share_cache_keys: tuple = (), + ): + """Samples inputs based on the given indices and sequence length. + + Args: + input_ids: The list of input tensor containing input_ids. + input_others: A dictionary containing other input data. + indices: The indices to sample from the input. + seqlen: The sequence length. + + Returns: + current_input_ids: The sampled input IDs. + current_input_others: The sampled other input data. + """ + if isinstance(input_ids, list): + current_input_ids = [input_ids[i] for i in indices] + current_input_ids = torch.cat(current_input_ids, dim=batch_dim) + elif isinstance(input_ids, dict): + current_input_ids = defaultdict(list) + for k in input_ids.keys(): + current_input_ids[k].extend([input_ids[k][i] for i in indices]) + current_input_ids[k] = torch.cat(current_input_ids[k], dim=batch_dim) + + current_input_others = {"positional_inputs": input_others["positional_inputs"]} + for key in input_others.keys(): + if "positional_inputs" in key: + continue + if (key not in share_cache_keys or len(indices) == 1) and not isinstance( + input_others[key], (str, bool, type(None)) + ): + current_input_others[key] = None + if input_others[key] is not None: + current_input_others[key] = [input_others[key][i] for i in indices] + if len(indices) == 1: + current_input_others[key] = current_input_others[key][0] + else: + try: + current_input_others[key] = torch.cat(current_input_others[key], dim=0) + except TypeError as err: + logger.warning_once("Please check the model cache inputs or try setting batch_size to 1.") + else: + current_input_others[key] = input_others[key] + + return current_input_ids, current_input_others diff --git a/auto_round/algorithms/quantization/rtn/rtn.py b/auto_round/algorithms/quantization/rtn/rtn.py deleted file mode 100644 index 14a492441..000000000 --- a/auto_round/algorithms/quantization/rtn/rtn.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2026 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 9d7184e40..20514fba1 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -44,6 +44,8 @@ get_shared_keys, infer_bits_by_data_type, init_cache, + is_block_wfp8, + is_dynamic_afp8, is_dynamic_wint8aint8, is_mx_fp, is_nv_fp, @@ -78,6 +80,7 @@ check_to_quantized, clear_memory, compile_func, + compress_layer_names, convert_dtype_str2torch, convert_module_to_hp_if_necessary, detect_device, @@ -457,7 +460,7 @@ def __init__( self.bits >= 8 and self.act_bits >= 8 and self.iters == 0 - and self.data_type == "int" + and self.data_type in ["int", "fp"] and disable_opt_rtn is None ): logger.warning("`disable_opt_rtn` is turned on for W8A16/W8A8 quantization to improve efficiency.") @@ -468,6 +471,9 @@ def __init__( ) disable_opt_rtn = False + if self.iters > 0 and is_block_wfp8(self): + logger.warning("RTN is recommended since it shows even better accuracy for block-wise fp8 quantization.") + # Important Note! This is not very robust, do NOT rely on it to do high risky thing self.is_moe_model = is_moe_model(self.model) @@ -840,10 +846,14 @@ def _check_configs(self) -> None: raise ValueError("`bits` must be positive") if self.act_bits <= 0: raise ValueError("`act_bits` must be positive") - if not (self.group_size == -1 or self.group_size >= 0): - raise ValueError("`group_size` must be -1 (per channel) or 0 (per-tensor) or a positive integer") - if not (self.act_group_size == -1 or self.act_group_size >= 0): + if not isinstance(self.group_size, tuple) and not (self.group_size == -1 or self.group_size >= 0): + raise ValueError( + "`group_size` must be -1 (per channel) or 0 (per-tensor) or a positive integer or a tuple of length 2" + ) + if isinstance(self.act_group_size, tuple) or not (self.act_group_size == -1 or self.act_group_size >= 0): raise ValueError("`act_group_size` must be -1 (per channel) or 0 (per-tensor) or a positive integer") + if isinstance(self.group_size, tuple) and len(self.group_size) != 2: + raise ValueError("`group_size` must be a tuple of length 2") if self.batch_size <= 0: raise ValueError("`batch_size` must be positive") if self.iters < 0: @@ -861,6 +871,7 @@ def _check_configs(self) -> None: and not is_mx_fp(self.act_data_type) and not is_dynamic_wint8aint8(self) and not is_static_wfp8afp8(self.act_data_type) + and not is_dynamic_afp8(self) ): logger.warning( "activation quantization is an experimental feature with limited support and a complex API. " @@ -873,6 +884,12 @@ def _check_configs(self) -> None: if is_nv_fp(self.data_type) and (self.group_size != 16): logger.warning("dtype nv_fp should only support group_size of 16 in real deployment") + if isinstance(self.group_size, tuple): + if not is_block_wfp8(self): + raise NotImplementedError("only support block-wise quantization for fp8 weight quantization.") + if not is_dynamic_afp8(self): + raise NotImplementedError("only support dynamic fp8 activation for fp8 weight quantization.") + if self.nsamples < self.gradient_accumulate_steps * self.batch_size: if self.batch_size > self.nsamples: if self.iters > 0: # GGUF should log this warning, but we don't know the format here @@ -1613,8 +1630,9 @@ def configure_layer_config(self, enable_gguf_official_mixed: None | bool = True) is_gguf_format = (f := getattr(self, "formats", None)) is not None and len(f) > 0 and f[0].is_gguf() if not is_gguf_format: predefined_ignore_layers = get_predefined_ignore_layers(self.model) + compressed_predefined_ignore_layers = compress_layer_names(predefined_ignore_layers) if predefined_ignore_layers: - logger.info(f"Using predefined ignore_layers: {predefined_ignore_layers}") + logger.info(f"Using predefined ignore_layers: {compressed_predefined_ignore_layers}") tmp_str = ",".join(predefined_ignore_layers) if self.ignore_layers == "": self.ignore_layers = tmp_str @@ -1868,7 +1886,8 @@ def _should_disable_inplace_due_to_layers_outside_block() -> bool: f"Summary: quantized {len(quantized_layers)}/{len(quantized_layers) + len(unquantized_layers)} in the model" ) if len(unquantized_layers) > 0: - summary_info += f", {unquantized_layers} have not been quantized" + compressed_unquantized_layers = compress_layer_names(unquantized_layers) + summary_info += f", {compressed_unquantized_layers} have not been quantized" logger.info(summary_info) self.quantized = True diff --git a/auto_round/compressors/utils.py b/auto_round/compressors/utils.py index 1aef5bd7c..1d5c0a8ce 100644 --- a/auto_round/compressors/utils.py +++ b/auto_round/compressors/utils.py @@ -19,7 +19,7 @@ import sys from dataclasses import asdict, fields from enum import Enum -from typing import Callable, Union +from typing import TYPE_CHECKING, Callable, Union import torch import transformers @@ -27,9 +27,11 @@ from auto_round.export.export_to_gguf.config import GGML_QUANT_SIZES, GGUF_CONFIG, GGUF_INNER_CONFIG, QK_K, ModelType from auto_round.logger import logger -from auto_round.schemes import QuantizationScheme, get_gguf_scheme, preset_name_to_scheme from auto_round.utils import check_to_quantized +if TYPE_CHECKING: + from auto_round.schemes import QuantizationScheme + class BackendDataType(str, Enum): STANDARD_FP = "fp" @@ -286,7 +288,7 @@ def set_layer_config( Returns (final_layer_config, has_quant_layer_outside_block) """ - from auto_round.schemes import get_gguf_scheme + from auto_round.schemes import QuantizationScheme, get_gguf_scheme, preset_name_to_scheme from auto_round.utils.model import get_layer_names_in_block, get_lm_head_name, get_module, is_separate_lm_head # ---- helpers ------------------------------------------------- @@ -303,6 +305,7 @@ def dispatch_layer_config(layer_config: dict[str, dict]) -> None: def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str) -> dict: """Convert config entry into dict and validate keys.""" + if isinstance(item, str): config = asdict(preset_name_to_scheme(item.upper())) elif isinstance(item, QuantizationScheme): @@ -609,6 +612,7 @@ def get_layer_config_by_gguf_format(layer_config, target_gguf_format: str, model import gguf # pylint: disable=E0401 + from auto_round.schemes import get_gguf_scheme from auto_round.utils.common import MM_KEYS, LazyImport from auto_round.utils.model import get_lm_head_name, get_module diff --git a/auto_round/compressors_new/__init__.py b/auto_round/compressors_new/__init__.py index 14a492441..91b9ffda8 100644 --- a/auto_round/compressors_new/__init__.py +++ b/auto_round/compressors_new/__init__.py @@ -11,3 +11,29 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +# Lazy imports to avoid circular dependencies +# Users should import from specific modules instead of this __init__.py + +__all__ = ["Compressor", "CalibCompessor", "ImatrixCompressor", "ZeroShotCompressor", "AutoRound"] + + +def __getattr__(name): + """Lazy import to avoid circular dependencies.""" + if name == "Compressor" or name == "AutoRound": + from auto_round.compressors_new.entry import Compressor, AutoRound + + if name == "Compressor": + return Compressor + return AutoRound + elif name == "CalibCompessor" or name == "ImatrixCompressor": + from auto_round.compressors_new.calib import CalibCompessor, ImatrixCompressor + + if name == "CalibCompessor": + return CalibCompessor + return ImatrixCompressor + elif name == "ZeroShotCompressor": + from auto_round.compressors_new.zero_shot import ZeroShotCompressor + + return ZeroShotCompressor + raise AttributeError(f"module '{__name__}' has no attribute '{name}'") diff --git a/auto_round/compressors_new/architecture_visualization.py b/auto_round/compressors_new/architecture_visualization.py new file mode 100644 index 000000000..fb31f4827 --- /dev/null +++ b/auto_round/compressors_new/architecture_visualization.py @@ -0,0 +1,255 @@ +# # Copyright (C) 2026 Intel Corporation +# # SPDX-License-Identifier: Apache-2.0 + +""" +New Architecture Visualization - Mixin Pattern Combination Table + +Demonstrates all possible combinations of model types and compression algorithms. +""" + + +def print_architecture_table(): + """Print architecture combination table""" + + print("\n" + "=" * 100) + print("Compressor New Architecture - Mixin Pattern Combination Table") + print("=" * 100 + "\n") + + # Table header + print(f"{'Model Type':<15} {'Config Type':<20} {'Algorithm':<20} {'Actual Created Class':<35}") + print("-" * 100) + + # LLM combinations + print(f"{'LLM':<15} {'AutoRoundConfig':<20} {'AutoRound':<20} {'CalibCompessor':<35}") + print(f"{'LLM':<15} {'RTNConfig':<20} {'RTN + imatrix':<20} {'ImatrixCompressor':<35}") + print(f"{'LLM':<15} {'RTNConfig':<20} {'RTN (zero-shot)':<20} {'ZeroShotCompressor':<35}") + + print() + + # MLLM combinations + print(f"{'MLLM':<15} {'AutoRoundConfig':<20} {'AutoRound':<20} {'MLLMCalibCompressor':<35}") + print(f"{'':<15} {'':<20} {'':<20} {' = MLLMMixin + CalibCompessor':<35}") + print(f"{'MLLM':<15} {'RTNConfig':<20} {'RTN + imatrix':<20} {'MLLMImatrixCompressor':<35}") + print(f"{'':<15} {'':<20} {'':<20} {' = MLLMMixin + ImatrixCompressor':<35}") + print(f"{'MLLM':<15} {'RTNConfig':<20} {'RTN (zero-shot)':<20} {'MLLMZeroShotCompressor':<35}") + print(f"{'':<15} {'':<20} {'':<20} {' = MLLMMixin + ZeroShotCompressor':<35}") + + print() + + # Diffusion combinations + print(f"{'Diffusion':<15} {'AutoRoundConfig':<20} {'AutoRound':<20} {'DiffusionCalibCompressor':<35}") + print(f"{'':<15} {'':<20} {'':<20} {' = DiffusionMixin + CalibCompessor':<35}") + print(f"{'Diffusion':<15} {'RTNConfig':<20} {'RTN + imatrix':<20} {'DiffusionImatrixCompressor':<35}") + print(f"{'':<15} {'':<20} {'':<20} {' = DiffusionMixin + ImatrixCompressor':<35}") + print(f"{'Diffusion':<15} {'RTNConfig':<20} {'RTN (zero-shot)':<20} {'DiffusionZeroShotCompressor':<35}") + print(f"{'':<15} {'':<20} {'':<20} {' = DiffusionMixin + ZeroShotCompressor':<35}") + + print("\n" + "=" * 100 + "\n") + + +def print_mixin_explanation(): + """Print Mixin pattern explanation""" + + print("=" * 100) + print("Mixin Pattern Explanation") + print("=" * 100 + "\n") + + print("✨ Core Components:") + print("-" * 100) + print(" 1. MLLMMixin - MLLM features (processor, template, etc.)") + print(" 2. DiffusionMixin - Diffusion features (guidance_scale, pipeline, etc.)") + print(" 3. CalibCompessor - Calibration-based compression algorithm (AutoRound)") + print(" 4. ImatrixCompressor - RTN + importance matrix") + print(" 5. ZeroShotCompressor - Zero-shot RTN") + + print("\n🎯 Combination Approach:") + print("-" * 100) + print(" Dynamically create combined classes through multiple inheritance:") + print(" class MLLMCalibCompressor(MLLMMixin, CalibCompessor):") + print(" pass") + print("\n MLLMMixin provides MLLM features, CalibCompessor provides compression algorithm") + + print("\n💡 Advantages:") + print("-" * 100) + print(" ✓ Flexible Combination: Any model feature can be combined with any compression algorithm") + print(" ✓ Code Reuse: Mixin code is written once and can be reused multiple times") + print(" ✓ Clear Separation: Model features and compression algorithms are completely independent") + print(" ✓ Easy Extension: Adding new model types or new algorithms is straightforward") + + print("\n" + "=" * 100 + "\n") + + +def print_usage_examples(): + """Print usage examples""" + + print("=" * 100) + print("Usage Examples") + print("=" * 100 + "\n") + + print("Example 1: MLLM + AutoRound") + print("-" * 100) + print( + """ +from auto_round.compressors_new.entry import Compressor +from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig + +config = AutoRoundConfig(scheme="W4A16", iters=200) +compressor = Compressor( + config=config, + model="/models/Qwen2-VL-2B-Instruct", + processor=processor, + template="qwen2_vl", +) +# Actually creates: MLLMCalibCompressor (MLLMMixin + CalibCompessor) + """ + ) + + print("\nExample 2: MLLM + RTN + imatrix") + print("-" * 100) + print( + """ +from auto_round.algorithms.quantization.rtn.config import RTNConfig + +config = RTNConfig(scheme="W4A16") +compressor = Compressor( + config=config, + model="/models/Qwen2-VL-2B-Instruct", + format="gguf_k", # Triggers imatrix + processor=processor, +) +# Actually creates: MLLMImatrixCompressor (MLLMMixin + ImatrixCompressor) + """ + ) + + print("\nExample 3: Diffusion + AutoRound") + print("-" * 100) + print( + """ +config = AutoRoundConfig(scheme="W4A16", iters=200) +compressor = Compressor( + config=config, + model="/models/stable-diffusion-2-1", + guidance_scale=7.5, +) +# Actually creates: DiffusionCalibCompressor (DiffusionMixin + CalibCompessor) + """ + ) + + print("\n" + "=" * 100 + "\n") + + +def print_mro_example(): + """Print MRO (Method Resolution Order) example""" + + print("=" * 100) + print("Method Resolution Order (MRO) Example") + print("=" * 100 + "\n") + + print("For MLLMCalibCompressor(MLLMMixin, CalibCompessor):") + print("-" * 100) + print( + """ +MLLMCalibCompressor + └─> MLLMMixin + └─> CalibCompessor + └─> BaseCompressor + └─> object + +Execution order when calling __init__(): + 1. MLLMCalibCompressor.__init__() (if defined) + 2. MLLMMixin.__init__() + - Save MLLM-specific parameters (processor, template, etc.) + - Call super().__init__() → enters CalibCompessor + 3. CalibCompessor.__init__() + - Save calibration-related parameters (dataset, iters, etc.) + - Call super().__init__() → enters BaseCompressor + 4. BaseCompressor.__init__() + - Base class initialization + +Thus, MLLMCalibCompressor has both: + ✓ MLLM features (from MLLMMixin) + ✓ Calibration compression functionality (from CalibCompessor) + """ + ) + + print("=" * 100 + "\n") + + +def print_decision_tree(): + """Print decision tree""" + + print("=" * 100) + print("Compressor Creation Decision Tree") + print("=" * 100 + "\n") + + print( + """ +Compressor.__new__(config, model, ...) +│ +├─ Step 1: Detect model type +│ model_type = detect_model_type(model) +│ ├─ is_diffusion_model() → "diffusion" +│ ├─ is_mllm_model() → "mllm" +│ └─ else → "llm" +│ +├─ Step 2: Determine config type +│ │ +│ ├─ AutoRoundConfig (requires calibration) +│ │ ├─ model_type == "mllm" +│ │ │ └─> class MLLMCalibCompressor(MLLMMixin, CalibCompessor) +│ │ │ return MLLMCalibCompressor(...) +│ │ │ +│ │ ├─ model_type == "diffusion" +│ │ │ └─> class DiffusionCalibCompressor(DiffusionMixin, CalibCompessor) +│ │ │ return DiffusionCalibCompressor(...) +│ │ │ +│ │ └─ model_type == "llm" +│ │ └─> return CalibCompessor(...) +│ │ +│ └─ RTNConfig (zero-shot or imatrix) +│ │ +│ ├─ enable_imatrix == True +│ │ ├─ model_type == "mllm" +│ │ │ └─> class MLLMImatrixCompressor(MLLMMixin, ImatrixCompressor) +│ │ │ return MLLMImatrixCompressor(...) +│ │ │ +│ │ ├─ model_type == "diffusion" +│ │ │ └─> class DiffusionImatrixCompressor(DiffusionMixin, ImatrixCompressor) +│ │ │ return DiffusionImatrixCompressor(...) +│ │ │ +│ │ └─ model_type == "llm" +│ │ └─> return ImatrixCompressor(...) +│ │ +│ └─ enable_imatrix == False +│ ├─ model_type == "mllm" +│ │ └─> class MLLMZeroShotCompressor(MLLMMixin, ZeroShotCompressor) +│ │ return MLLMZeroShotCompressor(...) +│ │ +│ ├─ model_type == "diffusion" +│ │ └─> class DiffusionZeroShotCompressor(DiffusionMixin, ZeroShotCompressor) +│ │ return DiffusionZeroShotCompressor(...) +│ │ +│ └─ model_type == "llm" +│ └─> return ZeroShotCompressor(...) + """ + ) + + print("=" * 100 + "\n") + + +def main(): + """Run all visualizations""" + + print_architecture_table() + print_mixin_explanation() + print_usage_examples() + print_mro_example() + print_decision_tree() + + print("=" * 100) + print("🎉 New architecture supports 9 combinations (3 model types × 3 compression algorithms)") + print("=" * 100) + + +if __name__ == "__main__": + main() diff --git a/auto_round/compressors_new/base.py b/auto_round/compressors_new/base.py index a968c55e6..dae46c8ad 100644 --- a/auto_round/compressors_new/base.py +++ b/auto_round/compressors_new/base.py @@ -11,156 +11,68 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import copy import os -import sys -import time -import traceback -from dataclasses import asdict, dataclass, fields -from enum import Enum -from functools import partial -from typing import Any, Callable, Optional, Union - -import accelerate +from dataclasses import asdict, dataclass +from typing import Any, Optional, Union + import torch -from accelerate.big_modeling import dispatch_model, infer_auto_device_map -from accelerate.utils import get_balanced_memory, get_max_memory -from tqdm import tqdm -from transformers import AutoConfig, set_seed +from transformers import set_seed -from auto_round import envs from auto_round.algorithms.alg_config import AlgConfig -from auto_round.algorithms.base import BaseAlgorithm -from auto_round.algorithms.quantization import ARQuantizer, AutoRoundConfig, BaseQuantizers, QuantizationConfig -from auto_round.calibration.utils import ( - _infer_last_cache_name, - _update_inputs, -) -from auto_round.compressors.shard_writer import shard_writer -from auto_round.compressors_new.utils import ( - IndexSampler, - _get_quantized_layer_names_outside_blocks, - block_forward, - check_need_act_calibration, - check_skippable_keywords, - collect_best_params, - get_shared_keys, - infer_bits_by_data_type, - init_cache, - reset_params, - set_layer_config, -) +from auto_round.algorithms.quantization import BaseQuantizers, QuantizationConfig +from auto_round.compressors_new.utils import block_forward from auto_round.context.compress import CompressContext from auto_round.context.model import ModelContext -from auto_round.data_type import QUANT_FUNC_WITH_DTYPE from auto_round.formats import OutputFormat, get_formats from auto_round.logger import logger -from auto_round.modeling.fused_moe.replace_modules import materialize_model_, safe_to_cpu_ -from auto_round.schemes import QuantizationScheme -from auto_round.special_model_handler import get_predefined_ignore_layers, update_module from auto_round.utils import ( - INNER_SUPPORTED_LAYER_TYPES, - SUPPORTED_DTYPES, SUPPORTED_LAYER_TYPES, TORCH_VERSION_AT_LEAST_2_6, - CpuInfo, - check_and_mark_quantized_module, - check_seqlen_compatible, - check_to_quantized, - clear_memory, compile_func, - convert_dtype_str2torch, - convert_module_to_hp_if_necessary, - detect_device, - find_matching_blocks, - flatten_list, - get_block_names, - get_layer_names_in_block, - get_lm_head_name, - get_module, - global_state, - htcore, - is_auto_device_mapping, is_debug_mode, is_hpex_available, - is_moe_model, - is_moe_model_via_config, - is_quantized_input_module, - llm_load_model, - memory_monitor, - mv_module_from_gpu, - safe_device_move_with_meta_handling, - set_module, - to_device, - to_dtype, - unsupported_meta_device, -) -from auto_round.utils.device import ( - clear_memory_if_reached_threshold, - get_major_device, - parse_available_devices, - set_auto_device_map_for_block_with_tuning, - set_non_auto_device_map, ) +from auto_round.utils.device import set_non_auto_device_map from auto_round.utils.offload import OffloadManager -from auto_round.wrapper import WrapperLinear, WrapperMultiblock, unwrapper_block, unwrapper_layer, wrapper_block - -SERIALIZATION_KEYS = ( - "bits", - "act_bits", - "data_type", - "act_data_type", - "group_size", - "act_group_size", - "sym", - "act_sym", - "act_dynamic", - "amp", - "batch_size", - "enable_minmax_tuning", - "enable_norm_bias_tuning", - "enable_quanted_input", - "gradient_accumulate_steps", - "iters", - "lr", - "low_gpu_mem_usage", - "minmax_lr", - "nsamples", - "quant_block_list", - "regex_config", - "scale_dtype", - "seqlen", - "supported_types", - "static_attention_dtype", - "static_kv_dtype", - "super_bits", - "super_group_size", - "to_quant_block_names", -) - - -class Compressor(object): - SKIP_ARGS = ("local_args", "kwargs", "cls", "config") - - def __new__( - cls, - config: Union[AlgConfig, list[AlgConfig]], - model: Union[torch.nn.Module, str], - tokenizer=None, - platform="hf", - format=None, - **kwargs, - ): - # using different compressor base on AlgConfigs - local_args = {k: v for k, v in locals().items() if k not in cls.SKIP_ARGS} - - if isinstance(config, AutoRoundConfig): - return BaseCompressor(config, **local_args, **kwargs) +from auto_round.wrapper import wrapper_block + + +@dataclass +class SerializedCompressorConfig: + bits: Optional[int] = None + act_bits: Optional[int] = None + data_type: Optional[str] = None + act_data_type: Optional[str] = None + group_size: Optional[int] = None + act_group_size: Optional[int] = None + sym: Optional[bool] = None + act_sym: Optional[bool] = None + act_dynamic: Optional[bool] = None + amp: Optional[bool] = None + batch_size: Optional[int] = None + enable_minmax_tuning: Optional[bool] = True + enable_norm_bias_tuning: Optional[bool] = False + enable_quanted_input: Optional[bool] = True + gradient_accumulate_steps: Optional[int] = None + iters: Optional[int] = None + lr: Optional[float] = None + low_gpu_mem_usage: Optional[bool] = None + minmax_lr: Optional[float] = None + nsamples: Optional[int] = None + quant_block_list: Optional[list[str]] = None + regex_config: Optional[dict[str, Any]] = None + scale_dtype: Optional[str] = None + seqlen: Optional[int] = None + supported_types: Optional[list[str]] = SUPPORTED_LAYER_TYPES + static_attention_dtype: Optional[str] = None + static_kv_dtype: Optional[str] = None + super_bits: Optional[int] = None + super_group_size: Optional[int] = None + to_quant_block_names: Optional[list[str]] = None class BaseCompressor(object): need_calib: bool = True - supported_types = SUPPORTED_LAYER_TYPES def __init__( self, @@ -169,13 +81,9 @@ def __init__( tokenizer=None, platform="hf", format=None, - dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k", - iters: int = 200, low_gpu_mem_usage: bool = False, device_map: Union[str, torch.device, int, dict] = 0, enable_torch_compile: bool = False, - enable_alg_ext: bool = False, - disable_opt_rtn: bool | None = None, seed: int = 42, low_cpu_mem_usage: bool = True, **kwargs, @@ -199,7 +107,6 @@ def __init__( nblocks = kwargs.pop("nblocks", 1) disable_deterministic_algorithms = kwargs.pop("disable_deterministic_algorithms", True) enable_deterministic_algorithms = kwargs.pop("enable_deterministic_algorithms", False) - enable_opt_rtn = kwargs.pop("enable_opt_rtn", None) self._offloader = OffloadManager(enabled=low_cpu_mem_usage, mode="offload", offload_dir_prefix="compressor") @@ -241,23 +148,8 @@ def __init__( set_seed(self.seed) self.nblocks = nblocks - self.dataset = dataset - self.iters = iters - - if self.iters == 0: - self.lr = 5e-3 - - # Automatically adjust the disable_opt_rtn option if the user does not explicitly set it. - # To avoid None issue, we keep a copy though it's a little ugly - if enable_opt_rtn and disable_opt_rtn: - raise ValueError("`enable_opt_rtn` and `disable_opt_rtn` are mutually exclusive; " "only one can be set.") - if enable_opt_rtn: - disable_opt_rtn = False - self.orig_disable_opt_rtn = disable_opt_rtn - self.disable_opt_rtn = disable_opt_rtn - self.enable_torch_compile = enable_torch_compile - self.enable_alg_ext = enable_alg_ext + self.enable_torch_compile = enable_torch_compile # Whether to pack the layer immediately after tuning self.is_immediate_packing = False @@ -275,6 +167,9 @@ def __init__( low_gpu_mem_usage, device_map, enable_torch_compile, + is_immediate_packing=self.is_immediate_packing, + is_immediate_saving=self.is_immediate_saving, + formats=self.formats, ) self.model_context = ModelContext( model, @@ -287,21 +182,31 @@ def __init__( device=self.compress_context.device, ) - # backward compatible with the legacy API - def __getattr__(self, name: str) -> Any: - if name in self.__dict__: - return self.__dict__[name] - - for obj in ["quantize_config", "model_context", "compress_context", "quantizer"]: - if obj not in self.__dict__: - continue - obj = object.__getattribute__(self, obj) - try: - return object.__getattribute__(obj, name) - except AttributeError: - continue - - raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'") + def _adjust_torch_compile(self, enable_torch_compile: bool) -> None: + """Sets the torch compile configuration for the tuning.""" + self.enable_torch_compile = enable_torch_compile + if ( + not self.enable_torch_compile + and TORCH_VERSION_AT_LEAST_2_6 + and self.quantize_config.act_bits > 8 + and not is_debug_mode() + and "fp8" not in self.quantize_config.data_type + and "fp8" not in self.quantize_config.act_data_type + and self.need_calib + ): + logger.info( + "%s", + "'enable_torch_compile' is set to `False` by default. " + "Enabling it can reduce tuning cost by 20%, but it might throw an exception.", + ) + # On HPU, we rely on torch.compile to speed up the model execution. + if self.enable_torch_compile and self.quantize_config.is_wfp8afp8 and not is_hpex_available(): + self.enable_torch_compile = False + logger.warning("reset enable_torch_compile to `False` as fp8 is enabled") + # TODO: fix https://github.com/intel/auto-round/issues/1109 + if self.enable_torch_compile and self.quantize_config.is_act_nv_fp: + self.enable_torch_compile = False + logger.warning("reset enable_torch_compile to `False` as nvfp4 is enabled") def post_init(self): self.model_context._load_model() @@ -316,30 +221,13 @@ def post_init(self): # self.other_alg.post_init() if self.other_alg is not None else None # check and update the format based on the current configuration - if self.formats: + if isinstance(self.formats, str): self.formats = get_formats(self.formats, self) + self.compress_context.formats = self.formats # Set device, must place after model loading set_non_auto_device_map(self.model_context.model, self.compress_context.device_map) - # if self.iters != 0 and self.orig_disable_opt_rtn is not None: - # logger.warning("`disable_opt_rtn` only works when `iters` is set to 0, ignore it now.") - # self.disable_opt_rtn = True - # if ( - # self.quantization_args.bits >= 8 - # and self.quantization_args.act_bits >= 8 - # and self.iters == 0 - # and self.quantization_args.data_type == "int" - # and self.disable_opt_rtn is None - # ): - # logger.warning("`disable_opt_rtn` is turned on for W8A16/W8A8 quantization to improve efficiency.") - # self.disable_opt_rtn = True - # if self.disable_opt_rtn is None and self.iters == 0: - # logger.info( - # "`enable_opt_rtn` is turned on, set `--disable_opt_rtn` for higher speed at the cost of accuracy." - # ) - # self.disable_opt_rtn = False - # after setting iters self._adjust_torch_compile(self.enable_torch_compile) self.compress_context.enable_torch_compile = self.enable_torch_compile @@ -364,38 +252,30 @@ def _should_disable_inplace_due_to_layers_outside_block() -> bool: # Determine if immediate packing is required self._adjust_immediate_packing_and_saving() - def _adjust_torch_compile(self, enable_torch_compile: bool) -> None: - """Sets the torch compile configuration for the tuning.""" - self.enable_torch_compile = enable_torch_compile - if ( - not self.enable_torch_compile - and TORCH_VERSION_AT_LEAST_2_6 - and self.quantize_config.act_bits > 8 - and not is_debug_mode() - and "fp8" not in self.quantize_config.data_type - and "fp8" not in self.quantize_config.act_data_type - and self.iters > 0 - ): - logger.info( - "%s", - "'enable_torch_compile' is set to `False` by default. " - "Enabling it can reduce tuning cost by 20%, but it might throw an exception.", - ) - # On HPU, we rely on torch.compile to speed up the model execution. - if self.enable_torch_compile and self.quantize_config.is_wfp8afp8 and not is_hpex_available(): - self.enable_torch_compile = False - logger.warning("reset enable_torch_compile to `False` as fp8 is enabled") - # TODO: fix https://github.com/intel/auto-round/issues/1109 - if self.enable_torch_compile and self.quantize_config.is_act_nv_fp: - self.enable_torch_compile = False - logger.warning("reset enable_torch_compile to `False` as nvfp4 is enabled") + # backward compatible with the legacy API + def __getattr__(self, name: str) -> Any: + if name in self.__dict__: + return self.__dict__[name] + + for obj in ["quantize_config", "model_context", "compress_context", "quantizer"]: + if obj not in self.__dict__: + continue + obj = object.__getattribute__(self, obj) + try: + return object.__getattribute__(obj, name) + except AttributeError: + continue + + raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'") def _adjust_immediate_packing_and_saving(self): + from auto_round.algorithms.quantization.rtn.config import RTNConfig + formats = getattr(self, "formats", []) if len(formats) == 1 and not formats[0].is_fake() and self.inplace: self.is_immediate_packing = True - if self.quantizer.has_qlayer_outside_block and self.iters != 0: + if self.quantizer.has_qlayer_outside_block and self.need_calib: self.is_immediate_packing = False if not ("causallm" in self.model_context.model.__class__.__name__.lower() and not self.model_context.is_mllm): @@ -433,12 +313,16 @@ def _adjust_immediate_packing_and_saving(self): ) self.compress_context.low_cpu_mem_usage = False self.is_immediate_saving = False - elif self.quantizer.has_qlayer_outside_block and self.disable_opt_rtn and self.iters == 0: + elif ( + self.has_qlayer_outside_block + and getattr(self, "disable_opt_rtn", None) + and isinstance(self.quantize_config, RTNConfig) + ): logger.info( "Keeping `low_cpu_mem_usage` enabled in RTN mode (iters=0): " "RTN path uses blockwise quantization and supports per-block offloading." ) - elif self.quantizer.has_qlayer_outside_block and self.iters > 0: + elif self.quantizer.has_qlayer_outside_block and not isinstance(self.quantize_config, RTNConfig): logger.warning( "`low_cpu_mem_usage` is not fully supported " "when there are quantized layers outside blocks and optimized RTN is disabled. " @@ -447,994 +331,22 @@ def _adjust_immediate_packing_and_saving(self): self.compress_context.low_cpu_mem_usage = False self.is_immediate_saving = False - if self.is_immediate_saving and "int" not in self.data_type: + if self.is_immediate_saving and "int" not in self.quantize_config.data_type: logger.warning("immediate_saving is only supported for int quantization, set to False") self.is_immediate_saving = False - if self.orig_output_dir is None: + if self.output_dir is None: self.is_immediate_saving = False - @torch.no_grad() - def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, last_cache_name=None): - """Attempts to cache intermediate data on GPU, if failed, then using CPU. - - Args: - block_names (list): List of block names to cache data for. - nsamples (int): Number of samples to use for caching. - layer_names (list, optional): List of layer names to cache data for. Defaults to []. - last_cache_name (str, optional): Name of the last cache. Defaults to None. - - Returns: - all_inputs: Cached intermediate data. - - Raises: - Exception: If caching on GPU fails, switches to CPU and caches there. - """ - if is_quantized_input_module(self.model_context.model): - layer_names = [] - if layer_names is None: - layer_names = [] - if self.compress_context.low_gpu_mem_usage or ( - len(block_names) == 1 - and len(layer_names) == 0 - and not self.quantizer.has_qlayer_outside_block - and (last_cache_name is None or last_cache_name in block_names) - ): - # low_gpu_mem_usage or calibrate only the embedding layer, which is also very fast on CPU - all_inputs = self.cache_inter_data(block_names, nsamples, layer_names=[], last_cache_name=last_cache_name) - else: - try: - if any(p.device.type == "meta" for p in self.model_context.model.parameters()): - materialize_model_(self.model_context.model) - - if ( - hasattr(self.model_context.model, "hf_device_map") - and len(self.model_context.model.hf_device_map) > 1 - ): - self.model_context.model = dispatch_model( - self.model_context.model, device_map=self.model_context.model.hf_device_map - ) - else: - # Change this if new device is supported - if str(self.model_context.model.device) == "cpu" and ( - not self.compress_context.device.startswith("hpu") - ): - # type(self.model_context.model._no_split_modules) changes from list to set when transformers > 5.0 - no_split_modules = list(getattr(self.model_context.model, "_no_split_modules", [])) - devices = parse_available_devices(self.compress_context.device_map) - - max_memory = get_max_memory() - new_max_memory = {} - if "cpu" not in devices: - devices.append("cpu") - for device in devices: - if ":" in device: - device = int(device.split(":")[-1]) - elif device == "cpu": - device = "cpu" - elif isinstance(device, str): - device = 0 - else: - raise ValueError( - f"Unsupported device {device} in device_map: {self.compress_context.device_map}" - ) - if device not in max_memory: - # Skip devices that aee not reported by accelerate's max_memory. - # This is expected when a device is unavailable or cannot provide memory info. - continue - # Use 90% of the reported max memory to leave headroom for activations, - # temporary tensors, other processes, and allocator fragmentation, reducing - # the chance of runtime OOM while still utilizing most available memory. - new_max_memory[device] = max_memory[device] * 0.9 - - # If non-CPU devices were requested but none survived, fall back to CPU caching - # via the OOM handler below, avoiding unnecessary dispatch overhead. - requested_non_cpu = any((d != "cpu") for d in devices) - has_non_cpu_memory = any((k != "cpu") for k in new_max_memory) - if requested_non_cpu and not has_non_cpu_memory: - raise torch.OutOfMemoryError( - "No non-CPU device available in accelerate's reported memory. " - "Falling back to CPU caching." - ) - - new_max_memory = get_balanced_memory( - self.model_context.model, - max_memory=new_max_memory, - no_split_module_classes=no_split_modules, - ) - self.model_context.model.tie_weights() - device_map = infer_auto_device_map( - self.model_context.model, - max_memory=new_max_memory, - no_split_module_classes=no_split_modules, - ) - if len(devices) > 1 and "cpu" in device_map.values(): - logger.warning( - "Some layers are offloaded to cpu, which may severely impact calibration speed." - " Please consider using more cards." - ) - - try: - - self.model_context.model = dispatch_model(self.model_context.model, device_map=device_map) - except ValueError as e: - if "offload_dir" in e.__str__(): - logger.warning( - f"Due to insufficient resources, disk is used to store the model." - f" `offload_dir={envs.AR_WORK_SPACE}`" - ) - self.model_context.model = dispatch_model( - self.model_context.model, device_map=device_map, offload_dir=envs.AR_WORK_SPACE - ) - else: - raise - else: - - self.model_context.model = self.model_context.model.to(self.compress_context.device) - - all_inputs = self.cache_inter_data( - block_names, nsamples, layer_names=layer_names, last_cache_name=last_cache_name - ) - if ( - hasattr(self.model_context.model, "hf_device_map") - and len(self.model_context.model.hf_device_map) > 1 - ): - accelerate.hooks.remove_hook_from_submodules(self.model_context.model) - - except torch.OutOfMemoryError: - cuda_error_msg = traceback.format_exc() - try: - logger.info("switch to cpu to cache block inputs") - self.compress_context.cache_device = torch.device("cpu") - if self.quantizer.has_qlayer_outside_block or self.__class__.__name__ == "AutoRoundMLLM": - logger.warning( - "we recommend using more GPUs in calibration." - " Otherwise, some layers may fall back to `rtn` mode, which can affect accuracy." - ) - accelerate.hooks.remove_hook_from_submodules(self.model_context.model) - self.model_context.model = mv_module_from_gpu(self.model_context.model) - clear_memory(device_list=self.compress_context.device_list) - # Important change after v0.51, on cpu, we use rtn mode for layers in layer_names - all_inputs = self.cache_inter_data( - block_names, nsamples, layer_names=[], last_cache_name=last_cache_name - ) - except Exception as e: - logger.error(cuda_error_msg) - raise - return all_inputs - - @torch.no_grad() - def cache_inter_data(self, block_names, nsamples, layer_names=None, last_cache_name=None): - """Save the inputs of block_name for calibration. - - This method temporarily replaces the forward method of the model to capture - the inputs passing through the specified block. It then calibrates the model - using a specified number of samples. Finally, it restores the original forward - method and returns the inputs for the specified block. - Args: - block_names (list): The names of the blocks for which inputs are to be saved. - layer_names (list):The names of the layers for which inputs are to be saved. - nsamples (int): The number of samples to use for calibration. - last_cache_name (str, optional): The name of the last layer to be cached, - we could break the forward in this layer to save time - - Returns: - dict: A dictionary containing the inputs for the specified block. - """ - if layer_names is None: - layer_names = [] - self.inputs = {} - self.to_cached_layers = block_names + layer_names - - tmp_dtype = None # TODO delete this as most model is not fp32 now - ## have bug if block name is not the first block - if (len(block_names) > 1 or len(layer_names) > 0) and self.compress_context.low_gpu_mem_usage: - tmp_dtype = self.model_context.model.dtype - if self.model_context.amp: - if self.model_context.model.dtype != self.model_context.model.dtype: - self.model_context.model = self.model_context.model.to(torch.bfloat16) - else: - self.model_context.model = self.model_context.model.to(torch.float32) ##model on cpu - - self.last_cache_name = _infer_last_cache_name(block_names, layer_names, last_cache_name) - self._cache_target_set = set(self.to_cached_layers) - self._cache_seen_targets = set() - calib_bs = self.quantizer.batch_size - self.hook_handles = [] - self._replace_forward() - self.calib(nsamples, calib_bs) - self.model_context.recover_forward() - res = self.inputs - del self.last_cache_name - del self._cache_target_set - del self._cache_seen_targets - del self.to_cached_layers - if tmp_dtype is not None: - self.model_context.model = self.model_context.model.to(tmp_dtype) - - return res - - @torch.no_grad() - def calib(self, nsamples, bs): - """Perform calibration for quantization. - - This method calibrates the model for quantization by processing a specified - number of samples from the calibration dataset. It ensures that the data is - properly formatted and feeds it to the model. If the number of samples processed - is less than the specified number, it logs a warning. If no samples are processed, - it logs an error and exits. - Args: - nsamples (int): The number of samples to use for calibration. - bs (int): The number of samples to use for calibration - """ - from auto_round.calib_dataset import get_dataloader - - need_attention_mask = True - if isinstance(self.dataset, str): - need_attention_mask = False # all supported datasets does not use pad - dataset = self.dataset.replace(" ", "") ##remove all whitespaces - - # slow here - self.dataloader = get_dataloader( - self.tokenizer, - self.seqlen, - dataset, - self.seed, - bs, - self.nsamples, - ) - else: - self.dataloader = self.dataset - total_cnt = 0 - if self.dataloader.__class__.__name__ == "BatchEncoding": - self.dataloader = [self.dataloader.data] - - for data in self.dataloader: - if data.__class__.__name__ == "BatchEncoding": - data = data.data - if data is None: - continue - if isinstance(data, torch.Tensor): - input_ids = data.to(self.model.device) - data_new = input_ids - elif isinstance(data, str): - if self.tokenizer is None: - logger.error("please provide tokenizer for string input") - exit(-1) - data = self.tokenizer(data, truncation=True, max_length=self.seqlen, return_tensors="pt").data - data_new = {} - for key in data.keys(): - data_new[key] = data[key].to(self.model.device) - input_ids = data_new["input_ids"] - elif isinstance(data, tuple) or isinstance(data, list): - data_new = to_device(data, self.model.device) - input_ids = data_new[0] - else: - data_new = {} - for key in data.keys(): - data_new[key] = to_device(data[key], self.model.device) - if key == "images": - data_new[key] = to_dtype(data_new[key], self.model.dtype) - input_ids = data_new["input_ids"] - if input_ids.shape[-1] < self.seqlen: - continue - if need_attention_mask: - if ( - isinstance(data_new, dict) - and "attention_mask" in data_new - and data_new["attention_mask"] is not None - ): - new_attention_mask = data_new["attention_mask"] - elif ( - self.tokenizer is not None - and hasattr(self.tokenizer, "pad_token") - and self.tokenizer.pad_token is not None - ): - new_attention_mask = (input_ids != self.tokenizer.pad_token_id).to(torch.long) - else: - # Default all ones - new_attention_mask = torch.ones_like(input_ids, dtype=torch.long) - - # For each sample, check if there are trailing repeated tokens - # If so, set the mask of the last token to 0 - batch_size, seq_len = input_ids.shape - for i in range(batch_size): - last_token = input_ids[i, -1] - # Check for trailing repeats - j = seq_len - 2 - repeated = False - while j >= 0 and input_ids[i, j] == last_token: - repeated = True - new_attention_mask[i, j] = 0 - j -= 1 - # If there was at least one repeat, set last token mask to 0 - if repeated: - new_attention_mask[i, -1] = 0 - - # Workaround: some models treat an all-1 attention mask as equivalent to None and - # will internally replace it with None for block inputs, which can cause tensor - # concatenation / shape-mismatch issues in downstream code. To avoid providing an - # all-1 mask, we force the last token in each sequence to be masked out (set to 0) - # so that the mask is never "all ones". This means the model will not attend to the - # last position, so the impact on accuracy is minimal as basically equivalent to dropping a single token - new_attention_mask[:, -1] = 0 - - self.quantizer.attention_mask.extend(list(torch.split(new_attention_mask, 1, dim=0))) - else: - new_attention_mask = None - try: - kwargs = {"use_cache": False} - if new_attention_mask is not None and not (isinstance(data_new, dict) and "attention_mask" in data_new): - kwargs["attention_mask"] = new_attention_mask - - if isinstance(data_new, torch.Tensor): - self.model(data_new, **kwargs) - elif isinstance(data_new, tuple) or isinstance(data_new, list): - self.model(*data_new, **kwargs) - else: - self.model(**data_new, **kwargs) - except NotImplementedError: - pass - except RuntimeError as error: - error_msg = str(error) - if "The expanded size of the tensor" in str(error_msg) and "must match the existing size" in error_msg: - check_seqlen_compatible(self.seqlen, self.tokenizer, self.model) - logger.warning( - "When quantization encounters tensor shape mismatch error, " - "you can try to avoid it with batch_size=1" - ) - raise error - except Exception as error: - raise error - - total_cnt += input_ids.shape[0] if len(input_ids.shape) > 1 else 1 - if total_cnt >= nsamples: - break - if total_cnt == 0: - logger.error( - f"no data has been cached, please provide more data with sequence length >={self.seqlen} in the " - f"dataset or decease the sequence length" - ) - exit(-1) - elif total_cnt < nsamples: - logger.warning_once( - f"An insufficient number of samples likely reduces the accuracy of the quantized model. " - f"Target samples count is {nsamples}, while valid samples count is {total_cnt}" - ) - - @torch.no_grad() - def _get_block_forward_func(self, name: str) -> Callable: - """Gets the forward function. - - Args: - name (str): The name of the function. - Returns: - function: The forward function. - """ - - def post_process_cache_data(batch_size, data, data_name): - """ - Processes store data for batch handling, reshaping if necessary. - - Args: - batch_size (int): The size of the batch. - data: The data value to store, potentially for caching. - data_name (str): Name of the data. - - Returns: - Processed data or None - """ - new_data = data - if batch_size <= 1: - return new_data - if data_name in self.model_context.shared_cache_keys: - return None - if "alibi" in data_name: - if isinstance(data, torch.Tensor): - alibi = data - alibi = alibi.reshape(batch_size, -1, alibi.shape[1], alibi.shape[2]) - new_data = alibi - return new_data - - def forward(m, hidden_states=None, *positional_inputs, **kwargs): - """Rewrite forward function, process and collect input data. - - Args: - hidden_states (torch.Tensor): The hidden states tensor. - *positional_inputs: Variable number of positional arguments. - **kwargs: Variable number of keyword arguments. - - Returns: - NotImplementedError: Getting the first layer inputs and then raise the error to save runtime. - """ - if name not in self.inputs: - self.inputs[name] = {} - init_cache(positional_inputs, self.inputs[name]) - - if self.quantizer.batch_dim is None: - self.quantizer.batch_dim = 0 - if hidden_states is not None and self.quantizer.batch_size > 1: - if hidden_states.shape[0] > self.quantizer.batch_size: - self.quantizer.batch_dim = 1 - if len(hidden_states.shape) > 1 and hidden_states.shape[1] > self.quantizer.batch_size: - logger.error( - "this model has not been supported, " - "please raise an issue in https://github.com/intel/auto-round/issues" - " or try to set the `batch_size` to 1 and " - "`gradient_accumulate_steps` to your current batch size." - ) - exit(-1) - - if hidden_states is not None: - kwargs["hidden_states"] = hidden_states - - for key in kwargs.keys(): - if ( - isinstance(kwargs[key], torch.Tensor) - or isinstance(kwargs[key], list) - or isinstance(kwargs[key], tuple) - ): - if key not in self.inputs[name].keys(): # initialization - data = to_device(kwargs[key], device=torch.device("cpu")) - if data is None or ( - self.quantizer.batch_size > 1 and key in self.model_context.shared_cache_keys - ): - self.inputs[name][key] = data - continue - if self.quantizer.batch_size <= 1: - self.inputs[name][key] = [data] - else: - data = post_process_cache_data(self.quantizer.batch_size, data, key) - self.inputs[name][key] = list(torch.split(data, 1, dim=self.quantizer.batch_dim)) - else: # append cache inputs - new_data = post_process_cache_data(self.quantizer.batch_size, kwargs[key], key) - if new_data is None: # shareable args or NoneType - continue - new_data = to_device(new_data, device=torch.device("cpu")) - if self.quantizer.batch_size <= 1: - self.inputs[name][key].append(new_data) - else: - self.inputs[name][key].extend(list(torch.split(new_data, 1, dim=self.quantizer.batch_dim))) - elif isinstance(kwargs[key], (str, bool, type(None))): - if key not in self.inputs[name].keys(): - self.inputs[name][key] = kwargs[key] - else: - # Parameters not to be cached - if check_skippable_keywords(key): - logger.warning_once( - f"Please note that '{key}' key" " is not currently used in quantization fine-tuning." - ) - reset_params(self.inputs[name]) - - if self._should_stop_cache_forward(name): - raise NotImplementedError - else: - if hidden_states is not None: - kwargs.pop("hidden_states") - return m.orig_forward(hidden_states, *positional_inputs, **kwargs) - else: - # Currently only for Llama-3.2-Vision-Instruct Series - return m.orig_forward(*positional_inputs, **kwargs) - - return forward - - @torch.no_grad() - def _get_cache_data_hook_for_layer(self, name): - """A forward hook to save input max of a module - :param name: the module name - :return: A hook function.""" - - def cache_input_hook(module, inputs, outputs): - input = inputs - if isinstance(inputs, tuple) or isinstance(input, list): - input = inputs[0] - if name in self.inputs: - self.inputs[name].extend(list(torch.split(input.to("cpu"), 1, dim=0))) - else: - self.inputs[name] = list(torch.split(input.to("cpu"), 1, dim=0)) - - if self._should_stop_cache_forward(name): - raise NotImplementedError - - return cache_input_hook - - def _replace_forward(self): - """Replaces the forward function.""" - - def register_hook(n, m, hook_handles): - if n in self.to_cached_layers and type(m) not in SUPPORTED_LAYER_TYPES: ##block - m.orig_forward = m.forward - m.forward = partial(self._get_block_forward_func(n), m) - elif n in self.to_cached_layers: ##linear layer or conv1d layer - hook_func = self._get_cache_data_hook_for_layer(n) - hook_handle = m.register_forward_hook(hook_func) - hook_handles.append(hook_handle) - - self.model_context.replace_forward(register_hook) - - def _should_stop_cache_forward(self, name: str) -> bool: - """Determine whether current forward pass can stop after caching `name`.""" - if name == self.last_cache_name: - return True - - if self.last_cache_name is not None: - return False - - if not hasattr(self, "_cache_target_set") or not hasattr(self, "_cache_seen_targets"): - return False - - if name in self._cache_target_set: - self._cache_seen_targets.add(name) - - if not self._cache_target_set.issubset(self._cache_seen_targets): - return False - - # Lock the last cache name after the first full forward pass. - self.last_cache_name = name - return True - - def _preprocess_block_inputs(self, inputs, first_input_name="input_ids"): - input_ids, input_others = self._split_inputs(inputs, first_input_name) - clear_memory(device_list=self.compress_context.device_list) - input_ids = to_device(input_ids, self.compress_context.cache_device) - input_others = to_device(input_others, self.compress_context.cache_device) - # As in calibration phase, we may use bf16 for calibration due to low_gpu_memory usage - - tmp_dtype = self.model_context.amp_dtype if self.model_context.amp else torch.float32 - input_ids = to_dtype(input_ids, tmp_dtype) - - for key in input_others.keys(): - if isinstance(input_others[key], torch.Tensor) and ( - input_others[key].dtype == torch.float16 or input_others[key].dtype == torch.bfloat16 - ): - input_others[key] = input_others[key].to(tmp_dtype) - elif isinstance(input_others[key], list): - for i in range(len(input_others[key])): - to_dtype(input_others[key][i], tmp_dtype) - return input_ids, input_others - - def _split_inputs(self, inputs: dict, first_input_name: str) -> tuple[torch.Tensor, dict]: - input_ids = inputs[first_input_name] - inputs.pop(first_input_name, None) - input_others = inputs - return input_ids, input_others - - @torch.inference_mode() - def _quantize_embedding_layer(self): - """Quantizes embedding layers in the model according to the configuration. - - This method iterates through all modules in the model, identifies embedding - layers specified in `self.quantizer.layer_config`, and applies the appropriate quantization - function based on bit precision, grouping strategy, and dtype. - - Returns: - bool: True if the quantization process completes without critical errors. - """ - is_quantized = False - for name, module in self.model.named_modules(): - # Skip non-Embedding modules or layers not in config - if not isinstance(module, torch.nn.Embedding) or name not in self.quantizer.layer_config: - continue - - config = self.quantizer.layer_config[name] - - # Skip layers that are not marked for quantization - if not check_to_quantized(config): - continue - is_quantized = True - config["scale_dtype"] = self.quantizer.scale_dtype - dtype = config["data_type"] - - # Determine quantization function key with symmetry/asymmetry - if dtype not in QUANT_FUNC_WITH_DTYPE: - dtype = f"{dtype}_{'sym' if config['sym'] else 'asym'}" - - # Optionally use optimized rounding (RTN) variant - if not self.disable_opt_rtn and f"rtn_{dtype}" in QUANT_FUNC_WITH_DTYPE: - dtype = f"rtn_{dtype}" - - quant_func = QUANT_FUNC_WITH_DTYPE[dtype] - dtype = module.weight.dtype - # As typically float32 are used in RTN to search scale zp, - # to avoid cache a bf16 copy we'd better use float32 - if config.get("super_group_size", None) is not None: - dtype = torch.float32 - - # Attempt quantization on GPU, fall back to CPU if OOM - try: - weight, scale, zp = quant_func( - module.weight.to(dtype=dtype, device=self.compress_context.device), - **{ - k: config.get(k, None) - for k in ["bits", "group_size", "super_bits", "super_group_size", "scale_dtype"] - }, - ) - except torch.OutOfMemoryError: - cuda_error_msg = traceback.format_exc() - try: - logger.error(cuda_error_msg) - logger.warning("falling back to CPU") - weight, scale, zp = quant_func( - module.weight.to("cpu"), - **{ - k: config.get(k, None) - for k in ["bits", "group_size", "super_bits", "super_group_size", "scale_dtype"] - }, - ) - except Exception as e: - raise - - # Overwrite the module's weights with the quantized version - module.weight.data.copy_(weight.cpu()) - - # Attach scale and zero point (zp) to the module - for param_name, value in zip(["scale", "zp"], [scale, zp]): - if isinstance(value, dict): - for k, v in value.items(): - setattr(module, k if k == "scale" else f"w_{k}", v.cpu()) - elif isinstance(value, torch.Tensor): - setattr(module, param_name, value.cpu()) - else: - setattr(module, param_name, value) - - # Update config - self.quantizer.layer_config.setdefault(name, {}).update(config) - del weight - del scale - del zp - clear_memory(device_list=self.compress_context.device_list) - - return is_quantized - - def _quantize_blocks( - self, - model: torch.nn.Module, - inputs: dict, - block_names: list, - q_input: torch.Tensor = None, - nblocks: int = 1, - device: str = "cpu", - pbar: tqdm = None, - ): - """Quantize and dequantize the weights of the specified blocks in the model. - - Args: - model: The PyTorch model to be quantized. - inputs: The input data for quantization. - block_names: The names of the blocks to be quantized and dequantized. - nblocks: The number of blocks to quantize and dequantize. - device: The device for quantization and dequantization. - - Returns: - None - """ - clear_memory(device_list=self.compress_context.device_list) - for n, m in model.named_parameters(): - m.requires_grad_(False) - - input_ids, input_others = self._preprocess_block_inputs(inputs) - - if pbar is None: - pbar = tqdm(range(0, len(block_names), nblocks)) - - for i in range(0, len(block_names), nblocks): - if i != 0: - pbar.update(1) - if nblocks == 1: - n = block_names[i] - pbar.set_description(f"Quantizing {n}") - m = get_module(model, n) - else: - names = block_names[i : min(i + nblocks, len(block_names))] - pbar.set_description(f"Quantizing [{i + 1}-{min(i + nblocks, len(block_names))}]/{len(block_names)}") - modules = [get_module(model, n) for n in names] - m = WrapperMultiblock(modules) - - if self.compress_context.low_cpu_mem_usage: - if nblocks == 1: - self._offloader.reload(model, n) - else: - self._offloader.reload(model, names) - - m.config = model.config if hasattr(model, "config") else None - q_input, input_ids = self.quantizer.quantize_block( - m, - input_ids, - input_others, - q_input=q_input, - device=device, - ) - if hasattr(model, "config"): - del m.config - if self.is_immediate_packing: - for n, tmp_m in m.named_modules(): - if not (hasattr(tmp_m, "bits") and check_to_quantized(tmp_m)): - continue - self._immediate_pack(tmp_m.global_name) - - if self.is_immediate_saving: - shard_writer(self, m, is_finalize=False) - - if self.compress_context.low_cpu_mem_usage and not self.is_immediate_saving: - if nblocks == 1: - self._offloader.offload(model, n, overwrite=True) - else: - for name in names: - self._offloader.offload(model, name, overwrite=True) - if pbar is not None: - pbar.update(1) - - if not self.is_immediate_saving: - self.model = mv_module_from_gpu(self.model) - for n, m in self.model.named_modules(): - if hasattr(m, "name"): - delattr(m, "name") - - del q_input - del input_ids - del input_others - del inputs - - clear_memory(device_list=self.compress_context.device_list) + self.compress_context.is_immediate_packing = self.is_immediate_packing + self.compress_context.is_immediate_saving = self.is_immediate_saving def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: """Quantize the model and return the quantized model along with layer configurations.The entry of AutoRound. Returns: The quantized model and layer configurations. """ - - self.post_init() - self.model_context.initialize(formats=self.formats) - - self._check_compatibility() - - if bool(self.quantizer.quant_block_list): - all_blocks = self.quantizer.quant_block_list - else: - all_blocks = get_block_names(self.model_context.model) - - if len(all_blocks) == 0: - logger.warning("could not find blocks, exit with original model") - return self.model_context.model, self.quantizer.layer_config - - layer_names = _get_quantized_layer_names_outside_blocks( - model=self.model_context.model, - layer_config=self.quantizer.layer_config, - supported_types=SUPPORTED_LAYER_TYPES, - quant_block_list=self.quantizer.quant_block_list, - ) - start_time = time.time() - all_first_block_names = [block[0] for block in all_blocks] - if len(layer_names) > 0: - logger.info( - "Starting to cache block inputs. This may be slow due to external block layers: %s", layer_names - ) - else: - logger.info("start to cache block inputs") - all_inputs = self.try_cache_inter_data_gpucpu( - all_first_block_names, - self.nsamples, - layer_names, - ) - self.inputs = all_inputs - is_quantized_embedding = self._quantize_embedding_layer() - clear_memory(device_list=self.compress_context.device_list) - all_q_inputs = None - if is_quantized_embedding: - all_inputs = copy.deepcopy(self.inputs) - clear_memory(self.inputs, device_list=self.compress_context.device_list) - all_q_inputs = self.try_cache_inter_data_gpucpu(all_first_block_names, self.nsamples, layer_names) - self.inputs = all_q_inputs - # Remove accelerate dispatch hooks before moving parameters. - # hf_device_map is kept for reference but hooks are no longer needed. - if hasattr(self.model_context.model, "hf_device_map") and len(self.model_context.model.hf_device_map) > 1: - accelerate.hooks.remove_hook_from_submodules(self.model_context.model) - self.model_context.model = mv_module_from_gpu(self.model_context.model) - clear_memory(device_list=self.compress_context.device_list) - logger.info("caching done") - if self.compress_context.low_cpu_mem_usage: - self._offloader.offload( - self.model_context.model, all_blocks, clear_memory=True, device_list=self.compress_context.device_list - ) - if len(all_blocks) > 1: - pbar = tqdm(range(0, sum([len(i) for i in all_blocks]), self.nblocks)) - else: - pbar = tqdm(range(0, len(all_blocks[0]), self.nblocks)) # move the alg warning outside pbar - - for block_names in all_blocks: - inputs = all_inputs[block_names[0]] - all_inputs.pop(block_names[0]) - q_inputs = None - if all_q_inputs is not None: - q_inputs = all_q_inputs[block_names[0]] - all_q_inputs.pop(block_names[0]) - - inputs, q_inputs = _update_inputs(inputs, q_inputs) - - clear_memory(self.inputs, device_list=self.compress_context.device_list) - - if "input_ids" in inputs.keys(): - total_samples = len(inputs["input_ids"]) - if total_samples < self.quantizer.batch_size: - self.quantizer.batch_size = total_samples - logger.warning(f"force the train batch size to {total_samples}") - - self._quantize_blocks( - self.model_context.model, - inputs, - block_names, - q_input=q_inputs if q_inputs is not None else None, - nblocks=self.nblocks, - device=self.compress_context.device, - pbar=pbar, - ) - if self.is_immediate_packing and len(self.formats) != 1: - raise ValueError( - f"Expected exactly one packing format when 'immediate_packing' is True, " - f"but got {len(self.formats)} formats." - ) - pbar.set_description("Quantizing done") - pbar.close() - self._quantize_layers(layer_names, all_inputs) - - convert_module_to_hp_if_necessary( - self.model_context.model, self.model_context.amp_dtype, self.compress_context.device, to_cpu=True - ) - if self.is_immediate_saving: - shard_writer(self, is_finalize=True) - - if self.compress_context.low_cpu_mem_usage: - self._offloader.reload(self.model_context.model) - - end_time = time.time() - cost_time = end_time - start_time - logger.info(f"quantization tuning time {cost_time}") - - # Dump a summary - quantized_layers = [] - unquantized_layers = [] - for n, m in self.model_context.model.named_modules(): - if isinstance(m, tuple(SUPPORTED_LAYER_TYPES)): - if check_to_quantized(m): - quantized_layers.append(n) - else: - unquantized_layers.append(n) - elif hasattr(m, "scales") or hasattr(m, "scale"): # packing_immediately - quantized_layers.append(n) - summary_info = ( - f"Summary: quantized {len(quantized_layers)}/{len(quantized_layers) + len(unquantized_layers)} in the model" - ) - if len(unquantized_layers) > 0: - summary_info += f", {unquantized_layers} have not been quantized" - logger.info(summary_info) - - self.model_context.quantized = True - return self.model_context.model, self.quantizer.layer_config - - def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None: - """Quantizes specified layers based on inputs and configuration. - - Args: - layer_names (list): list of layer names to quantize. - layer_inputs (dict): Dictionary mapping layer names to input data. - - Returns: - None - """ - # TODO currently we take all the layers outside blocks as post block layers which is not optimal - # if there is no input for layer, we use rtn - - for layer_name in copy.deepcopy(layer_names): - if layer_name not in layer_inputs: - if self.act_bits < 16 and not self.act_dynamic: - # Activation quantization requires collected inputs - msg_prefix = ( - f"Activation max hook for layer '{layer_name}' is unavailable due to " - f"insufficient collected inputs. " - ) - if "fp8_e5m2" in self.act_data_type: - logger.warning(msg_prefix + "Please notes that unit scale is used for this layer.") - else: - logger.warning( - msg_prefix + "Static activation quantization is not supported or ineffective, " - "Skipping quantization for this layer." - ) - layer_names.remove(layer_name) - continue - logger.info(f"using rtn to quantize {layer_name}") - from auto_round.data_type import QUANT_FUNC_WITH_DTYPE - - layer = get_module(self.model, layer_name) - layer = layer.to(self.device) - layer = convert_module_to_hp_if_necessary(layer, self.model_context.amp_dtype, self.device) - set_module(self.model, layer_name, layer) - - wrapper_layer = WrapperLinear( - layer, - enable_round_tuning=False, - enable_minmax_tuning=False, - enable_norm_bias_tuning=False, - enable_torch_compile=self.enable_torch_compile, - device=self.device, - disable_opt_rtn=self.disable_opt_rtn, - ) - new_layer = wrapper_layer.unwrapper({}) - set_module(self.model, layer_name, new_layer) - layer.cpu() - layer_names.remove(layer_name) - if len(layer_names) == 0: - memory_monitor.update() - memory_monitor.log_summary() - return - q_layer_inputs = None - enable_quanted_input = self.enable_quanted_input - has_gguf = False - - if hasattr(self, "formats"): - has_gguf = any(format_.is_gguf() for format_ in self.formats) - if has_gguf and self.is_immediate_packing: - enable_quanted_input = False - - if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1 and enable_quanted_input: - dispatch_model(self.model, self.model.hf_device_map) - - if enable_quanted_input: - logger.info("starting to cache layer inputs for %s, this may be quite slow ", layer_names) - q_layer_inputs = self.try_cache_inter_data_gpucpu([], self.nsamples, layer_names=layer_names) - if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1: - accelerate.hooks.remove_hook_from_submodules( - self.model - ) # self.model.hf_device_map has not been changed - if not self.is_immediate_saving: - self.model = mv_module_from_gpu(self.model) - clear_memory(device_list=self.device_list) - quant_layer = self.quantizer.quantize_layer - for layer_name in layer_names: - layer_input = layer_inputs[layer_name] - layer_input = to_device(layer_input, self.cache_device) - q_layer_input = q_layer_inputs.get(layer_name, None) if q_layer_inputs is not None else None - q_layer_input = to_device(q_layer_input, self.cache_device) - quant_layer(layer_name, layer_input, q_layer_input, device=self.device) - if self.is_immediate_packing: - self._immediate_pack(layer_name) - - if self.is_immediate_saving: - m = get_module(self.model, layer_name) - shard_writer(self, m, name=layer_name, is_finalize=False) - del layer_input - clear_memory(q_layer_input, device_list=self.device_list) - memory_monitor.log_summary() - - def _check_compatibility(self) -> None: - """Checks compatibility of the configurations and model.""" - if ( - self.seqlen is not None - and hasattr(self.model_context.model, "config") - and hasattr(self.model_context.model.config, "max_position_embeddings") - ): - if self.model_context.model.config.max_position_embeddings < self.seqlen: - logger.warning( - f"Change sequence length to {self.model_context.model.config.max_position_embeddings} " - "due to the limitation of max_position_embeddings" - ) - self.seqlen = min(self.seqlen, self.model_context.model.config.max_position_embeddings) - - if self.seqlen is not None and hasattr(self.tokenizer, "model_max_length"): - if self.tokenizer.model_max_length < self.seqlen: - logger.warning( - f"Change sequence length to {self.tokenizer.model_max_length} " - "due to the limitation of model_max_length. " - "You can also try to increase the model_max_length to avoid this issue." - ) - self.seqlen = min(self.seqlen, self.tokenizer.model_max_length) - - if self.group_size == 0 and "fp8" not in self.data_type: - logger.warning("`group_size==0` is not supported for data_type other than fp8 ") - - if self.bits <= 2 and (self.iters < 1000 or not self.enable_alg_ext) and self.super_group_size is None: - logger.warning( - "for bits <= 2, it is recommended to enable `auto-round-best` " "and turn on `--enable_alg_ext` " - ) + raise NotImplementedError("quantize method must be implemented in subclass") def _get_save_folder_name(self, format: OutputFormat) -> str: """Generates the save folder name based on the provided format string. @@ -1454,9 +366,9 @@ def _get_save_folder_name(self, format: OutputFormat) -> str: # Use a subfolder only if there are multiple formats if len(self.formats) > 1: - return os.path.join(self.orig_output_dir, sanitized_format) + return os.path.join(self.output_dir, sanitized_format) - return self.orig_output_dir + return self.output_dir def save_quantized( self, @@ -1477,7 +389,7 @@ def save_quantized( Returns: object: The compressed model object. """ - self.orig_output_dir = output_dir + self.output_dir = output_dir if format is not None: logger.warning( f"save_quantized with format is deprecated and will be deleted in auto_round version 1.0." @@ -1488,7 +400,7 @@ def save_quantized( if not hasattr(self, "formats"): self.formats = formats - if not self.quantized: + if not self.model_context.quantized: logger.warning("please run autoround.quantize first") return folders = [] @@ -1500,14 +412,15 @@ def save_quantized( "Please ensure that your configuration is supported." ) - serialization_dict = {} - for key in SERIALIZATION_KEYS: - serialization_dict[key] = getattr(self, key) + serialization_dict = asdict(SerializedCompressorConfig()) + for key in serialization_dict: + serialization_dict[key] = getattr(self, key, serialization_dict[key]) from auto_round.version import __version__ serialization_dict["autoround_version"] = __version__ if "scale_dtype" in serialization_dict.keys(): serialization_dict["scale_dtype"] = str(serialization_dict["scale_dtype"]) + compressed_model = format.save_quantized( save_folder, model=self.model_context.model, @@ -1551,7 +464,8 @@ def quantize_and_save( ValueError: If an unsupported format is specified. """ # Validate and process the specified formats - self.orig_output_dir = output_dir + self.output_dir = output_dir + self.compress_context.output_dir = output_dir # check and update the format based on the current configuration if format and self.formats is not None: @@ -1559,14 +473,14 @@ def quantize_and_save( f"quantize_and_save with format is deprecated and will be deleted in auto_round version 1.0." f" Please use Compressor(format='{format}' instead)." ) - format_list = get_formats(format, self) - self.formats = format_list + self.formats = format if self.formats is None: - logger.info("format is not set, using default auto_round format.") - self.formats = get_formats("auto_round", self) + if self.formats is None: + logger.info("format is not set, using default auto_round format.") + self.formats = "auto_round" # If multiple formats are specified, enforce inplace=False - if len(self.formats) > 1: + if len(self.formats.split(",")) > 1: inplace = False self.inplace = kwargs.get("inplace", inplace) kwargs.pop("inplace", None) @@ -1576,32 +490,17 @@ def quantize_and_save( from auto_round.experimental.attention import attention_quant_ctx with attention_quant_ctx(self.model_context.model, static_attention_dtype=self.static_attention_dtype): - model, _ = self.quantize() + self.quantize() elif self.static_kv_dtype is not None: from auto_round.experimental.kv_cache import kvcache_quant_context with kvcache_quant_context(self.model_context.model, static_kv_dtype=self.static_kv_dtype): - model, _ = self.quantize() + self.quantize() else: - model, _ = self.quantize() - # Save the quantized model in the specified format_list - model, folders = self.save_quantized(output_dir, inplace=inplace, return_folders=True, **kwargs) - memory_monitor.log_summary() - - return model, folders + self.quantize() - -class TuneCompressor(BaseCompressor): - need_calib: bool = True - - -class ZeroShotCompressor(BaseCompressor): - need_calib: bool = False - - def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: - """Quantize the model and return the quantized model along with layer configurations.The entry of AutoRound. - Returns: - The quantized model and layer configurations. - """ - - pass + # When immediate_saving is enabled, the model has already been saved during quantization + # Skip the save_quantized call to avoid attempting to save layers that are on meta device + if self.is_immediate_saving: + logger.info("immediate_saving is enabled, model already saved during quantization") + return self.model, [output_dir] diff --git a/auto_round/compressors_new/calib.py b/auto_round/compressors_new/calib.py new file mode 100644 index 000000000..2f0d8dbcc --- /dev/null +++ b/auto_round/compressors_new/calib.py @@ -0,0 +1,1235 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy +import time +import traceback +from functools import partial +from typing import Any, Callable, Optional, Union + +import accelerate +import torch +from accelerate.big_modeling import dispatch_model, infer_auto_device_map +from accelerate.utils import get_balanced_memory, get_max_memory +from tqdm import tqdm + +from auto_round import envs +from auto_round.algorithms.alg_config import AlgConfig +from auto_round.calibration.utils import ( + _infer_last_cache_name, + _update_inputs, +) +from auto_round.compressors.shard_writer import shard_writer +from auto_round.compressors_new.base import BaseCompressor +from auto_round.compressors_new.utils import ( + _get_quantized_layer_names_outside_blocks, + check_skippable_keywords, + immediate_pack, + init_cache, + reset_params, +) +from auto_round.logger import logger +from auto_round.modeling.fused_moe.replace_modules import materialize_model_, safe_to_cpu_ +from auto_round.utils import ( + SUPPORTED_LAYER_TYPES, + check_seqlen_compatible, + check_to_quantized, + clear_memory, + convert_module_to_hp_if_necessary, + get_block_names, + get_module, + is_quantized_input_module, + memory_monitor, + mv_module_from_gpu, + set_module, + to_device, + to_dtype, +) +from auto_round.utils.device import ( + parse_available_devices, +) +from auto_round.wrapper import WrapperLinear, WrapperMultiblock + + +class CalibCompessor(BaseCompressor): + need_calib: bool = True + + def __init__( + self, + config: Union[AlgConfig, list[AlgConfig]], + model: Union[torch.nn.Module, str], + tokenizer=None, + platform="hf", + format=None, + dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k", + iters: int = 200, + low_gpu_mem_usage: bool = False, + device_map: Union[str, torch.device, int, dict] = 0, + enable_torch_compile: bool = False, + seed: int = 42, + low_cpu_mem_usage: bool = True, + **kwargs, + ): + self.dataset = dataset + self.iters = iters + super().__init__( + config, + model, + tokenizer, + platform, + format, + low_gpu_mem_usage, + device_map, + enable_torch_compile, + seed, + low_cpu_mem_usage, + **kwargs, + ) + if iters == 0: + self.lr = 5e-3 + + @torch.no_grad() + def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, last_cache_name=None): + """Attempts to cache intermediate data on GPU, if failed, then using CPU. + + Args: + block_names (list): List of block names to cache data for. + nsamples (int): Number of samples to use for caching. + layer_names (list, optional): List of layer names to cache data for. Defaults to []. + last_cache_name (str, optional): Name of the last cache. Defaults to None. + + Returns: + all_inputs: Cached intermediate data. + + Raises: + Exception: If caching on GPU fails, switches to CPU and caches there. + """ + if is_quantized_input_module(self.model_context.model): + layer_names = [] + if layer_names is None: + layer_names = [] + if self.compress_context.low_gpu_mem_usage or ( + len(block_names) == 1 + and len(layer_names) == 0 + and not self.quantizer.has_qlayer_outside_block + and (last_cache_name is None or last_cache_name in block_names) + ): + # low_gpu_mem_usage or calibrate only the embedding layer, which is also very fast on CPU + all_inputs = self.cache_inter_data(block_names, nsamples, layer_names=[], last_cache_name=last_cache_name) + else: + try: + if any(p.device.type == "meta" for p in self.model_context.model.parameters()): + materialize_model_(self.model_context.model) + + if ( + hasattr(self.model_context.model, "hf_device_map") + and len(self.model_context.model.hf_device_map) > 1 + ): + self.model_context.model = dispatch_model( + self.model_context.model, device_map=self.model_context.model.hf_device_map + ) + else: + # Change this if new device is supported + if str(self.model_context.model.device) == "cpu" and ( + not self.compress_context.device.startswith("hpu") + ): + # type(self.model_context.model._no_split_modules) changes from list to set when transformers > 5.0 + no_split_modules = list(getattr(self.model_context.model, "_no_split_modules", [])) + devices = parse_available_devices(self.compress_context.device_map) + + max_memory = get_max_memory() + new_max_memory = {} + if "cpu" not in devices: + devices.append("cpu") + for device in devices: + if ":" in device: + device = int(device.split(":")[-1]) + elif device == "cpu": + device = "cpu" + elif isinstance(device, str): + device = 0 + else: + raise ValueError( + f"Unsupported device {device} in device_map: {self.compress_context.device_map}" + ) + if device not in max_memory: + # Skip devices that aee not reported by accelerate's max_memory. + # This is expected when a device is unavailable or cannot provide memory info. + continue + # Use 90% of the reported max memory to leave headroom for activations, + # temporary tensors, other processes, and allocator fragmentation, reducing + # the chance of runtime OOM while still utilizing most available memory. + new_max_memory[device] = max_memory[device] * 0.9 + + # If non-CPU devices were requested but none survived, fall back to CPU caching + # via the OOM handler below, avoiding unnecessary dispatch overhead. + requested_non_cpu = any((d != "cpu") for d in devices) + has_non_cpu_memory = any((k != "cpu") for k in new_max_memory) + if requested_non_cpu and not has_non_cpu_memory: + raise torch.OutOfMemoryError( + "No non-CPU device available in accelerate's reported memory. " + "Falling back to CPU caching." + ) + + new_max_memory = get_balanced_memory( + self.model_context.model, + max_memory=new_max_memory, + no_split_module_classes=no_split_modules, + ) + self.model_context.model.tie_weights() + device_map = infer_auto_device_map( + self.model_context.model, + max_memory=new_max_memory, + no_split_module_classes=no_split_modules, + ) + if len(devices) > 1 and "cpu" in device_map.values(): + logger.warning( + "Some layers are offloaded to cpu, which may severely impact calibration speed." + " Please consider using more cards." + ) + + try: + + self.model_context.model = dispatch_model(self.model_context.model, device_map=device_map) + except ValueError as e: + if "offload_dir" in e.__str__(): + logger.warning( + f"Due to insufficient resources, disk is used to store the model." + f" `offload_dir={envs.AR_WORK_SPACE}`" + ) + self.model_context.model = dispatch_model( + self.model_context.model, device_map=device_map, offload_dir=envs.AR_WORK_SPACE + ) + else: + raise + else: + + self.model_context.model = self.model_context.model.to(self.compress_context.device) + + all_inputs = self.cache_inter_data( + block_names, nsamples, layer_names=layer_names, last_cache_name=last_cache_name + ) + if ( + hasattr(self.model_context.model, "hf_device_map") + and len(self.model_context.model.hf_device_map) > 1 + ): + accelerate.hooks.remove_hook_from_submodules(self.model_context.model) + + except torch.OutOfMemoryError: + cuda_error_msg = traceback.format_exc() + try: + logger.info("switch to cpu to cache block inputs") + self.compress_context.cache_device = torch.device("cpu") + if self.quantizer.has_qlayer_outside_block or self.__class__.__name__ == "AutoRoundMLLM": + logger.warning( + "we recommend using more GPUs in calibration." + " Otherwise, some layers may fall back to `rtn` mode, which can affect accuracy." + ) + accelerate.hooks.remove_hook_from_submodules(self.model_context.model) + self.model_context.model = mv_module_from_gpu(self.model_context.model) + clear_memory(device_list=self.compress_context.device_list) + # Important change after v0.51, on cpu, we use rtn mode for layers in layer_names + all_inputs = self.cache_inter_data( + block_names, nsamples, layer_names=[], last_cache_name=last_cache_name + ) + except Exception as e: + logger.error(cuda_error_msg) + raise + return all_inputs + + @torch.no_grad() + def cache_inter_data(self, block_names, nsamples, layer_names=None, last_cache_name=None): + """Save the inputs of block_name for calibration. + + This method temporarily replaces the forward method of the model to capture + the inputs passing through the specified block. It then calibrates the model + using a specified number of samples. Finally, it restores the original forward + method and returns the inputs for the specified block. + Args: + block_names (list): The names of the blocks for which inputs are to be saved. + layer_names (list):The names of the layers for which inputs are to be saved. + nsamples (int): The number of samples to use for calibration. + last_cache_name (str, optional): The name of the last layer to be cached, + we could break the forward in this layer to save time + + Returns: + dict: A dictionary containing the inputs for the specified block. + """ + if layer_names is None: + layer_names = [] + self.inputs = {} + self.to_cached_layers = block_names + layer_names + + tmp_dtype = None # TODO delete this as most model is not fp32 now + ## have bug if block name is not the first block + if (len(block_names) > 1 or len(layer_names) > 0) and self.compress_context.low_gpu_mem_usage: + tmp_dtype = self.model_context.model.dtype + if self.model_context.amp: + if self.model_context.model.dtype != self.model_context.model.dtype: + self.model_context.model = self.model_context.model.to(torch.bfloat16) + else: + self.model_context.model = self.model_context.model.to(torch.float32) ##model on cpu + + self.last_cache_name = _infer_last_cache_name(block_names, layer_names, last_cache_name) + self._cache_target_set = set(self.to_cached_layers) + self._cache_seen_targets = set() + calib_bs = self.quantizer.batch_size + self.hook_handles = [] + self._replace_forward() + self.calib(nsamples, calib_bs) + self.model_context.recover_forward() + res = self.inputs + del self.last_cache_name + del self._cache_target_set + del self._cache_seen_targets + del self.to_cached_layers + if tmp_dtype is not None: + self.model_context.model = self.model_context.model.to(tmp_dtype) + + return res + + @torch.no_grad() + def calib(self, nsamples, bs): + """Perform calibration for quantization. + + This method calibrates the model for quantization by processing a specified + number of samples from the calibration dataset. It ensures that the data is + properly formatted and feeds it to the model. If the number of samples processed + is less than the specified number, it logs a warning. If no samples are processed, + it logs an error and exits. + Args: + nsamples (int): The number of samples to use for calibration. + bs (int): The number of samples to use for calibration + """ + from auto_round.calib_dataset import get_dataloader + + need_attention_mask = True + if isinstance(self.dataset, str): + need_attention_mask = False # all supported datasets does not use pad + dataset = self.dataset.replace(" ", "") ##remove all whitespaces + + # slow here + self.dataloader = get_dataloader( + self.tokenizer, + self.quantize_config.seqlen, + dataset, + self.seed, + bs, + self.quantize_config.nsamples, + ) + else: + self.dataloader = self.dataset + total_cnt = 0 + if self.dataloader.__class__.__name__ == "BatchEncoding": + self.dataloader = [self.dataloader.data] + + for data in self.dataloader: + if data.__class__.__name__ == "BatchEncoding": + data = data.data + if data is None: + continue + if isinstance(data, torch.Tensor): + input_ids = data.to(self.model.device) + data_new = input_ids + elif isinstance(data, str): + if self.tokenizer is None: + logger.error("please provide tokenizer for string input") + exit(-1) + data = self.tokenizer( + data, truncation=True, max_length=self.quantize_config.seqlen, return_tensors="pt" + ).data + data_new = {} + for key in data.keys(): + data_new[key] = data[key].to(self.model.device) + input_ids = data_new["input_ids"] + elif isinstance(data, tuple) or isinstance(data, list): + data_new = to_device(data, self.model.device) + input_ids = data_new[0] + else: + data_new = {} + for key in data.keys(): + data_new[key] = to_device(data[key], self.model.device) + if key == "images": + data_new[key] = to_dtype(data_new[key], self.model.dtype) + input_ids = data_new["input_ids"] + if input_ids.shape[-1] < self.quantize_config.seqlen: + continue + if need_attention_mask: + if ( + isinstance(data_new, dict) + and "attention_mask" in data_new + and data_new["attention_mask"] is not None + ): + new_attention_mask = data_new["attention_mask"] + elif ( + self.tokenizer is not None + and hasattr(self.tokenizer, "pad_token") + and self.tokenizer.pad_token is not None + ): + new_attention_mask = (input_ids != self.tokenizer.pad_token_id).to(torch.long) + else: + # Default all ones + new_attention_mask = torch.ones_like(input_ids, dtype=torch.long) + + # For each sample, check if there are trailing repeated tokens + # If so, set the mask of the last token to 0 + batch_size, seq_len = input_ids.shape + for i in range(batch_size): + last_token = input_ids[i, -1] + # Check for trailing repeats + j = seq_len - 2 + repeated = False + while j >= 0 and input_ids[i, j] == last_token: + repeated = True + new_attention_mask[i, j] = 0 + j -= 1 + # If there was at least one repeat, set last token mask to 0 + if repeated: + new_attention_mask[i, -1] = 0 + + # Workaround: some models treat an all-1 attention mask as equivalent to None and + # will internally replace it with None for block inputs, which can cause tensor + # concatenation / shape-mismatch issues in downstream code. To avoid providing an + # all-1 mask, we force the last token in each sequence to be masked out (set to 0) + # so that the mask is never "all ones". This means the model will not attend to the + # last position, so the impact on accuracy is minimal as basically equivalent to dropping a single token + new_attention_mask[:, -1] = 0 + + self.quantizer.attention_mask.extend(list(torch.split(new_attention_mask, 1, dim=0))) + else: + new_attention_mask = None + try: + kwargs = {"use_cache": False} + if new_attention_mask is not None and not (isinstance(data_new, dict) and "attention_mask" in data_new): + kwargs["attention_mask"] = new_attention_mask + + if isinstance(data_new, torch.Tensor): + self.model(data_new, **kwargs) + elif isinstance(data_new, tuple) or isinstance(data_new, list): + self.model(*data_new, **kwargs) + else: + self.model(**data_new, **kwargs) + except NotImplementedError: + pass + except RuntimeError as error: + error_msg = str(error) + if "The expanded size of the tensor" in str(error_msg) and "must match the existing size" in error_msg: + check_seqlen_compatible(self.quantize_config.seqlen, self.tokenizer, self.model) + logger.warning( + "When quantization encounters tensor shape mismatch error, " + "you can try to avoid it with batch_size=1" + ) + raise error + except Exception as error: + raise error + + total_cnt += input_ids.shape[0] if len(input_ids.shape) > 1 else 1 + if total_cnt >= nsamples: + break + if total_cnt == 0: + logger.error( + f"no data has been cached, please provide more data with sequence length >={self.quantize_config.seqlen} in the " + f"dataset or decease the sequence length" + ) + exit(-1) + elif total_cnt < nsamples: + logger.warning_once( + f"An insufficient number of samples likely reduces the accuracy of the quantized model. " + f"Target samples count is {nsamples}, while valid samples count is {total_cnt}" + ) + + @torch.no_grad() + def _get_block_forward_func(self, name: str) -> Callable: + """Gets the forward function. + + Args: + name (str): The name of the function. + Returns: + function: The forward function. + """ + + def post_process_cache_data(batch_size, data, data_name): + """ + Processes store data for batch handling, reshaping if necessary. + + Args: + batch_size (int): The size of the batch. + data: The data value to store, potentially for caching. + data_name (str): Name of the data. + + Returns: + Processed data or None + """ + new_data = data + if batch_size <= 1: + return new_data + if data_name in self.model_context.shared_cache_keys: + return None + if "alibi" in data_name: + if isinstance(data, torch.Tensor): + alibi = data + alibi = alibi.reshape(batch_size, -1, alibi.shape[1], alibi.shape[2]) + new_data = alibi + return new_data + + def forward(m, hidden_states=None, *positional_inputs, **kwargs): + """Rewrite forward function, process and collect input data. + + Args: + hidden_states (torch.Tensor): The hidden states tensor. + *positional_inputs: Variable number of positional arguments. + **kwargs: Variable number of keyword arguments. + + Returns: + NotImplementedError: Getting the first layer inputs and then raise the error to save runtime. + """ + if name not in self.inputs: + self.inputs[name] = {} + init_cache(positional_inputs, self.inputs[name]) + + if self.quantizer.batch_dim is None: + self.quantizer.batch_dim = 0 + if hidden_states is not None and self.quantizer.batch_size > 1: + if hidden_states.shape[0] > self.quantizer.batch_size: + self.quantizer.batch_dim = 1 + if len(hidden_states.shape) > 1 and hidden_states.shape[1] > self.quantizer.batch_size: + logger.error( + "this model has not been supported, " + "please raise an issue in https://github.com/intel/auto-round/issues" + " or try to set the `batch_size` to 1 and " + "`gradient_accumulate_steps` to your current batch size." + ) + exit(-1) + + if hidden_states is not None: + kwargs["hidden_states"] = hidden_states + + for key in kwargs.keys(): + if ( + isinstance(kwargs[key], torch.Tensor) + or isinstance(kwargs[key], list) + or isinstance(kwargs[key], tuple) + ): + if key not in self.inputs[name].keys(): # initialization + data = to_device(kwargs[key], device=torch.device("cpu")) + if data is None or ( + self.quantizer.batch_size > 1 and key in self.model_context.shared_cache_keys + ): + self.inputs[name][key] = data + continue + if self.quantizer.batch_size <= 1: + self.inputs[name][key] = [data] + else: + data = post_process_cache_data(self.quantizer.batch_size, data, key) + self.inputs[name][key] = list(torch.split(data, 1, dim=self.quantizer.batch_dim)) + else: # append cache inputs + new_data = post_process_cache_data(self.quantizer.batch_size, kwargs[key], key) + if new_data is None: # shareable args or NoneType + continue + new_data = to_device(new_data, device=torch.device("cpu")) + if self.quantizer.batch_size <= 1: + self.inputs[name][key].append(new_data) + else: + self.inputs[name][key].extend(list(torch.split(new_data, 1, dim=self.quantizer.batch_dim))) + elif isinstance(kwargs[key], (str, bool, type(None))): + if key not in self.inputs[name].keys(): + self.inputs[name][key] = kwargs[key] + else: + # Parameters not to be cached + if check_skippable_keywords(key): + logger.warning_once( + f"Please note that '{key}' key" " is not currently used in quantization fine-tuning." + ) + reset_params(self.inputs[name]) + + if self._should_stop_cache_forward(name): + raise NotImplementedError + else: + if hidden_states is not None: + kwargs.pop("hidden_states") + return m.orig_forward(hidden_states, *positional_inputs, **kwargs) + else: + # Currently only for Llama-3.2-Vision-Instruct Series + return m.orig_forward(*positional_inputs, **kwargs) + + return forward + + @torch.no_grad() + def _get_cache_data_hook_for_layer(self, name): + """A forward hook to save input max of a module + :param name: the module name + :return: A hook function.""" + + def cache_input_hook(module, inputs, outputs): + input = inputs + if isinstance(inputs, tuple) or isinstance(input, list): + input = inputs[0] + if name in self.inputs: + self.inputs[name].extend(list(torch.split(input.to("cpu"), 1, dim=0))) + else: + self.inputs[name] = list(torch.split(input.to("cpu"), 1, dim=0)) + + if self._should_stop_cache_forward(name): + raise NotImplementedError + + return cache_input_hook + + def _replace_forward(self): + """Replaces the forward function.""" + + def register_hook(n, m, hook_handles): + if n in self.to_cached_layers and type(m) not in SUPPORTED_LAYER_TYPES: ##block + m.orig_forward = m.forward + m.forward = partial(self._get_block_forward_func(n), m) + elif n in self.to_cached_layers: ##linear layer or conv1d layer + hook_func = self._get_cache_data_hook_for_layer(n) + hook_handle = m.register_forward_hook(hook_func) + hook_handles.append(hook_handle) + + self.model_context.replace_forward(register_hook) + + def _should_stop_cache_forward(self, name: str) -> bool: + """Determine whether current forward pass can stop after caching `name`.""" + if name == self.last_cache_name: + return True + + if self.last_cache_name is not None: + return False + + if not hasattr(self, "_cache_target_set") or not hasattr(self, "_cache_seen_targets"): + return False + + if name in self._cache_target_set: + self._cache_seen_targets.add(name) + + if not self._cache_target_set.issubset(self._cache_seen_targets): + return False + + # Lock the last cache name after the first full forward pass. + self.last_cache_name = name + return True + + def _preprocess_block_inputs(self, inputs, first_input_name="input_ids"): + input_ids, input_others = self._split_inputs(inputs, first_input_name) + clear_memory(device_list=self.compress_context.device_list) + input_ids = to_device(input_ids, self.compress_context.cache_device) + input_others = to_device(input_others, self.compress_context.cache_device) + # As in calibration phase, we may use bf16 for calibration due to low_gpu_memory usage + + tmp_dtype = self.model_context.amp_dtype if self.model_context.amp else torch.float32 + input_ids = to_dtype(input_ids, tmp_dtype) + + for key in input_others.keys(): + if isinstance(input_others[key], torch.Tensor) and ( + input_others[key].dtype == torch.float16 or input_others[key].dtype == torch.bfloat16 + ): + input_others[key] = input_others[key].to(tmp_dtype) + elif isinstance(input_others[key], list): + for i in range(len(input_others[key])): + to_dtype(input_others[key][i], tmp_dtype) + return input_ids, input_others + + def _split_inputs(self, inputs: dict, first_input_name: str) -> tuple[torch.Tensor, dict]: + input_ids = inputs[first_input_name] + inputs.pop(first_input_name, None) + input_others = inputs + return input_ids, input_others + + def _quantize_blocks( + self, + model: torch.nn.Module, + inputs: dict, + block_names: list, + q_input: torch.Tensor = None, + nblocks: int = 1, + pbar: tqdm = None, + ): + """Quantize and dequantize the weights of the specified blocks in the model. + + Args: + model: The PyTorch model to be quantized. + inputs: The input data for quantization. + block_names: The names of the blocks to be quantized and dequantized. + nblocks: The number of blocks to quantize and dequantize. + device: The device for quantization and dequantization. + + Returns: + None + """ + clear_memory(device_list=self.compress_context.device_list) + for n, m in model.named_parameters(): + m.requires_grad_(False) + + input_ids, input_others = self._preprocess_block_inputs(inputs) + + if pbar is None: + pbar = tqdm(range(0, len(block_names), nblocks)) + + for i in range(0, len(block_names), nblocks): + if i != 0: + pbar.update(1) + if nblocks == 1: + n = block_names[i] + pbar.set_description(f"Quantizing {n}") + m = get_module(model, n) + else: + names = block_names[i : min(i + nblocks, len(block_names))] + pbar.set_description(f"Quantizing [{i + 1}-{min(i + nblocks, len(block_names))}]/{len(block_names)}") + modules = [get_module(model, n) for n in names] + m = WrapperMultiblock(modules) + + if self.compress_context.low_cpu_mem_usage: + if nblocks == 1: + self._offloader.reload(model, n) + else: + self._offloader.reload(model, names) + + m.config = model.config if hasattr(model, "config") else None + q_input, input_ids = self.quantizer.quantize_block( + m, + input_ids, + input_others, + q_input=q_input, + ) + + if self.is_immediate_saving: + shard_writer(self, m, is_finalize=False) + + if self.compress_context.low_cpu_mem_usage and not self.is_immediate_saving: + if nblocks == 1: + self._offloader.offload(model, n, overwrite=True) + else: + for name in names: + self._offloader.offload(model, name, overwrite=True) + if pbar is not None: + pbar.update(1) + + if not self.is_immediate_saving: + self.model = mv_module_from_gpu(self.model) + for n, m in self.model.named_modules(): + if hasattr(m, "name"): + delattr(m, "name") + + del q_input + del input_ids + del input_others + del inputs + + clear_memory(device_list=self.compress_context.device_list) + + def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: + """Quantize the model and return the quantized model along with layer configurations.The entry of AutoRound. + Returns: + The quantized model and layer configurations. + """ + + self.post_init() + self.model_context.initialize(formats=self.formats, is_act_quantize=self.config.is_act_quantize) + + self._check_compatibility() + + if bool(self.quantizer.quant_block_list): + all_blocks = self.quantizer.quant_block_list + else: + all_blocks = get_block_names(self.model_context.model) + + if len(all_blocks) == 0: + logger.warning("could not find blocks, exit with original model") + return self.model_context.model, self.quantizer.layer_config + + layer_names = _get_quantized_layer_names_outside_blocks( + model=self.model_context.model, + layer_config=self.quantizer.layer_config, + supported_types=SUPPORTED_LAYER_TYPES, + quant_block_list=self.quantizer.quant_block_list, + ) + start_time = time.time() + all_first_block_names = [block[0] for block in all_blocks] + if len(layer_names) > 0: + logger.info( + "Starting to cache block inputs. This may be slow due to external block layers: %s", layer_names + ) + else: + logger.info("start to cache block inputs") + all_inputs = self.try_cache_inter_data_gpucpu( + all_first_block_names, + self.quantize_config.nsamples, + layer_names, + ) + self.inputs = all_inputs + is_quantized_embedding = self._quantize_embedding_layer() + clear_memory(device_list=self.compress_context.device_list) + all_q_inputs = None + if is_quantized_embedding: + all_inputs = copy.deepcopy(self.inputs) + clear_memory(self.inputs, device_list=self.compress_context.device_list) + all_q_inputs = self.try_cache_inter_data_gpucpu( + all_first_block_names, self.quantize_config.nsamples, layer_names + ) + self.inputs = all_q_inputs + # Remove accelerate dispatch hooks before moving parameters. + # hf_device_map is kept for reference but hooks are no longer needed. + if hasattr(self.model_context.model, "hf_device_map") and len(self.model_context.model.hf_device_map) > 1: + accelerate.hooks.remove_hook_from_submodules(self.model_context.model) + self.model_context.model = mv_module_from_gpu(self.model_context.model) + clear_memory(device_list=self.compress_context.device_list) + logger.info("caching done") + if self.compress_context.low_cpu_mem_usage: + self._offloader.offload( + self.model_context.model, all_blocks, clear_memory=True, device_list=self.compress_context.device_list + ) + if len(all_blocks) > 1: + pbar = tqdm(range(0, sum([len(i) for i in all_blocks]), self.nblocks)) + else: + pbar = tqdm(range(0, len(all_blocks[0]), self.nblocks)) # move the alg warning outside pbar + + for block_names in all_blocks: + inputs = all_inputs[block_names[0]] + all_inputs.pop(block_names[0]) + q_inputs = None + if all_q_inputs is not None: + q_inputs = all_q_inputs[block_names[0]] + all_q_inputs.pop(block_names[0]) + + inputs, q_inputs = _update_inputs(inputs, q_inputs) + + clear_memory(self.inputs, device_list=self.compress_context.device_list) + + if "input_ids" in inputs.keys(): + total_samples = len(inputs["input_ids"]) + if total_samples < self.quantizer.batch_size: + self.quantizer.batch_size = total_samples + logger.warning(f"force the train batch size to {total_samples}") + + self._quantize_blocks( + self.model_context.model, + inputs, + block_names, + q_input=q_inputs if q_inputs is not None else None, + nblocks=self.nblocks, + pbar=pbar, + ) + if self.is_immediate_packing and len(self.formats) != 1: + raise ValueError( + f"Expected exactly one packing format when 'immediate_packing' is True, " + f"but got {len(self.formats)} formats." + ) + pbar.set_description("Quantizing done") + pbar.close() + self._quantize_layers(layer_names, all_inputs) + + convert_module_to_hp_if_necessary( + self.model_context.model, self.model_context.amp_dtype, self.compress_context.device, to_cpu=True + ) + if self.is_immediate_saving: + shard_writer(self, is_finalize=True) + + if self.compress_context.low_cpu_mem_usage: + self._offloader.reload(self.model_context.model) + + end_time = time.time() + cost_time = end_time - start_time + logger.info(f"quantization tuning time {cost_time}") + + # Dump a summary + quantized_layers = [] + unquantized_layers = [] + for n, m in self.model_context.model.named_modules(): + if isinstance(m, tuple(SUPPORTED_LAYER_TYPES)): + if check_to_quantized(m): + quantized_layers.append(n) + else: + unquantized_layers.append(n) + elif hasattr(m, "scales") or hasattr(m, "scale"): # packing_immediately + quantized_layers.append(n) + summary_info = ( + f"Summary: quantized {len(quantized_layers)}/{len(quantized_layers) + len(unquantized_layers)} in the model" + ) + if len(unquantized_layers) > 0: + summary_info += f", {unquantized_layers} have not been quantized" + logger.info(summary_info) + + self.model_context.quantized = True + return self.model_context.model, self.quantizer.layer_config + + def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None: + """Quantizes specified layers based on inputs and configuration. + + Args: + layer_names (list): list of layer names to quantize. + layer_inputs (dict): Dictionary mapping layer names to input data. + + Returns: + None + """ + # TODO currently we take all the layers outside blocks as post block layers which is not optimal + # if there is no input for layer, we use rtn + + for layer_name in copy.deepcopy(layer_names): + if layer_name not in layer_inputs: + if self.act_bits < 16 and not self.act_dynamic: + # Activation quantization requires collected inputs + msg_prefix = ( + f"Activation max hook for layer '{layer_name}' is unavailable due to " + f"insufficient collected inputs. " + ) + if "fp8_e5m2" in self.act_data_type: + logger.warning(msg_prefix + "Please notes that unit scale is used for this layer.") + else: + logger.warning( + msg_prefix + "Static activation quantization is not supported or ineffective, " + "Skipping quantization for this layer." + ) + layer_names.remove(layer_name) + continue + logger.info(f"using rtn to quantize {layer_name}") + from auto_round.data_type import QUANT_FUNC_WITH_DTYPE + + layer = get_module(self.model, layer_name) + layer = layer.to(self.compress_context.device) + layer = convert_module_to_hp_if_necessary( + layer, self.model_context.amp_dtype, self.compress_context.device + ) + set_module(self.model, layer_name, layer) + + wrapper_layer = WrapperLinear( + layer, + enable_round_tuning=False, + enable_minmax_tuning=False, + enable_norm_bias_tuning=False, + enable_torch_compile=self.enable_torch_compile, + device=self.compress_context.device, + disable_opt_rtn=self.disable_opt_rtn, + ) + new_layer = wrapper_layer.unwrapper({}) + set_module(self.model, layer_name, new_layer) + layer.cpu() + layer_names.remove(layer_name) + if len(layer_names) == 0: + memory_monitor.update() + memory_monitor.log_summary() + return + q_layer_inputs = None + enable_quanted_input = self.enable_quanted_input + has_gguf = False + + if hasattr(self, "formats"): + has_gguf = any(format_.is_gguf() for format_ in self.formats) + if has_gguf and self.is_immediate_packing: + enable_quanted_input = False + + if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1 and enable_quanted_input: + dispatch_model(self.model, self.model.hf_device_map) + + if enable_quanted_input: + logger.info("starting to cache layer inputs for %s, this may be quite slow ", layer_names) + q_layer_inputs = self.try_cache_inter_data_gpucpu( + [], self.quantize_config.nsamples, layer_names=layer_names + ) + if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1: + accelerate.hooks.remove_hook_from_submodules( + self.model + ) # self.model.hf_device_map has not been changed + if not self.is_immediate_saving: + self.model = mv_module_from_gpu(self.model) + clear_memory(device_list=self.compress_context.device_list) + quant_layer = self.quantizer.quantize_layer + for layer_name in layer_names: + layer_input = layer_inputs[layer_name] + layer_input = to_device(layer_input, self.compress_context.cache_device) + q_layer_input = q_layer_inputs.get(layer_name, None) if q_layer_inputs is not None else None + q_layer_input = to_device(q_layer_input, self.compress_context.cache_device) + quant_layer(layer_name, layer_input, q_layer_input, device=self.compress_context.device) + if self.is_immediate_packing: + immediate_pack(layer_name, self.quantizer.layer_config) + + if self.is_immediate_saving: + m = get_module(self.model, layer_name) + shard_writer(self, m, name=layer_name, is_finalize=False) + del layer_input + clear_memory(q_layer_input, device_list=self.compress_context.device_list) + memory_monitor.log_summary() + + def _check_compatibility(self) -> None: + """Checks compatibility of the configurations and model.""" + if ( + self.quantize_config.seqlen is not None + and hasattr(self.model_context.model, "config") + and hasattr(self.model_context.model.config, "max_position_embeddings") + ): + if self.model_context.model.config.max_position_embeddings < self.quantize_config.seqlen: + logger.warning( + f"Change sequence length to {self.model_context.model.config.max_position_embeddings} " + "due to the limitation of max_position_embeddings" + ) + self.quantize_config.seqlen = min( + self.quantize_config.seqlen, self.model_context.model.config.max_position_embeddings + ) + + if self.quantize_config.seqlen is not None and hasattr(self.tokenizer, "model_max_length"): + if self.tokenizer.model_max_length < self.quantize_config.seqlen: + logger.warning( + f"Change sequence length to {self.tokenizer.model_max_length} " + "due to the limitation of model_max_length. " + "You can also try to increase the model_max_length to avoid this issue." + ) + self.quantize_config.seqlen = min(self.quantize_config.seqlen, self.tokenizer.model_max_length) + + if self.group_size == 0 and "fp8" not in self.data_type: + logger.warning("`group_size==0` is not supported for data_type other than fp8 ") + + if ( + self.bits <= 2 + and (self.iters < 1000 or not getattr(self.quantize_config, "enable_alg_ext", False)) + and self.super_group_size is None + ): + logger.warning( + "for bits <= 2, it is recommended to enable `auto-round-best` " "and turn on `--enable_alg_ext` " + ) + + +class ImatrixCompressor(CalibCompessor): + need_calib: bool = True + + def __init__( + self, + config: AlgConfig, + model: torch.nn.Module, + **kwargs, + ): + kwargs["iters"] = 0 + super().__init__( + config, + model, + **kwargs, + ) + + def _quantize_via_rtn_blockwise(self) -> None: + """Quantize model layers block by block using cached inputs and imatrix.""" + + all_blocks = self.quantizer.quant_block_list if self.quantizer.quant_block_list else get_block_names(self.model) + if not all_blocks: + raise ValueError("Could not find any blocks. Check the model or quant_block_list.") + + all_first_block_names = [block[0] for block in all_blocks] + layer_names = _get_quantized_layer_names_outside_blocks( + model=self.model_context.model, + layer_config=self.quantizer.layer_config, + supported_types=SUPPORTED_LAYER_TYPES, + quant_block_list=self.quantizer.quant_block_list, + ) + if self.quantize_config.is_act_quantize and (not self.quantize_config.act_dynamic or len(layer_names) > 0): + if len(layer_names) > 0: + logger.warning( + "quantize layers outside blocks for static activation quantizaiton" + " will significantly increase calibration time" + ) + all_inputs = self.try_cache_inter_data_gpucpu( + all_first_block_names, self.quantize_config.nsamples, layer_names + ) + else: + all_inputs = self.cache_inter_data(all_first_block_names, self.quantize_config.nsamples) + + # Clear hooks for multi-GPU setups + if hasattr(self.model_context.model, "hf_device_map") and len(self.model_context.model.hf_device_map) > 1: + accelerate.hooks.remove_hook_from_submodules(self.model_context.model) + + pbar = tqdm(range(sum(len(block) for block in all_blocks))) + + for block_names in all_blocks: + first_block = block_names[0] + inputs = all_inputs.pop(first_block) + input_keys = [k for k in inputs if k.startswith("hidden_state")] + if len(input_keys) != 1: + raise RuntimeError( + "hidden_states arg mismatch. Please file an issue at https://github.com/intel/auto-round/issues" + ) + inputs["input_ids"] = inputs.pop(input_keys[0]) + + clear_memory(self.inputs, device_list=self.compress_context.device_list) + + total_samples = len(inputs["input_ids"]) + if total_samples < self.quantize_config.batch_size: + self.quantize_config.batch_size = total_samples + logger.warning(f"Forcing batch size to {total_samples}") + + input_ids = to_device(inputs.pop("input_ids"), self.compress_context.cache_device) + input_others = to_device(inputs, self.compress_context.cache_device) + + tmp_dtype = self.model_context.amp_dtype if self.model_context.amp else torch.float32 + input_ids = [id_.to(tmp_dtype) for id_ in input_ids] + + for key, val in input_others.items(): + if isinstance(val, torch.Tensor) and val.dtype in (torch.float16, torch.bfloat16): + input_others[key] = val.to(tmp_dtype) + elif isinstance(val, list): + input_others[key] = [to_dtype(v, tmp_dtype) for v in val] + + for block_name in block_names: + pbar.set_description(f"Quantizing {block_name}") + block = get_module(self.model_context.model, block_name) + + self.quantizer.quantize_block( + block, + input_ids, + input_others, + ) + + if self.low_cpu_mem_usage and not self.is_immediate_saving: + self._offloader.offload(self.model_context.model, block_name) + if block_name == block_names[-1]: + clear_memory(input_ids, device_list=self.compress_context.device_list) + else: + clear_memory(device_list=self.compress_context.device_list) + + memory_monitor.log_summary() + pbar.update(1) + pbar.close() + # Process remaining layers not in blocks + # Collect names of quantizable layers not belonging to any block + remain_layer_names = [] + block_name_set = set(name for block in all_blocks for name in block) + for n, m in self.model_context.model.named_modules(): + if not check_to_quantized(m): + continue + # Skip if this layer is part of any block (by prefix match) + if any(n == block_name or n.startswith(f"{block_name}.") for block_name in block_name_set): + continue + remain_layer_names.append(n) + + for name in remain_layer_names: + dtype = None + if self.super_group_size is not None: + dtype = torch.float32 + self.quantizer.quantize_layer(name, dtype=dtype) + # clear_memory(device_list=self.compress_context.device_list) + # if self.is_immediate_saving: + # shard_writer(self, is_finalize=True) + + def _quant_rtn_with_imatrix(self) -> None: + """Performs RTN quantization using input activation statistics (imatrix). + + This method accumulates per-channel second-moment activation statistics (imatrix) + via forward hooks and uses them to perform RTN quantization. If CUDA memory runs out, + it falls back to CPU-based blockwise quantization. + + Returns: + None + """ + logger.info("start to compute imatrix") + + # Load dataset + from auto_round.calib_dataset import get_dataloader + + if isinstance(self.dataset, str): + if self.model_context.tokenizer is None: + raise ValueError("A tokenizer must be set for the model when using a dataset string.") + dataset_name = self.dataset.replace(" ", "") + self.dataloader = get_dataloader( + self.model_context.tokenizer, + self.quantize_config.seqlen, + dataset_name, + self.seed, + self.quantize_config.batch_size, + self.quantize_config.nsamples, + ) + else: + self.dataloader = self.dataset + + model = self.model_context.model + + # Dispatch multi-GPU model if necessary + if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1: + dispatch_model(model, model.hf_device_map) + + def register_act_hook(model): + """Registers hooks to accumulate activation squared norms into `imatrix`.""" + + def get_imatrix_hook(module, input, output): + input = input[0] if isinstance(input, (tuple, list)) else input + flattened = input.reshape(-1, input.shape[-1]).to(torch.float32) + squared = torch.sum(torch.pow(flattened, 2), dim=0).to(torch.float32) + + if not hasattr(module, "imatrix"): + module.imatrix = squared + module.imatrix_cnt = input.shape[0] + else: + module.imatrix += squared.to(module.imatrix.device) + module.imatrix_cnt += input.shape[0] + + hook_handles = [] + for name, module in model.named_modules(): + if type(module) in SUPPORTED_LAYER_TYPES and check_to_quantized(module): + hook = module.register_forward_hook(get_imatrix_hook) + hook_handles.append(hook) + return hook_handles + + hooks = register_act_hook(model) + + try: + if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1: + import accelerate + + accelerate.hooks.remove_hook_from_submodules(model) + safe_to_cpu_(model) + clear_memory(device_list=self.compress_context.device_list) + self._quantize_via_rtn_blockwise() + except torch.OutOfMemoryError: + cuda_error_msg = traceback.format_exc() + try: + logger.error(cuda_error_msg) + # Final fallback: warn and use CPU-only quantization + logger.warning( + "Fallback to CPU. " + "Consider enabling `low_gpu_mem_usage` or using more GPUs via `--device 0,1,2,3`." + ) + safe_to_cpu_(model) + clear_memory(device_list=self.compress_context.device_list) + if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1: + import accelerate + + accelerate.hooks.remove_hook_from_submodules(model) + + orig_device = self.compress_context.device + self.compress_context.device = "cpu" + self._quantize_via_rtn_blockwise() + self.compress_context.device = orig_device + except Exception as e: + raise + finally: + # Always remove hooks + for hook in hooks: + hook.remove() + + @torch.inference_mode() + def quantize(self): + """Quantize all modules in the model using RTN (Round-To-Nearest) strategy. + + If the target format includes GGUF with `k`, and optimized RTN is enabled, + blockwise quantization with input caching and imatrix is used. + + Returns: + tuple[nn.Module, Dict[str, Any]]: The quantized model and the layer configuration. + """ + self.post_init() + self.model_context.initialize(formats=self.formats, is_act_quantize=self.config.is_act_quantize) + + if not (any(fmt.is_gguf() for fmt in getattr(self, "formats", [])) or self.super_bits is not None): + self._quantize_embedding_layer() # leave to gguf itself to handle + + # Release memory + clear_memory(device_list=self.compress_context.device_list) + + self._quant_rtn_with_imatrix() + self.model_context.quantized = True + return self.model_context.model, self.quantizer.layer_config diff --git a/auto_round/compressors_new/diffusion_mixin.py b/auto_round/compressors_new/diffusion_mixin.py new file mode 100644 index 000000000..5893abe73 --- /dev/null +++ b/auto_round/compressors_new/diffusion_mixin.py @@ -0,0 +1,157 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Union + +import torch + +from auto_round.algorithms.alg_config import AlgConfig +from auto_round.context.model import ModelContext +from auto_round.logger import logger + + +class DiffusionMixin: + """Diffusion-specific functionality mixin. + + This mixin adds diffusion model-specific functionality to any compressor + (CalibCompessor, ZeroShotCompressor, ImatrixCompressor, etc). It handles + diffusion models (like Stable Diffusion, FLUX) that require special pipeline + handling and data generation logic. + + Can be combined with: + - CalibCompessor (for AutoRound with calibration) + - ImatrixCompressor (for RTN with importance matrix) + - ZeroShotCompressor (for basic RTN) + + Diffusion-specific parameters: + guidance_scale: Control how much image generation follows text prompt + num_inference_steps: Reference number of denoising steps + generator_seed: Seed for initial noise generation + """ + + def __init__(self, *args, guidance_scale=7.5, num_inference_steps=50, generator_seed=None, **kwargs): + # Store diffusion-specific attributes + self.guidance_scale = guidance_scale + self.num_inference_steps = num_inference_steps + self.generator_seed = generator_seed + self.pipe = None # Will be set during model loading + self.pipe_config = None + + # Call parent class __init__ (will be CalibCompessor, ImatrixCompressor, etc) + super().__init__(*args, **kwargs) + + def post_init(self): + """Override post_init to handle diffusion-specific model loading.""" + # Load diffusion model as pipeline before standard initialization + if isinstance(self.model_context.model, str): + self._load_diffusion_model() + + # Continue with standard post_init + super().post_init() + + def _load_diffusion_model(self): + """Load diffusion model using pipeline. + + This method loads the full diffusion pipeline and extracts the + transformer/unet component for quantization. + """ + from auto_round.utils import diffusion_load_model + + if isinstance(self.model_context.model, str): + # Load diffusion pipeline + logger.info(f"Loading diffusion model from {self.model_context.model}") + pipe, pipe_config = diffusion_load_model( + pretrained_model_name_or_path=self.model_context.model, + platform=self.platform, + device=self.compress_context.device, + trust_remote_code=self.model_context.trust_remote_code, + ) + self.pipe = pipe + self.pipe_config = pipe_config + + # Extract the transformer/unet component as the model + if hasattr(pipe, "transformer"): + extracted_model = pipe.transformer + logger.info("Extracted transformer from diffusion pipeline") + elif hasattr(pipe, "unet"): + extracted_model = pipe.unet + logger.info("Extracted unet from diffusion pipeline") + else: + raise ValueError("Cannot find transformer or unet in diffusion pipeline") + + # Replace the model path with the actual model + self.model_context.model = extracted_model + + @torch.no_grad() + def calib(self, nsamples, bs): + """Perform diffusion-specific calibration for quantization. + + Override parent's calib method to use diffusion dataset loading logic. + """ + from auto_round.compressors.diffusion.dataset import get_diffusion_dataloader + + if self.pipe is None: + raise ValueError("Diffusion pipeline must be loaded before calibration") + + logger.info(f"Preparing diffusion dataloader with {nsamples} samples") + + # Get diffusion dataloader + self.dataloader = get_diffusion_dataloader( + pipe=self.pipe, + dataset=self.dataset, + nsamples=nsamples, + batch_size=bs, + seed=self.seed, + guidance_scale=self.guidance_scale, + num_inference_steps=self.num_inference_steps, + generator_seed=self.generator_seed, + ) + + # Process data through the model for calibration + total_cnt = 0 + for data in self.dataloader: + if data is None: + continue + + # Diffusion data is usually already properly formatted + if isinstance(data, dict): + # Move all tensors to device + data_new = {} + for key, value in data.items(): + if isinstance(value, torch.Tensor): + data_new[key] = value.to(self.model_context.model.device) + else: + data_new[key] = value + else: + data_new = data + + try: + if isinstance(data_new, dict): + self.model_context.model(**data_new) + else: + self.model_context.model(data_new) + except NotImplementedError: + pass + except Exception as e: + logger.warning(f"Calibration forward pass failed: {e}") + + total_cnt += bs + if total_cnt >= nsamples: + break + + if total_cnt == 0: + logger.error("no data has been cached, please provide more data") + exit(-1) + elif total_cnt < nsamples: + logger.warning(f"Insufficient number of samples: required {nsamples}, but only {total_cnt} were processed.") diff --git a/auto_round/compressors_new/docs/compressors_new_architecture.md b/auto_round/compressors_new/docs/compressors_new_architecture.md new file mode 100644 index 000000000..1cd33111a --- /dev/null +++ b/auto_round/compressors_new/docs/compressors_new_architecture.md @@ -0,0 +1,291 @@ +# Compressor New Architecture + +## Overview + +本文档介绍了 `compressors_new` 的新架构设计,该设计统一了 LLM、MLLM 和 Diffusion 模型的量化入口。 + +## 架构设计 + +### 核心思想 + +通过 `entry.py` 中的 `Compressor` 类作为统一入口,根据模型类型和算法配置动态选择合适的 Compressor 实现。 + +### 组件结构 + +``` +compressors_new/ +├── entry.py # 统一入口,自动检测模型类型 +├── base.py # BaseCompressor 基类 +├── calib.py # CalibCompessor (需要校准的算法) +├── zero_shot.py # ZeroShotCompressor (不需要校准的算法) +├── mllm_mixin.py # MLLMCalibCompressor (MLLM + 校准) +└── diffusion_mixin.py # DiffusionCalibCompressor (Diffusion + 校准) +``` + +### 类继承关系 + +``` +BaseCompressor + ├── CalibCompessor (基于校准的压缩) + │ ├── MLLMCalibCompressor (MLLM 专用) + │ └── DiffusionCalibCompressor (Diffusion 专用) + │ + └── ZeroShotCompressor (不需要校准) +``` + +## 使用方法 + +### 1. 基本用法 + +```python +from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig +from auto_round.compressors_new.entry import Compressor + +# 创建配置 +config = AutoRoundConfig( + scheme="W4A16", + iters=200, + nsamples=128, +) + +# 统一入口 - 自动检测模型类型 +compressor = Compressor( + config=config, + model="/path/to/model", # 可以是 LLM/MLLM/Diffusion + tokenizer=tokenizer, + platform="hf", + format=None, +) + +# 执行量化 +quantized_model, layer_config = compressor.quantize() +``` + +### 2. MLLM 模型量化 + +```python +from auto_round.compressors_new.entry import Compressor +from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig + +config = AutoRoundConfig(scheme="W4A16", iters=200) + +# 会自动使用 MLLMCalibCompressor +compressor = Compressor( + config=config, + model="/models/Qwen2-VL-2B-Instruct", + tokenizer=tokenizer, + processor=processor, # MLLM 特定参数 + image_processor=image_processor, # MLLM 特定参数 + template="qwen2_vl", # MLLM 特定参数 + extra_data_dir="/path/to/images", # MLLM 特定参数 +) + +quantized_model, layer_config = compressor.quantize() +``` + +### 3. Diffusion 模型量化 + +```python +from auto_round.compressors_new.entry import Compressor +from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig + +config = AutoRoundConfig(scheme="W4A16", iters=200) + +# 会自动使用 DiffusionCalibCompressor +compressor = Compressor( + config=config, + model="/models/stable-diffusion-2-1", + platform="hf", + guidance_scale=7.5, # Diffusion 特定参数 + num_inference_steps=50, # Diffusion 特定参数 +) + +quantized_model, layer_config = compressor.quantize() +``` + +## 模型类型检测 + +`entry.py` 中的 `detect_model_type()` 函数自动检测模型类型: + +```python +def detect_model_type(model): + """检测模型类型 + + Returns: + "mllm" | "diffusion" | "llm" + """ + if is_diffusion_model(model): + return "diffusion" + if is_mllm_model(model): + return "mllm" + return "llm" +``` + +检测逻辑: +1. 优先检测是否为 Diffusion 模型(检查 `model_index.json`) +2. 然后检测是否为 MLLM 模型(检查 `processor_config.json` 等) +3. 默认为标准 LLM 模型 + +## 动态 Compressor 选择 + +`entry.py` 中的 `Compressor.__new__()` 方法根据以下条件动态选择: + +### 决策树 + +``` +Compressor.__new__() +│ +├── AutoRoundConfig (需要校准) +│ ├── MLLM → MLLMCalibCompressor +│ ├── Diffusion → DiffusionCalibCompressor +│ └── LLM → CalibCompessor +│ +└── RTNConfig + ├── enable_imatrix=True → ImatrixCompressor + └── enable_imatrix=False → ZeroShotCompressor +``` + +## 扩展新模型类型 + +如果需要支持新的模型类型,按照以下步骤: + +### 1. 创建专用 Compressor + +```python +# compressors_new/new_model_calib.py +from auto_round.compressors_new.calib import CalibCompessor + + +class NewModelCalibCompressor(CalibCompessor): + def __init__(self, config, model, **kwargs): + # 存储模型特定参数 + self.special_param = kwargs.pop("special_param", None) + super().__init__(config, model, **kwargs) + + @torch.no_grad() + def calib(self, nsamples, bs): + # 实现模型特定的校准逻辑 + # 通常需要: + # 1. 加载模型特定的 dataloader + # 2. 处理模型特定的数据格式 + # 3. 执行前向传播进行校准 + pass +``` + +### 2. 更新模型检测逻辑 + +```python +# 在 entry.py 的 detect_model_type() 中添加 +def detect_model_type(model): + if is_new_model_type(model): # 添加新的检测函数 + return "new_model_type" + if is_diffusion_model(model): + return "diffusion" + # ... +``` + +### 3. 更新 Compressor 入口 + +```python +# 在 entry.py 的 Compressor.__new__() 中添加 +if isinstance(config, AutoRoundConfig): + if model_type == "new_model_type": + from auto_round.compressors_new.new_model_calib import NewModelCalibCompressor + return NewModelCalibCompressor(config, **local_args, **kwargs) + elif model_type == "mllm": + # ... +``` + +## 与旧架构的兼容性 + +### 旧架构 (compressors/) + +```python +from auto_round.compressors.mllm.compressor import MLLMCompressor + +compressor = MLLMCompressor( + model=model, + # ... 参数 +) +``` + +### 新架构 (compressors_new/) + +```python +from auto_round.compressors_new.entry import Compressor +from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig + +config = AutoRoundConfig(...) +compressor = Compressor( + config=config, + model=model, + # ... 参数 +) +``` + +**优势:** +1. 统一入口,无需手动选择 Compressor +2. 自动模型类型检测 +3. 更好的代码组织和复用 +4. 易于扩展新模型类型 + +## 实现细节 + +### MLLMCalibCompressor + +重写的关键方法: +- `calib()`: 使用 MLLM 专用的 dataloader 和 template +- 处理 processor, image_processor, template 等 MLLM 特定参数 + +### DiffusionCalibCompressor + +重写的关键方法: +- `post_init()`: 预先加载 diffusion pipeline +- `_load_diffusion_model()`: 加载 pipeline 并提取 transformer/unet +- `calib()`: 使用 diffusion 专用的 dataloader + +### 数据流 + +``` +1. Compressor.__new__() + └── 检测模型类型 + └── 创建对应的 Compressor 实例 + +2. CompressorInstance.__init__() + └── 存储模型特定参数 + └── 调用 super().__init__() + +3. CompressorInstance.quantize() + └── post_init() + └── _load_model() (可能被重写) + └── calib() (可能被重写) + └── 执行量化算法 + +4. 返回量化后的模型 +``` + +## 测试 + +运行测试脚本: + +```bash +python test_compressor_new_arch.py +``` + +这将测试: +- 模型类型检测 +- LLM Compressor 创建 +- MLLM Compressor 创建 +- Diffusion Compressor 创建 + +## 总结 + +新架构的主要优势: + +1. **统一入口**: 一个 `Compressor` 类处理所有模型类型 +2. **自动检测**: 无需手动判断模型类型 +3. **易于扩展**: 添加新模型类型只需3步 +4. **代码复用**: 通过继承复用基类功能 +5. **清晰结构**: 每种模型类型有独立的 Compressor 实现 + +这种设计符合开闭原则(Open-Closed Principle),对扩展开放,对修改关闭。 diff --git a/auto_round/compressors_new/docs/compressors_new_architecture_CN.md b/auto_round/compressors_new/docs/compressors_new_architecture_CN.md new file mode 100644 index 000000000..4beca598c --- /dev/null +++ b/auto_round/compressors_new/docs/compressors_new_architecture_CN.md @@ -0,0 +1,787 @@ +# Compressor 新架构说明 + +## 概述 + +本文档介绍了 `compressors_new` 的新架构设计,实现了对 LLM、MLLM 和 Diffusion 模型的统一量化入口。 + +## 架构设计 + +### 核心思想 + +通过 `entry.py` 中的 `Compressor` 类作为统一入口点,根据模型类型和算法配置自动选择合适的 Compressor 实现类。 + +### 目录结构 + +``` +compressors_new/ +├── entry.py # 统一入口,自动检测模型类型 +├── base.py # BaseCompressor 基类 +├── calib.py # CalibCompessor (基于校准的压缩) +├── zero_shot.py # ZeroShotCompressor (零样本压缩) +├── mllm_mixin.py # MLLMCalibCompressor (多模态模型校准压缩) +└── diffusion_mixin.py # DiffusionCalibCompressor (扩散模型校准压缩) +``` + +### 类继承关系 + +``` +BaseCompressor (基础压缩器) + │ + ├── CalibCompessor (基于校准的压缩器) + │ │ + │ ├── MLLMCalibCompressor (多模态模型专用) + │ │ └── 支持视觉-语言模型(如 Qwen2-VL, LLaVA 等) + │ │ + │ └── DiffusionCalibCompressor (扩散模型专用) + │ └── 支持文生图模型(如 Stable Diffusion, FLUX 等) + │ + └── ZeroShotCompressor (零样本压缩器) + └── 用于 RTN 等不需要校准的算法 +``` + +## 使用方法 + +### 1. 基本用法(自动检测) + +```python +from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig +from auto_round.compressors_new.entry import Compressor + +# 创建量化配置 +config = AutoRoundConfig( + scheme="W4A16", # 量化方案: 权重4比特,激活16比特 + iters=200, # 迭代次数 + nsamples=128, # 校准样本数 +) + +# 统一入口 - 自动检测模型类型并选择合适的 Compressor +compressor = Compressor( + config=config, + model="/path/to/model", # 支持 LLM/MLLM/Diffusion 模型 + tokenizer=tokenizer, + platform="hf", # 平台: "hf" 或 "model_scope" +) + +# 执行量化 +quantized_model, layer_config = compressor.quantize() +``` + +### 2. MLLM 多模态模型量化 + +```python +from auto_round.compressors_new.entry import Compressor +from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig +from transformers import AutoProcessor, AutoTokenizer + +# 准备 tokenizer 和 processor +tokenizer = AutoTokenizer.from_pretrained("/models/Qwen2-VL-2B-Instruct") +processor = AutoProcessor.from_pretrained("/models/Qwen2-VL-2B-Instruct") + +config = AutoRoundConfig(scheme="W4A16", iters=200, nsamples=128) + +# 自动使用 MLLMCalibCompressor +compressor = Compressor( + config=config, + model="/models/Qwen2-VL-2B-Instruct", + tokenizer=tokenizer, + processor=processor, # MLLM 特定: 多模态处理器 + image_processor=None, # MLLM 特定: 图像处理器 + template="qwen2_vl", # MLLM 特定: 模板名称 + extra_data_dir="/path/to/images", # MLLM 特定: 额外数据路径 + quant_nontext_module=False, # 是否量化非文本模块 +) + +quantized_model, layer_config = compressor.quantize() +``` + +### 3. Diffusion 扩散模型量化 + +```python +from auto_round.compressors_new.entry import Compressor +from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig + +config = AutoRoundConfig(scheme="W4A16", iters=200, nsamples=128) + +# 自动使用 DiffusionCalibCompressor +compressor = Compressor( + config=config, + model="/models/stable-diffusion-2-1", + platform="hf", + guidance_scale=7.5, # Diffusion 特定: 引导强度 + num_inference_steps=50, # Diffusion 特定: 推理步数 + generator_seed=42, # Diffusion 特定: 随机种子 + dataset="coco2014", # 校准数据集 +) + +quantized_model, layer_config = compressor.quantize() +``` + +### 4. RTN 量化(零样本) + +```python +from auto_round.compressors_new.entry import Compressor +from auto_round.algorithms.quantization.rtn.config import RTNConfig + +# RTN 不需要校准数据 +config = RTNConfig(scheme="W4A16") + +# 自动使用 ZeroShotCompressor 或 ImatrixCompressor +compressor = Compressor( + config=config, + model="/path/to/model", + format="gguf_k", # 如果是 gguf_k 格式,会使用 ImatrixCompressor +) + +quantized_model, layer_config = compressor.quantize() +``` + +## 模型类型自动检测 + +`entry.py` 中的 `detect_model_type()` 函数负责自动检测模型类型: + +```python +def detect_model_type(model): + """检测模型类型 + + Args: + model: 模型实例或模型路径字符串 + + Returns: + str: "mllm" | "diffusion" | "llm" + """ + from auto_round.utils import is_mllm_model, is_diffusion_model + + # 1. 优先检测 Diffusion 模型 + if is_diffusion_model(model): + return "diffusion" + + # 2. 检测 MLLM 模型 + if is_mllm_model(model): + return "mllm" + + # 3. 默认为标准 LLM + return "llm" +``` + +### 检测逻辑说明 + +1. **Diffusion 模型检测** (`is_diffusion_model`): + - 检查目录中是否存在 `model_index.json` 文件 + - 检查是否为 `DiffusionPipeline` 实例 + +2. **MLLM 模型检测** (`is_mllm_model`): + - 检查是否存在 `processor_config.json` + - 检查是否存在 `preprocessor_config.json` + - 检查 config 中是否包含多模态相关键(vision_config 等) + +3. **LLM 模型** (默认): + - 所有其他情况 + +## Compressor 动态选择逻辑 + +`Compressor.__new__()` 方法根据配置类型和模型类型动态创建实例: + +### 决策流程图 + +``` +Compressor.__new__() +│ +├─ 检测模型类型 (detect_model_type) +│ ├─ "diffusion" +│ ├─ "mllm" +│ └─ "llm" +│ +├─ AutoRoundConfig (需要校准) +│ ├─ model_type == "mllm" +│ │ └─> MLLMCalibCompressor +│ │ └─ 使用 MLLM dataloader +│ │ └─ 支持 processor, template 等 +│ │ +│ ├─ model_type == "diffusion" +│ │ └─> DiffusionCalibCompressor +│ │ └─ 加载 diffusion pipeline +│ │ └─ 提取 transformer/unet +│ │ +│ └─ model_type == "llm" +│ └─> CalibCompessor +│ └─ 标准文本数据集 +│ +└─ RTNConfig (零样本量化) + ├─ enable_imatrix == True + │ └─> ImatrixCompressor + │ └─ 使用 importance matrix + │ + └─ enable_imatrix == False + └─> ZeroShotCompressor + └─ 纯 RTN 量化 +``` + +### 代码实现 + +```python +class Compressor(object): + def __new__(cls, config, model, tokenizer=None, platform="hf", format=None, **kwargs): + # 检测模型类型 + model_type = detect_model_type(model) + + if isinstance(config, AutoRoundConfig): + # AutoRound 需要校准 + if model_type == "mllm": + from auto_round.compressors_new.mllm_mixin import MLLMCalibCompressor + + return MLLMCalibCompressor(config, model, tokenizer, platform, format, **kwargs) + elif model_type == "diffusion": + from auto_round.compressors_new.diffusion_mixin import DiffusionCalibCompressor + + return DiffusionCalibCompressor(config, model, tokenizer, platform, format, **kwargs) + else: + return CalibCompessor(config, model, tokenizer, platform, format, **kwargs) + + elif isinstance(config, RTNConfig): + # RTN 可能需要 imatrix + if enable_imatrix: + from auto_round.compressors_new.calib import ImatrixCompressor + + return ImatrixCompressor(config, model, tokenizer, platform, format, **kwargs) + return ZeroShotCompressor(config, model, tokenizer, platform, format, **kwargs) +``` + +## 扩展新模型类型 + +如果需要支持新的模型类型,按照以下步骤操作: + +### 步骤 1: 创建专用 Compressor 类 + +在 `compressors_new/` 下创建新文件,例如 `audio_calib.py`: + +```python +# compressors_new/audio_calib.py +from typing import Union +import torch +from auto_round.algorithms.alg_config import AlgConfig +from auto_round.compressors_new.calib import CalibCompessor +from auto_round.logger import logger + + +class AudioCalibCompressor(CalibCompessor): + """音频模型专用校准压缩器""" + + def __init__( + self, + config: Union[AlgConfig, list[AlgConfig]], + model: Union[torch.nn.Module, str], + tokenizer=None, + platform="hf", + format=None, + audio_processor=None, # 音频特定参数 + **kwargs, + ): + # 保存音频特定参数 + self.audio_processor = audio_processor + + # 调用父类初始化 + super().__init__( + config=config, + model=model, + tokenizer=tokenizer, + platform=platform, + format=format, + **kwargs, + ) + + @torch.no_grad() + def calib(self, nsamples, bs): + """实现音频模型特定的校准逻辑""" + from your_audio_module import get_audio_dataloader + + logger.info("Preparing audio dataloader...") + + # 获取音频专用的 dataloader + self.dataloader = get_audio_dataloader( + model=self.model_context.model, + audio_processor=self.audio_processor, + dataset=self.dataset, + nsamples=nsamples, + batch_size=bs, + seed=self.seed, + ) + + # 执行校准前向传播 + total_cnt = 0 + for data in self.dataloader: + if data is None: + continue + + # 处理并前向传播 + try: + if isinstance(data, dict): + self.model_context.model(**data) + else: + self.model_context.model(data) + except Exception as e: + logger.warning(f"Calibration failed: {e}") + + total_cnt += bs + if total_cnt >= nsamples: + break + + if total_cnt == 0: + logger.error("No calibration data processed") + exit(-1) +``` + +### 步骤 2: 更新模型检测逻辑 + +在 `entry.py` 中添加音频模型检测: + +```python +# entry.py +def detect_model_type(model): + """检测模型类型""" + from auto_round.utils import is_mllm_model, is_diffusion_model, is_audio_model + + # 按特殊性从高到低检测 + if is_diffusion_model(model): + return "diffusion" + + if is_audio_model(model): # 新增音频检测 + return "audio" + + if is_mllm_model(model): + return "mllm" + + return "llm" +``` + +### 步骤 3: 更新 Compressor 入口 + +在 `entry.py` 的 `Compressor.__new__()` 中添加音频分支: + +```python +class Compressor(object): + def __new__(cls, config, model, tokenizer=None, platform="hf", format=None, **kwargs): + local_args = {k: v for k, v in locals().items() if k not in cls.SKIP_ARGS} + + # 检测模型类型 + model_type = detect_model_type(model) + + if isinstance(config, AutoRoundConfig): + # 新增音频分支 + if model_type == "audio": + from auto_round.compressors_new.audio_calib import AudioCalibCompressor + + return AudioCalibCompressor(config, **local_args, **kwargs) + elif model_type == "mllm": + from auto_round.compressors_new.mllm_mixin import MLLMCalibCompressor + + return MLLMCalibCompressor(config, **local_args, **kwargs) + # ... 其他分支 +``` + +### 步骤 4: 实现模型检测函数 + +在 `auto_round/utils/model.py` 中添加: + +```python +def is_audio_model(model_or_path: Union[str, torch.nn.Module]) -> bool: + """检测是否为音频模型""" + if isinstance(model_or_path, str): + # 检查配置文件中的特征 + config_path = os.path.join(model_or_path, "config.json") + if os.path.exists(config_path): + with open(config_path) as f: + config = json.load(f) + # 检查是否包含音频相关配置 + if "audio_config" in config: + return True + if config.get("model_type") in ["whisper", "wav2vec2", "hubert"]: + return True + + if isinstance(model_or_path, torch.nn.Module): + # 检查模块中是否有音频相关组件 + for name, module in model_or_path.named_modules(): + if "audio" in name.lower(): + return True + + return False +``` + +## 实现细节 + +### MLLMCalibCompressor 关键实现 + +```python +class MLLMCalibCompressor(CalibCompessor): + def __init__( + self, config, model, processor=None, image_processor=None, template=None, extra_data_dir=None, **kwargs + ): + # 保存 MLLM 特定参数 + self.processor = processor + self.image_processor = image_processor + self.template = template + self.extra_data_dir = extra_data_dir + super().__init__(config, model, **kwargs) + + @torch.no_grad() + def calib(self, nsamples, bs): + # 1. 选择合适的 template + self.template_obj = get_template(self.template or "default") + + # 2. 获取 MLLM dataloader + self.dataloader = get_mllm_dataloader( + model=self.model_context.model, + tokenizer=self.tokenizer, + dataset=self.dataset, + processor=self.processor, + image_processor=self.image_processor, + nsamples=nsamples, + seqlen=self.quantize_config.seqlen, + seed=self.seed, + batch_size=bs, + template=self.template_obj, + extra_data_dir=self.extra_data_dir, + ) + + # 3. 执行校准 + for data in self.dataloader: + self.model_context.model(**data) +``` + +**关键点:** +- 处理 `processor`, `image_processor`, `template` 等 MLLM 特定参数 +- 使用 `get_mllm_dataloader` 获取多模态数据 +- 支持自定义数据目录 (`extra_data_dir`) + +### DiffusionCalibCompressor 关键实现 + +```python +class DiffusionCalibCompressor(CalibCompessor): + def __init__(self, config, model, guidance_scale=7.5, num_inference_steps=50, **kwargs): + self.guidance_scale = guidance_scale + self.num_inference_steps = num_inference_steps + self.pipe = None + super().__init__(config, model, **kwargs) + + def post_init(self): + # 预先加载 diffusion pipeline + if isinstance(self.model_context.model, str): + self._load_diffusion_model() + super().post_init() + + def _load_diffusion_model(self): + # 加载完整的 pipeline + pipe, pipe_config = diffusion_load_model( + pretrained_model_name_or_path=self.model_context.model, + platform=self.platform, + device=self.compress_context.device, + ) + self.pipe = pipe + + # 提取 transformer 或 unet 用于量化 + if hasattr(pipe, "transformer"): + self.model_context.model = pipe.transformer + elif hasattr(pipe, "unet"): + self.model_context.model = pipe.unet + + @torch.no_grad() + def calib(self, nsamples, bs): + # 获取 diffusion dataloader + self.dataloader = get_diffusion_dataloader( + pipe=self.pipe, + dataset=self.dataset, + nsamples=nsamples, + batch_size=bs, + seed=self.seed, + guidance_scale=self.guidance_scale, + num_inference_steps=self.num_inference_steps, + ) + + # 执行校准 + for data in self.dataloader: + self.model_context.model(**data) +``` + +**关键点:** +- 需要加载完整的 diffusion pipeline +- 从 pipeline 中提取 transformer/unet 组件 +- 使用扩散模型特定的数据生成逻辑 + +### 完整数据流 + +``` +1. 用户调用 Compressor(config, model, ...) + │ + ├─> Compressor.__new__() + │ ├─> detect_model_type(model) + │ │ └─> 返回 "llm" | "mllm" | "diffusion" + │ │ + │ └─> 根据 config 类型和 model_type 创建实例 + │ ├─> MLLMCalibCompressor (MLLM + AutoRound) + │ ├─> DiffusionCalibCompressor (Diffusion + AutoRound) + │ ├─> CalibCompessor (LLM + AutoRound) + │ ├─> ImatrixCompressor (RTN + imatrix) + │ └─> ZeroShotCompressor (RTN) + │ +2. 实例.__init__() + │ ├─> 保存模型特定参数 + │ └─> super().__init__() 调用父类 + │ +3. 用户调用 compressor.quantize() + │ + ├─> post_init() + │ ├─> _load_model() (可能被子类重写) + │ └─> 初始化 quantizer + │ + ├─> calib(nsamples, bs) (可能被子类重写) + │ ├─> 准备 dataloader (模型特定) + │ └─> 执行校准前向传播 + │ + ├─> cache_inter_data() + │ └─> 缓存中间激活值 + │ + ├─> 对每个 block 执行量化 + │ └─> 运行量化算法 (AutoRound/RTN 等) + │ + └─> 返回 (quantized_model, layer_config) +``` + +## 与旧架构对比 + +### 旧架构 (`compressors/`) + +**使用方式:** +```python +# 需要手动选择 Compressor +from auto_round.compressors.mllm.compressor import MLLMCompressor +from auto_round.compressors.diffusion.compressor import DiffusionCompressor + +# MLLM +mllm_compressor = MLLMCompressor( + model=model, + scheme="W4A16", + iters=200, + # ... 很多参数 +) + +# Diffusion +diffusion_compressor = DiffusionCompressor( + model=model, + scheme="W4A16", + iters=200, + # ... 很多参数 +) +``` + +**问题:** +- 用户需要手动判断模型类型 +- 需要导入不同的 Compressor 类 +- 参数直接传给 Compressor,没有统一的配置对象 +- 每个 Compressor 都是独立实现,代码重复 + +### 新架构 (`compressors_new/`) + +**使用方式:** +```python +# 统一入口,自动检测 +from auto_round.compressors_new.entry import Compressor +from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig + +config = AutoRoundConfig(scheme="W4A16", iters=200, nsamples=128) + +# 同一个入口处理所有模型类型 +compressor = Compressor( + config=config, + model=model, # 自动检测是 LLM/MLLM/Diffusion + tokenizer=tokenizer, + # 模型特定参数... +) +``` + +**优势:** +- ✅ 自动模型类型检测 +- ✅ 统一的配置对象 (AlgConfig) +- ✅ 单一入口点 +- ✅ 通过继承复用代码 +- ✅ 易于扩展新模型类型 + +## 测试 + +### 运行测试脚本 + +```bash +# 运行完整测试 +python test_compressor_new_arch.py + +# 测试特定类型 +python -c "from test_compressor_new_arch import test_mllm_compressor; test_mllm_compressor()" +``` + +### 测试内容 + +1. **模型类型检测测试** + ```python + from auto_round.compressors_new.entry import detect_model_type + + assert detect_model_type("/models/opt-125m/") == "llm" + assert detect_model_type("/models/Qwen2-VL-2B-Instruct") == "mllm" + assert detect_model_type("/models/stable-diffusion-2-1") == "diffusion" + ``` + +2. **Compressor 创建测试** + ```python + from auto_round.compressors_new.entry import Compressor + from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig + + config = AutoRoundConfig(scheme="W4A16") + + # 测试 LLM + comp = Compressor(config=config, model="/models/opt-125m/") + assert isinstance(comp, CalibCompessor) + + # 测试 MLLM + comp = Compressor(config=config, model="/models/Qwen2-VL-2B-Instruct") + assert isinstance(comp, MLLMCalibCompressor) + + # 测试 Diffusion + comp = Compressor(config=config, model="/models/stable-diffusion-2-1") + assert isinstance(comp, DiffusionCalibCompressor) + ``` + +## 常见问题 + +### Q1: 如何判断我的模型会使用哪个 Compressor? + +**A:** 运行以下代码查看: + +```python +from auto_round.compressors_new.entry import detect_model_type, Compressor +from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig + +model_path = "/your/model/path" + +# 检测模型类型 +model_type = detect_model_type(model_path) +print(f"Model type: {model_type}") + +# 创建 compressor 并查看类型 +config = AutoRoundConfig(scheme="W4A16") +comp = Compressor(config=config, model=model_path) +print(f"Compressor type: {type(comp).__name__}") +``` + +### Q2: 如何传递模型特定的参数? + +**A:** 直接传递给 `Compressor()`,它会自动转发: + +```python +# MLLM 特定参数 +compressor = Compressor( + config=config, + model=mllm_model_path, + processor=processor, # MLLM 特定 + template="qwen2_vl", # MLLM 特定 + extra_data_dir="/data/imgs", # MLLM 特定 +) + +# Diffusion 特定参数 +compressor = Compressor( + config=config, + model=diffusion_model_path, + guidance_scale=7.5, # Diffusion 特定 + num_inference_steps=50, # Diffusion 特定 +) +``` + +### Q3: 新架构是否向后兼容? + +**A:** 是的,旧的 `compressors/` 仍然可用: + +```python +# 旧方式仍然工作 +from auto_round.compressors.mllm.compressor import MLLMCompressor + +comp = MLLMCompressor(model=..., scheme="W4A16", ...) + +# 新方式 (推荐) +from auto_round.compressors_new.entry import Compressor +from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig + +config = AutoRoundConfig(scheme="W4A16") +comp = Compressor(config=config, model=...) +``` + +### Q4: RTN 和 AutoRound 的区别? + +**A:** + +| 特性 | RTN | AutoRound | +|------|-----|-----------| +| 需要校准数据 | ❌ 否 | ✅ 是 | +| 量化质量 | 较低 | 较高 | +| 量化速度 | 快 | 慢 | +| Compressor | ZeroShotCompressor | CalibCompessor 系列 | + +```python +# RTN - 快速但质量较低 +from auto_round.algorithms.quantization.rtn.config import RTNConfig + +config = RTNConfig(scheme="W4A16") + +# AutoRound - 慢但质量较高 +from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig + +config = AutoRoundConfig(scheme="W4A16", iters=200) +``` + +## 总结 + +新架构的核心优势: + +| 特性 | 说明 | 好处 | +|------|------|------| +| 🎯 **统一入口** | 一个 `Compressor` 类处理所有模型 | 简化使用,降低学习成本 | +| 🔍 **自动检测** | 自动识别 LLM/MLLM/Diffusion | 无需手动判断模型类型 | +| 🧩 **配置对象** | 使用 `AlgConfig` 统一配置 | 参数管理更清晰 | +| 🏗️ **继承复用** | 通过继承共享基类功能 | 减少代码重复 | +| 🔌 **易于扩展** | 3步添加新模型类型 | 符合开闭原则 | +| 🔄 **向后兼容** | 旧 API 仍然可用 | 平滑迁移 | + +### 迁移建议 + +**从旧架构迁移到新架构:** + +```python +# 旧代码 +from auto_round.compressors.mllm.compressor import MLLMCompressor + +comp = MLLMCompressor( + model=model, + scheme="W4A16", + iters=200, + nsamples=128, + # ... 更多参数 +) + +# 新代码 +from auto_round.compressors_new.entry import Compressor +from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig + +config = AutoRoundConfig( + scheme="W4A16", + iters=200, + nsamples=128, +) +comp = Compressor( + config=config, + model=model, + # 模型特定参数自动识别 +) +``` + +**迁移步骤:** +1. 导入 `Compressor` 和 `AutoRoundConfig` +2. 创建 `config` 对象,将量化相关参数放入 config +3. 将模型特定参数直接传递给 `Compressor()` +4. 移除手动的模型类型判断代码 + +这种设计使得代码更加模块化、可维护和可扩展,同时保持了简单易用的 API 接口。 diff --git a/auto_round/compressors_new/entry.py b/auto_round/compressors_new/entry.py new file mode 100644 index 000000000..5835130d6 --- /dev/null +++ b/auto_round/compressors_new/entry.py @@ -0,0 +1,354 @@ +# # Copyright (C) 2026 Intel Corporation +# # SPDX-License-Identifier: Apache-2.0 + +from typing import Any, Callable, Optional, Union + +import torch + +from auto_round.algorithms.alg_config import AlgConfig +from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig +from auto_round.algorithms.quantization.rtn.config import RTNConfig +from auto_round.auto_scheme.gen_auto_scheme import AutoScheme +from auto_round.compressors_new.calib import CalibCompessor +from auto_round.compressors_new.zero_shot import ZeroShotCompressor +from auto_round.logger import logger +from auto_round.schemes import QuantizationScheme + + +def is_weight_scheme(scheme): + if isinstance(scheme, str): + return scheme.upper().startswith("W") + if isinstance(scheme, dict): + return all(isinstance(s, str) and s.upper().startswith("W") for s in scheme.values()) + if isinstance(scheme, AutoScheme): + opts = scheme.options + if isinstance(opts, (list, tuple)): + return all(isinstance(s, str) and s.upper().startswith("W") for s in opts) + if isinstance(opts, str): + return opts.upper().startswith("W") + return False + + +def detect_model_type(model): + """Detect the type of model (LLM, MLLM, or Diffusion). + + Args: + model: Model instance or model path string + + Returns: + str: "mllm", "diffusion", or "llm" + """ + from auto_round.utils import is_diffusion_model, is_mllm_model + + # Check if it's a diffusion model first (more specific) + if is_diffusion_model(model): + return "diffusion" + + # Check if it's an MLLM + if is_mllm_model(model): + return "mllm" + + # Default to standard LLM + return "llm" + + +class Compressor(object): + SKIP_ARGS = ("local_args", "kwargs", "cls", "config") + + def __new__( + cls, + config: Union[AlgConfig, list[AlgConfig]], + model: Union[torch.nn.Module, str], + tokenizer=None, + platform="hf", + format=None, + **kwargs, + ): + # using different compressor base on AlgConfigs + local_args = {k: v for k, v in locals().items() if k not in cls.SKIP_ARGS} + + # Detect model type to determine if we need special compressor + model_type = detect_model_type(model) + + if isinstance(config, AutoRoundConfig): + # For AutoRound, we need calibration-based compression + # Dynamically create combined class using Mixin pattern + if model_type == "mllm": + from auto_round.compressors_new.mllm_mixin import MLLMMixin + + # Create dynamic class: MLLMMixin + CalibCompessor + class MLLMCalibCompressor(MLLMMixin, CalibCompessor): + """MLLM model with AutoRound calibration compression""" + + pass + + return MLLMCalibCompressor(config, **local_args, **kwargs) + elif model_type == "diffusion": + from auto_round.compressors_new.diffusion_mixin import DiffusionMixin + + # Create dynamic class: DiffusionMixin + CalibCompessor + class DiffusionCalibCompressor(DiffusionMixin, CalibCompessor): + """Diffusion model with AutoRound calibration compression""" + + pass + + return DiffusionCalibCompressor(config, **local_args, **kwargs) + else: + return CalibCompessor(config, **local_args, **kwargs) + + elif isinstance(config, RTNConfig): + enable_imatrix = False + disable_opt_rtn = getattr(config, "disable_opt_rtn", False) + if not disable_opt_rtn: + has_gguf_k = "gguf" in format.lower() and "_k" in format.lower() if format else False + if has_gguf_k: + enable_imatrix = True + else: + sym = getattr(config, "sym", True) + if sym is not None and sym is False: + enable_imatrix = False + elif getattr(config, "data_type", "") == "int": + enable_imatrix = True + elif is_weight_scheme(config.scheme): + enable_imatrix = True + if enable_imatrix: + from auto_round.compressors_new.calib import ImatrixCompressor + + # For RTN with imatrix, dynamically combine with model-specific Mixin + if model_type == "mllm": + from auto_round.compressors_new.mllm_mixin import MLLMMixin + + # Create dynamic class: MLLMMixin + ImatrixCompressor + class MLLMImatrixCompressor(MLLMMixin, ImatrixCompressor): + """MLLM model with RTN importance matrix compression""" + + pass + + return MLLMImatrixCompressor(config, **local_args, **kwargs) + elif model_type == "diffusion": + from auto_round.compressors_new.diffusion_mixin import DiffusionMixin + + # Create dynamic class: DiffusionMixin + ImatrixCompressor + class DiffusionImatrixCompressor(DiffusionMixin, ImatrixCompressor): + """Diffusion model with RTN importance matrix compression""" + + pass + + return DiffusionImatrixCompressor(config, **local_args, **kwargs) + else: + return ImatrixCompressor(config, **local_args, **kwargs) + else: + # For basic RTN, dynamically combine with model-specific Mixin + if model_type == "mllm": + from auto_round.compressors_new.mllm_mixin import MLLMMixin + + # Create dynamic class: MLLMMixin + ZeroShotCompressor + class MLLMZeroShotCompressor(MLLMMixin, ZeroShotCompressor): + """MLLM model with zero-shot RTN compression""" + + pass + + return MLLMZeroShotCompressor(config, **local_args, **kwargs) + elif model_type == "diffusion": + from auto_round.compressors_new.diffusion_mixin import DiffusionMixin + + # Create dynamic class: DiffusionMixin + ZeroShotCompressor + class DiffusionZeroShotCompressor(DiffusionMixin, ZeroShotCompressor): + """Diffusion model with zero-shot RTN compression""" + + pass + + return DiffusionZeroShotCompressor(config, **local_args, **kwargs) + else: + return ZeroShotCompressor(config, **local_args, **kwargs) + + +class AutoRound: + """AutoRound wrapper class for backward compatibility. + + This class provides the same API as the old AutoRound class but internally + uses the new Compressor architecture with Mixin pattern. + + Args: + model: Model object or model name to load + tokenizer: Tokenizer for text processing + platform: Platform to download model ("hf" or "model_scope") + scheme: Quantization scheme (str, dict, or QuantizationScheme) + layer_config: Layer-wise quantization config + dataset: Calibration data + iters: Optimization iterations + seqlen: Calibration sequence length + nsamples: Number of calibration samples + batch_size: Calibration batch size + gradient_accumulate_steps: Gradient accumulation steps + low_gpu_mem_usage: Lower GPU memory mode + device_map: Device map for each module + enable_torch_compile: Enable torch.compile + seed: Random seed + low_cpu_mem_usage: Lower CPU memory mode + **kwargs: Additional arguments (bits, group_size, sym, etc.) + + Example: + >>> # Old API - still works + >>> from auto_round.compressors_new.entry import AutoRound + >>> autoround = AutoRound( + ... model="/models/opt-125m", + ... bits=4, + ... group_size=128, + ... iters=200, + ... ) + >>> quantized_model, layer_config = autoround.quantize() + """ + + SKIP_ARGS = ("local_args", "kwargs", "cls", "config") + + bits: int | None + group_size: int | None + sym: bool | None + data_type: str | None + act_bits: int | None + act_group_size: int | None + act_sym: bool | None + act_data_type: str | None + act_dynamic: bool | None + super_bits: int | None + super_group_size: int | None + + def __new__( + cls, + model: Union[torch.nn.Module, str], + tokenizer=None, + platform: str = "hf", + scheme: Union[str, dict, QuantizationScheme, AutoScheme] = "W4A16", + layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None, + dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k", + iters: int = 200, + seqlen: int = 2048, + nsamples: int = 128, + batch_size: int = 8, + gradient_accumulate_steps: int = 1, + low_gpu_mem_usage: bool = False, + device_map: Union[str, torch.device, int, dict] = 0, + enable_torch_compile: bool = False, + seed: int = 42, + low_cpu_mem_usage: bool = True, + **kwargs, + ): + """Create AutoRound instance using new Compressor architecture. + + This method translates old AutoRound API to new Compressor API. + """ + from auto_round.utils import is_diffusion_model, is_mllm_model + + # Extract quantization parameters from kwargs or use defaults + bits = kwargs.pop("bits", None) + group_size = kwargs.pop("group_size", None) + sym = kwargs.pop("sym", None) + data_type = kwargs.pop("data_type", None) + act_bits = kwargs.pop("act_bits", None) + act_group_size = kwargs.pop("act_group_size", None) + act_sym = kwargs.pop("act_sym", None) + act_data_type = kwargs.pop("act_data_type", None) + act_dynamic = kwargs.pop("act_dynamic", None) + + # Decide which algorithm to use + if iters == 0: + # RTN mode + disable_opt_rtn = kwargs.pop("disable_opt_rtn", None) + config = RTNConfig( + scheme=scheme, + layer_config=layer_config, + bits=bits, + group_size=group_size, + sym=sym, + data_type=data_type, + act_bits=act_bits, + act_group_size=act_group_size, + act_sym=act_sym, + act_data_type=act_data_type, + act_dynamic=act_dynamic, + disable_opt_rtn=disable_opt_rtn, + ) + else: + # AutoRound mode + lr = kwargs.pop("lr", None) + minmax_lr = kwargs.pop("minmax_lr", None) + enable_minmax_tuning = kwargs.pop("enable_minmax_tuning", True) + enable_norm_bias_tuning = kwargs.pop("enable_norm_bias_tuning", False) + enable_quanted_input = kwargs.pop("enable_quanted_input", True) + + config = AutoRoundConfig( + scheme=scheme, + layer_config=layer_config, + iters=iters, + nsamples=nsamples, + seqlen=seqlen, + batch_size=batch_size, + gradient_accumulate_steps=gradient_accumulate_steps, + bits=bits, + group_size=group_size, + sym=sym, + data_type=data_type, + act_bits=act_bits, + act_group_size=act_group_size, + act_sym=act_sym, + act_data_type=act_data_type, + act_dynamic=act_dynamic, + lr=lr, + minmax_lr=minmax_lr, + enable_minmax_tuning=enable_minmax_tuning, + enable_norm_bias_tuning=enable_norm_bias_tuning, + enable_quanted_input=enable_quanted_input, + ) + + # Determine output format if specified + format = kwargs.pop("format", None) + + # Extract MLLM-specific parameters + processor = kwargs.pop("processor", None) + image_processor = kwargs.pop("image_processor", None) + template = kwargs.pop("template", None) + extra_data_dir = kwargs.pop("extra_data_dir", None) + quant_nontext_module = kwargs.pop("quant_nontext_module", False) + + # Extract Diffusion-specific parameters + guidance_scale = kwargs.pop("guidance_scale", 7.5) + num_inference_steps = kwargs.pop("num_inference_steps", 50) + generator_seed = kwargs.pop("generator_seed", None) + + # Check model type for logging + if is_mllm_model(model, platform=platform): + logger.info("Using MLLM mode for multimodal model (new architecture).") + elif is_diffusion_model(model): + logger.info("Using Diffusion mode for diffusion model (new architecture).") + else: + logger.info("Using LLM mode (new architecture).") + + # Create Compressor instance using new architecture + compressor = Compressor( + config=config, + model=model, + tokenizer=tokenizer, + platform=platform, + format=format, + dataset=dataset, + low_gpu_mem_usage=low_gpu_mem_usage, + device_map=device_map, + enable_torch_compile=enable_torch_compile, + seed=seed, + low_cpu_mem_usage=low_cpu_mem_usage, + # MLLM parameters + processor=processor, + image_processor=image_processor, + template=template, + extra_data_dir=extra_data_dir, + quant_nontext_module=quant_nontext_module, + # Diffusion parameters + guidance_scale=guidance_scale, + num_inference_steps=num_inference_steps, + generator_seed=generator_seed, + # Pass remaining kwargs + **kwargs, + ) + + return compressor diff --git a/auto_round/compressors_new/mllm_mixin.py b/auto_round/compressors_new/mllm_mixin.py new file mode 100644 index 000000000..d14393bc2 --- /dev/null +++ b/auto_round/compressors_new/mllm_mixin.py @@ -0,0 +1,148 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Union + +import torch + +from auto_round.algorithms.alg_config import AlgConfig +from auto_round.logger import logger + + +class MLLMMixin: + """MLLM-specific functionality mixin. + + This mixin adds MLLM-specific functionality to any compressor (CalibCompessor, + ZeroShotCompressor, ImatrixCompressor, etc). It handles multi-modal models + (vision-language models) that require special data loading and processing logic. + + Can be combined with: + - CalibCompessor (for AutoRound with calibration) + - ImatrixCompressor (for RTN with importance matrix) + - ZeroShotCompressor (for basic RTN) + + MLLM-specific parameters: + processor: Multi-modal processor for encoding/decoding data + image_processor: Image processor for models like LLaVA + template: Template for processing different MLLMs + extra_data_dir: Path to extra data (images, audio, videos) + quant_nontext_module: Whether to quantize non-text modules + """ + + def __init__( + self, + *args, + processor=None, + image_processor=None, + template=None, + extra_data_dir=None, + quant_nontext_module=False, + **kwargs, + ): + # Store MLLM-specific attributes before calling super().__init__ + self.processor = processor + self.image_processor = image_processor + self.template = template + self.extra_data_dir = extra_data_dir + self.quant_nontext_module = quant_nontext_module + self.template_obj = None + + # Call parent class __init__ (will be CalibCompessor, ImatrixCompressor, etc) + super().__init__(*args, **kwargs) + + @torch.no_grad() + def calib(self, nsamples, bs): + """Perform MLLM-specific calibration for quantization. + + Override parent's calib method to use MLLM dataset loading logic. + """ + from transformers import PreTrainedModel + + from auto_round.compressors.mllm.dataset import get_mllm_dataloader + from auto_round.compressors.mllm.template import get_template + from auto_round.special_model_handler import MISTRAL_3_2_MODELS + + # Handle template selection + if isinstance(self.model_context.model, PreTrainedModel): + model_type = getattr(self.model_context.model.config, "model_type", None) + if model_type == "llava" and self.template is None: + self.template = "default" + + if hasattr(self.model_context.model, "name_or_path"): + name = self.model_context.model.name_or_path + if any([m in name for m in MISTRAL_3_2_MODELS]): + self.template = "mistral3_2" + + # Get template + if self.template is not None: + self.template_obj = get_template(self.template) + elif hasattr(self.model_context.model.config, "model_type"): + self.template_obj = get_template(self.model_context.model.config.model_type) + else: + self.template_obj = get_template("default") + + logger.info(f"Using MLLM template: {self.template or 'default'}") + + # Get MLLM dataloader + self.dataloader = get_mllm_dataloader( + self.model_context.model, + self.tokenizer, + self.dataset, + self.processor, + self.image_processor, + nsamples, + self.quantize_config.seqlen, + self.seed, + bs, + self.template_obj, + self.extra_data_dir, + ) + + # Process data through the model for calibration + total_cnt = 0 + for data in self.dataloader: + if data is None: + continue + + # MLLM data is usually already properly formatted + if isinstance(data, dict): + # Move all tensors to device + data_new = {} + for key, value in data.items(): + if isinstance(value, torch.Tensor): + data_new[key] = value.to(self.model_context.model.device) + else: + data_new[key] = value + else: + data_new = data + + try: + if isinstance(data_new, dict): + self.model_context.model(**data_new) + else: + self.model_context.model(data_new) + except NotImplementedError: + pass + except Exception as e: + logger.warning(f"Calibration forward pass failed: {e}") + + total_cnt += bs + if total_cnt >= nsamples: + break + + if total_cnt == 0: + logger.error("no data has been cached, please provide more data") + exit(-1) + elif total_cnt < nsamples: + logger.warning(f"Insufficient number of samples: required {nsamples}, but only {total_cnt} were processed.") diff --git a/auto_round/compressors_new/utils.py b/auto_round/compressors_new/utils.py index 8afe83f72..184368fe5 100644 --- a/auto_round/compressors_new/utils.py +++ b/auto_round/compressors_new/utils.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import copy +import os import random import re import sys @@ -1032,3 +1033,48 @@ def _get_quantized_layer_names_outside_blocks(model, layer_config, supported_typ layer_names.append(key) return layer_names + + +def _get_save_folder_name(format, *args, **kwargs) -> str: + """Generates the save folder name based on the provided format string. + + If there are multiple formats to handle, the function creates a subfolder + named after the format string with special characters replaced. If there's + only one format, it returns the original output directory directly. + + Args: + format_str (str): The format identifier (e.g., 'gguf:q2_k_s'). + + Returns: + str: The path to the folder where results should be saved. + """ + from auto_round.context.compress import CompressContext + + compress_context = CompressContext.get_context() + # Replace special characters to make the folder name filesystem-safe + sanitized_format = format.get_backend_name().replace(":", "-").replace("_", "-") + + # Use a subfolder only if there are multiple formats + if len(compress_context.formats) > 1: + return os.path.join(compress_context.output_dir, sanitized_format) + + return compress_context.output_dir + + +def immediate_pack(name: str, layer_config: dict): + from auto_round.context.compress import CompressContext + from auto_round.context.model import ModelContext + + compress_context = CompressContext.get_context() + model_context = ModelContext.get_context() + + if not compress_context.is_immediate_packing: + return + compress_context.formats[0].immediate_pack( + name=name, + model=model_context.model, + device=compress_context.device, + output_dir=_get_save_folder_name(compress_context.formats[0]), + layer_config=layer_config, + tokenizer=model_context.tokenizer, + ) diff --git a/auto_round/compressors_new/zero_shot.py b/auto_round/compressors_new/zero_shot.py new file mode 100644 index 000000000..77dd6ec15 --- /dev/null +++ b/auto_round/compressors_new/zero_shot.py @@ -0,0 +1,317 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy +from typing import Any, Union + +import accelerate +import torch +from tqdm import tqdm + +from auto_round.algorithms.alg_config import AlgConfig +from auto_round.compressors_new.base import BaseCompressor +from auto_round.compressors_new.shard_writer import shard_writer +from auto_round.compressors_new.utils import ( + _get_quantized_layer_names_outside_blocks, + check_need_act_calibration, +) +from auto_round.logger import logger +from auto_round.modeling.fused_moe.replace_modules import materialize_model_ +from auto_round.utils import ( + SUPPORTED_LAYER_TYPES, + check_to_quantized, + clear_memory, + convert_module_to_hp_if_necessary, + flatten_list, + get_block_names, + get_lm_head_name, + get_module, + global_state, + memory_monitor, + set_module, + to_device, + to_dtype, +) + + +class ZeroShotCompressor(BaseCompressor): + need_calib: bool = False + + def __init__( + self, + config: Union[AlgConfig, list[AlgConfig]], + model: Union[torch.nn.Module, str], + tokenizer=None, + platform="hf", + format=None, + low_gpu_mem_usage: bool = False, + device_map: Union[str, torch.device, int, dict] = 0, + enable_torch_compile: bool = False, + enable_alg_ext: bool = False, + seed: int = 42, + low_cpu_mem_usage: bool = True, + **kwargs, + ): + super().__init__( + config=config, + model=model, + tokenizer=tokenizer, + platform=platform, + format=format, + device_map=device_map, + low_gpu_mem_usage=low_gpu_mem_usage, + enable_torch_compile=enable_torch_compile, + enable_alg_ext=enable_alg_ext, + seed=seed, + low_cpu_mem_usage=low_cpu_mem_usage, + **kwargs, + ) + self.lr = 5e-3 + + def _quantize_via_rtn_blockwise(self) -> None: + """Quantize model layers block by block using cached inputs and imatrix.""" + + all_blocks = self.quantizer.quant_block_list if self.quantizer.quant_block_list else get_block_names(self.model) + if not all_blocks: + raise ValueError("Could not find any blocks. Check the model or quant_block_list.") + + all_first_block_names = [block[0] for block in all_blocks] + layer_names = _get_quantized_layer_names_outside_blocks( + model=self.model_context.model, + layer_config=self.quantizer.layer_config, + supported_types=SUPPORTED_LAYER_TYPES, + quant_block_list=self.quantizer.quant_block_list, + ) + if self.quantize_config.is_act_quantize and (not self.quantize_config.act_dynamic or len(layer_names) > 0): + if len(layer_names) > 0: + logger.warning( + "quantize layers outside blocks for static activation quantizaiton" + " will significantly increase calibration time" + ) + all_inputs = self.try_cache_inter_data_gpucpu( + all_first_block_names, self.quantize_config.nsamples, layer_names + ) + else: + all_inputs = self.cache_inter_data(all_first_block_names, self.quantize_config.nsamples) + + # Clear hooks for multi-GPU setups + if hasattr(self.model_context.model, "hf_device_map") and len(self.model_context.model.hf_device_map) > 1: + accelerate.hooks.remove_hook_from_submodules(self.model_context.model) + + pbar = tqdm(range(sum(len(block) for block in all_blocks))) + + for block_names in all_blocks: + first_block = block_names[0] + inputs = all_inputs.pop(first_block) + input_keys = [k for k in inputs if k.startswith("hidden_state")] + if len(input_keys) != 1: + raise RuntimeError( + "hidden_states arg mismatch. Please file an issue at https://github.com/intel/auto-round/issues" + ) + inputs["input_ids"] = inputs.pop(input_keys[0]) + + clear_memory(self.inputs, device_list=self.compress_context.device_list) + + total_samples = len(inputs["input_ids"]) + if total_samples < self.quantize_config.batch_size: + self.quantize_config.batch_size = total_samples + logger.warning(f"Forcing batch size to {total_samples}") + + input_ids = to_device(inputs.pop("input_ids"), self.compress_context.cache_device) + input_others = to_device(inputs, self.compress_context.cache_device) + + tmp_dtype = self.model_context.amp_dtype if self.model_context.amp else torch.float32 + input_ids = [id_.to(tmp_dtype) for id_ in input_ids] + + for key, val in input_others.items(): + if isinstance(val, torch.Tensor) and val.dtype in (torch.float16, torch.bfloat16): + input_others[key] = val.to(tmp_dtype) + elif isinstance(val, list): + input_others[key] = [to_dtype(v, tmp_dtype) for v in val] + + for block_name in block_names: + pbar.set_description(f"Quantizing {block_name}") + block = get_module(self.model_context.model, block_name) + + self.quantizer.quantize_block( + block, + input_ids, + input_others, + ) + + if self.low_cpu_mem_usage and not self.is_immediate_saving: + self._offloader.offload(self.model_context.model, block_name) + if block_name == block_names[-1]: + clear_memory(input_ids, device_list=self.compress_context.device_list) + else: + clear_memory(device_list=self.compress_context.device_list) + + memory_monitor.log_summary() + pbar.update(1) + pbar.close() + # Process remaining layers not in blocks + # Collect names of quantizable layers not belonging to any block + remain_layer_names = [] + block_name_set = set(name for block in all_blocks for name in block) + for n, m in self.model_context.model.named_modules(): + if not check_to_quantized(m): + continue + # Skip if this layer is part of any block (by prefix match) + if any(n == block_name or n.startswith(f"{block_name}.") for block_name in block_name_set): + continue + remain_layer_names.append(n) + + for name in remain_layer_names: + dtype = None + if self.super_group_size is not None: + dtype = torch.float32 + self.quantizer.quantize_layer(name, dtype=dtype) + + @torch.inference_mode() + def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: + """Quantize the model and return the quantized model along with layer configurations.The entry of AutoRound. + Returns: + The quantized model and layer configurations. + """ + + self.post_init() + self.model_context.initialize(formats=self.formats, is_act_quantize=self.config.is_act_quantize) + + if not (any(fmt.is_gguf() for fmt in getattr(self, "formats", [])) or self.super_bits is not None): + self._quantize_embedding_layer() # leave to gguf itself to handle + + # Release memory + clear_memory(device_list=self.device_list) + + if self.quantize_config.is_act_quantize and check_need_act_calibration( + self.quantize_config.act_dynamic, + self.quantize_config.act_data_type, + self.quantize_config.act_bits, + self.static_kv_dtype, + self.static_attention_dtype, + ): + hook_handles = self.quantizer._register_act_max_hook(self.model) + try: + self._quantize_via_rtn_blockwise() + except torch.OutOfMemoryError: + logger.warning("Fallback to CPU. Consider using more GPUs via `--device 0,1,2,3`.") + self.model = self.model.to("cpu") + clear_memory(device_list=self.device_list) + if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1: + import accelerate + + accelerate.hooks.remove_hook_from_submodules(self.model) + orig_device = self.device + self.device = "cpu" + self._quantize_via_rtn_blockwise() + self.device = orig_device + for handle in hook_handles: + handle.remove() + else: + # By default, we go with layer-wise way if no replacement happened. + # In RTN mode (iters == 0), force blockwise quantization to avoid + # full-model materialization and linear CPU RAM growth. + use_blockwise_quantization = global_state.replaced_module_count > 0 + if not use_blockwise_quantization: + logger.info( + "RTN mode detected (iters=0): force blockwise quantization to avoid " + "layer-wise full-model materialization." + ) + use_blockwise_quantization = True + tied_weights_keys = getattr(self.model, "_tied_weights_keys", []) + if tied_weights_keys is None: + tied_weights_keys = [] + if isinstance(tied_weights_keys, dict): + tied_weights_values = list(tied_weights_keys.values()) + else: + tied_weights_values = list(tied_weights_keys) + tied_weights_layers = [".".join(val.split(".")[:-1]) for val in tied_weights_values] # rm weight/bias + # In fact, we should detect whether it is is_separate_lm_head, to simplify, we don't do it + if hasattr(self, "formats") and self.formats[0].is_gguf(): + lm_head_name = get_lm_head_name(self.model) + if lm_head_name is not None: + tied_weights_layers.append(lm_head_name) + + if use_blockwise_quantization: # The ram usage is a little higher + + all_blocks = self.quant_block_list if self.quant_block_list else get_block_names(self.model) + pbar = tqdm(range(sum(len(block) for block in all_blocks))) + for block_names in all_blocks: + for block_name in block_names: + pbar.set_description(f"Quantizing {block_name}") + block = get_module(self.model, block_name) + self.quantizer.quantize_block(block, block_name=block_name) + + if self.low_cpu_mem_usage and not self.is_immediate_saving: + self._offloader.offload(self.model, block_name) + clear_memory(device_list=self.device_list) + memory_monitor.log_summary() + pbar.update(1) + cnt = 1 + remain_layer_names = [] + block_name_set = set(name for block in all_blocks for name in block) + for n, m in self.model_context.model.named_modules(): + if not check_to_quantized(m): + continue + # Skip if this layer is part of any block (by prefix match) + if any(n == block_name or n.startswith(f"{block_name}.") for block_name in block_name_set): + continue + remain_layer_names.append(n) + for name in remain_layer_names: + logger.info(f"Quantizing remaining layer {name} on CPU.") + self.quantizer.quantize_layer(name) + cnt += 1 + if cnt % 10 == 0: + clear_memory(device_list=self.device_list) + memory_monitor.log_summary() + else: + all_to_quantized_module_names: list[str] = [ + n for n, m in self.model.named_modules() if check_to_quantized(m) + ] + all_to_quantized_module_names = all_to_quantized_module_names + materialize_model_(self.model) + self.model.to("cpu") + block_names_cnt = len(flatten_list(get_block_names(self.model, True))) + clear_mem_freq = len(all_to_quantized_module_names) // block_names_cnt + cnt = 0 + pbar = tqdm(all_to_quantized_module_names) + + for n, m in self.model.named_modules(): + if hasattr(m, "global_name") and m.global_name in all_to_quantized_module_names: + pbar.set_description(f"Quantizing {m.global_name}") + self.quantizer.quantize_layer(m.global_name) + cnt += 1 + pbar.update() + if cnt % clear_mem_freq == 0: + clear_memory(device_list=self.device_list) + memory_monitor.log_summary() + + elif ( + not any(m.children()) + and len(m.state_dict()) > 0 + and n not in tied_weights_layers + and self.is_immediate_saving + ): + set_module(self.model, n, copy.deepcopy(m)) + shard_writer(self, name=n) + m.to("meta") + + # Convert remaining fp8 + convert_module_to_hp_if_necessary(self.model, self.amp_dtype, self.device) + if self.low_cpu_mem_usage: + self._offloader.reload(self.model) + if self.is_immediate_saving: + shard_writer(self, is_finalize=True) + + self.quantized = True + return self.model, self.layer_config diff --git a/auto_round/context/compress.py b/auto_round/context/compress.py index 1c2d23689..a67ac1966 100644 --- a/auto_round/context/compress.py +++ b/auto_round/context/compress.py @@ -34,11 +34,16 @@ def __init__( low_gpu_mem_usage: bool = False, device_map: Union[str, torch.device, int, dict] = 0, enable_torch_compile: bool = False, + is_immediate_packing: bool = False, + is_immediate_saving: bool = False, + formats: Union[list, str] = None, + output_dir: str = "./compressed_models", ): super().__init__() self.low_cpu_mem_usage = low_cpu_mem_usage self.low_gpu_mem_usage = low_gpu_mem_usage - + self.formats = formats + self.output_dir = output_dir if device_map is None: device_map = 0 self.device_map = device_map @@ -50,3 +55,6 @@ def __init__( self.cache_device = torch.device("cpu") if low_gpu_mem_usage else self.device self.enable_torch_compile = enable_torch_compile + self.immediate_packing = is_immediate_packing + self.is_immediate_saving = is_immediate_saving + self.formats = formats diff --git a/auto_round/context/model.py b/auto_round/context/model.py index a60385331..1117467a4 100644 --- a/auto_round/context/model.py +++ b/auto_round/context/model.py @@ -44,8 +44,6 @@ class ModelContext(BaseContext): _is_initialized = False quantized = False - act_quantize = False - # model_related _model_loaded = False _init_model = False @@ -160,7 +158,7 @@ def _set_amp_dtype(self) -> None: self.amp_dtype = torch.float32 self.model = self.model.to(torch.float32) - def initialize(self, formats): + def initialize(self, formats, is_act_quantize=False): # load and handle model if not self._model_loaded: self._load_model() @@ -178,7 +176,7 @@ def initialize(self, formats): self.is_moe_model = is_moe_model(self.model) self._set_amp_dtype() - if self.act_quantize and self.amp_dtype == torch.float16: + if is_act_quantize and self.amp_dtype == torch.float16: logger.warning("force to use bf16 to for quantization tuning when enabling activation quantization") self.amp_dtype = torch.bfloat16 if self.model.dtype != torch.bfloat16: # keep the model's buffer dtype unchanged From 9dc930c66992f7c43b478babc4409c06e311cd37 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 17 Mar 2026 09:20:30 +0000 Subject: [PATCH 04/90] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/compressors_new/__init__.py | 10 +++--- .../architecture_visualization.py | 32 +++++++++---------- auto_round/compressors_new/calib.py | 4 +-- auto_round/compressors_new/diffusion_mixin.py | 6 ++-- .../docs/compressors_new_architecture.md | 10 +++--- .../docs/compressors_new_architecture_CN.md | 22 ++++++------- auto_round/compressors_new/entry.py | 12 +++---- auto_round/compressors_new/mllm_mixin.py | 6 ++-- 8 files changed, 51 insertions(+), 51 deletions(-) diff --git a/auto_round/compressors_new/__init__.py b/auto_round/compressors_new/__init__.py index 91b9ffda8..902b0c526 100644 --- a/auto_round/compressors_new/__init__.py +++ b/auto_round/compressors_new/__init__.py @@ -15,7 +15,7 @@ # Lazy imports to avoid circular dependencies # Users should import from specific modules instead of this __init__.py -__all__ = ["Compressor", "CalibCompessor", "ImatrixCompressor", "ZeroShotCompressor", "AutoRound"] +__all__ = ["Compressor", "CalibCompressor", "ImatrixCompressor", "ZeroShotCompressor", "AutoRound"] def __getattr__(name): @@ -26,11 +26,11 @@ def __getattr__(name): if name == "Compressor": return Compressor return AutoRound - elif name == "CalibCompessor" or name == "ImatrixCompressor": - from auto_round.compressors_new.calib import CalibCompessor, ImatrixCompressor + elif name == "CalibCompressor" or name == "ImatrixCompressor": + from auto_round.compressors_new.calib import CalibCompressor, ImatrixCompressor - if name == "CalibCompessor": - return CalibCompessor + if name == "CalibCompressor": + return CalibCompressor return ImatrixCompressor elif name == "ZeroShotCompressor": from auto_round.compressors_new.zero_shot import ZeroShotCompressor diff --git a/auto_round/compressors_new/architecture_visualization.py b/auto_round/compressors_new/architecture_visualization.py index fb31f4827..654cdb0e0 100644 --- a/auto_round/compressors_new/architecture_visualization.py +++ b/auto_round/compressors_new/architecture_visualization.py @@ -20,7 +20,7 @@ def print_architecture_table(): print("-" * 100) # LLM combinations - print(f"{'LLM':<15} {'AutoRoundConfig':<20} {'AutoRound':<20} {'CalibCompessor':<35}") + print(f"{'LLM':<15} {'AutoRoundConfig':<20} {'AutoRound':<20} {'CalibCompressor':<35}") print(f"{'LLM':<15} {'RTNConfig':<20} {'RTN + imatrix':<20} {'ImatrixCompressor':<35}") print(f"{'LLM':<15} {'RTNConfig':<20} {'RTN (zero-shot)':<20} {'ZeroShotCompressor':<35}") @@ -28,7 +28,7 @@ def print_architecture_table(): # MLLM combinations print(f"{'MLLM':<15} {'AutoRoundConfig':<20} {'AutoRound':<20} {'MLLMCalibCompressor':<35}") - print(f"{'':<15} {'':<20} {'':<20} {' = MLLMMixin + CalibCompessor':<35}") + print(f"{'':<15} {'':<20} {'':<20} {' = MLLMMixin + CalibCompressor':<35}") print(f"{'MLLM':<15} {'RTNConfig':<20} {'RTN + imatrix':<20} {'MLLMImatrixCompressor':<35}") print(f"{'':<15} {'':<20} {'':<20} {' = MLLMMixin + ImatrixCompressor':<35}") print(f"{'MLLM':<15} {'RTNConfig':<20} {'RTN (zero-shot)':<20} {'MLLMZeroShotCompressor':<35}") @@ -38,7 +38,7 @@ def print_architecture_table(): # Diffusion combinations print(f"{'Diffusion':<15} {'AutoRoundConfig':<20} {'AutoRound':<20} {'DiffusionCalibCompressor':<35}") - print(f"{'':<15} {'':<20} {'':<20} {' = DiffusionMixin + CalibCompessor':<35}") + print(f"{'':<15} {'':<20} {'':<20} {' = DiffusionMixin + CalibCompressor':<35}") print(f"{'Diffusion':<15} {'RTNConfig':<20} {'RTN + imatrix':<20} {'DiffusionImatrixCompressor':<35}") print(f"{'':<15} {'':<20} {'':<20} {' = DiffusionMixin + ImatrixCompressor':<35}") print(f"{'Diffusion':<15} {'RTNConfig':<20} {'RTN (zero-shot)':<20} {'DiffusionZeroShotCompressor':<35}") @@ -58,16 +58,16 @@ def print_mixin_explanation(): print("-" * 100) print(" 1. MLLMMixin - MLLM features (processor, template, etc.)") print(" 2. DiffusionMixin - Diffusion features (guidance_scale, pipeline, etc.)") - print(" 3. CalibCompessor - Calibration-based compression algorithm (AutoRound)") + print(" 3. CalibCompressor - Calibration-based compression algorithm (AutoRound)") print(" 4. ImatrixCompressor - RTN + importance matrix") print(" 5. ZeroShotCompressor - Zero-shot RTN") print("\n🎯 Combination Approach:") print("-" * 100) print(" Dynamically create combined classes through multiple inheritance:") - print(" class MLLMCalibCompressor(MLLMMixin, CalibCompessor):") + print(" class MLLMCalibCompressor(MLLMMixin, CalibCompressor):") print(" pass") - print("\n MLLMMixin provides MLLM features, CalibCompessor provides compression algorithm") + print("\n MLLMMixin provides MLLM features, CalibCompressor provides compression algorithm") print("\n💡 Advantages:") print("-" * 100) @@ -100,7 +100,7 @@ def print_usage_examples(): processor=processor, template="qwen2_vl", ) -# Actually creates: MLLMCalibCompressor (MLLMMixin + CalibCompessor) +# Actually creates: MLLMCalibCompressor (MLLMMixin + CalibCompressor) """ ) @@ -131,7 +131,7 @@ def print_usage_examples(): model="/models/stable-diffusion-2-1", guidance_scale=7.5, ) -# Actually creates: DiffusionCalibCompressor (DiffusionMixin + CalibCompessor) +# Actually creates: DiffusionCalibCompressor (DiffusionMixin + CalibCompressor) """ ) @@ -145,13 +145,13 @@ def print_mro_example(): print("Method Resolution Order (MRO) Example") print("=" * 100 + "\n") - print("For MLLMCalibCompressor(MLLMMixin, CalibCompessor):") + print("For MLLMCalibCompressor(MLLMMixin, CalibCompressor):") print("-" * 100) print( """ MLLMCalibCompressor └─> MLLMMixin - └─> CalibCompessor + └─> CalibCompressor └─> BaseCompressor └─> object @@ -159,8 +159,8 @@ def print_mro_example(): 1. MLLMCalibCompressor.__init__() (if defined) 2. MLLMMixin.__init__() - Save MLLM-specific parameters (processor, template, etc.) - - Call super().__init__() → enters CalibCompessor - 3. CalibCompessor.__init__() + - Call super().__init__() → enters CalibCompressor + 3. CalibCompressor.__init__() - Save calibration-related parameters (dataset, iters, etc.) - Call super().__init__() → enters BaseCompressor 4. BaseCompressor.__init__() @@ -168,7 +168,7 @@ def print_mro_example(): Thus, MLLMCalibCompressor has both: ✓ MLLM features (from MLLMMixin) - ✓ Calibration compression functionality (from CalibCompessor) + ✓ Calibration compression functionality (from CalibCompressor) """ ) @@ -196,15 +196,15 @@ def print_decision_tree(): │ │ │ ├─ AutoRoundConfig (requires calibration) │ │ ├─ model_type == "mllm" -│ │ │ └─> class MLLMCalibCompressor(MLLMMixin, CalibCompessor) +│ │ │ └─> class MLLMCalibCompressor(MLLMMixin, CalibCompressor) │ │ │ return MLLMCalibCompressor(...) │ │ │ │ │ ├─ model_type == "diffusion" -│ │ │ └─> class DiffusionCalibCompressor(DiffusionMixin, CalibCompessor) +│ │ │ └─> class DiffusionCalibCompressor(DiffusionMixin, CalibCompressor) │ │ │ return DiffusionCalibCompressor(...) │ │ │ │ │ └─ model_type == "llm" -│ │ └─> return CalibCompessor(...) +│ │ └─> return CalibCompressor(...) │ │ │ └─ RTNConfig (zero-shot or imatrix) │ │ diff --git a/auto_round/compressors_new/calib.py b/auto_round/compressors_new/calib.py index 2f0d8dbcc..738f9287c 100644 --- a/auto_round/compressors_new/calib.py +++ b/auto_round/compressors_new/calib.py @@ -61,7 +61,7 @@ from auto_round.wrapper import WrapperLinear, WrapperMultiblock -class CalibCompessor(BaseCompressor): +class CalibCompressor(BaseCompressor): need_calib: bool = True def __init__( @@ -998,7 +998,7 @@ def _check_compatibility(self) -> None: ) -class ImatrixCompressor(CalibCompessor): +class ImatrixCompressor(CalibCompressor): need_calib: bool = True def __init__( diff --git a/auto_round/compressors_new/diffusion_mixin.py b/auto_round/compressors_new/diffusion_mixin.py index 5893abe73..cef1ba1d4 100644 --- a/auto_round/compressors_new/diffusion_mixin.py +++ b/auto_round/compressors_new/diffusion_mixin.py @@ -25,12 +25,12 @@ class DiffusionMixin: """Diffusion-specific functionality mixin. This mixin adds diffusion model-specific functionality to any compressor - (CalibCompessor, ZeroShotCompressor, ImatrixCompressor, etc). It handles + (CalibCompressor, ZeroShotCompressor, ImatrixCompressor, etc). It handles diffusion models (like Stable Diffusion, FLUX) that require special pipeline handling and data generation logic. Can be combined with: - - CalibCompessor (for AutoRound with calibration) + - CalibCompressor (for AutoRound with calibration) - ImatrixCompressor (for RTN with importance matrix) - ZeroShotCompressor (for basic RTN) @@ -48,7 +48,7 @@ def __init__(self, *args, guidance_scale=7.5, num_inference_steps=50, generator_ self.pipe = None # Will be set during model loading self.pipe_config = None - # Call parent class __init__ (will be CalibCompessor, ImatrixCompressor, etc) + # Call parent class __init__ (will be CalibCompressor, ImatrixCompressor, etc) super().__init__(*args, **kwargs) def post_init(self): diff --git a/auto_round/compressors_new/docs/compressors_new_architecture.md b/auto_round/compressors_new/docs/compressors_new_architecture.md index 1cd33111a..858a60cae 100644 --- a/auto_round/compressors_new/docs/compressors_new_architecture.md +++ b/auto_round/compressors_new/docs/compressors_new_architecture.md @@ -16,7 +16,7 @@ compressors_new/ ├── entry.py # 统一入口,自动检测模型类型 ├── base.py # BaseCompressor 基类 -├── calib.py # CalibCompessor (需要校准的算法) +├── calib.py # CalibCompressor (需要校准的算法) ├── zero_shot.py # ZeroShotCompressor (不需要校准的算法) ├── mllm_mixin.py # MLLMCalibCompressor (MLLM + 校准) └── diffusion_mixin.py # DiffusionCalibCompressor (Diffusion + 校准) @@ -26,7 +26,7 @@ compressors_new/ ``` BaseCompressor - ├── CalibCompessor (基于校准的压缩) + ├── CalibCompressor (基于校准的压缩) │ ├── MLLMCalibCompressor (MLLM 专用) │ └── DiffusionCalibCompressor (Diffusion 专用) │ @@ -138,7 +138,7 @@ Compressor.__new__() ├── AutoRoundConfig (需要校准) │ ├── MLLM → MLLMCalibCompressor │ ├── Diffusion → DiffusionCalibCompressor -│ └── LLM → CalibCompessor +│ └── LLM → CalibCompressor │ └── RTNConfig ├── enable_imatrix=True → ImatrixCompressor @@ -153,10 +153,10 @@ Compressor.__new__() ```python # compressors_new/new_model_calib.py -from auto_round.compressors_new.calib import CalibCompessor +from auto_round.compressors_new.calib import CalibCompressor -class NewModelCalibCompressor(CalibCompessor): +class NewModelCalibCompressor(CalibCompressor): def __init__(self, config, model, **kwargs): # 存储模型特定参数 self.special_param = kwargs.pop("special_param", None) diff --git a/auto_round/compressors_new/docs/compressors_new_architecture_CN.md b/auto_round/compressors_new/docs/compressors_new_architecture_CN.md index 4beca598c..9c6569fb7 100644 --- a/auto_round/compressors_new/docs/compressors_new_architecture_CN.md +++ b/auto_round/compressors_new/docs/compressors_new_architecture_CN.md @@ -16,7 +16,7 @@ compressors_new/ ├── entry.py # 统一入口,自动检测模型类型 ├── base.py # BaseCompressor 基类 -├── calib.py # CalibCompessor (基于校准的压缩) +├── calib.py # CalibCompressor (基于校准的压缩) ├── zero_shot.py # ZeroShotCompressor (零样本压缩) ├── mllm_mixin.py # MLLMCalibCompressor (多模态模型校准压缩) └── diffusion_mixin.py # DiffusionCalibCompressor (扩散模型校准压缩) @@ -27,7 +27,7 @@ compressors_new/ ``` BaseCompressor (基础压缩器) │ - ├── CalibCompessor (基于校准的压缩器) + ├── CalibCompressor (基于校准的压缩器) │ │ │ ├── MLLMCalibCompressor (多模态模型专用) │ │ └── 支持视觉-语言模型(如 Qwen2-VL, LLaVA 等) @@ -203,7 +203,7 @@ Compressor.__new__() │ │ └─ 提取 transformer/unet │ │ │ └─ model_type == "llm" -│ └─> CalibCompessor +│ └─> CalibCompressor │ └─ 标准文本数据集 │ └─ RTNConfig (零样本量化) @@ -235,7 +235,7 @@ class Compressor(object): return DiffusionCalibCompressor(config, model, tokenizer, platform, format, **kwargs) else: - return CalibCompessor(config, model, tokenizer, platform, format, **kwargs) + return CalibCompressor(config, model, tokenizer, platform, format, **kwargs) elif isinstance(config, RTNConfig): # RTN 可能需要 imatrix @@ -259,11 +259,11 @@ class Compressor(object): from typing import Union import torch from auto_round.algorithms.alg_config import AlgConfig -from auto_round.compressors_new.calib import CalibCompessor +from auto_round.compressors_new.calib import CalibCompressor from auto_round.logger import logger -class AudioCalibCompressor(CalibCompessor): +class AudioCalibCompressor(CalibCompressor): """音频模型专用校准压缩器""" def __init__( @@ -411,7 +411,7 @@ def is_audio_model(model_or_path: Union[str, torch.nn.Module]) -> bool: ### MLLMCalibCompressor 关键实现 ```python -class MLLMCalibCompressor(CalibCompessor): +class MLLMCalibCompressor(CalibCompressor): def __init__( self, config, model, processor=None, image_processor=None, template=None, extra_data_dir=None, **kwargs ): @@ -455,7 +455,7 @@ class MLLMCalibCompressor(CalibCompessor): ### DiffusionCalibCompressor 关键实现 ```python -class DiffusionCalibCompressor(CalibCompessor): +class DiffusionCalibCompressor(CalibCompressor): def __init__(self, config, model, guidance_scale=7.5, num_inference_steps=50, **kwargs): self.guidance_scale = guidance_scale self.num_inference_steps = num_inference_steps @@ -518,7 +518,7 @@ class DiffusionCalibCompressor(CalibCompessor): │ └─> 根据 config 类型和 model_type 创建实例 │ ├─> MLLMCalibCompressor (MLLM + AutoRound) │ ├─> DiffusionCalibCompressor (Diffusion + AutoRound) - │ ├─> CalibCompessor (LLM + AutoRound) + │ ├─> CalibCompressor (LLM + AutoRound) │ ├─> ImatrixCompressor (RTN + imatrix) │ └─> ZeroShotCompressor (RTN) │ @@ -636,7 +636,7 @@ python -c "from test_compressor_new_arch import test_mllm_compressor; test_mllm_ # 测试 LLM comp = Compressor(config=config, model="/models/opt-125m/") - assert isinstance(comp, CalibCompessor) + assert isinstance(comp, CalibCompressor) # 测试 MLLM comp = Compressor(config=config, model="/models/Qwen2-VL-2B-Instruct") @@ -719,7 +719,7 @@ comp = Compressor(config=config, model=...) | 需要校准数据 | ❌ 否 | ✅ 是 | | 量化质量 | 较低 | 较高 | | 量化速度 | 快 | 慢 | -| Compressor | ZeroShotCompressor | CalibCompessor 系列 | +| Compressor | ZeroShotCompressor | CalibCompressor 系列 | ```python # RTN - 快速但质量较低 diff --git a/auto_round/compressors_new/entry.py b/auto_round/compressors_new/entry.py index 5835130d6..e39b0b4c2 100644 --- a/auto_round/compressors_new/entry.py +++ b/auto_round/compressors_new/entry.py @@ -9,7 +9,7 @@ from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig from auto_round.algorithms.quantization.rtn.config import RTNConfig from auto_round.auto_scheme.gen_auto_scheme import AutoScheme -from auto_round.compressors_new.calib import CalibCompessor +from auto_round.compressors_new.calib import CalibCompressor from auto_round.compressors_new.zero_shot import ZeroShotCompressor from auto_round.logger import logger from auto_round.schemes import QuantizationScheme @@ -76,8 +76,8 @@ def __new__( if model_type == "mllm": from auto_round.compressors_new.mllm_mixin import MLLMMixin - # Create dynamic class: MLLMMixin + CalibCompessor - class MLLMCalibCompressor(MLLMMixin, CalibCompessor): + # Create dynamic class: MLLMMixin + CalibCompressor + class MLLMCalibCompressor(MLLMMixin, CalibCompressor): """MLLM model with AutoRound calibration compression""" pass @@ -86,15 +86,15 @@ class MLLMCalibCompressor(MLLMMixin, CalibCompessor): elif model_type == "diffusion": from auto_round.compressors_new.diffusion_mixin import DiffusionMixin - # Create dynamic class: DiffusionMixin + CalibCompessor - class DiffusionCalibCompressor(DiffusionMixin, CalibCompessor): + # Create dynamic class: DiffusionMixin + CalibCompressor + class DiffusionCalibCompressor(DiffusionMixin, CalibCompressor): """Diffusion model with AutoRound calibration compression""" pass return DiffusionCalibCompressor(config, **local_args, **kwargs) else: - return CalibCompessor(config, **local_args, **kwargs) + return CalibCompressor(config, **local_args, **kwargs) elif isinstance(config, RTNConfig): enable_imatrix = False diff --git a/auto_round/compressors_new/mllm_mixin.py b/auto_round/compressors_new/mllm_mixin.py index d14393bc2..14157dcab 100644 --- a/auto_round/compressors_new/mllm_mixin.py +++ b/auto_round/compressors_new/mllm_mixin.py @@ -23,12 +23,12 @@ class MLLMMixin: """MLLM-specific functionality mixin. - This mixin adds MLLM-specific functionality to any compressor (CalibCompessor, + This mixin adds MLLM-specific functionality to any compressor (CalibCompressor, ZeroShotCompressor, ImatrixCompressor, etc). It handles multi-modal models (vision-language models) that require special data loading and processing logic. Can be combined with: - - CalibCompessor (for AutoRound with calibration) + - CalibCompressor (for AutoRound with calibration) - ImatrixCompressor (for RTN with importance matrix) - ZeroShotCompressor (for basic RTN) @@ -58,7 +58,7 @@ def __init__( self.quant_nontext_module = quant_nontext_module self.template_obj = None - # Call parent class __init__ (will be CalibCompessor, ImatrixCompressor, etc) + # Call parent class __init__ (will be CalibCompressor, ImatrixCompressor, etc) super().__init__(*args, **kwargs) @torch.no_grad() From 70a2d0268b8a5117a03bcb997186570395e13baf Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Tue, 17 Mar 2026 17:20:47 +0800 Subject: [PATCH 05/90] add switch Signed-off-by: n1ck-guo --- auto_round/autoround.py | 7 +++++++ auto_round/compressors/utils.py | 26 +------------------------- auto_round/schemes.py | 7 +------ auto_round/utils/common.py | 24 ++++++++++++++++++++++++ 4 files changed, 33 insertions(+), 31 deletions(-) diff --git a/auto_round/autoround.py b/auto_round/autoround.py index 8b161be73..b787e1f02 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -32,6 +32,8 @@ if TYPE_CHECKING: from auto_round.auto_scheme.gen_auto_scheme import AutoScheme +NEW_ARCH = True + class AutoRound: """Automatic weight rounding (Signed Gradient Descent) for LLM quantization @@ -159,6 +161,11 @@ def __new__( local_args = {k: v for k, v in locals().items() if k not in cls.SKIP_ARGS} + if NEW_ARCH: + from auto_round.compressors_new import AutoRound as AutoRoundNew + + AutoRoundNew(**local_args) + model_cls = [] if (extra_config and not extra_config.mllm_config.is_default()) or is_mllm_model(model, platform=platform): diff --git a/auto_round/compressors/utils.py b/auto_round/compressors/utils.py index fdba49a88..1a7bdc506 100644 --- a/auto_round/compressors/utils.py +++ b/auto_round/compressors/utils.py @@ -28,7 +28,7 @@ from auto_round.export.export_to_gguf.config import GGML_QUANT_SIZES, GGUF_CONFIG, GGUF_INNER_CONFIG, QK_K, ModelType from auto_round.logger import logger from auto_round.schemes import QuantizationScheme, get_gguf_scheme, preset_name_to_scheme -from auto_round.utils import check_to_quantized, to_standard_regex +from auto_round.utils import check_to_quantized, infer_bits_by_data_type, to_standard_regex class BackendDataType(str, Enum): @@ -199,30 +199,6 @@ def collect_best_params(block, cache_device="cpu"): return params -def infer_bits_by_data_type(data_type: str): - """Infer bits by data_type - - Args: - data_type (str): data_type - - Returns: - int: bits inferred by data_type, None means cannot infer correct bits by data_type - """ - from auto_round.utils import SUPPORTED_DTYPES - - if data_type is None: - return 16 - for supported_dtype in SUPPORTED_DTYPES: - if data_type.startswith(supported_dtype) and len(data_type) > len(supported_dtype): - ##first check the following two bits - suc_2str = data_type[len(supported_dtype) : len(supported_dtype) + 2] - if str.isdigit(suc_2str): - return int(suc_2str) - if str.isdigit(data_type[len(supported_dtype)]): - return int(data_type[len(supported_dtype)]) - return None - - def _get_safetensor_layer_names_not_in_model(model, all_module_names: list) -> list: """Collect layer names from safetensor files that are not loaded into the model. diff --git a/auto_round/schemes.py b/auto_round/schemes.py index fc37961f7..953d10921 100644 --- a/auto_round/schemes.py +++ b/auto_round/schemes.py @@ -18,9 +18,8 @@ import torch -from auto_round.compressors.utils import infer_bits_by_data_type from auto_round.logger import logger -from auto_round.utils import SUPPORTED_DTYPES +from auto_round.utils import SUPPORTED_DTYPES, infer_bits_by_data_type __all__ = ["QuantizationScheme", "get_gguf_scheme", "preset_name_to_scheme"] @@ -284,7 +283,6 @@ def _parse_scheme( } ) - W2A16G32 = QuantizationScheme.from_dict( { "bits": 2, @@ -341,7 +339,6 @@ def _parse_scheme( } ) - MXFP8 = QuantizationScheme.from_dict( { "bits": 8, @@ -368,7 +365,6 @@ def _parse_scheme( } ) - NVFP4 = QuantizationScheme.from_dict( { "bits": 4, @@ -440,7 +436,6 @@ def _parse_scheme( } ) - # For AutoScheme 16 bits options BF16 = QuantizationScheme.from_dict( { diff --git a/auto_round/utils/common.py b/auto_round/utils/common.py index 96b6066e5..0da1a9845 100644 --- a/auto_round/utils/common.py +++ b/auto_round/utils/common.py @@ -590,3 +590,27 @@ def compress_layer_names(names: list) -> str: parts.append(f"{prefix}[{range_str}]{suffix}") parts.extend(singles) return ", ".join(parts) + + +def infer_bits_by_data_type(data_type: str): + """Infer bits by data_type + + Args: + data_type (str): data_type + + Returns: + int: bits inferred by data_type, None means cannot infer correct bits by data_type + """ + from auto_round.utils import SUPPORTED_DTYPES + + if data_type is None: + return 16 + for supported_dtype in SUPPORTED_DTYPES: + if data_type.startswith(supported_dtype) and len(data_type) > len(supported_dtype): + ##first check the following two bits + suc_2str = data_type[len(supported_dtype) : len(supported_dtype) + 2] + if str.isdigit(suc_2str): + return int(suc_2str) + if str.isdigit(data_type[len(supported_dtype)]): + return int(data_type[len(supported_dtype)]) + return None From 5998d4441237d4f3c5c4f56f10a36b90a1711bb5 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Tue, 17 Mar 2026 17:43:02 +0800 Subject: [PATCH 06/90] code scan Signed-off-by: n1ck-guo --- auto_round/compressors_new/base.py | 4 +++- auto_round/compressors_new/calib.py | 10 ++++++---- auto_round/compressors_new/zero_shot.py | 6 +++--- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/auto_round/compressors_new/base.py b/auto_round/compressors_new/base.py index dae46c8ad..bce98a63d 100644 --- a/auto_round/compressors_new/base.py +++ b/auto_round/compressors_new/base.py @@ -124,7 +124,9 @@ def __init__( logger.warning("The static kv is experimental and currently has limited support.") if kwargs: - logger.warning(f"unrecognized keys {list(kwargs.keys())} were passed. Please check them.") + logger.warning( + f"unrecognized keys {list(kwargs.keys())} were passed. Please check them. If you use old api, just ignore this warning. " + ) if "CUBLAS_WORKSPACE_CONFIG" not in os.environ: os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" # Deprecated, default not to use torch.use_deterministic_algorithms diff --git a/auto_round/compressors_new/calib.py b/auto_round/compressors_new/calib.py index 2f0d8dbcc..43522778c 100644 --- a/auto_round/compressors_new/calib.py +++ b/auto_round/compressors_new/calib.py @@ -143,7 +143,8 @@ def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, l if str(self.model_context.model.device) == "cpu" and ( not self.compress_context.device.startswith("hpu") ): - # type(self.model_context.model._no_split_modules) changes from list to set when transformers > 5.0 + # type(self.model_context.model._no_split_modules) changes from list to set + # when transformers > 5.0 no_split_modules = list(getattr(self.model_context.model, "_no_split_modules", [])) devices = parse_available_devices(self.compress_context.device_map) @@ -438,8 +439,8 @@ def calib(self, nsamples, bs): break if total_cnt == 0: logger.error( - f"no data has been cached, please provide more data with sequence length >={self.quantize_config.seqlen} in the " - f"dataset or decease the sequence length" + f"no data has been cached, please provide more data with sequence length " + f">={self.quantize_config.seqlen} in the dataset or decease the sequence length" ) exit(-1) elif total_cnt < nsamples: @@ -1224,7 +1225,8 @@ def quantize(self): self.post_init() self.model_context.initialize(formats=self.formats, is_act_quantize=self.config.is_act_quantize) - if not (any(fmt.is_gguf() for fmt in getattr(self, "formats", [])) or self.super_bits is not None): + formats = getattr(self, "formats", None) or [] + if not (any(fmt.is_gguf() for fmt in formats) or self.super_bits is not None): self._quantize_embedding_layer() # leave to gguf itself to handle # Release memory diff --git a/auto_round/compressors_new/zero_shot.py b/auto_round/compressors_new/zero_shot.py index 77dd6ec15..f9528d9d4 100644 --- a/auto_round/compressors_new/zero_shot.py +++ b/auto_round/compressors_new/zero_shot.py @@ -211,10 +211,10 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: import accelerate accelerate.hooks.remove_hook_from_submodules(self.model) - orig_device = self.device - self.device = "cpu" + orig_device = self.compress_context.device + self.compress_context.device = "cpu" self._quantize_via_rtn_blockwise() - self.device = orig_device + self.compress_context.device = orig_device for handle in hook_handles: handle.remove() else: From 394dcdd56c80e1e5c3077834b77ae3f699486a58 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Tue, 17 Mar 2026 19:02:58 +0800 Subject: [PATCH 07/90] fix Signed-off-by: n1ck-guo --- .../quantization/auto_round/config.py | 2 + .../quantization/auto_round/quantizer.py | 155 ++++++++++++----- auto_round/algorithms/quantization/base.py | 25 ++- .../algorithms/quantization/rtn/quantizer.py | 2 +- auto_round/autoround.py | 4 +- auto_round/calibration/utils.py | 10 ++ auto_round/compressors_new/__init__.py | 7 + auto_round/compressors_new/base.py | 3 +- auto_round/compressors_new/calib.py | 5 + auto_round/compressors_new/diffusion_mixin.py | 162 +++++++++++++----- auto_round/compressors_new/utils.py | 43 +++++ auto_round/compressors_new/zero_shot.py | 13 +- auto_round/context/base.py | 2 + 13 files changed, 331 insertions(+), 102 deletions(-) diff --git a/auto_round/algorithms/quantization/auto_round/config.py b/auto_round/algorithms/quantization/auto_round/config.py index cb8e1b23f..781ea38c9 100644 --- a/auto_round/algorithms/quantization/auto_round/config.py +++ b/auto_round/algorithms/quantization/auto_round/config.py @@ -47,6 +47,7 @@ def __init__( nsamples: int = 128, momentum: float = 0.0, batch_size: int = 8, + nblocks: int = 1, enable_minmax_tuning: bool = True, enable_norm_bias_tuning: bool = False, gradient_accumulate_steps: int = 1, @@ -79,6 +80,7 @@ def __init__( self.seqlen = seqlen self.nsamples = nsamples self.batch_size, self.gradient_accumulate_steps = batch_size, gradient_accumulate_steps + self.nblocks = nblocks self.momentum = momentum self.enable_alg_ext = enable_alg_ext diff --git a/auto_round/algorithms/quantization/auto_round/quantizer.py b/auto_round/algorithms/quantization/auto_round/quantizer.py index d64e5087d..5d682a9df 100644 --- a/auto_round/algorithms/quantization/auto_round/quantizer.py +++ b/auto_round/algorithms/quantization/auto_round/quantizer.py @@ -62,6 +62,11 @@ from auto_round.utils.distributed import setup_ddp_if_needed_ from auto_round.wrapper import WrapperLinear, WrapperMultiblock, unwrapper_block, unwrapper_layer, wrapper_block +DIFFUSION_OUTPUT_CONFIGS = { + "FluxTransformerBlock": ["encoder_hidden_states", "hidden_states"], + "FluxSingleTransformerBlock": ["encoder_hidden_states", "hidden_states"], +} + class ARQuantizer(BaseQuantizers): @@ -215,51 +220,42 @@ def forward(m, hidden_states=None, *positional_inputs, **kwargs): return forward - def normalize_decoding_layer_inputs_(self, decoding_layer_inputs: list[tuple[tuple[Any, dict[str, Any]]]]): - """ - Processes and stores decoding layer inputs for block quantization. - - This function iterates through a list of captured decoding layer calls, - replaying them through a fake decoding layer to extract and store the - inputs required for the decoding block in `self.inputs`. This effectively - "normalizes" the inputs by making them accessible in a consistent format - for subsequent quantization steps. - - Args: - decoding_layer_inputs: - A list of entries captured by a forward hook on the decoding layer. - Each element is expected to be a tuple whose first item is - `(args, kwargs)`, where `args` are the positional arguments and - `kwargs` are the keyword arguments seen during the original - forward pass. - - The capture hook look like: - - def input_capture_hook(module, *args, **kwargs): - _all_module_input[module._global_name].append((args, kwargs)) - """ - first_block_name = self.quant_block_list[0][0] - - class _FakeDecodingLayer(torch.nn.Module): - - def forward(self, *args, **kwargs): - return args, kwargs - - fake_layer = _FakeDecodingLayer() - fake_layer.orig_forward = fake_layer.forward - fake_layer.forward = partial(self._get_block_forward_func(first_block_name), fake_layer) - - self.inputs = {} - self.last_cache_name = None - for step_input in decoding_layer_inputs: - args, kwargs = step_input[0] - fake_layer(*args, **kwargs) - def _get_current_output(self, output: list[torch.Tensor], indices: list[int]) -> torch.Tensor: + if self.model_context.is_diffusion: + assert "hidden_states" in output + current_output = [output["hidden_states"][x] for x in indices] + current_output = torch.cat(current_output, dim=self.batch_dim) + return current_output current_output = [output[x] for x in indices] current_output = torch.cat(current_output, dim=self.batch_dim) return current_output + def _get_diffusion_current_q_output( + self, + block: torch.nn.Module, + input_ids: dict, + input_others: dict, + indices: list[int], + device: str, + cache_device: str = "cpu", + ): + output_config = DIFFUSION_OUTPUT_CONFIGS.get(block.__class__.__name__, []) + idx = None if "hidden_states" not in output_config else output_config.index("hidden_states") + current_input_ids, current_input_others = self._sampling_inputs( + input_ids, + input_others, + indices, + seqlen=self.seqlen, + batch_dim=self.batch_dim, + share_cache_keys=self.shared_cache_keys, + ) + if isinstance(current_input_ids, dict): + hidden_states = current_input_ids.pop("hidden_states") + current_input_others.update(current_input_ids) + current_input_ids = hidden_states + output_q = block_forward(block, current_input_ids, current_input_others, self.amp, self.amp_dtype, device, idx) + return output_q.to(cache_device) + def _get_current_q_output( self, block: torch.nn.Module, @@ -269,6 +265,8 @@ def _get_current_q_output( device: str, cache_device: str = "cpu", ) -> torch.Tensor: + if self.model_context.is_diffusion: + return self._get_diffusion_current_q_output(block, input_ids, input_others, indices, device, cache_device) current_input_ids, current_input_others = self._sampling_inputs( input_ids, input_others, @@ -287,6 +285,10 @@ def _get_current_num_elm( input_ids: list[torch.Tensor], indices: list[int], ) -> int: + if self.model_context.is_diffusion: + current_input_ids = [input_ids["hidden_states"][i] for i in indices] + return sum(id.numel() for id in current_input_ids) + current_input_ids = [input_ids[i] for i in indices] return sum(id.numel() for id in current_input_ids) @@ -339,12 +341,12 @@ def quantize_block( ): self._quantize_block(block, input_ids, input_others, q_input=q_input, auto_offload=auto_offload, **kwargs) if hasattr(block, "config"): - del block.block + del block.config if self.compress_context.is_immediate_saving: for n, tmp_m in block.named_modules(): if not (hasattr(tmp_m, "bits") and check_to_quantized(tmp_m)): continue - immediate_pack(tmp_m.global_name, self.quantizer.layer_config) + immediate_pack(tmp_m.global_name, self.layer_config) def _quantize_block( self, @@ -704,6 +706,7 @@ def quantize_layer( nsamples = len(inputs) last_best_iter = 0 best_loss = torch.finfo(torch.float).max + best_params = None scaler = self._get_scaler() # pylint: disable=assignment-from-none init_loss = None gradient_accumulate_steps = self.batch_size # Force to low gpu @@ -824,6 +827,15 @@ def _get_block_outputs( Returns: The output tensor of the block. """ + if self.model_context.is_diffusion: + return self._get_diffusion_block_outputs( + block, + input_ids, + input_others, + bs, + self.compress_context.device, + self.compress_context.cache_device, + ) if ( (self.config.is_act_quantize and (not self.config.act_dynamic or self.config.is_act_nv_fp)) # have hooks @@ -872,6 +884,65 @@ def _get_block_outputs( return output + @torch.no_grad() + def _get_diffusion_block_outputs( + self, + block: torch.nn.Module, + input_ids: Union[torch.Tensor, dict], + input_others: torch.Tensor, + bs: int, + device: Union[str, torch.device], + cache_device: Union[str, torch.device], + save_output: bool = True, + ): + """Compute the output of a given block of the model for a given input. + + Args: + block: The block of the model. + input_ids: The input tensor containing tokenized input ids. + input_others: A dictionary containing additional input data. + bs: The batch size for computing the output. + device: The device for computation. + cache_device: The device for storing the output. + batch_dim: The batch dimension of the output tensor. + + Returns: + The output tensor of the block. + """ + + output = defaultdict(list) + output_config = DIFFUSION_OUTPUT_CONFIGS.get(block.__class__.__name__, []) + if isinstance(input_ids, dict): + nsamples = len(input_ids["hidden_states"]) + else: + nsamples = len(input_ids) + + for i in range(0, nsamples, bs): + end_index = min(nsamples, i + bs) + indices = torch.arange(i, end_index).to(torch.long) + tmp_input_ids, tmp_input_others = self._sampling_inputs( + input_ids, input_others, indices, self.seqlen, self.batch_dim, share_cache_keys=self.shared_cache_keys + ) + if isinstance(tmp_input_ids, dict): + hidden_states = tmp_input_ids.pop("hidden_states") + tmp_input_others.update(tmp_input_ids) + tmp_input_ids = hidden_states + + tmp_output = block_forward(block, tmp_input_ids, tmp_input_others, self.amp, self.amp_dtype, device, None) + assert len(output_config) == len(tmp_output) + tmp_output = dict(zip(output_config, tmp_output)) + + if save_output: + for name, out in tmp_output.items(): + if self.batch_size == 1: + output[name].append(out.to(cache_device)) + else: + output[name].extend(list(torch.split(out.to(cache_device), 1, dim=self.batch_dim))) + if self.low_gpu_mem_usage: + clear_memory() + + return output + @classmethod @torch.no_grad() def _sampling_inputs( diff --git a/auto_round/algorithms/quantization/base.py b/auto_round/algorithms/quantization/base.py index affd244b1..8056a8ab1 100644 --- a/auto_round/algorithms/quantization/base.py +++ b/auto_round/algorithms/quantization/base.py @@ -59,6 +59,17 @@ class BaseQuantizers: + # Class-level attribute declarations for dynamic properties set in post_init() + # These prevent pylint E1101 (no-member) and E0203 (access-member-before-definition) errors + model_context = None + compress_context = None + dataset = None + quant_block_list = None + orig_scheme = None + is_auto_scheme = False + supported_types = SUPPORTED_LAYER_TYPES + inner_supported_types = INNER_SUPPORTED_LAYER_TYPES + def __init__(self, config: QuantizationConfig): self.config = config self.layer_config = config.layer_config @@ -144,7 +155,7 @@ def post_init(self): self.configure_layer_config(enable_gguf_official_mixed=enable_gguf_official_mixed) def _gen_auto_scheme(self) -> dict[str, dict]: - if self.mllm: + if self.model_context.is_mllm: logger.info("AutoScheme is not yet supported for multimodal LLMs.") sys.exit(-1) @@ -193,7 +204,7 @@ def _gen_auto_scheme(self) -> dict[str, dict]: self.ignore_layers, self.quant_lm_head, enable_gguf_official_mixed=False, - is_mllm=self.mllm, + is_mllm=self.model_context.is_mllm, ) quant_layer_names = layer_config.keys() scheme_keys = {f.name for f in fields(QuantizationScheme)} @@ -206,7 +217,11 @@ def _gen_auto_scheme(self) -> dict[str, dict]: # mainly using quant_layers and fixed by users from auto_round.auto_scheme.gen_auto_scheme import GenScheme - if not self.enable_torch_compile and self.super_bits is None and not self.orig_scheme.low_gpu_mem_usage: + if ( + not self.compress_context.enable_torch_compile + and self.super_bits is None + and not self.orig_scheme.low_gpu_mem_usage + ): logger.warning("we strongly recommend to set `enable_torch_compile` to True for AutoScheme to save VRAM") self.scheme_generator = GenScheme( self.orig_scheme, @@ -215,8 +230,8 @@ def _gen_auto_scheme(self) -> dict[str, dict]: fixed_layer_scheme_new, self.dataset, device_map=self.compress_context.device_map, - tokenizer=self.tokenizer, - enable_torch_compile=self.enable_torch_compile, + tokenizer=self.model_context.tokenizer, + enable_torch_compile=self.compress_context.enable_torch_compile, ) layer_config = self.scheme_generator.get_layer_config() return layer_config diff --git a/auto_round/algorithms/quantization/rtn/quantizer.py b/auto_round/algorithms/quantization/rtn/quantizer.py index 52a6d83a5..d2ed466a9 100644 --- a/auto_round/algorithms/quantization/rtn/quantizer.py +++ b/auto_round/algorithms/quantization/rtn/quantizer.py @@ -85,7 +85,7 @@ def quantize_block(self, block: torch.nn.Module, block_name: str, **kwargs): materialize_model_(block) for name, m in block.named_modules(): if hasattr(m, "global_name") and check_to_quantized(m): - self.quantize_layer(m.global_name, to_cpu=self.low_gpu_mem_usage) + self.quantize_layer(m.global_name) elif ( not any(m.children()) and len(m.state_dict()) > 0 diff --git a/auto_round/autoround.py b/auto_round/autoround.py index b787e1f02..a1ccb65aa 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -162,9 +162,9 @@ def __new__( local_args = {k: v for k, v in locals().items() if k not in cls.SKIP_ARGS} if NEW_ARCH: - from auto_round.compressors_new import AutoRound as AutoRoundNew + from auto_round.compressors_new.entry import AutoRound as AutoRoundNew - AutoRoundNew(**local_args) + return AutoRoundNew(**local_args) model_cls = [] diff --git a/auto_round/calibration/utils.py b/auto_round/calibration/utils.py index 0dc997fae..523561aef 100644 --- a/auto_round/calibration/utils.py +++ b/auto_round/calibration/utils.py @@ -41,6 +41,16 @@ def _infer_last_cache_name(block_names, layer_names=None, requested_last_cache_n def _update_inputs(inputs: dict, q_inputs: dict) -> tuple[dict, torch.Tensor]: + from auto_round.context.model import ModelContext + + model_context = ModelContext() + if model_context.is_diffusion: + # flux transformer model's blocks will update hidden_states and encoder_hidden_states + input_id_str = [key for key in inputs.keys() if "hidden_state" in key] + if q_inputs is not None: + q_inputs = {k: q_inputs.pop(k, None) for k in input_id_str} + return inputs, q_inputs + keys = inputs.keys() input_id_str = [key for key in keys if key.startswith("hidden_state")] if len(input_id_str) != 1: diff --git a/auto_round/compressors_new/__init__.py b/auto_round/compressors_new/__init__.py index 902b0c526..de63f779e 100644 --- a/auto_round/compressors_new/__init__.py +++ b/auto_round/compressors_new/__init__.py @@ -15,6 +15,13 @@ # Lazy imports to avoid circular dependencies # Users should import from specific modules instead of this __init__.py +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from auto_round.compressors_new.calib import CalibCompressor, ImatrixCompressor + from auto_round.compressors_new.entry import AutoRound, Compressor + from auto_round.compressors_new.zero_shot import ZeroShotCompressor + __all__ = ["Compressor", "CalibCompressor", "ImatrixCompressor", "ZeroShotCompressor", "AutoRound"] diff --git a/auto_round/compressors_new/base.py b/auto_round/compressors_new/base.py index bce98a63d..b3fd9047e 100644 --- a/auto_round/compressors_new/base.py +++ b/auto_round/compressors_new/base.py @@ -125,7 +125,8 @@ def __init__( if kwargs: logger.warning( - f"unrecognized keys {list(kwargs.keys())} were passed. Please check them. If you use old api, just ignore this warning. " + f"unrecognized keys {list(kwargs.keys())} were passed. " + "Please check them. If you use old api, just ignore this warning." ) if "CUBLAS_WORKSPACE_CONFIG" not in os.environ: os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" diff --git a/auto_round/compressors_new/calib.py b/auto_round/compressors_new/calib.py index 444489388..ce7a9006a 100644 --- a/auto_round/compressors_new/calib.py +++ b/auto_round/compressors_new/calib.py @@ -641,6 +641,11 @@ def _preprocess_block_inputs(self, inputs, first_input_name="input_ids"): return input_ids, input_others def _split_inputs(self, inputs: dict, first_input_name: str) -> tuple[torch.Tensor, dict]: + if self.model_context.is_diffusion: + input_id_str = [key for key in inputs.keys() if "hidden_state" in key] + input_ids = {k: inputs.pop(k, None) for k in input_id_str} + input_others = inputs + return input_ids, input_others input_ids = inputs[first_input_name] inputs.pop(first_input_name, None) input_others = inputs diff --git a/auto_round/compressors_new/diffusion_mixin.py b/auto_round/compressors_new/diffusion_mixin.py index cef1ba1d4..71a4e0fb3 100644 --- a/auto_round/compressors_new/diffusion_mixin.py +++ b/auto_round/compressors_new/diffusion_mixin.py @@ -11,10 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import os from typing import Union import torch +from tqdm import tqdm from auto_round.algorithms.alg_config import AlgConfig from auto_round.context.model import ModelContext @@ -104,54 +105,123 @@ def calib(self, nsamples, bs): if self.pipe is None: raise ValueError("Diffusion pipeline must be loaded before calibration") - logger.info(f"Preparing diffusion dataloader with {nsamples} samples") - - # Get diffusion dataloader - self.dataloader = get_diffusion_dataloader( - pipe=self.pipe, - dataset=self.dataset, - nsamples=nsamples, - batch_size=bs, - seed=self.seed, - guidance_scale=self.guidance_scale, - num_inference_steps=self.num_inference_steps, - generator_seed=self.generator_seed, + logger.warning( + "Diffusion model will catch nsamples * num_inference_steps inputs, " + "you can reduce nsamples or num_inference_steps if OOM or take too much time." ) - - # Process data through the model for calibration + if isinstance(self.dataset, str): + dataset = self.dataset.replace(" ", "") + self.dataloader, self.batch_size, self.gradient_accumulate_steps = get_diffusion_dataloader( + dataset=dataset, + bs=self.batch_size, + seed=self.seed, + nsamples=self.nsamples, + gradient_accumulate_steps=self.gradient_accumulate_steps, + ) + else: + self.dataloader = self.dataset total_cnt = 0 - for data in self.dataloader: - if data is None: - continue - - # Diffusion data is usually already properly formatted - if isinstance(data, dict): - # Move all tensors to device - data_new = {} - for key, value in data.items(): - if isinstance(value, torch.Tensor): - data_new[key] = value.to(self.model_context.model.device) - else: - data_new[key] = value - else: - data_new = data - - try: - if isinstance(data_new, dict): - self.model_context.model(**data_new) - else: - self.model_context.model(data_new) - except NotImplementedError: - pass - except Exception as e: - logger.warning(f"Calibration forward pass failed: {e}") - - total_cnt += bs - if total_cnt >= nsamples: - break + total = nsamples if not hasattr(self.dataloader, "len") else min(nsamples, len(self.dataloader)) + if self.pipe.dtype != self.model.dtype: + self.pipe.to(self.model.dtype) + + if ( + hasattr(self.model, "hf_device_map") + and len(self.model.hf_device_map) > 0 + and self.pipe.device != self.model.device + and torch.device(self.model.device).type in ["cuda", "xpu"] + ): + logger.error( + "Diffusion model is activated sequential model offloading, it will crash during moving to GPU/XPU. " + "Please use model path for quantization or " + "move the pipeline object to GPU/XPU before passing them into API." + ) + exit(-1) + + if self.pipe.device != self.model.device: + self.pipe.to(self.model.device) + with tqdm(range(1, total + 1), desc="cache block inputs") as pbar: + for ids, prompts in self.dataloader: + if isinstance(prompts, tuple): + prompts = list(prompts) + try: + self.pipe( + prompt=prompts, + guidance_scale=self.guidance_scale, + num_inference_steps=self.num_inference_steps, + generator=( + None + if self.generator_seed is None + else torch.Generator(device=self.pipe.device).manual_seed(self.generator_seed) + ), + ) + except NotImplementedError: + pass + except Exception as error: + raise error + step = len(prompts) + total_cnt += step + pbar.update(step) + if total_cnt >= nsamples: + break if total_cnt == 0: - logger.error("no data has been cached, please provide more data") + logger.error( + f"no data has been cached, please provide more data with sequence length >={self.seqlen} in the " + f"dataset or decease the sequence length" + ) exit(-1) elif total_cnt < nsamples: - logger.warning(f"Insufficient number of samples: required {nsamples}, but only {total_cnt} were processed.") + logger.warning( + f"Insufficient number of samples collected may affect the quantization. " + f"target samples count is {nsamples}, while valid samples count is {total_cnt}" + ) + if total_cnt < self.batch_size: + raise ValueError( + f"valid samples is less than batch_size({self.batch_size})," + " please adjust self.batch_size or seqlen." + ) + max_len = (total_cnt // self.batch_size) * self.batch_size + for k, v in self.inputs.items(): + for key in v: + if isinstance(v[key], list) and len(v[key]) == total_cnt: + self.inputs[k][key] = v[key][:max_len] + + # torch.cuda.empty_cache() + + def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **kwargs): + """Save the quantized model to the specified output directory in the specified format. + + Args: + output_dir (str, optional): The directory to save the quantized model. Defaults to None. + format (str, optional): The format in which to save the model. Defaults to "auto_round". + inplace (bool, optional): Whether to modify the model in place. Defaults to True. + **kwargs: Additional keyword arguments specific to the export format. + + Returns: + object: The compressed model object. + """ + if output_dir is None: + return super().save_quantized(output_dir, format=format, inplace=inplace, **kwargs) + + compressed_model = None + for name in self.pipe.components.keys(): + val = getattr(self.pipe, name) + sub_module_path = ( + os.path.join(output_dir, name) if os.path.basename(os.path.normpath(output_dir)) != name else output_dir + ) + if ( + hasattr(val, "config") + and hasattr(val.config, "_name_or_path") + and val.config._name_or_path == self.model.config._name_or_path + ): + compressed_model = super().save_quantized( + output_dir=sub_module_path if not self.is_immediate_saving else output_dir, + format=format, + inplace=inplace, + **kwargs, + ) + elif val is not None and hasattr(val, "save_pretrained"): + val.save_pretrained(sub_module_path) + self.pipe.config.save_pretrained(output_dir) + return compressed_model diff --git a/auto_round/compressors_new/utils.py b/auto_round/compressors_new/utils.py index 184368fe5..9169fb757 100644 --- a/auto_round/compressors_new/utils.py +++ b/auto_round/compressors_new/utils.py @@ -1035,6 +1035,45 @@ def _get_quantized_layer_names_outside_blocks(model, layer_config, supported_typ return layer_names +def _get_diffusion_save_folder_name(format) -> str: + """Generates the save folder name based on the provided format string. + + If there are multiple formats to handle, the function creates a subfolder + named after the format string with special characters replaced. If there's + only one format, it returns the original output directory directly. + + Args: + format_str (str): The format identifier (e.g., 'gguf:q2_k_s'). + + Returns: + str: The path to the folder where results should be saved. + """ + from auto_round.context.compress import CompressContext + from auto_round.context.model import ModelContext + + compress_context = CompressContext.get_context() + model_context = ModelContext.get_context() + + # Replace special characters to make the folder name filesystem-safe + sanitized_format = format.get_backend_name().replace(":", "-").replace("_", "-") + + formats = compress_context.formats + # Use a subfolder only if there are multiple formats + if len(formats) > 1: + return ( + os.path.join(compress_context.output_dir, sanitized_format, "transformer") + if compress_context.is_immediate_saving + else os.path.join(compress_context.output_dir, sanitized_format, "transformer") + ) + + # if use is_immediate_saving, we need to save model in self.output_dir/transformer folder + return ( + os.path.join(compress_context.output_dir, "transformer") + if compress_context.is_immediate_saving + else compress_context.output_dir + ) + + def _get_save_folder_name(format, *args, **kwargs) -> str: """Generates the save folder name based on the provided format string. @@ -1049,8 +1088,12 @@ def _get_save_folder_name(format, *args, **kwargs) -> str: str: The path to the folder where results should be saved. """ from auto_round.context.compress import CompressContext + from auto_round.context.model import ModelContext compress_context = CompressContext.get_context() + model_context = ModelContext.get_context() + if model_context.is_diffusion: + return _get_diffusion_save_folder_name(format) # Replace special characters to make the folder name filesystem-safe sanitized_format = format.get_backend_name().replace(":", "-").replace("_", "-") diff --git a/auto_round/compressors_new/zero_shot.py b/auto_round/compressors_new/zero_shot.py index f9528d9d4..d4629de83 100644 --- a/auto_round/compressors_new/zero_shot.py +++ b/auto_round/compressors_new/zero_shot.py @@ -187,7 +187,8 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: self.post_init() self.model_context.initialize(formats=self.formats, is_act_quantize=self.config.is_act_quantize) - if not (any(fmt.is_gguf() for fmt in getattr(self, "formats", [])) or self.super_bits is not None): + formats = self.formats if isinstance(self.formats, list) else [] + if not (any(fmt.is_gguf() for fmt in formats) or self.super_bits is not None): self._quantize_embedding_layer() # leave to gguf itself to handle # Release memory @@ -200,17 +201,19 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: self.static_kv_dtype, self.static_attention_dtype, ): - hook_handles = self.quantizer._register_act_max_hook(self.model) + model = self.model_context.model + hook_handles = self.quantizer._register_act_max_hook(model) try: self._quantize_via_rtn_blockwise() except torch.OutOfMemoryError: logger.warning("Fallback to CPU. Consider using more GPUs via `--device 0,1,2,3`.") - self.model = self.model.to("cpu") + model = model.to("cpu") + self.model_context.model = model clear_memory(device_list=self.device_list) - if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1: + if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1: import accelerate - accelerate.hooks.remove_hook_from_submodules(self.model) + accelerate.hooks.remove_hook_from_submodules(model) orig_device = self.compress_context.device self.compress_context.device = "cpu" self._quantize_via_rtn_blockwise() diff --git a/auto_round/context/base.py b/auto_round/context/base.py index 9f65357f0..7da8b4f36 100644 --- a/auto_round/context/base.py +++ b/auto_round/context/base.py @@ -42,6 +42,8 @@ def __call__(cls, *args, **kwargs): class BaseContext(metaclass=AutoSkipInitMeta): + _instances = {} + def __init__(self): logger.info(f"{self.__class__.__name__} context initialized.") From 36daba082292e7901cfc1c722e040b66a0c375d2 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Wed, 18 Mar 2026 15:32:57 +0800 Subject: [PATCH 08/90] fix Signed-off-by: n1ck-guo --- .../quantization/auto_round/adam.py | 2 +- .../quantization/auto_round/quantizer.py | 163 ++++-------------- auto_round/algorithms/quantization/base.py | 38 ++++ .../algorithms/quantization/rtn/quantizer.py | 11 +- auto_round/compressors_new/base.py | 60 +++---- auto_round/compressors_new/calib.py | 38 ++-- auto_round/compressors_new/entry.py | 4 + auto_round/compressors_new/shard_writer.py | 80 ++++++--- auto_round/compressors_new/zero_shot.py | 5 +- auto_round/context/base.py | 27 +++ 10 files changed, 210 insertions(+), 218 deletions(-) diff --git a/auto_round/algorithms/quantization/auto_round/adam.py b/auto_round/algorithms/quantization/auto_round/adam.py index b05af35ea..3f0f87325 100644 --- a/auto_round/algorithms/quantization/auto_round/adam.py +++ b/auto_round/algorithms/quantization/auto_round/adam.py @@ -33,7 +33,7 @@ def _get_optimizer(self, optimizer): def _get_scaler(self): scaler = None - if self.amp and not check_is_cpu(self.device): + if self.model_context.amp and not check_is_cpu(self.compress_context.device): from torch.cuda.amp import GradScaler scaler = GradScaler(init_scale=1024, growth_interval=100000) diff --git a/auto_round/algorithms/quantization/auto_round/quantizer.py b/auto_round/algorithms/quantization/auto_round/quantizer.py index 5d682a9df..e62bffc59 100644 --- a/auto_round/algorithms/quantization/auto_round/quantizer.py +++ b/auto_round/algorithms/quantization/auto_round/quantizer.py @@ -27,16 +27,11 @@ IndexSampler, block_forward, check_need_act_calibration, - check_skippable_keywords, collect_best_params, - get_shared_keys, immediate_pack, - infer_bits_by_data_type, - init_cache, - reset_params, ) from auto_round.logger import logger -from auto_round.modeling.fused_moe.replace_modules import materialize_model_, safe_to_cpu_ +from auto_round.modeling.fused_moe.replace_modules import materialize_model_ from auto_round.sign_sgd import SignSGD from auto_round.utils import ( check_to_quantized, @@ -54,13 +49,10 @@ ) from auto_round.utils.device import ( clear_memory_if_reached_threshold, - get_major_device, - parse_available_devices, set_auto_device_map_for_block_with_tuning, - set_non_auto_device_map, ) from auto_round.utils.distributed import setup_ddp_if_needed_ -from auto_round.wrapper import WrapperLinear, WrapperMultiblock, unwrapper_block, unwrapper_layer, wrapper_block +from auto_round.wrapper import WrapperLinear, unwrapper_block, unwrapper_layer, wrapper_block DIFFUSION_OUTPUT_CONFIGS = { "FluxTransformerBlock": ["encoder_hidden_states", "hidden_states"], @@ -106,120 +98,6 @@ def post_init(self): except (ImportError, ModuleNotFoundError): logger.error("algorithm extension import error, fallback to default mode") - @torch.no_grad() - def _get_block_forward_func(self, name: str) -> Callable: - """Gets the forward function. - - Args: - name (str): The name of the function. - Returns: - function: The forward function. - """ - - def post_process_cache_data(batch_size, data, data_name): - """ - Processes store data for batch handling, reshaping if necessary. - - Args: - batch_size (int): The size of the batch. - data: The data value to store, potentially for caching. - data_name (str): Name of the data. - - Returns: - Processed data or None - """ - new_data = data - if batch_size <= 1: - return new_data - if data_name in self.model_context.shared_cache_keys: - return None - if "alibi" in data_name: - if isinstance(data, torch.Tensor): - alibi = data - alibi = alibi.reshape(batch_size, -1, alibi.shape[1], alibi.shape[2]) - new_data = alibi - return new_data - - def forward(m, hidden_states=None, *positional_inputs, **kwargs): - """Rewrite forward function, process and collect input data. - - Args: - hidden_states (torch.Tensor): The hidden states tensor. - *positional_inputs: Variable number of positional arguments. - **kwargs: Variable number of keyword arguments. - - Returns: - NotImplementedError: Getting the first layer inputs and then raise the error to save runtime. - """ - if name not in self.inputs: - self.inputs[name] = {} - init_cache(positional_inputs, self.inputs[name]) - - if self.batch_dim is None: - self.batch_dim = 0 - if hidden_states is not None and self.batch_size > 1: - if hidden_states.shape[0] > self.batch_size: - self.batch_dim = 1 - if len(hidden_states.shape) > 1 and hidden_states.shape[1] > self.batch_size: - logger.error( - "this model has not been supported, " - "please raise an issue in https://github.com/intel/auto-round/issues" - " or try to set the `batch_size` to 1 and " - "`gradient_accumulate_steps` to your current batch size." - ) - exit(-1) - - if hidden_states is not None: - kwargs["hidden_states"] = hidden_states - - for key in kwargs.keys(): - if ( - isinstance(kwargs[key], torch.Tensor) - or isinstance(kwargs[key], list) - or isinstance(kwargs[key], tuple) - ): - if key not in self.inputs[name].keys(): # initialization - data = to_device(kwargs[key], device=torch.device("cpu")) - if data is None or (self.batch_size > 1 and key in self.model_context.shared_cache_keys): - self.inputs[name][key] = data - continue - if self.batch_size <= 1: - self.inputs[name][key] = [data] - else: - data = post_process_cache_data(self.batch_size, data, key) - self.inputs[name][key] = list(torch.split(data, 1, dim=self.batch_dim)) - else: # append cache inputs - new_data = post_process_cache_data(self.batch_size, kwargs[key], key) - if new_data is None: # shareable args or NoneType - continue - new_data = to_device(new_data, device=torch.device("cpu")) - if self.batch_size <= 1: - self.inputs[name][key].append(new_data) - else: - self.inputs[name][key].extend(list(torch.split(new_data, 1, dim=self.batch_dim))) - elif isinstance(kwargs[key], (str, bool, type(None))): - if key not in self.inputs[name].keys(): - self.inputs[name][key] = kwargs[key] - else: - # Parameters not to be cached - if check_skippable_keywords(key): - logger.warning_once( - f"Please note that '{key}' key" " is not currently used in quantization fine-tuning." - ) - reset_params(self.inputs[name]) - - if self._should_stop_cache_forward(name): - raise NotImplementedError - else: - if hidden_states is not None: - kwargs.pop("hidden_states") - return m.orig_forward(hidden_states, *positional_inputs, **kwargs) - else: - # Currently only for Llama-3.2-Vision-Instruct Series - return m.orig_forward(*positional_inputs, **kwargs) - - return forward - def _get_current_output(self, output: list[torch.Tensor], indices: list[int]) -> torch.Tensor: if self.model_context.is_diffusion: assert "hidden_states" in output @@ -247,13 +125,21 @@ def _get_diffusion_current_q_output( indices, seqlen=self.seqlen, batch_dim=self.batch_dim, - share_cache_keys=self.shared_cache_keys, + share_cache_keys=self.model_context.shared_cache_keys, ) if isinstance(current_input_ids, dict): hidden_states = current_input_ids.pop("hidden_states") current_input_others.update(current_input_ids) current_input_ids = hidden_states - output_q = block_forward(block, current_input_ids, current_input_others, self.amp, self.amp_dtype, device, idx) + output_q = block_forward( + block, + current_input_ids, + current_input_others, + self.model_context.amp, + self.model_context.amp_dtype, + device, + idx, + ) return output_q.to(cache_device) def _get_current_q_output( @@ -339,7 +225,9 @@ def quantize_block( auto_offload=True, **kwargs, ): - self._quantize_block(block, input_ids, input_others, q_input=q_input, auto_offload=auto_offload, **kwargs) + q_outputs, output = self._quantize_block( + block, input_ids, input_others, q_input=q_input, auto_offload=auto_offload, **kwargs + ) if hasattr(block, "config"): del block.config if self.compress_context.is_immediate_saving: @@ -347,6 +235,7 @@ def quantize_block( if not (hasattr(tmp_m, "bits") and check_to_quantized(tmp_m)): continue immediate_pack(tmp_m.global_name, self.layer_config) + return q_outputs, output def _quantize_block( self, @@ -750,8 +639,8 @@ def quantize_layer( current_output = layer(org_input) autocast_ctx = ( nullcontext() - if not self.amp - else autocast(device_type=str(device).split(":")[0], dtype=self.amp_dtype) + if not self.model_context.amp + else autocast(device_type=str(device).split(":")[0], dtype=self.model_context.amp_dtype) ) if self.attention_mask: tmp_attention_mask = [self.attention_mask[i] for i in indices] @@ -783,10 +672,10 @@ def quantize_layer( if total_loss < best_loss: best_loss = total_loss if not self.not_use_best_mse: - best_params = collect_best_params(wrapper_linear, self.cache_device) + best_params = collect_best_params(wrapper_linear, self.compress_context.cache_device) last_best_iter = i if self.not_use_best_mse and i == self.iters - 1: - best_params = collect_best_params(wrapper_linear, self.cache_device) + best_params = collect_best_params(wrapper_linear, self.compress_context.cache_device) if not self.not_use_best_mse: if 0 < self.dynamic_max_gap <= i - last_best_iter: @@ -928,7 +817,15 @@ def _get_diffusion_block_outputs( tmp_input_others.update(tmp_input_ids) tmp_input_ids = hidden_states - tmp_output = block_forward(block, tmp_input_ids, tmp_input_others, self.amp, self.amp_dtype, device, None) + tmp_output = block_forward( + block, + tmp_input_ids, + tmp_input_others, + self.model_context.amp, + self.model_context.amp_dtype, + device, + None, + ) assert len(output_config) == len(tmp_output) tmp_output = dict(zip(output_config, tmp_output)) @@ -938,7 +835,7 @@ def _get_diffusion_block_outputs( output[name].append(out.to(cache_device)) else: output[name].extend(list(torch.split(out.to(cache_device), 1, dim=self.batch_dim))) - if self.low_gpu_mem_usage: + if self.compress_context.low_gpu_mem_usage: clear_memory() return output diff --git a/auto_round/algorithms/quantization/base.py b/auto_round/algorithms/quantization/base.py index 8056a8ab1..60da2125d 100644 --- a/auto_round/algorithms/quantization/base.py +++ b/auto_round/algorithms/quantization/base.py @@ -422,3 +422,41 @@ def _quantize_embedding_layer(self): clear_memory(device_list=self.compress_context.device_list) return is_quantized + + def quantize_block(self, bock, input_ids, input_others, q_input=False, **kwargs): + """Quantizes a given block of the model based on the specified configuration. + + This method applies quantization to the specified block using the appropriate quantization + function determined by the block's configuration. It handles memory management and supports + optional automatic offloading to manage GPU memory usage. + + Args: + block (torch.nn.Module): The block of the model to be quantized. + input_ids (torch.Tensor): The input IDs for the block. + input_others (dict): Additional inputs required for the block's forward pass. + q_input (bool, optional): Whether to quantize the input. Defaults to False. + auto_offload (bool, optional): Whether to automatically offload to manage GPU memory. Defaults to True. + + Returns: + tuple: A tuple containing the quantized outputs and any additional output information. + """ + raise NotImplementedError("quantize_block must be implemented in subclasses of BaseQuantizers") + + def quantize_layer(self, layer, input_ids, input_others, q_input=False, **kwargs): + """Quantizes a single layer of the model based on the specified configuration. + + This method applies quantization to the specified layer using the appropriate quantization + function determined by the layer's configuration. It handles memory management and supports + optional automatic offloading to manage GPU memory usage. + + Args: + layer (torch.nn.Module): The layer of the model to be quantized. + input_ids (torch.Tensor): The input IDs for the layer. + input_others (dict): Additional inputs required for the layer's forward pass. + q_input (bool, optional): Whether to quantize the input. Defaults to False. + auto_offload (bool, optional): Whether to automatically offload to manage GPU memory. Defaults to True. + + Returns: + tuple: A tuple containing the quantized outputs and any additional output information. + """ + raise NotImplementedError("quantize_layer must be implemented in subclasses of BaseQuantizers") diff --git a/auto_round/algorithms/quantization/rtn/quantizer.py b/auto_round/algorithms/quantization/rtn/quantizer.py index d2ed466a9..b11959d42 100644 --- a/auto_round/algorithms/quantization/rtn/quantizer.py +++ b/auto_round/algorithms/quantization/rtn/quantizer.py @@ -22,7 +22,7 @@ from auto_round.algorithms.quantization.auto_round.quantizer import ARQuantizer from auto_round.algorithms.quantization.base import BaseQuantizers from auto_round.algorithms.quantization.rtn.config import RTNConfig -from auto_round.compressors.shard_writer import shard_writer +from auto_round.compressors_new.shard_writer import ShardWriter from auto_round.compressors_new.utils import ( IndexSampler, block_forward, @@ -68,6 +68,8 @@ def __init__(self, config: RTNConfig): BaseQuantizers.__init__(self, config) def quantize_block(self, block: torch.nn.Module, block_name: str, **kwargs): + shard_writer = ShardWriter.get_shard_writer() + tied_weights_keys = getattr(self.model, "_tied_weights_keys", []) if tied_weights_keys is None: tied_weights_keys = [] @@ -94,7 +96,7 @@ def quantize_block(self, block: torch.nn.Module, block_name: str, **kwargs): ): set_module(self.model, m.global_name, copy.deepcopy(m)) if self.compress_context.is_immediate_saving: - shard_writer(self, name=m.global_name) + shard_writer.write(name=m.global_name) copied_m = get_module(self.model, m.global_name) copied_m.to("meta") m.to("meta") @@ -105,7 +107,7 @@ def quantize_block(self, block: torch.nn.Module, block_name: str, **kwargs): else: # Save once at block scope to capture tensors that are not saved # in per-layer branch (e.g., custom module-level params/buffers). - shard_writer(self, name=block_name) + shard_writer.write(name=block_name) block.to("meta") def quantize_layer(self, name: str, dtype: torch.dtype = None) -> None: @@ -183,6 +185,7 @@ def quantize_layer(self, name: str, dtype: torch.dtype = None) -> None: self._immediate_pack_and_save_module(name) def _immediate_pack_and_save_module(self, module_name): + shard_writer = ShardWriter.get_shard_writer() to_cpu = self.compress_context.low_gpu_mem_usage module = get_module(self.model, module_name) if self.compress_context.is_immediate_packing: # For gguf, packing conducts on block level @@ -198,7 +201,7 @@ def _immediate_pack_and_save_module(self, module_name): if self.compress_context.is_immediate_saving: module = get_module(self.model, module_name) module.to("cpu") - shard_writer(self, module, module_name, False) + shard_writer.write(module, module_name, False) # Free RAM immediately: the data is now in the shard-writer buffer # (and will be flushed to disk). Keeping it also in the model tree # causes linear RAM growth for large models. diff --git a/auto_round/compressors_new/base.py b/auto_round/compressors_new/base.py index b3fd9047e..25f082ba2 100644 --- a/auto_round/compressors_new/base.py +++ b/auto_round/compressors_new/base.py @@ -20,7 +20,8 @@ from auto_round.algorithms.alg_config import AlgConfig from auto_round.algorithms.quantization import BaseQuantizers, QuantizationConfig -from auto_round.compressors_new.utils import block_forward +from auto_round.compressors_new.shard_writer import ShardWriter +from auto_round.compressors_new.utils import _get_save_folder_name, block_forward from auto_round.context.compress import CompressContext from auto_round.context.model import ModelContext from auto_round.formats import OutputFormat, get_formats @@ -31,6 +32,7 @@ compile_func, is_debug_mode, is_hpex_available, + memory_monitor, ) from auto_round.utils.device import set_non_auto_device_map from auto_round.utils.offload import OffloadManager @@ -73,6 +75,9 @@ class SerializedCompressorConfig: class BaseCompressor(object): need_calib: bool = True + compress_context: CompressContext = None + model_context: ModelContext = None + shard_writer: ShardWriter = None def __init__( self, @@ -174,6 +179,7 @@ def __init__( is_immediate_saving=self.is_immediate_saving, formats=self.formats, ) + ModelContext.reset_context() self.model_context = ModelContext( model, tokenizer=tokenizer, @@ -184,6 +190,7 @@ def __init__( need_calib=self.need_calib, device=self.compress_context.device, ) + self.shard_writer = None def _adjust_torch_compile(self, enable_torch_compile: bool) -> None: """Sets the torch compile configuration for the tuning.""" @@ -212,6 +219,7 @@ def _adjust_torch_compile(self, enable_torch_compile: bool) -> None: logger.warning("reset enable_torch_compile to `False` as nvfp4 is enabled") def post_init(self): + self.model_context._load_model() assert self.model_context._model_loaded, "should load model first" @@ -227,6 +235,7 @@ def post_init(self): if isinstance(self.formats, str): self.formats = get_formats(self.formats, self) self.compress_context.formats = self.formats + self.shard_writer = ShardWriter(self.model_context.model, bits=8) # Set device, must place after model loading set_non_auto_device_map(self.model_context.model, self.compress_context.device_map) @@ -265,7 +274,7 @@ def __getattr__(self, name: str) -> Any: continue obj = object.__getattribute__(self, obj) try: - return object.__getattribute__(obj, name) + return getattr(obj, name) except AttributeError: continue @@ -351,28 +360,6 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: """ raise NotImplementedError("quantize method must be implemented in subclass") - def _get_save_folder_name(self, format: OutputFormat) -> str: - """Generates the save folder name based on the provided format string. - - If there are multiple formats to handle, the function creates a subfolder - named after the format string with special characters replaced. If there's - only one format, it returns the original output directory directly. - - Args: - format_str (str): The format identifier (e.g., 'gguf:q2_k_s'). - - Returns: - str: The path to the folder where results should be saved. - """ - # Replace special characters to make the folder name filesystem-safe - sanitized_format = format.get_backend_name().replace(":", "-").replace("_", "-") - - # Use a subfolder only if there are multiple formats - if len(self.formats) > 1: - return os.path.join(self.output_dir, sanitized_format) - - return self.output_dir - def save_quantized( self, output_dir: str = None, @@ -394,11 +381,11 @@ def save_quantized( """ self.output_dir = output_dir if format is not None: - logger.warning( - f"save_quantized with format is deprecated and will be deleted in auto_round version 1.0." - f" Please use Compressor(format='{format}' instead)." - ) if isinstance(format, str) and getattr(self, "formats", None) is None: + logger.warning( + f"save_quantized with format is deprecated and will be deleted in auto_round version 1.0." + f" Please use Compressor(format='{format}' instead)." + ) formats = get_formats(format, self) if not hasattr(self, "formats"): self.formats = formats @@ -408,7 +395,7 @@ def save_quantized( return folders = [] for format in self.formats: - save_folder = self._get_save_folder_name(format) + save_folder = _get_save_folder_name(format) if self.act_bits <= 8 and format.is_fake(): logger.warning( "Support for exporting activation quantization is limited. " @@ -429,7 +416,7 @@ def save_quantized( model=self.model_context.model, layer_config=self.quantizer.layer_config, inplace=inplace, - tokenizer=self.tokenizer, + tokenizer=self.model_context.tokenizer, device=self.compress_context.device, serialization_dict=serialization_dict, **kwargs, @@ -494,16 +481,19 @@ def quantize_and_save( with attention_quant_ctx(self.model_context.model, static_attention_dtype=self.static_attention_dtype): self.quantize() + self.model_context.quantized = True elif self.static_kv_dtype is not None: from auto_round.experimental.kv_cache import kvcache_quant_context with kvcache_quant_context(self.model_context.model, static_kv_dtype=self.static_kv_dtype): self.quantize() + self.model_context.quantized = True else: self.quantize() + self.model_context.quantized = True + + # Save the quantized model in the specified format_list + model, folders = self.save_quantized(output_dir, inplace=inplace, return_folders=True, **kwargs) + memory_monitor.log_summary() - # When immediate_saving is enabled, the model has already been saved during quantization - # Skip the save_quantized call to avoid attempting to save layers that are on meta device - if self.is_immediate_saving: - logger.info("immediate_saving is enabled, model already saved during quantization") - return self.model, [output_dir] + return model, folders diff --git a/auto_round/compressors_new/calib.py b/auto_round/compressors_new/calib.py index ce7a9006a..7da1b479f 100644 --- a/auto_round/compressors_new/calib.py +++ b/auto_round/compressors_new/calib.py @@ -29,7 +29,6 @@ _infer_last_cache_name, _update_inputs, ) -from auto_round.compressors.shard_writer import shard_writer from auto_round.compressors_new.base import BaseCompressor from auto_round.compressors_new.utils import ( _get_quantized_layer_names_outside_blocks, @@ -321,7 +320,7 @@ def calib(self, nsamples, bs): # slow here self.dataloader = get_dataloader( - self.tokenizer, + self.model_context.tokenizer, self.quantize_config.seqlen, dataset, self.seed, @@ -343,10 +342,10 @@ def calib(self, nsamples, bs): input_ids = data.to(self.model.device) data_new = input_ids elif isinstance(data, str): - if self.tokenizer is None: + if self.model_context.tokenizer is None: logger.error("please provide tokenizer for string input") exit(-1) - data = self.tokenizer( + data = self.model_context.tokenizer( data, truncation=True, max_length=self.quantize_config.seqlen, return_tensors="pt" ).data data_new = {} @@ -373,11 +372,11 @@ def calib(self, nsamples, bs): ): new_attention_mask = data_new["attention_mask"] elif ( - self.tokenizer is not None - and hasattr(self.tokenizer, "pad_token") - and self.tokenizer.pad_token is not None + self.model_context.tokenizer is not None + and hasattr(self.model_context.tokenizer, "pad_token") + and self.model_context.tokenizer.pad_token is not None ): - new_attention_mask = (input_ids != self.tokenizer.pad_token_id).to(torch.long) + new_attention_mask = (input_ids != self.model_context.tokenizer.pad_token_id).to(torch.long) else: # Default all ones new_attention_mask = torch.ones_like(input_ids, dtype=torch.long) @@ -406,6 +405,8 @@ def calib(self, nsamples, bs): # last position, so the impact on accuracy is minimal as basically equivalent to dropping a single token new_attention_mask[:, -1] = 0 + if not hasattr(self.quantizer, "attention_mask"): + self.quantizer.attention_mask = [] self.quantizer.attention_mask.extend(list(torch.split(new_attention_mask, 1, dim=0))) else: new_attention_mask = None @@ -425,7 +426,7 @@ def calib(self, nsamples, bs): except RuntimeError as error: error_msg = str(error) if "The expanded size of the tensor" in str(error_msg) and "must match the existing size" in error_msg: - check_seqlen_compatible(self.quantize_config.seqlen, self.tokenizer, self.model) + check_seqlen_compatible(self.quantize_config.seqlen, self.model_context.tokenizer, self.model) logger.warning( "When quantization encounters tensor shape mismatch error, " "you can try to avoid it with batch_size=1" @@ -709,7 +710,7 @@ def _quantize_blocks( ) if self.is_immediate_saving: - shard_writer(self, m, is_finalize=False) + self.shard_writer.write(m, is_finalize=False) if self.compress_context.low_cpu_mem_usage and not self.is_immediate_saving: if nblocks == 1: @@ -738,7 +739,6 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: Returns: The quantized model and layer configurations. """ - self.post_init() self.model_context.initialize(formats=self.formats, is_act_quantize=self.config.is_act_quantize) @@ -838,7 +838,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: self.model_context.model, self.model_context.amp_dtype, self.compress_context.device, to_cpu=True ) if self.is_immediate_saving: - shard_writer(self, is_finalize=True) + self.shard_writer.write(is_finalize=True) if self.compress_context.low_cpu_mem_usage: self._offloader.reload(self.model_context.model) @@ -961,7 +961,7 @@ def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None: if self.is_immediate_saving: m = get_module(self.model, layer_name) - shard_writer(self, m, name=layer_name, is_finalize=False) + self.shard_writer.write(m, name=layer_name, is_finalize=False) del layer_input clear_memory(q_layer_input, device_list=self.compress_context.device_list) memory_monitor.log_summary() @@ -982,14 +982,16 @@ def _check_compatibility(self) -> None: self.quantize_config.seqlen, self.model_context.model.config.max_position_embeddings ) - if self.quantize_config.seqlen is not None and hasattr(self.tokenizer, "model_max_length"): - if self.tokenizer.model_max_length < self.quantize_config.seqlen: + if self.quantize_config.seqlen is not None and hasattr(self.model_context.tokenizer, "model_max_length"): + if self.model_context.tokenizer.model_max_length < self.quantize_config.seqlen: logger.warning( - f"Change sequence length to {self.tokenizer.model_max_length} " + f"Change sequence length to {self.model_context.tokenizer.model_max_length} " "due to the limitation of model_max_length. " "You can also try to increase the model_max_length to avoid this issue." ) - self.quantize_config.seqlen = min(self.quantize_config.seqlen, self.tokenizer.model_max_length) + self.quantize_config.seqlen = min( + self.quantize_config.seqlen, self.model_context.tokenizer.model_max_length + ) if self.group_size == 0 and "fp8" not in self.data_type: logger.warning("`group_size==0` is not supported for data_type other than fp8 ") @@ -1091,7 +1093,7 @@ def _quantize_via_rtn_blockwise(self) -> None: input_others, ) - if self.low_cpu_mem_usage and not self.is_immediate_saving: + if self.compress_context.low_cpu_mem_usage and not self.is_immediate_saving: self._offloader.offload(self.model_context.model, block_name) if block_name == block_names[-1]: clear_memory(input_ids, device_list=self.compress_context.device_list) diff --git a/auto_round/compressors_new/entry.py b/auto_round/compressors_new/entry.py index e39b0b4c2..059875d75 100644 --- a/auto_round/compressors_new/entry.py +++ b/auto_round/compressors_new/entry.py @@ -268,6 +268,10 @@ def __new__( act_data_type=act_data_type, act_dynamic=act_dynamic, disable_opt_rtn=disable_opt_rtn, + # for optRTN + seqlen=seqlen, + nsamples=nsamples, + batch_size=batch_size, ) else: # AutoRound mode diff --git a/auto_round/compressors_new/shard_writer.py b/auto_round/compressors_new/shard_writer.py index b6dd329e3..d112c96ff 100644 --- a/auto_round/compressors_new/shard_writer.py +++ b/auto_round/compressors_new/shard_writer.py @@ -18,6 +18,9 @@ import torch +from auto_round.compressors_new.utils import _get_save_folder_name +from auto_round.context.compress import CompressContext +from auto_round.context.model import ModelContext from auto_round.logger import logger from auto_round.utils import get_lm_head_name, get_module @@ -27,8 +30,28 @@ class ShardWriter: Handles shard-saving of model parameters to disk with memory management. """ - def __init__(self, rounder): - self.model = rounder.model + _instance = None + _initialized = False + + model = None + lm_head_name = None + + def __new__(cls, *args, **kwargs): + if cls._instance is None: + cls._instance = super().__new__(cls) + cls._instance._data = {} + return cls._instance + + def __init__( + self, + model, + bits, + max_shard_size=None, + safe_serialization=True, + ): + if ShardWriter._initialized: + return + self.model = model self.lm_head_name = get_lm_head_name(self.model) total_params = sum(p.numel() for p in self.model.parameters()) # Heuristic estimate of model size in GB used to choose a default max_shard_size: @@ -39,12 +62,13 @@ def __init__(self, rounder): # smaller than the full model; this intentionally # underestimates size before clamping below. max_split_num = 10 - model_size = int(total_params * rounder.bits // 1e9 // 8 + max_split_num - 1) / max_split_num + model_size = int(total_params * bits // 1e9 // 8 + max_split_num - 1) / max_split_num model_size = max(1, min(int(model_size), 5)) # Configuration - self.max_shard_size = self._parse_size(getattr(rounder, "max_shard_size", f"{model_size}GB")) - self.safe_serialization = getattr(rounder, "safe_serialization", True) + max_shard_size = max_shard_size or f"{model_size}GB" + self.max_shard_size = self._parse_size(max_shard_size) + self.safe_serialization = safe_serialization # Internal State self.use_safetensors = self._check_safetensors() @@ -65,9 +89,21 @@ def __init__(self, rounder): self.skipped_meta_tensors = [] # Directory Setup - self.output_dir = os.path.join(rounder._get_save_folder_name(rounder.formats[0]), "") + compress_context = CompressContext.get_context() + formats = compress_context.formats + self.output_dir = os.path.join(_get_save_folder_name(formats[0]), "") os.makedirs(self.output_dir, exist_ok=True) + ShardWriter._initialized = True + + @classmethod + def get_shard_writer(cls, *args, **kwargs): + if cls._instance is None: + raise ValueError( + "ShardWriter has not been initialized yet. Please create an instance before calling get_shard_writer." + ) + return cls._instance + def _parse_size(self, size_str: str) -> int: if isinstance(size_str, int): return size_str @@ -228,21 +264,17 @@ def finalize(self): logger.info(f"model has been saved to {self.output_dir}") - -@torch.no_grad() -def shard_writer(rounder: object, m: torch.nn.Module = None, name: str = None, is_finalize: bool = False): - if m is None and name is None and not is_finalize and not is_finalize: - raise ValueError("Must specify either name or m") - if not hasattr(rounder, "_shard_writer"): - rounder._shard_writer = ShardWriter(rounder) - - if m is None and name is not None: - m = get_module(rounder.model, name) - # Perform the save - if m is not None: - rounder._shard_writer.save_module(m, name) - - if is_finalize: - rounder._shard_writer.finalize() - # Optional: cleanup the saver object from rounder - del rounder._shard_writer + @torch.no_grad() + def write(self, m: torch.nn.Module = None, name: str = None, is_finalize: bool = False): + if m is None and name is None and not is_finalize and not is_finalize: + raise ValueError("Must specify either name or m") + if m is None and name is not None: + m = get_module(self.model, name) + # Perform the save + if m is not None: + self.save_module(m, name) + + if is_finalize: + self.finalize() + # Optional: cleanup the saver object from rounder + self._initialized = False diff --git a/auto_round/compressors_new/zero_shot.py b/auto_round/compressors_new/zero_shot.py index d4629de83..8132d621e 100644 --- a/auto_round/compressors_new/zero_shot.py +++ b/auto_round/compressors_new/zero_shot.py @@ -20,7 +20,6 @@ from auto_round.algorithms.alg_config import AlgConfig from auto_round.compressors_new.base import BaseCompressor -from auto_round.compressors_new.shard_writer import shard_writer from auto_round.compressors_new.utils import ( _get_quantized_layer_names_outside_blocks, check_need_act_calibration, @@ -306,7 +305,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: and self.is_immediate_saving ): set_module(self.model, n, copy.deepcopy(m)) - shard_writer(self, name=n) + self.shard_writer.write(name=n) m.to("meta") # Convert remaining fp8 @@ -314,7 +313,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: if self.low_cpu_mem_usage: self._offloader.reload(self.model) if self.is_immediate_saving: - shard_writer(self, is_finalize=True) + self.shard_writer.write(is_finalize=True) self.quantized = True return self.model, self.layer_config diff --git a/auto_round/context/base.py b/auto_round/context/base.py index 7da8b4f36..2032accfc 100644 --- a/auto_round/context/base.py +++ b/auto_round/context/base.py @@ -43,10 +43,33 @@ def __call__(cls, *args, **kwargs): class BaseContext(metaclass=AutoSkipInitMeta): _instances = {} + _internal_attrs = {"_context_state", "_singleton_skip_init"} def __init__(self): + if "_context_state" not in self.__dict__: + object.__setattr__(self, "_context_state", {}) logger.info(f"{self.__class__.__name__} context initialized.") + def __getattr__(self, name): + context_state = object.__getattribute__(self, "_context_state") + if name in context_state: + return context_state[name] + raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'") + + def __setattr__(self, name, value): + if name.startswith("_") or name in self._internal_attrs: + object.__setattr__(self, name, value) + return + self._set_context_attr(name, value) + + def _set_context_attr(self, name, value): + context_state = object.__getattribute__(self, "_context_state") + context_state[name] = value + + def _update_context_attrs(self, **kwargs): + for name, value in kwargs.items(): + self._set_context_attr(name, value) + @classmethod def get_context(cls): assert cls in cls._instances, f"{cls.__name__} context has not been created yet." @@ -55,3 +78,7 @@ def get_context(cls): @classmethod def create_context(cls, *args, **kwargs): return cls(*args, **kwargs) + + @classmethod + def reset_context(cls): + cls._instances.pop(cls, None) From 6feed9935de8945d9137ebe242b9d0b0f7b1b013 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Wed, 18 Mar 2026 15:38:51 +0800 Subject: [PATCH 09/90] fix Signed-off-by: n1ck-guo --- auto_round/compressors_new/base.py | 2 +- auto_round/context/base.py | 23 ----------------------- auto_round/context/model.py | 12 ++++++------ 3 files changed, 7 insertions(+), 30 deletions(-) diff --git a/auto_round/compressors_new/base.py b/auto_round/compressors_new/base.py index 25f082ba2..af05d9c41 100644 --- a/auto_round/compressors_new/base.py +++ b/auto_round/compressors_new/base.py @@ -274,7 +274,7 @@ def __getattr__(self, name: str) -> Any: continue obj = object.__getattribute__(self, obj) try: - return getattr(obj, name) + return object.__getattribute__(obj, name) except AttributeError: continue diff --git a/auto_round/context/base.py b/auto_round/context/base.py index 2032accfc..e3f75fb8f 100644 --- a/auto_round/context/base.py +++ b/auto_round/context/base.py @@ -43,33 +43,10 @@ def __call__(cls, *args, **kwargs): class BaseContext(metaclass=AutoSkipInitMeta): _instances = {} - _internal_attrs = {"_context_state", "_singleton_skip_init"} def __init__(self): - if "_context_state" not in self.__dict__: - object.__setattr__(self, "_context_state", {}) logger.info(f"{self.__class__.__name__} context initialized.") - def __getattr__(self, name): - context_state = object.__getattribute__(self, "_context_state") - if name in context_state: - return context_state[name] - raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'") - - def __setattr__(self, name, value): - if name.startswith("_") or name in self._internal_attrs: - object.__setattr__(self, name, value) - return - self._set_context_attr(name, value) - - def _set_context_attr(self, name, value): - context_state = object.__getattribute__(self, "_context_state") - context_state[name] = value - - def _update_context_attrs(self, **kwargs): - for name, value in kwargs.items(): - self._set_context_attr(name, value) - @classmethod def get_context(cls): assert cls in cls._instances, f"{cls.__name__} context has not been created yet." diff --git a/auto_round/context/model.py b/auto_round/context/model.py index 1117467a4..52785278f 100644 --- a/auto_round/context/model.py +++ b/auto_round/context/model.py @@ -42,16 +42,10 @@ class ModelContext(BaseContext): _is_initialized = False - quantized = False # model_related _model_loaded = False _init_model = False - is_mllm = False - is_diffusion = False - is_model_patched = False - is_moe_model = False - hook_handles = [] def __init__( @@ -66,6 +60,12 @@ def __init__( device="cpu", ): super().__init__() + self.quantized = False + self.is_mllm = False + self.is_diffusion = False + self.is_model_patched = False + self.is_moe_model = False + assert model is not None, "model must be provided for ModelContext" self.model = model self.tokenizer = tokenizer From 7bd3e62b6f24b4c86e88a45cdac8fe542250290b Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Wed, 18 Mar 2026 17:21:31 +0800 Subject: [PATCH 10/90] fix qweight Signed-off-by: n1ck-guo --- auto_round/compressors_new/calib.py | 9 +++++++++ auto_round/compressors_new/zero_shot.py | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/auto_round/compressors_new/calib.py b/auto_round/compressors_new/calib.py index 7da1b479f..92de7e2ba 100644 --- a/auto_round/compressors_new/calib.py +++ b/auto_round/compressors_new/calib.py @@ -1240,5 +1240,14 @@ def quantize(self): clear_memory(device_list=self.compress_context.device_list) self._quant_rtn_with_imatrix() + + convert_module_to_hp_if_necessary( + self.model_context.model, self.model_context.amp_dtype, self.compress_context.device + ) + if self.compress_context.low_cpu_mem_usage: + self._offloader.reload(self.model_context.model) + if self.is_immediate_saving: + self.shard_writer.write(is_finalize=True) + self.model_context.quantized = True return self.model_context.model, self.quantizer.layer_config diff --git a/auto_round/compressors_new/zero_shot.py b/auto_round/compressors_new/zero_shot.py index 8132d621e..2048ec9fe 100644 --- a/auto_round/compressors_new/zero_shot.py +++ b/auto_round/compressors_new/zero_shot.py @@ -315,5 +315,5 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: if self.is_immediate_saving: self.shard_writer.write(is_finalize=True) - self.quantized = True + self.model_context.quantized = True return self.model, self.layer_config From 9b149183368d95cd4a33758f49ad6784623cddd2 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 19 Mar 2026 13:57:37 +0800 Subject: [PATCH 11/90] fix ut and refactor code Signed-off-by: n1ck-guo --- .../quantization/auto_round/quantizer.py | 40 +++++-- auto_round/algorithms/quantization/base.py | 49 +++----- auto_round/algorithms/quantization/config.py | 112 ++++++++++-------- .../algorithms/quantization/rtn/config.py | 3 +- .../algorithms/quantization/rtn/quantizer.py | 30 ++--- auto_round/autoround.py | 4 +- auto_round/compressors_new/__init__.py | 21 ++-- auto_round/compressors_new/base.py | 111 +++++++++++++---- auto_round/compressors_new/calib.py | 29 +++-- auto_round/compressors_new/entry.py | 74 +++++++++--- auto_round/compressors_new/mllm_mixin.py | 61 +++++++--- auto_round/compressors_new/shard_writer.py | 4 +- auto_round/compressors_new/zero_shot.py | 12 +- auto_round/context/model.py | 98 ++++++++++----- auto_round/export/utils.py | 36 ++++-- 15 files changed, 453 insertions(+), 231 deletions(-) diff --git a/auto_round/algorithms/quantization/auto_round/quantizer.py b/auto_round/algorithms/quantization/auto_round/quantizer.py index e62bffc59..bbddb0d1d 100644 --- a/auto_round/algorithms/quantization/auto_round/quantizer.py +++ b/auto_round/algorithms/quantization/auto_round/quantizer.py @@ -218,18 +218,29 @@ def _get_loss( def quantize_block( self, - block: torch.nn.Module, + block_name: Union[str, list[str]], input_ids: Union[list[torch.Tensor], dict], input_others: dict, q_input: Union[torch.Tensor, dict, None] = None, auto_offload=True, **kwargs, ): + """Quantize a block (or multiple blocks fused as WrapperMultiblock). + + Args: + block_name: A single block name, or a list of names when nblocks > 1. + The module(s) are retrieved internally via get_module(). + """ + if isinstance(block_name, list): + from auto_round.wrapper import WrapperMultiblock + + modules = [get_module(self.model, n) for n in block_name] + block = WrapperMultiblock(modules) + else: + block = get_module(self.model, block_name) q_outputs, output = self._quantize_block( block, input_ids, input_others, q_input=q_input, auto_offload=auto_offload, **kwargs ) - if hasattr(block, "config"): - del block.config if self.compress_context.is_immediate_saving: for n, tmp_m in block.named_modules(): if not (hasattr(tmp_m, "bits") and check_to_quantized(tmp_m)): @@ -518,7 +529,7 @@ def _quantize_block( return None, output def quantize_layer( - self, layer_name: str, inputs: torch.Tensor, q_inputs: torch.Tensor = None, device: str = "cpu", **kwargs + self, layer_name: str, input_ids: torch.Tensor, q_inputs: torch.Tensor = None, device: str = "cpu", **kwargs ): """Quantize a specific layer of the model using the provided inputs. @@ -537,8 +548,8 @@ def quantize_layer( device = layer.tuning_device layer = layer.to(device) - for i in range(len(inputs)): - inputs[i] = inputs[i].to(layer.weight.dtype) + for i in range(len(input_ids)): + input_ids[i] = input_ids[i].to(layer.weight.dtype) if q_inputs is not None: q_inputs[i] = q_inputs[i].to(layer.weight.dtype) @@ -549,7 +560,7 @@ def quantize_layer( self.config.static_kv_dtype, self.config.static_attention_dtype, ): - tmp_inputs = q_inputs if q_inputs is not None else inputs + tmp_inputs = q_inputs if q_inputs is not None else input_ids hook_handles = self._register_act_max_hook(layer) with torch.no_grad(): for input in tmp_inputs: @@ -592,7 +603,7 @@ def quantize_layer( ) else: lr_schedule = copy.deepcopy(self.lr_scheduler) - nsamples = len(inputs) + nsamples = len(input_ids) last_best_iter = 0 best_loss = torch.finfo(torch.float).max best_params = None @@ -614,7 +625,7 @@ def quantize_layer( if q_inputs is not None: num_elm = self._get_current_num_elm(q_inputs, whole_indices) else: - num_elm = self._get_current_num_elm(inputs, whole_indices) + num_elm = self._get_current_num_elm(input_ids, whole_indices) index_sampler = IndexSampler(nsamples, global_batch_size) @@ -629,10 +640,10 @@ def quantize_layer( if q_inputs is not None: current_input = [q_inputs[i] for i in indices] current_input = torch.cat(current_input, dim=0).to(device) - org_input = [inputs[i] for i in indices] + org_input = [input_ids[i] for i in indices] org_input = torch.cat(org_input, dim=0).to(device) else: - current_input = [inputs[i] for i in indices] + current_input = [input_ids[i] for i in indices] current_input = torch.cat(current_input, dim=0).to(device) org_input = current_input with torch.no_grad(): @@ -810,7 +821,12 @@ def _get_diffusion_block_outputs( end_index = min(nsamples, i + bs) indices = torch.arange(i, end_index).to(torch.long) tmp_input_ids, tmp_input_others = self._sampling_inputs( - input_ids, input_others, indices, self.seqlen, self.batch_dim, share_cache_keys=self.shared_cache_keys + input_ids, + input_others, + indices, + self.seqlen, + self.batch_dim, + share_cache_keys=self.model_context.shared_cache_keys, ) if isinstance(tmp_input_ids, dict): hidden_states = tmp_input_ids.pop("hidden_states") diff --git a/auto_round/algorithms/quantization/base.py b/auto_round/algorithms/quantization/base.py index 60da2125d..9701857bf 100644 --- a/auto_round/algorithms/quantization/base.py +++ b/auto_round/algorithms/quantization/base.py @@ -146,11 +146,17 @@ def post_init(self): else: enable_gguf_official_mixed = False - if not hasattr(self, "quant_block_list"): - all_blocks = get_block_names(self.model_context.model) + if self.quant_block_list is None: + quant_nontext_module = getattr(self.model_context, "quant_nontext_module", False) + all_blocks = get_block_names(self.model_context.model, quant_vision=quant_nontext_module) self.quant_block_list = find_matching_blocks( self.model_context.model, all_blocks, self.to_quant_block_names ) + if self.to_quant_block_names is None and self.quant_block_list: + from auto_round.utils import extract_block_names_to_str + + self.to_quant_block_names = extract_block_names_to_str(self.quant_block_list) + self.config.to_quant_block_names = self.to_quant_block_names self.configure_layer_config(enable_gguf_official_mixed=enable_gguf_official_mixed) @@ -281,6 +287,7 @@ def configure_layer_config(self, enable_gguf_official_mixed: None | bool = True) ) def _register_act_max_hook(self, model): + def get_act_max_hook(module, input, output): if isinstance(input, (tuple, list)): input = input[0] @@ -423,40 +430,22 @@ def _quantize_embedding_layer(self): return is_quantized - def quantize_block(self, bock, input_ids, input_others, q_input=False, **kwargs): - """Quantizes a given block of the model based on the specified configuration. - - This method applies quantization to the specified block using the appropriate quantization - function determined by the block's configuration. It handles memory management and supports - optional automatic offloading to manage GPU memory usage. + def quantize_block(self, block_name: str, input_ids=None, input_others=None, **kwargs): + """Quantizes a given block of the model. Args: - block (torch.nn.Module): The block of the model to be quantized. - input_ids (torch.Tensor): The input IDs for the block. - input_others (dict): Additional inputs required for the block's forward pass. - q_input (bool, optional): Whether to quantize the input. Defaults to False. - auto_offload (bool, optional): Whether to automatically offload to manage GPU memory. Defaults to True. - - Returns: - tuple: A tuple containing the quantized outputs and any additional output information. + block_name (str): The name of the block to quantize. The block module is + retrieved internally via get_module(model, block_name). + input_ids: Calibration inputs for the block (required by gradient-based quantizers). + input_others (dict): Additional inputs for the block's forward pass. """ raise NotImplementedError("quantize_block must be implemented in subclasses of BaseQuantizers") - def quantize_layer(self, layer, input_ids, input_others, q_input=False, **kwargs): - """Quantizes a single layer of the model based on the specified configuration. - - This method applies quantization to the specified layer using the appropriate quantization - function determined by the layer's configuration. It handles memory management and supports - optional automatic offloading to manage GPU memory usage. + def quantize_layer(self, layer_name: str, **kwargs): + """Quantizes a single layer of the model. Args: - layer (torch.nn.Module): The layer of the model to be quantized. - input_ids (torch.Tensor): The input IDs for the layer. - input_others (dict): Additional inputs required for the layer's forward pass. - q_input (bool, optional): Whether to quantize the input. Defaults to False. - auto_offload (bool, optional): Whether to automatically offload to manage GPU memory. Defaults to True. - - Returns: - tuple: A tuple containing the quantized outputs and any additional output information. + layer_name (str): The name of the layer to quantize. The layer module is + retrieved internally via get_module(model, layer_name). """ raise NotImplementedError("quantize_layer must be implemented in subclasses of BaseQuantizers") diff --git a/auto_round/algorithms/quantization/config.py b/auto_round/algorithms/quantization/config.py index a26eb5d35..68920e3f1 100644 --- a/auto_round/algorithms/quantization/config.py +++ b/auto_round/algorithms/quantization/config.py @@ -12,8 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. import copy +from dataclasses import dataclass from enum import Enum -from typing import Union +from typing import ClassVar, Union from auto_round.algorithms.alg_config import AlgConfig from auto_round.auto_scheme.gen_auto_scheme import AutoScheme @@ -37,51 +38,64 @@ class BackendDataType(str, Enum): FP8 = "fp8" +@dataclass(kw_only=True) class QuantizationConfig(AlgConfig): - _alg_cls: str = None - - def __init__( - self, - scheme: Union[str, dict, QuantizationScheme, AutoScheme] = "W4A16", - layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None, - *, - # quantization args - bits: int = None, - group_size: int = None, - sym: bool = None, - data_type: str = None, - act_bits: int = None, - act_group_size: int = None, - act_sym: bool = None, - act_data_type: str = None, - act_dynamic: bool = None, - super_bits: int = None, - super_group_size: int = None, - scale_dtype: str = None, - ignore_layers: str = "", - quant_lm_head: bool = False, - to_quant_block_names: Union[str, list, None] = None, - ): - - self.scheme = scheme - self.layer_config = layer_config - - self.bits = bits - self.group_size = group_size - self.sym = sym - self.data_type = data_type - self.act_bits = act_bits - self.act_group_size = act_group_size - self.act_sym = act_sym - self.act_data_type = act_data_type - self.act_dynamic = act_dynamic - self.super_bits = super_bits - self.super_group_size = super_group_size - - self.scale_dtype = scale_dtype - self.ignore_layers = ignore_layers - self.quant_lm_head = quant_lm_head - self.to_quant_block_names = to_quant_block_names + _alg_cls: ClassVar[str] = None + + # quantization args + scheme: Union[str, dict, QuantizationScheme, AutoScheme] = "W4A16" + layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None + bits: int = None + group_size: int = None + sym: bool = None + data_type: str = None + act_bits: int = None + act_group_size: int = None + act_sym: bool = None + act_data_type: str = None + act_dynamic: bool = None + super_bits: int = None + super_group_size: int = None + scale_dtype: str = None + ignore_layers: str = "" + quant_lm_head: bool = False + to_quant_block_names: Union[str, list, None] = None + + def __post_init__(self): + # Resolve scheme attributes early so properties (is_act_nv_fp, is_wfp8afp8, etc.) + # work correctly at construction time without waiting for post_init(). + self._early_resolve_scheme() + + def _early_resolve_scheme(self) -> None: + """Resolve scheme attributes early so properties work from init time. + + Both entry.py routing (needs_act_calib) and BaseCompressor._adjust_torch_compile + need resolved attributes (act_data_type, data_type, is_act_nv_fp, ...) before + BaseQuantizers.post_init() runs (which is deferred until quantize() / after model + loading). This method performs the same _parse_scheme() call eagerly so those + attributes are available from construction time. + + AutoScheme is left deferred because it requires model information to select its + concrete option. + """ + if isinstance(self.scheme, AutoScheme): + # AutoScheme needs model info for option selection — defer to post_init + return + + # Collect fields that exist in both QuantizationScheme and QuantizationConfig + # where the user explicitly provided a value (non-None). These override the + # scheme's built-in defaults so that e.g. RTNConfig(scheme="NVFP4", bits=8) + # expands NVFP4 but keeps bits=8 instead of the scheme's default bits=4. + user_scheme_overrides = { + k: getattr(self, k) for k in QuantizationScheme.get_attributes() if getattr(self, k, None) is not None + } + + try: + _, _, final_attrs = _parse_scheme(self.scheme, user_scheme_overrides) + vars(self).update(final_attrs) + except Exception: + # Silently ignore failures — post_init() will do the authoritative resolution + pass def check_config(self) -> None: """Checks if the configurations are valid. @@ -153,11 +167,11 @@ def is_dynamic_wint8aint8(self): return False @property - def is_standard_fp(self, act=False): + def is_standard_fp(self): return BackendDataType.STANDARD_FP in self.data_type and not self.is_mx_fp and not self.is_nv_fp @property - def is_act_standard_fp(self, act=False): + def is_act_standard_fp(self): return BackendDataType.STANDARD_FP in self.act_data_type and not self.is_act_mx_fp and not self.is_act_nv_fp @property @@ -173,8 +187,8 @@ def is_wfp8afp8(self): if ( ("fp8" in self.act_data_type or ("fp" in self.act_data_type and self.act_bits == 8)) and ("fp8" in self.data_type or ("fp" in self.data_type and self.bits == 8)) - and self.is_standard_fp(act=True) - and self.is_standard_fp(act=False) + and self.is_act_standard_fp + and self.is_standard_fp ): return True else: diff --git a/auto_round/algorithms/quantization/rtn/config.py b/auto_round/algorithms/quantization/rtn/config.py index 2eae2d507..cb297529c 100644 --- a/auto_round/algorithms/quantization/rtn/config.py +++ b/auto_round/algorithms/quantization/rtn/config.py @@ -32,6 +32,8 @@ def __init__( batch_size: int = 8, **kwargs, ): + # pop before super().__init__ so it doesn't leak into QuantizationConfig as an unknown kwarg + enable_opt_rtn = kwargs.pop("enable_opt_rtn", None) super().__init__(scheme=scheme, layer_config=layer_config, **kwargs) self.seqlen = seqlen @@ -44,7 +46,6 @@ def __init__( # Automatically adjust the disable_opt_rtn option if the user does not explicitly set it. # To avoid None issue, we keep a copy though it's a little ugly - enable_opt_rtn = kwargs.pop("enable_opt_rtn", None) if enable_opt_rtn and disable_opt_rtn: raise ValueError("`enable_opt_rtn` and `disable_opt_rtn` are mutually exclusive; " "only one can be set.") if enable_opt_rtn: diff --git a/auto_round/algorithms/quantization/rtn/quantizer.py b/auto_round/algorithms/quantization/rtn/quantizer.py index b11959d42..bd34b11b6 100644 --- a/auto_round/algorithms/quantization/rtn/quantizer.py +++ b/auto_round/algorithms/quantization/rtn/quantizer.py @@ -64,10 +64,12 @@ class RTNQuantizer(BaseQuantizers): + def __init__(self, config: RTNConfig): BaseQuantizers.__init__(self, config) - def quantize_block(self, block: torch.nn.Module, block_name: str, **kwargs): + def quantize_block(self, block_name: str, **kwargs): + block = get_module(self.model, block_name) shard_writer = ShardWriter.get_shard_writer() tied_weights_keys = getattr(self.model, "_tied_weights_keys", []) @@ -100,15 +102,16 @@ def quantize_block(self, block: torch.nn.Module, block_name: str, **kwargs): copied_m = get_module(self.model, m.global_name) copied_m.to("meta") m.to("meta") - # Move remaining GPU tensors to CPU; offload to disk if low_cpu_mem_usage. - # This mirrors _quantize_via_rtn_blockwise's post-block cleanup. - if not self.compress_context.is_immediate_saving: - mv_module_from_gpu(block) - else: - # Save once at block scope to capture tensors that are not saved - # in per-layer branch (e.g., custom module-level params/buffers). - shard_writer.write(name=block_name) - block.to("meta") + + # Move remaining GPU tensors to CPU; offload to disk if low_cpu_mem_usage. + # This mirrors _quantize_via_rtn_blockwise's post-block cleanup. + if not self.compress_context.is_immediate_saving: + mv_module_from_gpu(block) + else: + # Save once at block scope to capture tensors that are not saved + # in per-layer branch (e.g., custom module-level params/buffers). + shard_writer.write(name=block_name) + block.to("meta") def quantize_layer(self, name: str, dtype: torch.dtype = None) -> None: """Quantizes a layer using RTN (Round-To-Nearest) if available. @@ -209,6 +212,7 @@ def _immediate_pack_and_save_module(self, module_name): class OptimizedRTNQuantizer(RTNQuantizer): + def __init__(self, config: RTNConfig): BaseQuantizers.__init__(self, config) self.batch_size = config.batch_size @@ -221,10 +225,8 @@ def __init__(self, config: RTNConfig): self.enable_alg_ext = True - def quantize_block( - self, block: torch.nn.Module, input_ids: Union[list[torch.Tensor], dict], input_others: dict, **kwargs - ): - + def quantize_block(self, block_name: str, input_ids: Union[list[torch.Tensor], dict], input_others: dict, **kwargs): + block = get_module(self.model, block_name) materialize_model_(block) block.to("cpu") diff --git a/auto_round/autoround.py b/auto_round/autoround.py index a1ccb65aa..ed90bac09 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -54,7 +54,7 @@ class AutoRound: enable_torch_compile (bool): Whether to enable torch.compile for quant blocks/layers. """ - SKIP_ARGS = ("local_args", "kwargs", "cls", "model_cls", "dynamic_compressor", "extra_config", "enable_adam") + SKIP_ARGS = ("local_args", "kwargs", "cls", "model_cls", "dynamic_compressor", "extra_config") bits: int | None group_size: int | tuple | None @@ -164,7 +164,7 @@ def __new__( if NEW_ARCH: from auto_round.compressors_new.entry import AutoRound as AutoRoundNew - return AutoRoundNew(**local_args) + return AutoRoundNew(**local_args, **kwargs) model_cls = [] diff --git a/auto_round/compressors_new/__init__.py b/auto_round/compressors_new/__init__.py index de63f779e..8bee639c5 100644 --- a/auto_round/compressors_new/__init__.py +++ b/auto_round/compressors_new/__init__.py @@ -18,11 +18,17 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: - from auto_round.compressors_new.calib import CalibCompressor, ImatrixCompressor + from auto_round.compressors_new.calib import CalibCompressor, CalibratedRTNCompressor from auto_round.compressors_new.entry import AutoRound, Compressor from auto_round.compressors_new.zero_shot import ZeroShotCompressor -__all__ = ["Compressor", "CalibCompressor", "ImatrixCompressor", "ZeroShotCompressor", "AutoRound"] +__all__ = [ + "Compressor", + "CalibCompressor", + "CalibratedRTNCompressor", + "ZeroShotCompressor", + "AutoRound", +] def __getattr__(name): @@ -33,12 +39,13 @@ def __getattr__(name): if name == "Compressor": return Compressor return AutoRound - elif name == "CalibCompressor" or name == "ImatrixCompressor": - from auto_round.compressors_new.calib import CalibCompressor, ImatrixCompressor + elif name in ("CalibCompressor", "CalibratedRTNCompressor"): + from auto_round.compressors_new.calib import CalibCompressor, CalibratedRTNCompressor - if name == "CalibCompressor": - return CalibCompressor - return ImatrixCompressor + return { + "CalibCompressor": CalibCompressor, + "CalibratedRTNCompressor": CalibratedRTNCompressor, + }[name] elif name == "ZeroShotCompressor": from auto_round.compressors_new.zero_shot import ZeroShotCompressor diff --git a/auto_round/compressors_new/base.py b/auto_round/compressors_new/base.py index af05d9c41..6d4e273c5 100644 --- a/auto_round/compressors_new/base.py +++ b/auto_round/compressors_new/base.py @@ -30,6 +30,8 @@ SUPPORTED_LAYER_TYPES, TORCH_VERSION_AT_LEAST_2_6, compile_func, + convert_dtype_str2torch, + extract_block_names_to_str, is_debug_mode, is_hpex_available, memory_monitor, @@ -118,6 +120,7 @@ def __init__( # Model related model_dtype = kwargs.pop("model_dtype", None) trust_remote_code = kwargs.pop("trust_remote_code") if "trust_remote_code" in kwargs else True + quant_nontext_module = kwargs.pop("quant_nontext_module", False) self.static_attention_dtype = kwargs.pop("static_attention_dtype", None) # Attention static dtype @@ -169,6 +172,10 @@ def __init__( logger.info("habana_frameworks is available, import htcore explicitly.") import habana_frameworks.torch.core as htcore # pylint: disable=E0401 + # Reset both context singletons before creating fresh instances so that + # consecutive AutoRound creations don't inherit stale config from earlier ones. + CompressContext.reset_context() + ModelContext.reset_context() # Alternatively, you can use CompressContext.create_context self.compress_context = CompressContext( low_cpu_mem_usage, @@ -179,7 +186,6 @@ def __init__( is_immediate_saving=self.is_immediate_saving, formats=self.formats, ) - ModelContext.reset_context() self.model_context = ModelContext( model, tokenizer=tokenizer, @@ -189,19 +195,50 @@ def __init__( amp=amp, need_calib=self.need_calib, device=self.compress_context.device, + formats=self.formats, + is_act_quantize=self.quantize_config.is_act_quantize, + quant_nontext_module=quant_nontext_module, ) self.shard_writer = None + from auto_round.schemes import get_gguf_scheme + + qc = self.quantize_config + if qc.scale_dtype is None: + qc.scale_dtype = convert_dtype_str2torch("fp32" if get_gguf_scheme(qc.scheme) else "fp16") + + # Apply torch compile adjustments eagerly so that ar.enable_torch_compile + # reflects the correct value immediately after construction (not only after post_init). + self._adjust_torch_compile(enable_torch_compile) + self.compress_context.enable_torch_compile = self.enable_torch_compile + def _adjust_torch_compile(self, enable_torch_compile: bool) -> None: """Sets the torch compile configuration for the tuning.""" self.enable_torch_compile = enable_torch_compile + + # Determine fp8 / nvfp4 intent from raw config before scheme resolution. + cfg = self.quantize_config + raw_scheme = cfg.scheme if isinstance(cfg.scheme, str) else "" + raw_dt = (cfg.data_type or "").lower() + raw_adt = (cfg.act_data_type or "").lower() + raw_scheme_upper = raw_scheme.upper() + + is_raw_nv_fp = "nv_fp" in raw_dt or "nv_fp" in raw_adt or "NVFP" in raw_scheme_upper + is_raw_fp8 = ( + "fp8" in raw_dt + or "fp8" in raw_adt + or "FP8" in raw_scheme_upper + or ("fp" in raw_dt and getattr(cfg, "bits", 16) == 8) + or ("fp" in raw_adt and getattr(cfg, "act_bits", 16) == 8) + ) + + act_bits = getattr(cfg, "act_bits", 16) or 16 if ( not self.enable_torch_compile and TORCH_VERSION_AT_LEAST_2_6 - and self.quantize_config.act_bits > 8 + and act_bits > 8 and not is_debug_mode() - and "fp8" not in self.quantize_config.data_type - and "fp8" not in self.quantize_config.act_data_type + and not is_raw_fp8 and self.need_calib ): logger.info( @@ -210,37 +247,34 @@ def _adjust_torch_compile(self, enable_torch_compile: bool) -> None: "Enabling it can reduce tuning cost by 20%, but it might throw an exception.", ) # On HPU, we rely on torch.compile to speed up the model execution. - if self.enable_torch_compile and self.quantize_config.is_wfp8afp8 and not is_hpex_available(): + if self.enable_torch_compile and is_raw_fp8 and not is_hpex_available(): self.enable_torch_compile = False logger.warning("reset enable_torch_compile to `False` as fp8 is enabled") # TODO: fix https://github.com/intel/auto-round/issues/1109 - if self.enable_torch_compile and self.quantize_config.is_act_nv_fp: + if self.enable_torch_compile and is_raw_nv_fp: self.enable_torch_compile = False logger.warning("reset enable_torch_compile to `False` as nvfp4 is enabled") def post_init(self): + assert self.model_context._model_loaded, "model should be loaded in ModelContext.__init__" + + # 1. Resolve formats (scale_dtype was defaulted early in __init__) + if isinstance(self.formats, str): + self.formats = get_formats(self.formats, self) + if self.formats is not None: + self.compress_context.formats = self.formats + self.shard_writer = ShardWriter(self.model_context.model, bits=8) - self.model_context._load_model() - assert self.model_context._model_loaded, "should load model first" + self.model_context.apply_patches(self.formats) self.quantizer = BaseQuantizers.from_config(self.quantize_config) self.quantizer.post_init() self.wrapper_block = wrapper_block - # TODO: add other algs here when they are ready - # self.other_alg = OtherAlg.from_config(self.other_alg_config) if self.other_alg_config is not None else None - # self.other_alg.post_init() if self.other_alg is not None else None - - # check and update the format based on the current configuration - if isinstance(self.formats, str): - self.formats = get_formats(self.formats, self) - self.compress_context.formats = self.formats - self.shard_writer = ShardWriter(self.model_context.model, bits=8) - - # Set device, must place after model loading + # Set device set_non_auto_device_map(self.model_context.model, self.compress_context.device_map) - # after setting iters + # Re-check torch compile with fully resolved config attrs self._adjust_torch_compile(self.enable_torch_compile) self.compress_context.enable_torch_compile = self.enable_torch_compile @@ -280,9 +314,32 @@ def __getattr__(self, name: str) -> Any: raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'") + @property + def optimizer(self): + """Return the actual optimizer class, converting string to class for backward compat. + + Old API stored ``self.optimizer = torch.optim.AdamW`` (the class itself). + New arch stores the optimizer name as a string in ``quantize_config.optimizer``. + This property converts it so that ``ar.optimizer == torch.optim.AdamW`` works. + """ + if self.quantize_config is None: + return None + opt = getattr(self.quantize_config, "optimizer", None) + if opt is None: + # Default to AdamW when enable_adam=True and no explicit optimizer was set + if getattr(self.quantize_config, "enable_adam", False): + return torch.optim.AdamW + return None + if isinstance(opt, str): + return getattr(torch.optim, opt, None) + return opt + def _adjust_immediate_packing_and_saving(self): from auto_round.algorithms.quantization.rtn.config import RTNConfig + if self.formats is None: + return + formats = getattr(self, "formats", []) if len(formats) == 1 and not formats[0].is_fake() and self.inplace: self.is_immediate_packing = True @@ -380,15 +437,16 @@ def save_quantized( object: The compressed model object. """ self.output_dir = output_dir + if output_dir is not None: + self.compress_context.output_dir = output_dir if format is not None: if isinstance(format, str) and getattr(self, "formats", None) is None: logger.warning( f"save_quantized with format is deprecated and will be deleted in auto_round version 1.0." f" Please use Compressor(format='{format}' instead)." ) - formats = get_formats(format, self) - if not hasattr(self, "formats"): - self.formats = formats + self.formats = get_formats(format, self) + self.compress_context.formats = self.formats if not self.model_context.quantized: logger.warning("please run autoround.quantize first") @@ -408,6 +466,8 @@ def save_quantized( from auto_round.version import __version__ serialization_dict["autoround_version"] = __version__ + if serialization_dict.get("to_quant_block_names") is None and self.quantizer.quant_block_list: + serialization_dict["to_quant_block_names"] = extract_block_names_to_str(self.quantizer.quant_block_list) if "scale_dtype" in serialization_dict.keys(): serialization_dict["scale_dtype"] = str(serialization_dict["scale_dtype"]) @@ -458,15 +518,14 @@ def quantize_and_save( self.compress_context.output_dir = output_dir # check and update the format based on the current configuration - if format and self.formats is not None: + if format and self.formats is None: logger.warning( f"quantize_and_save with format is deprecated and will be deleted in auto_round version 1.0." f" Please use Compressor(format='{format}' instead)." ) self.formats = format if self.formats is None: - if self.formats is None: - logger.info("format is not set, using default auto_round format.") + logger.info("format is not set, using default auto_round format.") self.formats = "auto_round" # If multiple formats are specified, enforce inplace=False diff --git a/auto_round/compressors_new/calib.py b/auto_round/compressors_new/calib.py index 92de7e2ba..9196c3cb1 100644 --- a/auto_round/compressors_new/calib.py +++ b/auto_round/compressors_new/calib.py @@ -701,9 +701,9 @@ def _quantize_blocks( else: self._offloader.reload(model, names) - m.config = model.config if hasattr(model, "config") else None + block_name_or_names = n if nblocks == 1 else names q_input, input_ids = self.quantizer.quantize_block( - m, + block_name_or_names, input_ids, input_others, q_input=q_input, @@ -740,7 +740,6 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: The quantized model and layer configurations. """ self.post_init() - self.model_context.initialize(formats=self.formats, is_act_quantize=self.config.is_act_quantize) self._check_compatibility() @@ -1006,7 +1005,20 @@ def _check_compatibility(self) -> None: ) -class ImatrixCompressor(CalibCompressor): +class CalibratedRTNCompressor(CalibCompressor): + """CalibCompressor variant for iters=0 RTN that needs calibration data. + + Handles two cases that require forward passes through the model: + - Weight quantization with imatrix (importance-matrix statistics for + improved RTN accuracy on INT / weight-only schemes). + - Activation quantization with static scales (e.g. NVFP4, FP8_STATIC) + where per-tensor or per-channel scale factors must be collected before + the actual quantization step. + + Both cases use OptimizedRTNQuantizer and need a calibration dataset, + which is why they cannot be handled by the zero-shot (no-data) path. + """ + need_calib: bool = True def __init__( @@ -1085,10 +1097,8 @@ def _quantize_via_rtn_blockwise(self) -> None: for block_name in block_names: pbar.set_description(f"Quantizing {block_name}") - block = get_module(self.model_context.model, block_name) - self.quantizer.quantize_block( - block, + block_name, input_ids, input_others, ) @@ -1230,7 +1240,6 @@ def quantize(self): tuple[nn.Module, Dict[str, Any]]: The quantized model and the layer configuration. """ self.post_init() - self.model_context.initialize(formats=self.formats, is_act_quantize=self.config.is_act_quantize) formats = getattr(self, "formats", None) or [] if not (any(fmt.is_gguf() for fmt in formats) or self.super_bits is not None): @@ -1242,7 +1251,9 @@ def quantize(self): self._quant_rtn_with_imatrix() convert_module_to_hp_if_necessary( - self.model_context.model, self.model_context.amp_dtype, self.compress_context.device + self.model_context.model, + self.model_context.amp_dtype, + self.compress_context.device, ) if self.compress_context.low_cpu_mem_usage: self._offloader.reload(self.model_context.model) diff --git a/auto_round/compressors_new/entry.py b/auto_round/compressors_new/entry.py index 059875d75..27a2c6b92 100644 --- a/auto_round/compressors_new/entry.py +++ b/auto_round/compressors_new/entry.py @@ -9,7 +9,8 @@ from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig from auto_round.algorithms.quantization.rtn.config import RTNConfig from auto_round.auto_scheme.gen_auto_scheme import AutoScheme -from auto_round.compressors_new.calib import CalibCompressor +from auto_round.compressors_new.calib import CalibCompressor, CalibratedRTNCompressor +from auto_round.compressors_new.utils import check_need_act_calibration from auto_round.compressors_new.zero_shot import ZeroShotCompressor from auto_round.logger import logger from auto_round.schemes import QuantizationScheme @@ -111,38 +112,44 @@ class DiffusionCalibCompressor(DiffusionMixin, CalibCompressor): enable_imatrix = True elif is_weight_scheme(config.scheme): enable_imatrix = True - if enable_imatrix: - from auto_round.compressors_new.calib import ImatrixCompressor - # For RTN with imatrix, dynamically combine with model-specific Mixin + needs_act_calib = getattr(config, "is_act_quantize", False) and check_need_act_calibration( + getattr(config, "act_dynamic", None), + getattr(config, "act_data_type", None), + getattr(config, "act_bits", 16), + static_kv_dtype=kwargs.get("static_kv_dtype"), + static_attention_dtype=kwargs.get("static_attention_dtype"), + ) + + if enable_imatrix or needs_act_calib: + config._alg_cls = "OptimizedRTNQuantizer" + # For RTN with calibration data, dynamically combine with model-specific Mixin if model_type == "mllm": from auto_round.compressors_new.mllm_mixin import MLLMMixin - # Create dynamic class: MLLMMixin + ImatrixCompressor - class MLLMImatrixCompressor(MLLMMixin, ImatrixCompressor): - """MLLM model with RTN importance matrix compression""" + class MLLMCalibratedRTNCompressor(MLLMMixin, CalibratedRTNCompressor): + """MLLM model with calibrated RTN compression""" pass - return MLLMImatrixCompressor(config, **local_args, **kwargs) + return MLLMCalibratedRTNCompressor(config, **local_args, **kwargs) elif model_type == "diffusion": from auto_round.compressors_new.diffusion_mixin import DiffusionMixin - # Create dynamic class: DiffusionMixin + ImatrixCompressor - class DiffusionImatrixCompressor(DiffusionMixin, ImatrixCompressor): - """Diffusion model with RTN importance matrix compression""" + class DiffusionCalibratedRTNCompressor(DiffusionMixin, CalibratedRTNCompressor): + """Diffusion model with calibrated RTN compression""" pass - return DiffusionImatrixCompressor(config, **local_args, **kwargs) + return DiffusionCalibratedRTNCompressor(config, **local_args, **kwargs) else: - return ImatrixCompressor(config, **local_args, **kwargs) + return CalibratedRTNCompressor(config, **local_args, **kwargs) else: - # For basic RTN, dynamically combine with model-specific Mixin + config._alg_cls = "RTNQuantizer" + # Zero-shot RTN: no calibration data needed if model_type == "mllm": from auto_round.compressors_new.mllm_mixin import MLLMMixin - # Create dynamic class: MLLMMixin + ZeroShotCompressor class MLLMZeroShotCompressor(MLLMMixin, ZeroShotCompressor): """MLLM model with zero-shot RTN compression""" @@ -152,7 +159,6 @@ class MLLMZeroShotCompressor(MLLMMixin, ZeroShotCompressor): elif model_type == "diffusion": from auto_round.compressors_new.diffusion_mixin import DiffusionMixin - # Create dynamic class: DiffusionMixin + ZeroShotCompressor class DiffusionZeroShotCompressor(DiffusionMixin, ZeroShotCompressor): """Diffusion model with zero-shot RTN compression""" @@ -214,6 +220,37 @@ class AutoRound: super_bits: int | None super_group_size: int | None + @staticmethod + def _pop_config_kwargs(kwargs: dict[str, Any]) -> tuple[dict[str, Any], dict[str, Any]]: + """Extract old-API config kwargs and split them by config type.""" + common_keys = ( + "ignore_layers", + "quant_lm_head", + "scale_dtype", + "super_bits", + "super_group_size", + "to_quant_block_names", + ) + auto_round_only_keys = ( + "nblocks", + "enable_alg_ext", + "lr_scheduler", + "not_use_best_mse", + "dynamic_max_gap", + "optimizer", + "enable_adam", + "momentum", + ) + common_kwargs = {} + auto_round_kwargs = {} + for key in common_keys: + if key in kwargs: + common_kwargs[key] = kwargs.pop(key) + for key in auto_round_only_keys: + if key in kwargs: + auto_round_kwargs[key] = kwargs.pop(key) + return common_kwargs, auto_round_kwargs + def __new__( cls, model: Union[torch.nn.Module, str], @@ -240,6 +277,8 @@ def __new__( """ from auto_round.utils import is_diffusion_model, is_mllm_model + common_config_kwargs, auto_round_config_kwargs = cls._pop_config_kwargs(kwargs) + # Extract quantization parameters from kwargs or use defaults bits = kwargs.pop("bits", None) group_size = kwargs.pop("group_size", None) @@ -272,6 +311,7 @@ def __new__( seqlen=seqlen, nsamples=nsamples, batch_size=batch_size, + **common_config_kwargs, ) else: # AutoRound mode @@ -303,6 +343,8 @@ def __new__( enable_minmax_tuning=enable_minmax_tuning, enable_norm_bias_tuning=enable_norm_bias_tuning, enable_quanted_input=enable_quanted_input, + **common_config_kwargs, + **auto_round_config_kwargs, ) # Determine output format if specified diff --git a/auto_round/compressors_new/mllm_mixin.py b/auto_round/compressors_new/mllm_mixin.py index 14157dcab..d80098eba 100644 --- a/auto_round/compressors_new/mllm_mixin.py +++ b/auto_round/compressors_new/mllm_mixin.py @@ -57,7 +57,11 @@ def __init__( self.extra_data_dir = extra_data_dir self.quant_nontext_module = quant_nontext_module self.template_obj = None + # Backward compat: ar.mllm is expected to be True for MLLM instances + self.mllm = True + # Pass quant_nontext_module to ModelContext so get_block_names can include vision blocks + kwargs.setdefault("quant_nontext_module", quant_nontext_module) # Call parent class __init__ (will be CalibCompressor, ImatrixCompressor, etc) super().__init__(*args, **kwargs) @@ -84,29 +88,48 @@ def calib(self, nsamples, bs): if any([m in name for m in MISTRAL_3_2_MODELS]): self.template = "mistral3_2" + template_name = self.template + if template_name is None and hasattr(self.model_context.model.config, "model_type"): + template_name = self.model_context.model.config.model_type + if template_name is None: + template_name = "default" + # Get template - if self.template is not None: - self.template_obj = get_template(self.template) - elif hasattr(self.model_context.model.config, "model_type"): - self.template_obj = get_template(self.model_context.model.config.model_type) - else: - self.template_obj = get_template("default") + self.template_obj = get_template( + template_name, + model=self.model_context.model, + tokenizer=self.tokenizer, + processor=self.processor, + image_processor=self.image_processor, + use_rtn=getattr(self.quantize_config, "iters", None) == 0, + quiet=not self.quant_nontext_module, + ) - logger.info(f"Using MLLM template: {self.template or 'default'}") + logger.info(f"Using MLLM template: {template_name}") # Get MLLM dataloader - self.dataloader = get_mllm_dataloader( - self.model_context.model, - self.tokenizer, - self.dataset, - self.processor, - self.image_processor, - nsamples, - self.quantize_config.seqlen, - self.seed, - bs, - self.template_obj, - self.extra_data_dir, + dataset = self.dataset.replace(" ", "") if isinstance(self.dataset, str) else self.dataset + if dataset is None: + dataset = self.template_obj.default_dataset + + ( + self.dataloader, + self.batch_size, + self.seqlen, + self.gradient_accumulate_steps, + ) = get_mllm_dataloader( + template=self.template_obj, + model=self.model_context.model, + tokenizer=self.tokenizer, + processor=self.processor, + image_processor=self.image_processor, + dataset=dataset, + extra_data_dir=self.extra_data_dir, + seqlen=self.quantize_config.seqlen, + bs=bs, + seed=self.seed, + nsamples=nsamples, + quant_nontext_module=self.quant_nontext_module, ) # Process data through the model for calibration diff --git a/auto_round/compressors_new/shard_writer.py b/auto_round/compressors_new/shard_writer.py index d112c96ff..f76c9f7af 100644 --- a/auto_round/compressors_new/shard_writer.py +++ b/auto_round/compressors_new/shard_writer.py @@ -276,5 +276,5 @@ def write(self, m: torch.nn.Module = None, name: str = None, is_finalize: bool = if is_finalize: self.finalize() - # Optional: cleanup the saver object from rounder - self._initialized = False + ShardWriter._initialized = False + ShardWriter._instance = None diff --git a/auto_round/compressors_new/zero_shot.py b/auto_round/compressors_new/zero_shot.py index 2048ec9fe..0c6e4446e 100644 --- a/auto_round/compressors_new/zero_shot.py +++ b/auto_round/compressors_new/zero_shot.py @@ -140,13 +140,7 @@ def _quantize_via_rtn_blockwise(self) -> None: for block_name in block_names: pbar.set_description(f"Quantizing {block_name}") - block = get_module(self.model_context.model, block_name) - - self.quantizer.quantize_block( - block, - input_ids, - input_others, - ) + self.quantizer.quantize_block(block_name, input_ids, input_others) if self.low_cpu_mem_usage and not self.is_immediate_saving: self._offloader.offload(self.model_context.model, block_name) @@ -184,7 +178,6 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: """ self.post_init() - self.model_context.initialize(formats=self.formats, is_act_quantize=self.config.is_act_quantize) formats = self.formats if isinstance(self.formats, list) else [] if not (any(fmt.is_gguf() for fmt in formats) or self.super_bits is not None): @@ -251,8 +244,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: for block_names in all_blocks: for block_name in block_names: pbar.set_description(f"Quantizing {block_name}") - block = get_module(self.model, block_name) - self.quantizer.quantize_block(block, block_name=block_name) + self.quantizer.quantize_block(block_name) if self.low_cpu_mem_usage and not self.is_immediate_saving: self._offloader.offload(self.model, block_name) diff --git a/auto_round/context/model.py b/auto_round/context/model.py index 52785278f..214e9ef27 100644 --- a/auto_round/context/model.py +++ b/auto_round/context/model.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import importlib from typing import Any, Callable, Optional, Union import torch @@ -39,6 +40,10 @@ __all__ = ["ModelContext"] +_CUSTOM_MOE_REPLACEMENT_MODULES = { + "gpt_oss": "auto_round.modeling.fused_moe.gpt_oss", +} + class ModelContext(BaseContext): _is_initialized = False @@ -58,6 +63,9 @@ def __init__( amp=True, need_calib=True, device="cpu", + formats=None, + is_act_quantize=False, + quant_nontext_module=False, ): super().__init__() self.quantized = False @@ -77,8 +85,33 @@ def __init__( self.model_dtype = model_dtype self.trust_remote_code = trust_remote_code self.amp = amp - self.need_calib = need_calib + self.quant_nontext_module = quant_nontext_module + + # Load model and run basic initialization eagerly so the model is ready + # by the time BaseCompressor.post_init() runs. + self._load_model() + + if unsupported_meta_device(self.model): + raise RuntimeError( + "AutoRound does not support parameters on meta device. " + "Please use more GPUs by setting `--device 0,1,2,3` or just place the model on CPU." + ) + check_and_mark_quantized_module(self.model) + self.model = self.model.eval() + self.shared_cache_keys = get_shared_keys(self.model) + + self.is_moe_model = is_moe_model(self.model) + self._import_custom_moe_replacements(getattr(self.model, "config", None)) + + self._set_amp_dtype() + if is_act_quantize and self.amp_dtype == torch.float16: + logger.warning("force to use bf16 for quantization tuning when enabling activation quantization") + self.amp_dtype = torch.bfloat16 + if self.model.dtype != torch.bfloat16: + self.model = self.model.to(torch.bfloat16) + else: + logger.info(f"using {self.model.dtype} for quantization tuning") def _load_model(self): if is_mllm_model(self.model, platform=self.platform): @@ -96,6 +129,7 @@ def _load_model(self): config: Optional[AutoConfig] = None try: config = AutoConfig.from_pretrained(self.model, trust_remote_code=self.trust_remote_code) + self._import_custom_moe_replacements(config) except (OSError, EnvironmentError) as e: logger.debug( "Failed to load config via AutoConfig.from_pretrained for %s: %s. " @@ -136,6 +170,34 @@ def _load_model(self): self._model_loaded = True + def _import_custom_moe_replacements(self, model_or_config) -> None: + model_type = getattr(model_or_config, "model_type", None) + module_name = _CUSTOM_MOE_REPLACEMENT_MODULES.get(model_type) + if module_name is None: + return + + module = importlib.import_module(module_name) + from auto_round.modeling.fused_moe.replace_modules import BUILTIN_MODULES + + BUILTIN_MODULES.setdefault(model_type, module) + logger.debug(f"Loaded custom MoE replacement module for {model_type}") + + def _patch_custom_moe_modules(self) -> None: + model_type = getattr(getattr(self.model, "config", None), "model_type", None) + if model_type != "qwen3_vl_moe": + return + + for module in self.model.modules(): + if module.__class__.__name__ != "Qwen3VLMoeTextSparseMoeBlock": + continue + if hasattr(module, "top_k"): + continue + + gate = getattr(module, "gate", None) + top_k = getattr(gate, "top_k", None) + if top_k is not None: + setattr(module, "top_k", top_k) + def _set_amp_dtype(self) -> None: """Sets the automatic mixed precision (AMP) data type for the model based on the device and configuration.""" self.amp_dtype = torch.bfloat16 @@ -158,34 +220,17 @@ def _set_amp_dtype(self) -> None: self.amp_dtype = torch.float32 self.model = self.model.to(torch.float32) - def initialize(self, formats, is_act_quantize=False): - # load and handle model - if not self._model_loaded: - self._load_model() - - if unsupported_meta_device(self.model): - raise RuntimeError( - "AutoRound does not support parameters on meta device. " - "Please use more GPUs by setting `--device 0,1,2,3` or just place the model on CPU." - ) - check_and_mark_quantized_module(self.model) - self.model = self.model.eval() - self.shared_cache_keys = get_shared_keys(self.model) - - # Important Note! This is not very robust, do NOT rely on it to do high risky thing - self.is_moe_model = is_moe_model(self.model) - - self._set_amp_dtype() - if is_act_quantize and self.amp_dtype == torch.float16: - logger.warning("force to use bf16 to for quantization tuning when enabling activation quantization") - self.amp_dtype = torch.bfloat16 - if self.model.dtype != torch.bfloat16: # keep the model's buffer dtype unchanged - self.model = self.model.to(torch.bfloat16) - else: - logger.info(f"using {self.model.dtype} for quantization tuning") + def apply_patches(self, formats): + """Apply format-specific model structure patches. + Must be called after formats are resolved (list[OutputFormat]) and before + BaseQuantizers.post_init() so that configure_layer_config() operates on the + final model structure (post update_module). Eliminates the need for a + subsequent refresh_quantizer_for_initialized_model() call. + """ # It is best to modify the model structure in the quantize function and check the format, # because it may cause the gguf format to not be exported normally. + self._patch_custom_moe_modules() self.model = update_module( self.model, formats=formats, trust_remote_code=self.trust_remote_code, cleanup_original=False ) @@ -199,7 +244,6 @@ def initialize(self, formats, is_act_quantize=False): self.model = self.model.to(self.amp_dtype) self._init_model = True - self._is_initialized = True def replace_forward(self, register_hook): diff --git a/auto_round/export/utils.py b/auto_round/export/utils.py index df8ebd37a..4688c9f02 100644 --- a/auto_round/export/utils.py +++ b/auto_round/export/utils.py @@ -14,6 +14,7 @@ import json import os +import torch import torch.nn as nn from auto_round.utils import ( @@ -24,6 +25,21 @@ ) +def _save_model_configs(model: nn.Module, save_dir: str) -> None: + if hasattr(model, "config") and model.config is not None: + model.config.save_pretrained(save_dir) + + if hasattr(model, "generation_config") and model.generation_config is not None: + model.generation_config.save_pretrained(save_dir) + + +def _state_dict_has_meta_tensor(model: nn.Module) -> bool: + for tensor in model.state_dict().values(): + if isinstance(tensor, torch.Tensor) and tensor.device.type == "meta": + return True + return False + + def save_model( model: nn.Module, save_dir: str, @@ -54,13 +70,17 @@ def save_model( os.makedirs(save_dir, exist_ok=True) if unsupported_meta_device(model): - if hasattr(model, "config") and model.config is not None: - model.config.save_pretrained(save_dir) - - if hasattr(model, "generation_config") and model.generation_config is not None: - model.generation_config.save_pretrained(save_dir) + _save_model_configs(model, save_dir) else: - model.save_pretrained(save_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization) + has_meta_tensor = _state_dict_has_meta_tensor(model) + if has_meta_tensor: + logger.info( + "Detected meta tensors in state_dict after shard-based saving; skipping model.save_pretrained and " + "saving configs only." + ) + _save_model_configs(model, save_dir) + else: + model.save_pretrained(save_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization) try: copy_missing_tensors_from_source( @@ -184,7 +204,9 @@ def filter_quantization_config(quantization_config): "scale_dtype": "torch.float16", "seqlen": 2048, } - iters = quantization_config.get("iters", 200) + iters = quantization_config.get("iters") + if iters is None: + iters = 0 default_dict["lr"] = 1.0 / iters if iters > 0 else 5e-3 default_dict["minmax_lr"] = default_dict["lr"] From 2ab9b51b288e226380034b2ef5d6fbf0999fc39e Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 19 Mar 2026 19:06:39 +0800 Subject: [PATCH 12/90] fix ut Signed-off-by: n1ck-guo --- auto_round/algorithms/quantization/config.py | 33 +- .../architecture_visualization.py | 327 ++++--- auto_round/compressors_new/base.py | 1 + auto_round/compressors_new/diffusion_mixin.py | 81 +- .../docs/compressors_new_architecture.md | 427 +++++---- .../docs/compressors_new_architecture_CN.md | 903 +++++------------- auto_round/compressors_new/mllm_mixin.py | 82 +- auto_round/compressors_new/shard_writer.py | 15 +- auto_round/context/compress.py | 2 + auto_round/context/model.py | 6 + auto_round/formats.py | 2 +- 11 files changed, 791 insertions(+), 1088 deletions(-) diff --git a/auto_round/algorithms/quantization/config.py b/auto_round/algorithms/quantization/config.py index 68920e3f1..dd27be99c 100644 --- a/auto_round/algorithms/quantization/config.py +++ b/auto_round/algorithms/quantization/config.py @@ -97,6 +97,17 @@ def _early_resolve_scheme(self) -> None: # Silently ignore failures — post_init() will do the authoritative resolution pass + @staticmethod + def _is_valid_group_size(gs) -> bool: + """Return True if gs is a valid group_size value. + + Accepts -1 (per-channel), 0 (per-tensor), a positive integer, + or a tuple/list of such values (e.g. (128, 128) for block-wise FP8). + """ + if isinstance(gs, (tuple, list)): + return all(QuantizationConfig._is_valid_group_size(g) for g in gs) + return gs == -1 or gs >= 0 + def check_config(self) -> None: """Checks if the configurations are valid. @@ -107,11 +118,16 @@ def check_config(self) -> None: raise ValueError("`bits` must be positive") if self.act_bits <= 0: raise ValueError("`act_bits` must be positive") - if not (self.group_size == -1 or self.group_size >= 0): - raise ValueError("`group_size` must be -1 (per channel) or 0 (per-tensor) or a positive integer") - if not (self.act_group_size == -1 or self.act_group_size >= 0): - raise ValueError("`act_group_size` must be -1 (per channel) or 0 (per-tensor) or a positive integer") - """Reset the default value of super_bits and super_group_size""" + if not self._is_valid_group_size(self.group_size): + raise ValueError( + "`group_size` must be -1 (per channel), 0 (per-tensor), a positive integer, " + "or a tuple thereof (e.g. (128, 128) for block-wise quantization)" + ) + if not self._is_valid_group_size(self.act_group_size): + raise ValueError( + "`act_group_size` must be -1 (per channel), 0 (per-tensor), a positive integer, " "or a tuple thereof" + ) + # Reset the default value of super_bits and super_group_size if self.data_type.endswith("_dq"): gguf_config = GGUF_INNER_CONFIG[f"gguf:q{self.bits}_k"] self.super_bits = gguf_config.get("super_bits", None) if self.super_bits is None else self.super_bits @@ -130,10 +146,11 @@ def check_config(self) -> None: "activation quantization is an experimental feature with limited support and a complex API. " "And please save the quantized model to fake format as real deployment is not supported currently" ) - if self.is_mx_fp and self.group_size != 32: + # For block-wise group_size (tuple), skip the scalar-only warnings + scalar_gs = self.group_size if not isinstance(self.group_size, (tuple, list)) else None + if self.is_mx_fp and scalar_gs != 32: logger.warning("dtype mx_fp should only support group_size of 32 in real deployment") - - if self.is_nv_fp and (self.group_size != 16): + if self.is_nv_fp and scalar_gs != 16: logger.warning("dtype nv_fp should only support group_size of 16 in real deployment") @property diff --git a/auto_round/compressors_new/architecture_visualization.py b/auto_round/compressors_new/architecture_visualization.py index 654cdb0e0..6d044c520 100644 --- a/auto_round/compressors_new/architecture_visualization.py +++ b/auto_round/compressors_new/architecture_visualization.py @@ -1,6 +1,5 @@ # # Copyright (C) 2026 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 - """ New Architecture Visualization - Mixin Pattern Combination Table @@ -11,101 +10,139 @@ def print_architecture_table(): """Print architecture combination table""" - print("\n" + "=" * 100) + print("\n" + "=" * 110) print("Compressor New Architecture - Mixin Pattern Combination Table") - print("=" * 100 + "\n") + print("=" * 110 + "\n") - # Table header - print(f"{'Model Type':<15} {'Config Type':<20} {'Algorithm':<20} {'Actual Created Class':<35}") - print("-" * 100) + print(f"{'Model Type':<15} {'Config Type':<20} {'Compressor (dynamic class)':<40} {'Base classes':<35}") + print("-" * 110) # LLM combinations - print(f"{'LLM':<15} {'AutoRoundConfig':<20} {'AutoRound':<20} {'CalibCompressor':<35}") - print(f"{'LLM':<15} {'RTNConfig':<20} {'RTN + imatrix':<20} {'ImatrixCompressor':<35}") - print(f"{'LLM':<15} {'RTNConfig':<20} {'RTN (zero-shot)':<20} {'ZeroShotCompressor':<35}") + print(f"{'LLM':<15} {'AutoRoundConfig':<20} {'CalibCompressor':<40} {'CalibCompressor':<35}") + print(f"{'LLM':<15} {'RTNConfig':<20} {'CalibratedRTNCompressor':<40} {'CalibratedRTNCompressor':<35}") + print(f"{'LLM':<15} {'RTNConfig':<20} {'ZeroShotCompressor':<40} {'ZeroShotCompressor':<35}") print() - # MLLM combinations - print(f"{'MLLM':<15} {'AutoRoundConfig':<20} {'AutoRound':<20} {'MLLMCalibCompressor':<35}") - print(f"{'':<15} {'':<20} {'':<20} {' = MLLMMixin + CalibCompressor':<35}") - print(f"{'MLLM':<15} {'RTNConfig':<20} {'RTN + imatrix':<20} {'MLLMImatrixCompressor':<35}") - print(f"{'':<15} {'':<20} {'':<20} {' = MLLMMixin + ImatrixCompressor':<35}") - print(f"{'MLLM':<15} {'RTNConfig':<20} {'RTN (zero-shot)':<20} {'MLLMZeroShotCompressor':<35}") - print(f"{'':<15} {'':<20} {'':<20} {' = MLLMMixin + ZeroShotCompressor':<35}") + # MLLM combinations (dynamic classes created in entry.py) + print(f"{'MLLM':<15} {'AutoRoundConfig':<20} {'MLLMCalibCompressor':<40} {'MLLMMixin + CalibCompressor':<35}") + print( + f"{'MLLM':<15} {'RTNConfig':<20} {'MLLMCalibratedRTNCompressor':<40} {'MLLMMixin + CalibratedRTNCompressor':<35}" + ) + print(f"{'MLLM':<15} {'RTNConfig':<20} {'MLLMZeroShotCompressor':<40} {'MLLMMixin + ZeroShotCompressor':<35}") print() - # Diffusion combinations - print(f"{'Diffusion':<15} {'AutoRoundConfig':<20} {'AutoRound':<20} {'DiffusionCalibCompressor':<35}") - print(f"{'':<15} {'':<20} {'':<20} {' = DiffusionMixin + CalibCompressor':<35}") - print(f"{'Diffusion':<15} {'RTNConfig':<20} {'RTN + imatrix':<20} {'DiffusionImatrixCompressor':<35}") - print(f"{'':<15} {'':<20} {'':<20} {' = DiffusionMixin + ImatrixCompressor':<35}") - print(f"{'Diffusion':<15} {'RTNConfig':<20} {'RTN (zero-shot)':<20} {'DiffusionZeroShotCompressor':<35}") - print(f"{'':<15} {'':<20} {'':<20} {' = DiffusionMixin + ZeroShotCompressor':<35}") + # Diffusion combinations (dynamic classes created in entry.py) + print( + f"{'Diffusion':<15} {'AutoRoundConfig':<20} {'DiffusionCalibCompressor':<40} {'DiffusionMixin + CalibCompressor':<35}" + ) + print( + f"{'Diffusion':<15} {'RTNConfig':<20} {'DiffusionCalibratedRTNCompressor':<40} {'DiffusionMixin + CalibratedRTNCompressor':<35}" + ) + print( + f"{'Diffusion':<15} {'RTNConfig':<20} {'DiffusionZeroShotCompressor':<40} {'DiffusionMixin + ZeroShotCompressor':<35}" + ) - print("\n" + "=" * 100 + "\n") + print("\n" + "=" * 110 + "\n") def print_mixin_explanation(): """Print Mixin pattern explanation""" - print("=" * 100) + print("=" * 110) print("Mixin Pattern Explanation") - print("=" * 100 + "\n") + print("=" * 110 + "\n") print("✨ Core Components:") - print("-" * 100) - print(" 1. MLLMMixin - MLLM features (processor, template, etc.)") - print(" 2. DiffusionMixin - Diffusion features (guidance_scale, pipeline, etc.)") - print(" 3. CalibCompressor - Calibration-based compression algorithm (AutoRound)") - print(" 4. ImatrixCompressor - RTN + importance matrix") - print(" 5. ZeroShotCompressor - Zero-shot RTN") + print("-" * 110) + print(" 1. MLLMMixin - MLLM features (processor, template, quant_nontext_module, etc.)") + print(" 2. DiffusionMixin - Diffusion features (pipeline loading, guidance_scale, etc.)") + print(" 3. CalibCompressor - AutoRound: gradient-based calibration quantization") + print(" 4. CalibratedRTNCompressor - RTN with importance-matrix (imatrix) or act calibration") + print(" 5. ZeroShotCompressor - Zero-shot RTN (no calibration data needed)") print("\n🎯 Combination Approach:") - print("-" * 100) - print(" Dynamically create combined classes through multiple inheritance:") - print(" class MLLMCalibCompressor(MLLMMixin, CalibCompressor):") - print(" pass") - print("\n MLLMMixin provides MLLM features, CalibCompressor provides compression algorithm") + print("-" * 110) + print(" Dynamic classes created on-the-fly inside Compressor.__new__():") + print(" class MLLMCalibCompressor(MLLMMixin, CalibCompressor): pass") + print(" class MLLMCalibratedRTNCompressor(MLLMMixin, CalibratedRTNCompressor): pass") + print(" class MLLMZeroShotCompressor(MLLMMixin, ZeroShotCompressor): pass") print("\n💡 Advantages:") - print("-" * 100) - print(" ✓ Flexible Combination: Any model feature can be combined with any compression algorithm") - print(" ✓ Code Reuse: Mixin code is written once and can be reused multiple times") - print(" ✓ Clear Separation: Model features and compression algorithms are completely independent") - print(" ✓ Easy Extension: Adding new model types or new algorithms is straightforward") + print("-" * 110) + print(" ✓ Flexible Combination: Any model type can be combined with any compression algorithm") + print(" ✓ Code Reuse: Mixin code is written once and reused across all compression algorithms") + print(" ✓ Clear Separation: Model-specific logic (Mixin) and compression algorithm are independent") + print(" ✓ Easy Extension: Add new model types without touching existing compressor code") - print("\n" + "=" * 100 + "\n") + print("\n" + "=" * 110 + "\n") + + +def print_post_init_flow(): + """Print the post_init execution flow""" + + print("=" * 110) + print("BaseCompressor.post_init() Execution Flow") + print("=" * 110 + "\n") + + print( + """ +BaseCompressor.post_init() +│ +├─ Step 1: Resolve formats (str → list[OutputFormat]) +│ └─ get_formats(self.formats, self) +│ +├─ Step 2: Apply format-specific model patches +│ └─ model_context.apply_patches(formats) +│ ├─ _patch_custom_moe_modules() # e.g. Qwen3VL MoE top_k fix +│ ├─ update_module(model, formats) # add gguf_pack_linear etc. +│ └─ assign global_name to all modules +│ +├─ Step 3: Setup quantizer on the patched model +│ └─ quantizer = BaseQuantizers.from_config(config) +│ └─ quantizer.post_init() +│ ├─ get ModelContext / CompressContext singletons +│ ├─ _parse_scheme() → resolve final quant attrs +│ ├─ get_block_names(quant_vision=quant_nontext_module) +│ ├─ find_matching_blocks() → quant_block_list +│ └─ back-fill to_quant_block_names if it was None +│ +└─ Step 4: Setup device map, torch compile, offloader + """ + ) + + print("=" * 110 + "\n") def print_usage_examples(): """Print usage examples""" - print("=" * 100) + print("=" * 110) print("Usage Examples") - print("=" * 100 + "\n") + print("=" * 110 + "\n") - print("Example 1: MLLM + AutoRound") - print("-" * 100) + print("Example 1: MLLM + AutoRound (gradient-based)") + print("-" * 110) print( """ from auto_round.compressors_new.entry import Compressor from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig -config = AutoRoundConfig(scheme="W4A16", iters=200) +config = AutoRoundConfig(scheme="W4A16", iters=200, nsamples=128) compressor = Compressor( config=config, model="/models/Qwen2-VL-2B-Instruct", processor=processor, template="qwen2_vl", + quant_nontext_module=False, # set True to also quantize vision encoder ) -# Actually creates: MLLMCalibCompressor (MLLMMixin + CalibCompressor) +# Dynamically creates: class MLLMCalibCompressor(MLLMMixin, CalibCompressor) """ ) - print("\nExample 2: MLLM + RTN + imatrix") - print("-" * 100) + print("\nExample 2: MLLM + RTN with imatrix") + print("-" * 110) print( """ from auto_round.algorithms.quantization.rtn.config import RTNConfig @@ -114,15 +151,15 @@ def print_usage_examples(): compressor = Compressor( config=config, model="/models/Qwen2-VL-2B-Instruct", - format="gguf_k", # Triggers imatrix + format="gguf_k", # gguf_k triggers CalibratedRTNCompressor processor=processor, ) -# Actually creates: MLLMImatrixCompressor (MLLMMixin + ImatrixCompressor) +# Dynamically creates: class MLLMCalibratedRTNCompressor(MLLMMixin, CalibratedRTNCompressor) """ ) print("\nExample 3: Diffusion + AutoRound") - print("-" * 100) + print("-" * 110) print( """ config = AutoRoundConfig(scheme="W4A16", iters=200) @@ -131,110 +168,133 @@ def print_usage_examples(): model="/models/stable-diffusion-2-1", guidance_scale=7.5, ) -# Actually creates: DiffusionCalibCompressor (DiffusionMixin + CalibCompressor) +# Dynamically creates: class DiffusionCalibCompressor(DiffusionMixin, CalibCompressor) """ ) - print("\n" + "=" * 100 + "\n") + print("\n" + "=" * 110 + "\n") def print_mro_example(): """Print MRO (Method Resolution Order) example""" - print("=" * 100) + print("=" * 110) print("Method Resolution Order (MRO) Example") - print("=" * 100 + "\n") + print("=" * 110 + "\n") - print("For MLLMCalibCompressor(MLLMMixin, CalibCompressor):") - print("-" * 100) + print("For class MLLMCalibCompressor(MLLMMixin, CalibCompressor):") + print("-" * 110) print( """ -MLLMCalibCompressor +MLLMCalibCompressor (dynamic, created in Compressor.__new__) └─> MLLMMixin └─> CalibCompressor └─> BaseCompressor └─> object Execution order when calling __init__(): - 1. MLLMCalibCompressor.__init__() (if defined) + 1. MLLMCalibCompressor.__init__() → not defined, falls through 2. MLLMMixin.__init__() - - Save MLLM-specific parameters (processor, template, etc.) + - Save MLLM-specific attrs: processor, template, quant_nontext_module, … + - kwargs.setdefault("quant_nontext_module", quant_nontext_module) - Call super().__init__() → enters CalibCompressor - 3. CalibCompressor.__init__() - - Save calibration-related parameters (dataset, iters, etc.) - - Call super().__init__() → enters BaseCompressor - 4. BaseCompressor.__init__() - - Base class initialization - -Thus, MLLMCalibCompressor has both: - ✓ MLLM features (from MLLMMixin) - ✓ Calibration compression functionality (from CalibCompressor) + 3. CalibCompressor.__init__() → BaseCompressor.__init__() + - pops quant_nontext_module from kwargs + - Creates ModelContext(…, quant_nontext_module=quant_nontext_module) + - ModelContext.__init__ eagerly loads the model + - Creates CompressContext singleton + +MLLMCalibCompressor instance has: + ✓ MLLM features from MLLMMixin (processor, template, calib() override) + ✓ Calibration compression from CalibCompressor + ✓ Model/context management from BaseCompressor """ ) - print("=" * 100 + "\n") + print("=" * 110 + "\n") def print_decision_tree(): """Print decision tree""" - print("=" * 100) + print("=" * 110) print("Compressor Creation Decision Tree") - print("=" * 100 + "\n") + print("=" * 110 + "\n") print( """ -Compressor.__new__(config, model, ...) +Compressor.__new__(config, model, format, **kwargs) │ ├─ Step 1: Detect model type │ model_type = detect_model_type(model) │ ├─ is_diffusion_model() → "diffusion" -│ ├─ is_mllm_model() → "mllm" -│ └─ else → "llm" +│ ├─ is_mllm_model() → "mllm" +│ └─ else → "llm" │ -├─ Step 2: Determine config type -│ │ -│ ├─ AutoRoundConfig (requires calibration) -│ │ ├─ model_type == "mllm" -│ │ │ └─> class MLLMCalibCompressor(MLLMMixin, CalibCompressor) -│ │ │ return MLLMCalibCompressor(...) -│ │ │ -│ │ ├─ model_type == "diffusion" -│ │ │ └─> class DiffusionCalibCompressor(DiffusionMixin, CalibCompressor) -│ │ │ return DiffusionCalibCompressor(...) -│ │ │ -│ │ └─ model_type == "llm" -│ │ └─> return CalibCompressor(...) -│ │ -│ └─ RTNConfig (zero-shot or imatrix) -│ │ -│ ├─ enable_imatrix == True -│ │ ├─ model_type == "mllm" -│ │ │ └─> class MLLMImatrixCompressor(MLLMMixin, ImatrixCompressor) -│ │ │ return MLLMImatrixCompressor(...) -│ │ │ -│ │ ├─ model_type == "diffusion" -│ │ │ └─> class DiffusionImatrixCompressor(DiffusionMixin, ImatrixCompressor) -│ │ │ return DiffusionImatrixCompressor(...) -│ │ │ -│ │ └─ model_type == "llm" -│ │ └─> return ImatrixCompressor(...) -│ │ -│ └─ enable_imatrix == False -│ ├─ model_type == "mllm" -│ │ └─> class MLLMZeroShotCompressor(MLLMMixin, ZeroShotCompressor) -│ │ return MLLMZeroShotCompressor(...) -│ │ -│ ├─ model_type == "diffusion" -│ │ └─> class DiffusionZeroShotCompressor(DiffusionMixin, ZeroShotCompressor) -│ │ return DiffusionZeroShotCompressor(...) -│ │ -│ └─ model_type == "llm" -│ └─> return ZeroShotCompressor(...) +├─ isinstance(config, AutoRoundConfig) +│ ├─ model_type == "mllm" +│ │ └─> class MLLMCalibCompressor(MLLMMixin, CalibCompressor) +│ ├─ model_type == "diffusion" +│ │ └─> class DiffusionCalibCompressor(DiffusionMixin, CalibCompressor) +│ └─ model_type == "llm" +│ └─> CalibCompressor +│ +└─ isinstance(config, RTNConfig) + │ + ├─ enable_imatrix OR needs_act_calib → CalibratedRTNCompressor path + │ ├─ gguf_k format → enable_imatrix = True + │ ├─ symmetric int RTN → enable_imatrix = True + │ ├─ static activation quantization → needs_act_calib = True + │ │ + │ ├─ model_type == "mllm" + │ │ └─> class MLLMCalibratedRTNCompressor(MLLMMixin, CalibratedRTNCompressor) + │ ├─ model_type == "diffusion" + │ │ └─> class DiffusionCalibratedRTNCompressor(DiffusionMixin, CalibratedRTNCompressor) + │ └─ model_type == "llm" + │ └─> CalibratedRTNCompressor + │ + └─ else (zero-shot) → ZeroShotCompressor path + ├─ model_type == "mllm" + │ └─> class MLLMZeroShotCompressor(MLLMMixin, ZeroShotCompressor) + ├─ model_type == "diffusion" + │ └─> class DiffusionZeroShotCompressor(DiffusionMixin, ZeroShotCompressor) + └─ model_type == "llm" + └─> ZeroShotCompressor """ ) - print("=" * 100 + "\n") + print("=" * 110 + "\n") + + +def print_quantizer_interface(): + """Print the BaseQuantizers interface contract""" + + print("=" * 110) + print("BaseQuantizers Interface - Name-based quantize_block / quantize_layer") + print("=" * 110 + "\n") + + print( + """ +All quantizers use module *names* (str) instead of module objects. +The module is retrieved internally via get_module(model, name). + + BaseQuantizers (abstract) + ├─ quantize_block(block_name: Union[str, list[str]], input_ids, input_others, **kwargs) + │ str → get_module(model, block_name) + │ list[str] → WrapperMultiblock([get_module(model, n) for n in block_name]) + │ (used when nblocks > 1 in CalibCompressor) + │ + └─ quantize_layer(layer_name: str, **kwargs) + → get_module(model, layer_name) + + Implementations: + ├─ RTNQuantizer.quantize_block(block_name: str) + ├─ OptimizedRTNQuantizer.quantize_block(block_name: str, input_ids, input_others) + └─ ARQuantizer.quantize_block(block_name: Union[str, list[str]], input_ids, input_others) + """ + ) + + print("=" * 110 + "\n") def main(): @@ -242,14 +302,41 @@ def main(): print_architecture_table() print_mixin_explanation() + print_post_init_flow() print_usage_examples() print_mro_example() print_decision_tree() + print_quantizer_interface() - print("=" * 100) + print("=" * 110) print("🎉 New architecture supports 9 combinations (3 model types × 3 compression algorithms)") - print("=" * 100) + print(" CalibratedRTNCompressor (was ImatrixCompressor) lives in calib.py") + print("=" * 110) if __name__ == "__main__": main() + + print(f"{'LLM':<15} {'RTNConfig':<20} {'RTN (zero-shot)':<20} {'ZeroShotCompressor':<35}") + + print() + + # MLLM combinations + print(f"{'MLLM':<15} {'AutoRoundConfig':<20} {'AutoRound':<20} {'MLLMCalibCompressor':<35}") + print(f"{'':<15} {'':<20} {'':<20} {' = MLLMMixin + CalibCompressor':<35}") + print(f"{'MLLM':<15} {'RTNConfig':<20} {'RTN + imatrix':<20} {'MLLMImatrixCompressor':<35}") + print(f"{'':<15} {'':<20} {'':<20} {' = MLLMMixin + ImatrixCompressor':<35}") + print(f"{'MLLM':<15} {'RTNConfig':<20} {'RTN (zero-shot)':<20} {'MLLMZeroShotCompressor':<35}") + print(f"{'':<15} {'':<20} {'':<20} {' = MLLMMixin + ZeroShotCompressor':<35}") + + print() + + # Diffusion combinations + print(f"{'Diffusion':<15} {'AutoRoundConfig':<20} {'AutoRound':<20} {'DiffusionCalibCompressor':<35}") + print(f"{'':<15} {'':<20} {'':<20} {' = DiffusionMixin + CalibCompressor':<35}") + print(f"{'Diffusion':<15} {'RTNConfig':<20} {'RTN + imatrix':<20} {'DiffusionImatrixCompressor':<35}") + print(f"{'':<15} {'':<20} {'':<20} {' = DiffusionMixin + ImatrixCompressor':<35}") + print(f"{'Diffusion':<15} {'RTNConfig':<20} {'RTN (zero-shot)':<20} {'DiffusionZeroShotCompressor':<35}") + print(f"{'':<15} {'':<20} {'':<20} {' = DiffusionMixin + ZeroShotCompressor':<35}") + + print("\n" + "=" * 100 + "\n") diff --git a/auto_round/compressors_new/base.py b/auto_round/compressors_new/base.py index 6d4e273c5..6517f1a65 100644 --- a/auto_round/compressors_new/base.py +++ b/auto_round/compressors_new/base.py @@ -263,6 +263,7 @@ def post_init(self): self.formats = get_formats(self.formats, self) if self.formats is not None: self.compress_context.formats = self.formats + ShardWriter.reset() # Ensure a fresh ShardWriter for every new quantization run self.shard_writer = ShardWriter(self.model_context.model, bits=8) self.model_context.apply_patches(self.formats) diff --git a/auto_round/compressors_new/diffusion_mixin.py b/auto_round/compressors_new/diffusion_mixin.py index 71a4e0fb3..1b7e25f38 100644 --- a/auto_round/compressors_new/diffusion_mixin.py +++ b/auto_round/compressors_new/diffusion_mixin.py @@ -17,8 +17,6 @@ import torch from tqdm import tqdm -from auto_round.algorithms.alg_config import AlgConfig -from auto_round.context.model import ModelContext from auto_round.logger import logger @@ -39,6 +37,12 @@ class DiffusionMixin: guidance_scale: Control how much image generation follows text prompt num_inference_steps: Reference number of denoising steps generator_seed: Seed for initial noise generation + + Design note: + ``ModelContext._load_model()`` loads the diffusion pipeline and sets + ``model_context.pipe`` and ``model_context.model`` (the unet/transformer). + This mixin reads ``self.model_context.pipe`` directly during calibration and + saving so that ``model_context`` remains the single source of truth. """ def __init__(self, *args, guidance_scale=7.5, num_inference_steps=50, generator_seed=None, **kwargs): @@ -46,64 +50,24 @@ def __init__(self, *args, guidance_scale=7.5, num_inference_steps=50, generator_ self.guidance_scale = guidance_scale self.num_inference_steps = num_inference_steps self.generator_seed = generator_seed - self.pipe = None # Will be set during model loading - self.pipe_config = None # Call parent class __init__ (will be CalibCompressor, ImatrixCompressor, etc) super().__init__(*args, **kwargs) - def post_init(self): - """Override post_init to handle diffusion-specific model loading.""" - # Load diffusion model as pipeline before standard initialization - if isinstance(self.model_context.model, str): - self._load_diffusion_model() - - # Continue with standard post_init - super().post_init() - - def _load_diffusion_model(self): - """Load diffusion model using pipeline. - - This method loads the full diffusion pipeline and extracts the - transformer/unet component for quantization. - """ - from auto_round.utils import diffusion_load_model - - if isinstance(self.model_context.model, str): - # Load diffusion pipeline - logger.info(f"Loading diffusion model from {self.model_context.model}") - pipe, pipe_config = diffusion_load_model( - pretrained_model_name_or_path=self.model_context.model, - platform=self.platform, - device=self.compress_context.device, - trust_remote_code=self.model_context.trust_remote_code, - ) - self.pipe = pipe - self.pipe_config = pipe_config - - # Extract the transformer/unet component as the model - if hasattr(pipe, "transformer"): - extracted_model = pipe.transformer - logger.info("Extracted transformer from diffusion pipeline") - elif hasattr(pipe, "unet"): - extracted_model = pipe.unet - logger.info("Extracted unet from diffusion pipeline") - else: - raise ValueError("Cannot find transformer or unet in diffusion pipeline") - - # Replace the model path with the actual model - self.model_context.model = extracted_model - @torch.no_grad() def calib(self, nsamples, bs): """Perform diffusion-specific calibration for quantization. Override parent's calib method to use diffusion dataset loading logic. + The diffusion pipeline is read from ``self.model_context.pipe``. """ from auto_round.compressors.diffusion.dataset import get_diffusion_dataloader - if self.pipe is None: - raise ValueError("Diffusion pipeline must be loaded before calibration") + pipe = self.model_context.pipe + if pipe is None: + raise ValueError( + "Diffusion pipeline not found in model_context. " "Ensure the model was loaded as a diffusion model." + ) logger.warning( "Diffusion model will catch nsamples * num_inference_steps inputs, " @@ -123,13 +87,13 @@ def calib(self, nsamples, bs): total_cnt = 0 total = nsamples if not hasattr(self.dataloader, "len") else min(nsamples, len(self.dataloader)) - if self.pipe.dtype != self.model.dtype: - self.pipe.to(self.model.dtype) + if pipe.dtype != self.model.dtype: + pipe.to(self.model.dtype) if ( hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 0 - and self.pipe.device != self.model.device + and pipe.device != self.model.device and torch.device(self.model.device).type in ["cuda", "xpu"] ): logger.error( @@ -139,21 +103,21 @@ def calib(self, nsamples, bs): ) exit(-1) - if self.pipe.device != self.model.device: - self.pipe.to(self.model.device) + if pipe.device != self.model.device: + pipe.to(self.model.device) with tqdm(range(1, total + 1), desc="cache block inputs") as pbar: for ids, prompts in self.dataloader: if isinstance(prompts, tuple): prompts = list(prompts) try: - self.pipe( + pipe( prompt=prompts, guidance_scale=self.guidance_scale, num_inference_steps=self.num_inference_steps, generator=( None if self.generator_seed is None - else torch.Generator(device=self.pipe.device).manual_seed(self.generator_seed) + else torch.Generator(device=pipe.device).manual_seed(self.generator_seed) ), ) except NotImplementedError: @@ -204,9 +168,10 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k if output_dir is None: return super().save_quantized(output_dir, format=format, inplace=inplace, **kwargs) + pipe = self.model_context.pipe compressed_model = None - for name in self.pipe.components.keys(): - val = getattr(self.pipe, name) + for name in pipe.components.keys(): + val = getattr(pipe, name) sub_module_path = ( os.path.join(output_dir, name) if os.path.basename(os.path.normpath(output_dir)) != name else output_dir ) @@ -223,5 +188,5 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k ) elif val is not None and hasattr(val, "save_pretrained"): val.save_pretrained(sub_module_path) - self.pipe.config.save_pretrained(output_dir) + pipe.config.save_pretrained(output_dir) return compressed_model diff --git a/auto_round/compressors_new/docs/compressors_new_architecture.md b/auto_round/compressors_new/docs/compressors_new_architecture.md index 858a60cae..7b3012988 100644 --- a/auto_round/compressors_new/docs/compressors_new_architecture.md +++ b/auto_round/compressors_new/docs/compressors_new_architecture.md @@ -2,290 +2,313 @@ ## Overview -本文档介绍了 `compressors_new` 的新架构设计,该设计统一了 LLM、MLLM 和 Diffusion 模型的量化入口。 +This document describes the new architecture of `compressors_new`, which provides a unified +quantization entry point for LLM, MLLM, and Diffusion models. -## 架构设计 +## Architecture Design -### 核心思想 +### Core Idea -通过 `entry.py` 中的 `Compressor` 类作为统一入口,根据模型类型和算法配置动态选择合适的 Compressor 实现。 +`Compressor` in `entry.py` is the single entry point. It detects the model type and config +type at construction time and dynamically creates the correct concrete class using multiple +inheritance (Mixin pattern). -### 组件结构 +### Directory Structure ``` compressors_new/ -├── entry.py # 统一入口,自动检测模型类型 -├── base.py # BaseCompressor 基类 -├── calib.py # CalibCompressor (需要校准的算法) -├── zero_shot.py # ZeroShotCompressor (不需要校准的算法) -├── mllm_mixin.py # MLLMCalibCompressor (MLLM + 校准) -└── diffusion_mixin.py # DiffusionCalibCompressor (Diffusion + 校准) +├── entry.py # Unified entry point — Compressor + AutoRound wrapper +├── base.py # BaseCompressor base class + SerializedCompressorConfig +├── calib.py # CalibCompressor (AutoRound gradient-based) +│ # CalibratedRTNCompressor (RTN + imatrix / act-calib) +├── zero_shot.py # ZeroShotCompressor (zero-shot RTN) +├── mllm_mixin.py # MLLMMixin (vision-language model extra logic) +├── diffusion_mixin.py # DiffusionMixin (diffusion pipeline extra logic) +└── docs/ # This document ``` -### 类继承关系 +### Class Hierarchy ``` BaseCompressor - ├── CalibCompressor (基于校准的压缩) - │ ├── MLLMCalibCompressor (MLLM 专用) - │ └── DiffusionCalibCompressor (Diffusion 专用) - │ - └── ZeroShotCompressor (不需要校准) + ├── CalibCompressor (AutoRound, gradient-based calibration) + ├── CalibratedRTNCompressor (RTN + importance-matrix or act calibration) + └── ZeroShotCompressor (RTN, no calibration data needed) + +Mixins (combined dynamically in entry.py): + MLLMMixin + {CalibCompressor | CalibratedRTNCompressor | ZeroShotCompressor} + DiffusionMixin + {CalibCompressor | CalibratedRTNCompressor | ZeroShotCompressor} ``` -## 使用方法 +## Configuration Layer + +### QuantizationConfig (dataclass) -### 1. 基本用法 +`QuantizationConfig` is declared as a `@dataclass(kw_only=True)`, which eliminates +`__init__` boilerplate. Subclasses call `super().__init__(scheme=..., **kwargs)` as normal: ```python -from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig -from auto_round.compressors_new.entry import Compressor +@dataclass(kw_only=True) +class QuantizationConfig(AlgConfig): + _alg_cls: ClassVar[str] = None # which quantizer class to use -# 创建配置 -config = AutoRoundConfig( - scheme="W4A16", - iters=200, - nsamples=128, -) + scheme: Union[str, dict, QuantizationScheme, AutoScheme] = "W4A16" + bits: int = None + group_size: int = None # also accepts tuple, e.g. (128,128) for block-FP8 + # ... other fields -# 统一入口 - 自动检测模型类型 -compressor = Compressor( - config=config, - model="/path/to/model", # 可以是 LLM/MLLM/Diffusion - tokenizer=tokenizer, - platform="hf", - format=None, -) - -# 执行量化 -quantized_model, layer_config = compressor.quantize() + def __post_init__(self): + self._early_resolve_scheme() # eagerly resolves scheme attrs at construction time ``` -### 2. MLLM 模型量化 - -```python -from auto_round.compressors_new.entry import Compressor -from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig +Subclasses: +- `RTNConfig(QuantizationConfig)` — adds `disable_opt_rtn`, `seqlen`, `nsamples`, `batch_size` +- `AutoRoundConfig(QuantizationConfig)` — adds `iters`, `lr`, `nblocks`, `enable_minmax_tuning`, … -config = AutoRoundConfig(scheme="W4A16", iters=200) +### AlgConfig -# 会自动使用 MLLMCalibCompressor -compressor = Compressor( - config=config, - model="/models/Qwen2-VL-2B-Instruct", - tokenizer=tokenizer, - processor=processor, # MLLM 特定参数 - image_processor=image_processor, # MLLM 特定参数 - template="qwen2_vl", # MLLM 特定参数 - extra_data_dir="/path/to/images", # MLLM 特定参数 -) +`AlgConfig` is the base class used as type annotation throughout `compressors_new/`. +Both `QuantizationConfig` and future non-quantization configs inherit from it. -quantized_model, layer_config = compressor.quantize() -``` +## ModelContext -### 3. Diffusion 模型量化 +`ModelContext.__init__` **eagerly loads the model** — by the time `BaseCompressor.__init__` +returns, the model is already loaded in CPU memory. ```python -from auto_round.compressors_new.entry import Compressor -from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig +class ModelContext(BaseContext): + def __init__(self, model, tokenizer, platform, ..., formats, is_act_quantize, quant_nontext_module): + # ... store attrs + self._load_model() # load LLM / MLLM / Diffusion model + check_and_mark_quantized_module(self.model) + self.model = self.model.eval() + self.shared_cache_keys = get_shared_keys(self.model) + self.is_moe_model = is_moe_model(self.model) + self._set_amp_dtype() + + def apply_patches(self, formats): + """Apply format-specific model structure patches. + Called by BaseCompressor.post_init() after formats are resolved. + """ + self._patch_custom_moe_modules() # e.g. Qwen3VL top_k fix + self.model = update_module(self.model, formats=formats, ...) + for n, m in self.model.named_modules(): + m.global_name = n # assign names used by quantizers + self._is_initialized = True +``` -config = AutoRoundConfig(scheme="W4A16", iters=200) +## BaseCompressor.post_init() Flow -# 会自动使用 DiffusionCalibCompressor -compressor = Compressor( - config=config, - model="/models/stable-diffusion-2-1", - platform="hf", - guidance_scale=7.5, # Diffusion 特定参数 - num_inference_steps=50, # Diffusion 特定参数 -) +`post_init()` is called at the start of `quantize()` (not in `__init__`). +The order matters — model patches must come before quantizer setup: -quantized_model, layer_config = compressor.quantize() +``` +post_init() +│ +├─ 1. Resolve formats (str → list[OutputFormat]) +│ +├─ 2. Apply model patches +│ model_context.apply_patches(formats) +│ ├── _patch_custom_moe_modules() +│ ├── update_module(model, formats) # insert gguf_pack_linear, etc. +│ └── assign m.global_name to all modules +│ +├─ 3. Setup quantizer on the patched model +│ quantizer = BaseQuantizers.from_config(config) +│ quantizer.post_init() +│ ├── _parse_scheme() → resolve final quant attrs +│ ├── get_block_names(quant_vision=quant_nontext_module) +│ ├── find_matching_blocks() → quant_block_list +│ ├── back-fill to_quant_block_names (if was None) +│ └── configure_layer_config() +│ +└─ 4. Setup device map, torch compile, offloader ``` -## 模型类型检测 +> **No `refresh_quantizer_for_initialized_model()`** — eliminated by running `apply_patches` +> *before* `quantizer.post_init()`. -`entry.py` 中的 `detect_model_type()` 函数自动检测模型类型: +## BaseQuantizers Interface -```python -def detect_model_type(model): - """检测模型类型 +All quantizers accept **names** (str), not module objects. +The module is retrieved internally via `get_module(model, name)`: - Returns: - "mllm" | "diffusion" | "llm" - """ - if is_diffusion_model(model): - return "diffusion" - if is_mllm_model(model): - return "mllm" - return "llm" +```python +class BaseQuantizers: + def quantize_block( + self, + block_name: Union[str, list[str]], # list[str] for nblocks > 1 + input_ids=None, + input_others=None, + **kwargs, + ): ... + + def quantize_layer(self, layer_name: str, **kwargs): ... ``` -检测逻辑: -1. 优先检测是否为 Diffusion 模型(检查 `model_index.json`) -2. 然后检测是否为 MLLM 模型(检查 `processor_config.json` 等) -3. 默认为标准 LLM 模型 - -## 动态 Compressor 选择 +- `str` → `get_module(model, block_name)` +- `list[str]` → `WrapperMultiblock([get_module(model, n) for n in block_name])` (multi-block) -`entry.py` 中的 `Compressor.__new__()` 方法根据以下条件动态选择: - -### 决策树 +## Compressor Selection Decision Tree ``` -Compressor.__new__() +Compressor.__new__(config, model, format, **kwargs) +│ +├─ Detect model type +│ ├─ is_diffusion_model() → "diffusion" +│ ├─ is_mllm_model() → "mllm" +│ └─ else → "llm" │ -├── AutoRoundConfig (需要校准) -│ ├── MLLM → MLLMCalibCompressor -│ ├── Diffusion → DiffusionCalibCompressor -│ └── LLM → CalibCompressor +├─ isinstance(config, AutoRoundConfig) +│ ├─ mllm → class MLLMCalibCompressor(MLLMMixin, CalibCompressor) +│ ├─ diffusion → class DiffusionCalibCompressor(DiffusionMixin, CalibCompressor) +│ └─ llm → CalibCompressor │ -└── RTNConfig - ├── enable_imatrix=True → ImatrixCompressor - └── enable_imatrix=False → ZeroShotCompressor +└─ isinstance(config, RTNConfig) + ├─ enable_imatrix OR needs_act_calib → CalibratedRTNCompressor path + │ ├─ gguf_k format → enable_imatrix = True + │ ├─ symmetric int RTN → enable_imatrix = True + │ ├─ static act quantization → needs_act_calib = True + │ │ + │ ├─ mllm → class MLLMCalibratedRTNCompressor(MLLMMixin, CalibratedRTNCompressor) + │ ├─ diffusion → class DiffusionCalibratedRTNCompressor(DiffusionMixin, CalibratedRTNCompressor) + │ └─ llm → CalibratedRTNCompressor + │ + └─ else → ZeroShotCompressor path + ├─ mllm → class MLLMZeroShotCompressor(MLLMMixin, ZeroShotCompressor) + ├─ diffusion → class DiffusionZeroShotCompressor(DiffusionMixin, ZeroShotCompressor) + └─ llm → ZeroShotCompressor ``` -## 扩展新模型类型 - -如果需要支持新的模型类型,按照以下步骤: - -### 1. 创建专用 Compressor +## MLLMMixin ```python -# compressors_new/new_model_calib.py -from auto_round.compressors_new.calib import CalibCompressor - +class MLLMMixin: + def __init__( + self, + *args, + processor=None, + image_processor=None, + template=None, + extra_data_dir=None, + quant_nontext_module=False, + **kwargs + ): + self.processor = processor + self.template = template + self.quant_nontext_module = quant_nontext_module + # Pass to ModelContext so get_block_names includes vision blocks + kwargs.setdefault("quant_nontext_module", quant_nontext_module) + super().__init__(*args, **kwargs) -class NewModelCalibCompressor(CalibCompressor): - def __init__(self, config, model, **kwargs): - # 存储模型特定参数 - self.special_param = kwargs.pop("special_param", None) - super().__init__(config, model, **kwargs) - - @torch.no_grad() def calib(self, nsamples, bs): - # 实现模型特定的校准逻辑 - # 通常需要: - # 1. 加载模型特定的 dataloader - # 2. 处理模型特定的数据格式 - # 3. 执行前向传播进行校准 - pass + # Uses get_mllm_dataloader with template / processor + ... ``` -### 2. 更新模型检测逻辑 +`quant_nontext_module` flow: +`MLLMMixin.__init__` → `kwargs.setdefault` → `BaseCompressor.__init__` pops → `ModelContext(quant_nontext_module=...)` +→ `BaseQuantizers.post_init()` calls `get_block_names(quant_vision=quant_nontext_module)` -```python -# 在 entry.py 的 detect_model_type() 中添加 -def detect_model_type(model): - if is_new_model_type(model): # 添加新的检测函数 - return "new_model_type" - if is_diffusion_model(model): - return "diffusion" - # ... -``` +## Usage Examples -### 3. 更新 Compressor 入口 +### Basic LLM quantization ```python -# 在 entry.py 的 Compressor.__new__() 中添加 -if isinstance(config, AutoRoundConfig): - if model_type == "new_model_type": - from auto_round.compressors_new.new_model_calib import NewModelCalibCompressor - return NewModelCalibCompressor(config, **local_args, **kwargs) - elif model_type == "mllm": - # ... -``` +from auto_round.compressors_new.entry import Compressor +from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig -## 与旧架构的兼容性 +config = AutoRoundConfig(scheme="W4A16", iters=200, nsamples=128) +compressor = Compressor(config=config, model="/path/to/llm", tokenizer=tokenizer) +quantized_model, layer_config = compressor.quantize() +``` -### 旧架构 (compressors/) +### MLLM (vision-language model) ```python -from auto_round.compressors.mllm.compressor import MLLMCompressor - -compressor = MLLMCompressor( - model=model, - # ... 参数 +config = AutoRoundConfig(scheme="W4A16", iters=200) +compressor = Compressor( + config=config, + model="/models/Qwen2-VL-2B-Instruct", + processor=processor, + template="qwen2_vl", + quant_nontext_module=False, # True to also quantize vision encoder ) +# Creates: MLLMCalibCompressor(MLLMMixin, CalibCompressor) ``` -### 新架构 (compressors_new/) +### Diffusion model ```python -from auto_round.compressors_new.entry import Compressor -from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig - -config = AutoRoundConfig(...) +config = AutoRoundConfig(scheme="W4A16", iters=200) compressor = Compressor( config=config, - model=model, - # ... 参数 + model="/models/stable-diffusion-2-1", + guidance_scale=7.5, ) +# Creates: DiffusionCalibCompressor(DiffusionMixin, CalibCompressor) ``` -**优势:** -1. 统一入口,无需手动选择 Compressor -2. 自动模型类型检测 -3. 更好的代码组织和复用 -4. 易于扩展新模型类型 - -## 实现细节 - -### MLLMCalibCompressor +### RTN zero-shot -重写的关键方法: -- `calib()`: 使用 MLLM 专用的 dataloader 和 template -- 处理 processor, image_processor, template 等 MLLM 特定参数 - -### DiffusionCalibCompressor +```python +from auto_round.algorithms.quantization.rtn.config import RTNConfig -重写的关键方法: -- `post_init()`: 预先加载 diffusion pipeline -- `_load_diffusion_model()`: 加载 pipeline 并提取 transformer/unet -- `calib()`: 使用 diffusion 专用的 dataloader +config = RTNConfig(scheme="W4A16") +compressor = Compressor(config=config, model="/path/to/model") +``` -### 数据流 +### RTN with imatrix (GGUF k-quants) +```python +config = RTNConfig(scheme="W4A16") +compressor = Compressor(config=config, model="/path/to/model", format="gguf_k") +# Creates: CalibratedRTNCompressor (enable_imatrix=True) ``` -1. Compressor.__new__() - └── 检测模型类型 - └── 创建对应的 Compressor 实例 -2. CompressorInstance.__init__() - └── 存储模型特定参数 - └── 调用 super().__init__() +## Extending with New Model Types -3. CompressorInstance.quantize() - └── post_init() - └── _load_model() (可能被重写) - └── calib() (可能被重写) - └── 执行量化算法 +**Step 1**: Create a new Mixin in `compressors_new/`: -4. 返回量化后的模型 -``` +```python +class AudioMixin: + def __init__(self, *args, audio_processor=None, **kwargs): + self.audio_processor = audio_processor + super().__init__(*args, **kwargs) -## 测试 + def calib(self, nsamples, bs): ... +``` -运行测试脚本: +**Step 2**: Add detection in `entry.py`: -```bash -python test_compressor_new_arch.py +```python +def detect_model_type(model): + if is_audio_model(model): + return "audio" + if is_diffusion_model(model): + return "diffusion" + ... ``` -这将测试: -- 模型类型检测 -- LLM Compressor 创建 -- MLLM Compressor 创建 -- Diffusion Compressor 创建 +**Step 3**: Add routing in `Compressor.__new__()`: -## 总结 +```python +if model_type == "audio": + from auto_round.compressors_new.audio_mixin import AudioMixin -新架构的主要优势: + class AudioCalibCompressor(AudioMixin, CalibCompressor): + pass + + return AudioCalibCompressor(config, **local_args, **kwargs) +``` -1. **统一入口**: 一个 `Compressor` 类处理所有模型类型 -2. **自动检测**: 无需手动判断模型类型 -3. **易于扩展**: 添加新模型类型只需3步 -4. **代码复用**: 通过继承复用基类功能 -5. **清晰结构**: 每种模型类型有独立的 Compressor 实现 +## Summary -这种设计符合开闭原则(Open-Closed Principle),对扩展开放,对修改关闭。 +| Aspect | Description | +|---|---| +| **Entry point** | Single `Compressor` class, auto-detects model type | +| **Config** | `QuantizationConfig` dataclass; subclasses `RTNConfig`, `AutoRoundConfig` | +| **Model loading** | `ModelContext.__init__` loads eagerly; `apply_patches()` runs before quantizer setup | +| **9 combinations** | 3 model types × 3 compressors, dynamic classes via Mixin | +| **Quantizer interface** | Name-based `quantize_block(name)` / `quantize_layer(name)`, not module objects | +| **Extension** | Add new model type in 3 steps (Mixin class, detect fn, routing) | diff --git a/auto_round/compressors_new/docs/compressors_new_architecture_CN.md b/auto_round/compressors_new/docs/compressors_new_architecture_CN.md index 9c6569fb7..88bebc103 100644 --- a/auto_round/compressors_new/docs/compressors_new_architecture_CN.md +++ b/auto_round/compressors_new/docs/compressors_new_architecture_CN.md @@ -2,786 +2,371 @@ ## 概述 -本文档介绍了 `compressors_new` 的新架构设计,实现了对 LLM、MLLM 和 Diffusion 模型的统一量化入口。 +本文档介绍 `compressors_new` 的新架构设计,为 LLM、MLLM 和 Diffusion 模型提供统一的量化入口。 ## 架构设计 ### 核心思想 -通过 `entry.py` 中的 `Compressor` 类作为统一入口点,根据模型类型和算法配置自动选择合适的 Compressor 实现类。 +`entry.py` 中的 `Compressor` 是唯一入口。构造时自动检测模型类型和配置类型,通过多重继承(Mixin 模式)动态创建正确的具体类。 ### 目录结构 ``` compressors_new/ -├── entry.py # 统一入口,自动检测模型类型 -├── base.py # BaseCompressor 基类 -├── calib.py # CalibCompressor (基于校准的压缩) -├── zero_shot.py # ZeroShotCompressor (零样本压缩) -├── mllm_mixin.py # MLLMCalibCompressor (多模态模型校准压缩) -└── diffusion_mixin.py # DiffusionCalibCompressor (扩散模型校准压缩) +├── entry.py # 统一入口 — Compressor + AutoRound 兼容层 +├── base.py # BaseCompressor 基类 + SerializedCompressorConfig +├── calib.py # CalibCompressor(AutoRound 梯度校准) +│ # CalibratedRTNCompressor(RTN + imatrix / 激活校准) +├── zero_shot.py # ZeroShotCompressor(零样本 RTN) +├── mllm_mixin.py # MLLMMixin(视觉-语言模型扩展逻辑) +├── diffusion_mixin.py # DiffusionMixin(扩散模型 pipeline 扩展逻辑) +└── docs/ # 本文档 ``` ### 类继承关系 ``` -BaseCompressor (基础压缩器) - │ - ├── CalibCompressor (基于校准的压缩器) - │ │ - │ ├── MLLMCalibCompressor (多模态模型专用) - │ │ └── 支持视觉-语言模型(如 Qwen2-VL, LLaVA 等) - │ │ - │ └── DiffusionCalibCompressor (扩散模型专用) - │ └── 支持文生图模型(如 Stable Diffusion, FLUX 等) - │ - └── ZeroShotCompressor (零样本压缩器) - └── 用于 RTN 等不需要校准的算法 +BaseCompressor + ├── CalibCompressor (AutoRound,基于梯度的校准量化) + ├── CalibratedRTNCompressor (RTN + importance-matrix 或激活校准) + └── ZeroShotCompressor (RTN,不需要校准数据) + +Mixin(在 entry.py 中动态组合): + MLLMMixin + {CalibCompressor | CalibratedRTNCompressor | ZeroShotCompressor} + DiffusionMixin + {CalibCompressor | CalibratedRTNCompressor | ZeroShotCompressor} ``` -## 使用方法 +## 配置层 -### 1. 基本用法(自动检测) +### QuantizationConfig(dataclass) -```python -from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig -from auto_round.compressors_new.entry import Compressor - -# 创建量化配置 -config = AutoRoundConfig( - scheme="W4A16", # 量化方案: 权重4比特,激活16比特 - iters=200, # 迭代次数 - nsamples=128, # 校准样本数 -) - -# 统一入口 - 自动检测模型类型并选择合适的 Compressor -compressor = Compressor( - config=config, - model="/path/to/model", # 支持 LLM/MLLM/Diffusion 模型 - tokenizer=tokenizer, - platform="hf", # 平台: "hf" 或 "model_scope" -) - -# 执行量化 -quantized_model, layer_config = compressor.quantize() -``` - -### 2. MLLM 多模态模型量化 +`QuantizationConfig` 声明为 `@dataclass(kw_only=True)`,消除了 `__init__` 中的样板代码。 +子类仍然用 `super().__init__(scheme=..., **kwargs)` 正常调用: ```python -from auto_round.compressors_new.entry import Compressor -from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig -from transformers import AutoProcessor, AutoTokenizer +@dataclass(kw_only=True) +class QuantizationConfig(AlgConfig): + _alg_cls: ClassVar[str] = None # 指定使用哪个量化器类 -# 准备 tokenizer 和 processor -tokenizer = AutoTokenizer.from_pretrained("/models/Qwen2-VL-2B-Instruct") -processor = AutoProcessor.from_pretrained("/models/Qwen2-VL-2B-Instruct") - -config = AutoRoundConfig(scheme="W4A16", iters=200, nsamples=128) + scheme: Union[str, dict, QuantizationScheme, AutoScheme] = "W4A16" + bits: int = None + group_size: int = None # 也接受 tuple,如 (128,128) 用于块状 FP8 + # ... 其他字段 -# 自动使用 MLLMCalibCompressor -compressor = Compressor( - config=config, - model="/models/Qwen2-VL-2B-Instruct", - tokenizer=tokenizer, - processor=processor, # MLLM 特定: 多模态处理器 - image_processor=None, # MLLM 特定: 图像处理器 - template="qwen2_vl", # MLLM 特定: 模板名称 - extra_data_dir="/path/to/images", # MLLM 特定: 额外数据路径 - quant_nontext_module=False, # 是否量化非文本模块 -) - -quantized_model, layer_config = compressor.quantize() + def __post_init__(self): + self._early_resolve_scheme() # 构造时即刻解析 scheme 属性 ``` -### 3. Diffusion 扩散模型量化 - -```python -from auto_round.compressors_new.entry import Compressor -from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig +子类: +- `RTNConfig(QuantizationConfig)` — 新增 `disable_opt_rtn`、`seqlen`、`nsamples`、`batch_size` +- `AutoRoundConfig(QuantizationConfig)` — 新增 `iters`、`lr`、`nblocks`、`enable_minmax_tuning` 等 -config = AutoRoundConfig(scheme="W4A16", iters=200, nsamples=128) +### AlgConfig -# 自动使用 DiffusionCalibCompressor -compressor = Compressor( - config=config, - model="/models/stable-diffusion-2-1", - platform="hf", - guidance_scale=7.5, # Diffusion 特定: 引导强度 - num_inference_steps=50, # Diffusion 特定: 推理步数 - generator_seed=42, # Diffusion 特定: 随机种子 - dataset="coco2014", # 校准数据集 -) +`AlgConfig` 是基类,用于 `compressors_new/` 各处的类型标注。 +`QuantizationConfig` 及未来的非量化配置都继承自它。 -quantized_model, layer_config = compressor.quantize() -``` +## ModelContext -### 4. RTN 量化(零样本) +`ModelContext.__init__` **立即加载模型** —— `BaseCompressor.__init__` 返回时,模型已经在 CPU 内存中。 ```python -from auto_round.compressors_new.entry import Compressor -from auto_round.algorithms.quantization.rtn.config import RTNConfig - -# RTN 不需要校准数据 -config = RTNConfig(scheme="W4A16") - -# 自动使用 ZeroShotCompressor 或 ImatrixCompressor -compressor = Compressor( - config=config, - model="/path/to/model", - format="gguf_k", # 如果是 gguf_k 格式,会使用 ImatrixCompressor -) - -quantized_model, layer_config = compressor.quantize() +class ModelContext(BaseContext): + def __init__(self, model, tokenizer, platform, ..., formats, is_act_quantize, quant_nontext_module): + # ... 存储属性 + self._load_model() # 加载 LLM / MLLM / Diffusion 模型 + check_and_mark_quantized_module(self.model) + self.model = self.model.eval() + self.shared_cache_keys = get_shared_keys(self.model) + self.is_moe_model = is_moe_model(self.model) + self._set_amp_dtype() + + def apply_patches(self, formats): + """应用格式相关的模型结构补丁。 + 由 BaseCompressor.post_init() 在 formats 解析完毕后调用。 + """ + self._patch_custom_moe_modules() # 如 Qwen3VL top_k 修复 + self.model = update_module(self.model, formats=formats, ...) + for n, m in self.model.named_modules(): + m.global_name = n # 赋予量化器使用的全局名称 + self._is_initialized = True ``` -## 模型类型自动检测 - -`entry.py` 中的 `detect_model_type()` 函数负责自动检测模型类型: - -```python -def detect_model_type(model): - """检测模型类型 - - Args: - model: 模型实例或模型路径字符串 - - Returns: - str: "mllm" | "diffusion" | "llm" - """ - from auto_round.utils import is_mllm_model, is_diffusion_model - - # 1. 优先检测 Diffusion 模型 - if is_diffusion_model(model): - return "diffusion" +## BaseCompressor.post_init() 执行流程 - # 2. 检测 MLLM 模型 - if is_mllm_model(model): - return "mllm" +`post_init()` 在 `quantize()` 开始时调用(不在 `__init__` 中)。 +顺序至关重要——模型补丁必须在量化器初始化之前完成: - # 3. 默认为标准 LLM - return "llm" +``` +post_init() +│ +├─ 1. 解析 formats(str → list[OutputFormat]) +│ +├─ 2. 应用模型补丁 +│ model_context.apply_patches(formats) +│ ├── _patch_custom_moe_modules() +│ ├── update_module(model, formats) # 插入 gguf_pack_linear 等 +│ └── 为所有模块赋予 m.global_name +│ +├─ 3. 在已补丁的模型上初始化量化器 +│ quantizer = BaseQuantizers.from_config(config) +│ quantizer.post_init() +│ ├── _parse_scheme() → 解析最终量化属性 +│ ├── get_block_names(quant_vision=quant_nontext_module) +│ ├── find_matching_blocks() → quant_block_list +│ ├── 反填 to_quant_block_names(如果原来为 None) +│ └── configure_layer_config() +│ +└─ 4. 设置 device_map、torch compile、offloader ``` -### 检测逻辑说明 +> **无 `refresh_quantizer_for_initialized_model()`** —— 旧调用已通过先执行 `apply_patches`、 +> 再调用 `quantizer.post_init()` 的顺序调整消除。 -1. **Diffusion 模型检测** (`is_diffusion_model`): - - 检查目录中是否存在 `model_index.json` 文件 - - 检查是否为 `DiffusionPipeline` 实例 +## BaseQuantizers 接口 -2. **MLLM 模型检测** (`is_mllm_model`): - - 检查是否存在 `processor_config.json` - - 检查是否存在 `preprocessor_config.json` - - 检查 config 中是否包含多模态相关键(vision_config 等) +所有量化器接受**名称**(str),而非模块对象。 +模块在内部通过 `get_module(model, name)` 获取: -3. **LLM 模型** (默认): - - 所有其他情况 +```python +class BaseQuantizers: + def quantize_block( + self, + block_name: Union[str, list[str]], # list[str] 用于 nblocks > 1 + input_ids=None, + input_others=None, + **kwargs, + ): ... -## Compressor 动态选择逻辑 + def quantize_layer(self, layer_name: str, **kwargs): ... +``` -`Compressor.__new__()` 方法根据配置类型和模型类型动态创建实例: +- `str` → `get_module(model, block_name)` +- `list[str]` → `WrapperMultiblock([get_module(model, n) for n in block_name])`(多块模式) -### 决策流程图 +## Compressor 选择决策树 ``` -Compressor.__new__() +Compressor.__new__(config, model, format, **kwargs) │ -├─ 检测模型类型 (detect_model_type) -│ ├─ "diffusion" -│ ├─ "mllm" -│ └─ "llm" +├─ 检测模型类型 +│ ├─ is_diffusion_model() → "diffusion" +│ ├─ is_mllm_model() → "mllm" +│ └─ 其他 → "llm" │ -├─ AutoRoundConfig (需要校准) -│ ├─ model_type == "mllm" -│ │ └─> MLLMCalibCompressor -│ │ └─ 使用 MLLM dataloader -│ │ └─ 支持 processor, template 等 -│ │ -│ ├─ model_type == "diffusion" -│ │ └─> DiffusionCalibCompressor -│ │ └─ 加载 diffusion pipeline -│ │ └─ 提取 transformer/unet -│ │ -│ └─ model_type == "llm" -│ └─> CalibCompressor -│ └─ 标准文本数据集 +├─ isinstance(config, AutoRoundConfig) +│ ├─ mllm → class MLLMCalibCompressor(MLLMMixin, CalibCompressor) +│ ├─ diffusion → class DiffusionCalibCompressor(DiffusionMixin, CalibCompressor) +│ └─ llm → CalibCompressor │ -└─ RTNConfig (零样本量化) - ├─ enable_imatrix == True - │ └─> ImatrixCompressor - │ └─ 使用 importance matrix +└─ isinstance(config, RTNConfig) + ├─ enable_imatrix 或 needs_act_calib → CalibratedRTNCompressor 路径 + │ ├─ gguf_k 格式 → enable_imatrix = True + │ ├─ 对称 int RTN → enable_imatrix = True + │ ├─ 静态激活量化 → needs_act_calib = True + │ │ + │ ├─ mllm → class MLLMCalibratedRTNCompressor(MLLMMixin, CalibratedRTNCompressor) + │ ├─ diffusion → class DiffusionCalibratedRTNCompressor(DiffusionMixin, CalibratedRTNCompressor) + │ └─ llm → CalibratedRTNCompressor │ - └─ enable_imatrix == False - └─> ZeroShotCompressor - └─ 纯 RTN 量化 -``` - -### 代码实现 - -```python -class Compressor(object): - def __new__(cls, config, model, tokenizer=None, platform="hf", format=None, **kwargs): - # 检测模型类型 - model_type = detect_model_type(model) - - if isinstance(config, AutoRoundConfig): - # AutoRound 需要校准 - if model_type == "mllm": - from auto_round.compressors_new.mllm_mixin import MLLMCalibCompressor - - return MLLMCalibCompressor(config, model, tokenizer, platform, format, **kwargs) - elif model_type == "diffusion": - from auto_round.compressors_new.diffusion_mixin import DiffusionCalibCompressor - - return DiffusionCalibCompressor(config, model, tokenizer, platform, format, **kwargs) - else: - return CalibCompressor(config, model, tokenizer, platform, format, **kwargs) - - elif isinstance(config, RTNConfig): - # RTN 可能需要 imatrix - if enable_imatrix: - from auto_round.compressors_new.calib import ImatrixCompressor - - return ImatrixCompressor(config, model, tokenizer, platform, format, **kwargs) - return ZeroShotCompressor(config, model, tokenizer, platform, format, **kwargs) + └─ 其他(零样本) → ZeroShotCompressor 路径 + ├─ mllm → class MLLMZeroShotCompressor(MLLMMixin, ZeroShotCompressor) + ├─ diffusion → class DiffusionZeroShotCompressor(DiffusionMixin, ZeroShotCompressor) + └─ llm → ZeroShotCompressor ``` -## 扩展新模型类型 - -如果需要支持新的模型类型,按照以下步骤操作: - -### 步骤 1: 创建专用 Compressor 类 - -在 `compressors_new/` 下创建新文件,例如 `audio_calib.py`: +## MLLMMixin ```python -# compressors_new/audio_calib.py -from typing import Union -import torch -from auto_round.algorithms.alg_config import AlgConfig -from auto_round.compressors_new.calib import CalibCompressor -from auto_round.logger import logger - - -class AudioCalibCompressor(CalibCompressor): - """音频模型专用校准压缩器""" - +class MLLMMixin: def __init__( self, - config: Union[AlgConfig, list[AlgConfig]], - model: Union[torch.nn.Module, str], - tokenizer=None, - platform="hf", - format=None, - audio_processor=None, # 音频特定参数 - **kwargs, + *args, + processor=None, + image_processor=None, + template=None, + extra_data_dir=None, + quant_nontext_module=False, + **kwargs ): - # 保存音频特定参数 - self.audio_processor = audio_processor + self.processor = processor + self.template = template + self.quant_nontext_module = quant_nontext_module + # 传给 ModelContext,使 get_block_names 包含视觉编码器的块 + kwargs.setdefault("quant_nontext_module", quant_nontext_module) + super().__init__(*args, **kwargs) - # 调用父类初始化 - super().__init__( - config=config, - model=model, - tokenizer=tokenizer, - platform=platform, - format=format, - **kwargs, - ) - - @torch.no_grad() def calib(self, nsamples, bs): - """实现音频模型特定的校准逻辑""" - from your_audio_module import get_audio_dataloader - - logger.info("Preparing audio dataloader...") - - # 获取音频专用的 dataloader - self.dataloader = get_audio_dataloader( - model=self.model_context.model, - audio_processor=self.audio_processor, - dataset=self.dataset, - nsamples=nsamples, - batch_size=bs, - seed=self.seed, - ) - - # 执行校准前向传播 - total_cnt = 0 - for data in self.dataloader: - if data is None: - continue - - # 处理并前向传播 - try: - if isinstance(data, dict): - self.model_context.model(**data) - else: - self.model_context.model(data) - except Exception as e: - logger.warning(f"Calibration failed: {e}") - - total_cnt += bs - if total_cnt >= nsamples: - break - - if total_cnt == 0: - logger.error("No calibration data processed") - exit(-1) + # 使用 get_mllm_dataloader,带 template / processor + ... ``` -### 步骤 2: 更新模型检测逻辑 - -在 `entry.py` 中添加音频模型检测: +`quant_nontext_module` 传递链路: +`MLLMMixin.__init__` → `kwargs.setdefault` → `BaseCompressor.__init__` pop +→ `ModelContext(quant_nontext_module=...)` → `BaseQuantizers.post_init()` +调用 `get_block_names(quant_vision=quant_nontext_module)` -```python -# entry.py -def detect_model_type(model): - """检测模型类型""" - from auto_round.utils import is_mllm_model, is_diffusion_model, is_audio_model - - # 按特殊性从高到低检测 - if is_diffusion_model(model): - return "diffusion" - - if is_audio_model(model): # 新增音频检测 - return "audio" +## MRO(方法解析顺序)示例 - if is_mllm_model(model): - return "mllm" - - return "llm" +``` +MLLMCalibCompressor(entry.py 中动态创建) + └─> MLLMMixin + └─> CalibCompressor + └─> BaseCompressor + └─> object + +调用 __init__() 的执行顺序: + 1. MLLMCalibCompressor.__init__() → 未定义,向上查找 + 2. MLLMMixin.__init__() + - 保存 MLLM 专属属性:processor、template、quant_nontext_module 等 + - kwargs.setdefault("quant_nontext_module", ...) + - super().__init__() → 进入 CalibCompressor + 3. CalibCompressor.__init__() → BaseCompressor.__init__() + - pop quant_nontext_module from kwargs + - 创建 ModelContext(..., quant_nontext_module=quant_nontext_module) + - ModelContext.__init__ 立即加载模型 + - 创建 CompressContext 单例 + +结果:MLLMCalibCompressor 实例同时具备: + ✓ MLLMMixin 提供的 MLLM 特性(processor、template、calib() 重写) + ✓ CalibCompressor 提供的梯度校准量化 + ✓ BaseCompressor 提供的模型/上下文管理 ``` -### 步骤 3: 更新 Compressor 入口 +## 使用示例 -在 `entry.py` 的 `Compressor.__new__()` 中添加音频分支: +### 基本 LLM 量化 ```python -class Compressor(object): - def __new__(cls, config, model, tokenizer=None, platform="hf", format=None, **kwargs): - local_args = {k: v for k, v in locals().items() if k not in cls.SKIP_ARGS} - - # 检测模型类型 - model_type = detect_model_type(model) - - if isinstance(config, AutoRoundConfig): - # 新增音频分支 - if model_type == "audio": - from auto_round.compressors_new.audio_calib import AudioCalibCompressor - - return AudioCalibCompressor(config, **local_args, **kwargs) - elif model_type == "mllm": - from auto_round.compressors_new.mllm_mixin import MLLMCalibCompressor +from auto_round.compressors_new.entry import Compressor +from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig - return MLLMCalibCompressor(config, **local_args, **kwargs) - # ... 其他分支 +config = AutoRoundConfig(scheme="W4A16", iters=200, nsamples=128) +compressor = Compressor(config=config, model="/path/to/llm", tokenizer=tokenizer) +quantized_model, layer_config = compressor.quantize() ``` -### 步骤 4: 实现模型检测函数 - -在 `auto_round/utils/model.py` 中添加: +### MLLM(视觉-语言模型) ```python -def is_audio_model(model_or_path: Union[str, torch.nn.Module]) -> bool: - """检测是否为音频模型""" - if isinstance(model_or_path, str): - # 检查配置文件中的特征 - config_path = os.path.join(model_or_path, "config.json") - if os.path.exists(config_path): - with open(config_path) as f: - config = json.load(f) - # 检查是否包含音频相关配置 - if "audio_config" in config: - return True - if config.get("model_type") in ["whisper", "wav2vec2", "hubert"]: - return True - - if isinstance(model_or_path, torch.nn.Module): - # 检查模块中是否有音频相关组件 - for name, module in model_or_path.named_modules(): - if "audio" in name.lower(): - return True - - return False +config = AutoRoundConfig(scheme="W4A16", iters=200) +compressor = Compressor( + config=config, + model="/models/Qwen2-VL-2B-Instruct", + processor=processor, + template="qwen2_vl", + quant_nontext_module=False, # True 则同时量化视觉编码器 +) +# 创建:MLLMCalibCompressor(MLLMMixin, CalibCompressor) +quantized_model, layer_config = compressor.quantize() ``` -## 实现细节 - -### MLLMCalibCompressor 关键实现 +### Diffusion 扩散模型 ```python -class MLLMCalibCompressor(CalibCompressor): - def __init__( - self, config, model, processor=None, image_processor=None, template=None, extra_data_dir=None, **kwargs - ): - # 保存 MLLM 特定参数 - self.processor = processor - self.image_processor = image_processor - self.template = template - self.extra_data_dir = extra_data_dir - super().__init__(config, model, **kwargs) - - @torch.no_grad() - def calib(self, nsamples, bs): - # 1. 选择合适的 template - self.template_obj = get_template(self.template or "default") - - # 2. 获取 MLLM dataloader - self.dataloader = get_mllm_dataloader( - model=self.model_context.model, - tokenizer=self.tokenizer, - dataset=self.dataset, - processor=self.processor, - image_processor=self.image_processor, - nsamples=nsamples, - seqlen=self.quantize_config.seqlen, - seed=self.seed, - batch_size=bs, - template=self.template_obj, - extra_data_dir=self.extra_data_dir, - ) - - # 3. 执行校准 - for data in self.dataloader: - self.model_context.model(**data) +config = AutoRoundConfig(scheme="W4A16", iters=200) +compressor = Compressor( + config=config, + model="/models/stable-diffusion-2-1", + guidance_scale=7.5, +) +# 创建:DiffusionCalibCompressor(DiffusionMixin, CalibCompressor) ``` -**关键点:** -- 处理 `processor`, `image_processor`, `template` 等 MLLM 特定参数 -- 使用 `get_mllm_dataloader` 获取多模态数据 -- 支持自定义数据目录 (`extra_data_dir`) - -### DiffusionCalibCompressor 关键实现 +### RTN 零样本 ```python -class DiffusionCalibCompressor(CalibCompressor): - def __init__(self, config, model, guidance_scale=7.5, num_inference_steps=50, **kwargs): - self.guidance_scale = guidance_scale - self.num_inference_steps = num_inference_steps - self.pipe = None - super().__init__(config, model, **kwargs) - - def post_init(self): - # 预先加载 diffusion pipeline - if isinstance(self.model_context.model, str): - self._load_diffusion_model() - super().post_init() - - def _load_diffusion_model(self): - # 加载完整的 pipeline - pipe, pipe_config = diffusion_load_model( - pretrained_model_name_or_path=self.model_context.model, - platform=self.platform, - device=self.compress_context.device, - ) - self.pipe = pipe - - # 提取 transformer 或 unet 用于量化 - if hasattr(pipe, "transformer"): - self.model_context.model = pipe.transformer - elif hasattr(pipe, "unet"): - self.model_context.model = pipe.unet - - @torch.no_grad() - def calib(self, nsamples, bs): - # 获取 diffusion dataloader - self.dataloader = get_diffusion_dataloader( - pipe=self.pipe, - dataset=self.dataset, - nsamples=nsamples, - batch_size=bs, - seed=self.seed, - guidance_scale=self.guidance_scale, - num_inference_steps=self.num_inference_steps, - ) - - # 执行校准 - for data in self.dataloader: - self.model_context.model(**data) -``` - -**关键点:** -- 需要加载完整的 diffusion pipeline -- 从 pipeline 中提取 transformer/unet 组件 -- 使用扩散模型特定的数据生成逻辑 - -### 完整数据流 +from auto_round.algorithms.quantization.rtn.config import RTNConfig +config = RTNConfig(scheme="W4A16") +compressor = Compressor(config=config, model="/path/to/model") ``` -1. 用户调用 Compressor(config, model, ...) - │ - ├─> Compressor.__new__() - │ ├─> detect_model_type(model) - │ │ └─> 返回 "llm" | "mllm" | "diffusion" - │ │ - │ └─> 根据 config 类型和 model_type 创建实例 - │ ├─> MLLMCalibCompressor (MLLM + AutoRound) - │ ├─> DiffusionCalibCompressor (Diffusion + AutoRound) - │ ├─> CalibCompressor (LLM + AutoRound) - │ ├─> ImatrixCompressor (RTN + imatrix) - │ └─> ZeroShotCompressor (RTN) - │ -2. 实例.__init__() - │ ├─> 保存模型特定参数 - │ └─> super().__init__() 调用父类 - │ -3. 用户调用 compressor.quantize() - │ - ├─> post_init() - │ ├─> _load_model() (可能被子类重写) - │ └─> 初始化 quantizer - │ - ├─> calib(nsamples, bs) (可能被子类重写) - │ ├─> 准备 dataloader (模型特定) - │ └─> 执行校准前向传播 - │ - ├─> cache_inter_data() - │ └─> 缓存中间激活值 - │ - ├─> 对每个 block 执行量化 - │ └─> 运行量化算法 (AutoRound/RTN 等) - │ - └─> 返回 (quantized_model, layer_config) -``` - -## 与旧架构对比 -### 旧架构 (`compressors/`) +### RTN + imatrix(GGUF k-quants) -**使用方式:** ```python -# 需要手动选择 Compressor -from auto_round.compressors.mllm.compressor import MLLMCompressor -from auto_round.compressors.diffusion.compressor import DiffusionCompressor - -# MLLM -mllm_compressor = MLLMCompressor( - model=model, - scheme="W4A16", - iters=200, - # ... 很多参数 -) - -# Diffusion -diffusion_compressor = DiffusionCompressor( - model=model, - scheme="W4A16", - iters=200, - # ... 很多参数 -) +config = RTNConfig(scheme="W4A16") +compressor = Compressor(config=config, model="/path/to/model", format="gguf_k") +# 创建:CalibratedRTNCompressor(enable_imatrix=True) ``` -**问题:** -- 用户需要手动判断模型类型 -- 需要导入不同的 Compressor 类 -- 参数直接传给 Compressor,没有统一的配置对象 -- 每个 Compressor 都是独立实现,代码重复 +## 扩展新模型类型 -### 新架构 (`compressors_new/`) +**第 1 步**:在 `compressors_new/` 中创建新 Mixin: -**使用方式:** ```python -# 统一入口,自动检测 -from auto_round.compressors_new.entry import Compressor -from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig - -config = AutoRoundConfig(scheme="W4A16", iters=200, nsamples=128) +class AudioMixin: + def __init__(self, *args, audio_processor=None, **kwargs): + self.audio_processor = audio_processor + super().__init__(*args, **kwargs) -# 同一个入口处理所有模型类型 -compressor = Compressor( - config=config, - model=model, # 自动检测是 LLM/MLLM/Diffusion - tokenizer=tokenizer, - # 模型特定参数... -) + def calib(self, nsamples, bs): + # 音频专用 dataloader + ... ``` -**优势:** -- ✅ 自动模型类型检测 -- ✅ 统一的配置对象 (AlgConfig) -- ✅ 单一入口点 -- ✅ 通过继承复用代码 -- ✅ 易于扩展新模型类型 - -## 测试 - -### 运行测试脚本 - -```bash -# 运行完整测试 -python test_compressor_new_arch.py +**第 2 步**:在 `entry.py` 中添加检测逻辑: -# 测试特定类型 -python -c "from test_compressor_new_arch import test_mllm_compressor; test_mllm_compressor()" +```python +def detect_model_type(model): + if is_audio_model(model): + return "audio" + if is_diffusion_model(model): + return "diffusion" + ... ``` -### 测试内容 - -1. **模型类型检测测试** - ```python - from auto_round.compressors_new.entry import detect_model_type - - assert detect_model_type("/models/opt-125m/") == "llm" - assert detect_model_type("/models/Qwen2-VL-2B-Instruct") == "mllm" - assert detect_model_type("/models/stable-diffusion-2-1") == "diffusion" - ``` - -2. **Compressor 创建测试** - ```python - from auto_round.compressors_new.entry import Compressor - from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig - - config = AutoRoundConfig(scheme="W4A16") +**第 3 步**:在 `Compressor.__new__()` 中添加路由: - # 测试 LLM - comp = Compressor(config=config, model="/models/opt-125m/") - assert isinstance(comp, CalibCompressor) +```python +if model_type == "audio": + from auto_round.compressors_new.audio_mixin import AudioMixin - # 测试 MLLM - comp = Compressor(config=config, model="/models/Qwen2-VL-2B-Instruct") - assert isinstance(comp, MLLMCalibCompressor) + class AudioCalibCompressor(AudioMixin, CalibCompressor): + pass - # 测试 Diffusion - comp = Compressor(config=config, model="/models/stable-diffusion-2-1") - assert isinstance(comp, DiffusionCalibCompressor) - ``` + return AudioCalibCompressor(config, **local_args, **kwargs) +``` ## 常见问题 -### Q1: 如何判断我的模型会使用哪个 Compressor? - -**A:** 运行以下代码查看: +### Q1:如何确认我的模型会使用哪个 Compressor? ```python from auto_round.compressors_new.entry import detect_model_type, Compressor from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig model_path = "/your/model/path" +print(f"模型类型: {detect_model_type(model_path)}") -# 检测模型类型 -model_type = detect_model_type(model_path) -print(f"Model type: {model_type}") - -# 创建 compressor 并查看类型 config = AutoRoundConfig(scheme="W4A16") comp = Compressor(config=config, model=model_path) -print(f"Compressor type: {type(comp).__name__}") -``` - -### Q2: 如何传递模型特定的参数? - -**A:** 直接传递给 `Compressor()`,它会自动转发: - -```python -# MLLM 特定参数 -compressor = Compressor( - config=config, - model=mllm_model_path, - processor=processor, # MLLM 特定 - template="qwen2_vl", # MLLM 特定 - extra_data_dir="/data/imgs", # MLLM 特定 -) - -# Diffusion 特定参数 -compressor = Compressor( - config=config, - model=diffusion_model_path, - guidance_scale=7.5, # Diffusion 特定 - num_inference_steps=50, # Diffusion 特定 -) +print(f"Compressor 类型: {type(comp).__name__}") ``` -### Q3: 新架构是否向后兼容? - -**A:** 是的,旧的 `compressors/` 仍然可用: - -```python -# 旧方式仍然工作 -from auto_round.compressors.mllm.compressor import MLLMCompressor - -comp = MLLMCompressor(model=..., scheme="W4A16", ...) - -# 新方式 (推荐) -from auto_round.compressors_new.entry import Compressor -from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig - -config = AutoRoundConfig(scheme="W4A16") -comp = Compressor(config=config, model=...) -``` - -### Q4: RTN 和 AutoRound 的区别? - -**A:** +### Q2:RTN 和 AutoRound 有什么区别? | 特性 | RTN | AutoRound | |------|-----|-----------| -| 需要校准数据 | ❌ 否 | ✅ 是 | +| 需要校准数据 | ❌ 否(ZeroShot)/ ✅ 是(Calibrated) | ✅ 是 | | 量化质量 | 较低 | 较高 | | 量化速度 | 快 | 慢 | -| Compressor | ZeroShotCompressor | CalibCompressor 系列 | +| Compressor | ZeroShotCompressor / CalibratedRTNCompressor | CalibCompressor | -```python -# RTN - 快速但质量较低 -from auto_round.algorithms.quantization.rtn.config import RTNConfig +### Q3:`group_size` 可以是 tuple 吗? -config = RTNConfig(scheme="W4A16") - -# AutoRound - 慢但质量较高 -from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig - -config = AutoRoundConfig(scheme="W4A16", iters=200) -``` +可以。块状 FP8(如 `FP8_BLOCK` scheme)会将 `group_size` 设置为 `(128, 128)`, +`check_config()` 已通过 `_is_valid_group_size()` 静态方法正确处理 tuple/list/scalar 三种形式。 ## 总结 -新架构的核心优势: - -| 特性 | 说明 | 好处 | -|------|------|------| -| 🎯 **统一入口** | 一个 `Compressor` 类处理所有模型 | 简化使用,降低学习成本 | -| 🔍 **自动检测** | 自动识别 LLM/MLLM/Diffusion | 无需手动判断模型类型 | -| 🧩 **配置对象** | 使用 `AlgConfig` 统一配置 | 参数管理更清晰 | -| 🏗️ **继承复用** | 通过继承共享基类功能 | 减少代码重复 | -| 🔌 **易于扩展** | 3步添加新模型类型 | 符合开闭原则 | -| 🔄 **向后兼容** | 旧 API 仍然可用 | 平滑迁移 | - -### 迁移建议 - -**从旧架构迁移到新架构:** - -```python -# 旧代码 -from auto_round.compressors.mllm.compressor import MLLMCompressor - -comp = MLLMCompressor( - model=model, - scheme="W4A16", - iters=200, - nsamples=128, - # ... 更多参数 -) - -# 新代码 -from auto_round.compressors_new.entry import Compressor -from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig - -config = AutoRoundConfig( - scheme="W4A16", - iters=200, - nsamples=128, -) -comp = Compressor( - config=config, - model=model, - # 模型特定参数自动识别 -) -``` - -**迁移步骤:** -1. 导入 `Compressor` 和 `AutoRoundConfig` -2. 创建 `config` 对象,将量化相关参数放入 config -3. 将模型特定参数直接传递给 `Compressor()` -4. 移除手动的模型类型判断代码 +| 特性 | 说明 | +|---|---| +| **统一入口** | 单一 `Compressor` 类,自动检测模型类型 | +| **配置** | `QuantizationConfig` dataclass;子类 `RTNConfig`、`AutoRoundConfig` | +| **模型加载** | `ModelContext.__init__` 立即加载;`apply_patches()` 在量化器初始化前运行 | +| **9 种组合** | 3 种模型类型 × 3 种 Compressor,通过 Mixin 动态创建 | +| **量化器接口** | 基于名称的 `quantize_block(name)` / `quantize_layer(name)`,非模块对象 | +| **扩展** | 3 步添加新模型类型(Mixin 类、检测函数、路由) | -这种设计使得代码更加模块化、可维护和可扩展,同时保持了简单易用的 API 接口。 diff --git a/auto_round/compressors_new/mllm_mixin.py b/auto_round/compressors_new/mllm_mixin.py index d80098eba..d8ec59cbf 100644 --- a/auto_round/compressors_new/mllm_mixin.py +++ b/auto_round/compressors_new/mllm_mixin.py @@ -12,11 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Union - import torch -from auto_round.algorithms.alg_config import AlgConfig from auto_round.logger import logger @@ -33,11 +30,19 @@ class MLLMMixin: - ZeroShotCompressor (for basic RTN) MLLM-specific parameters: - processor: Multi-modal processor for encoding/decoding data - image_processor: Image processor for models like LLaVA - template: Template for processing different MLLMs + processor: Multi-modal processor override (normally loaded by ModelContext) + image_processor: Image processor override (e.g. for LLaVA) + template: Template name for processing different MLLMs extra_data_dir: Path to extra data (images, audio, videos) quant_nontext_module: Whether to quantize non-text modules + + Design note: + ``ModelContext._load_model()`` is responsible for loading the model and its + associated artifacts (processor, tokenizer, image_processor). This mixin + reads those artifacts from ``self.model_context`` during calibration. + If the caller passes explicit ``processor`` / ``image_processor`` overrides, + they are written into ``model_context`` after ``super().__init__()`` so that + ``model_context`` remains the single source of truth. """ def __init__( @@ -50,9 +55,6 @@ def __init__( quant_nontext_module=False, **kwargs, ): - # Store MLLM-specific attributes before calling super().__init__ - self.processor = processor - self.image_processor = image_processor self.template = template self.extra_data_dir = extra_data_dir self.quant_nontext_module = quant_nontext_module @@ -62,14 +64,23 @@ def __init__( # Pass quant_nontext_module to ModelContext so get_block_names can include vision blocks kwargs.setdefault("quant_nontext_module", quant_nontext_module) - # Call parent class __init__ (will be CalibCompressor, ImatrixCompressor, etc) + + # super().__init__() creates model_context, which eagerly loads the model and + # populates model_context.processor / image_processor / tokenizer. super().__init__(*args, **kwargs) + # Apply user-provided overrides into model_context (single source of truth). + if processor is not None: + self.model_context.processor = processor + if image_processor is not None: + self.model_context.image_processor = image_processor + @torch.no_grad() def calib(self, nsamples, bs): """Perform MLLM-specific calibration for quantization. Override parent's calib method to use MLLM dataset loading logic. + All multimodal artifacts are read from ``self.model_context``. """ from transformers import PreTrainedModel @@ -77,37 +88,40 @@ def calib(self, nsamples, bs): from auto_round.compressors.mllm.template import get_template from auto_round.special_model_handler import MISTRAL_3_2_MODELS + mc = self.model_context + processor = mc.processor + image_processor = mc.image_processor + tokenizer = mc.tokenizer + # Handle template selection - if isinstance(self.model_context.model, PreTrainedModel): - model_type = getattr(self.model_context.model.config, "model_type", None) + if isinstance(mc.model, PreTrainedModel): + model_type = getattr(mc.model.config, "model_type", None) if model_type == "llava" and self.template is None: self.template = "default" - if hasattr(self.model_context.model, "name_or_path"): - name = self.model_context.model.name_or_path + if hasattr(mc.model, "name_or_path"): + name = mc.model.name_or_path if any([m in name for m in MISTRAL_3_2_MODELS]): self.template = "mistral3_2" template_name = self.template - if template_name is None and hasattr(self.model_context.model.config, "model_type"): - template_name = self.model_context.model.config.model_type + if template_name is None and hasattr(mc.model.config, "model_type"): + template_name = mc.model.config.model_type if template_name is None: template_name = "default" - # Get template self.template_obj = get_template( template_name, - model=self.model_context.model, - tokenizer=self.tokenizer, - processor=self.processor, - image_processor=self.image_processor, + model=mc.model, + tokenizer=tokenizer, + processor=processor, + image_processor=image_processor, use_rtn=getattr(self.quantize_config, "iters", None) == 0, quiet=not self.quant_nontext_module, ) logger.info(f"Using MLLM template: {template_name}") - # Get MLLM dataloader dataset = self.dataset.replace(" ", "") if isinstance(self.dataset, str) else self.dataset if dataset is None: dataset = self.template_obj.default_dataset @@ -119,10 +133,10 @@ def calib(self, nsamples, bs): self.gradient_accumulate_steps, ) = get_mllm_dataloader( template=self.template_obj, - model=self.model_context.model, - tokenizer=self.tokenizer, - processor=self.processor, - image_processor=self.image_processor, + model=mc.model, + tokenizer=tokenizer, + processor=processor, + image_processor=image_processor, dataset=dataset, extra_data_dir=self.extra_data_dir, seqlen=self.quantize_config.seqlen, @@ -138,23 +152,19 @@ def calib(self, nsamples, bs): if data is None: continue - # MLLM data is usually already properly formatted if isinstance(data, dict): - # Move all tensors to device - data_new = {} - for key, value in data.items(): - if isinstance(value, torch.Tensor): - data_new[key] = value.to(self.model_context.model.device) - else: - data_new[key] = value + data_new = { + key: value.to(mc.model.device) if isinstance(value, torch.Tensor) else value + for key, value in data.items() + } else: data_new = data try: if isinstance(data_new, dict): - self.model_context.model(**data_new) + mc.model(**data_new) else: - self.model_context.model(data_new) + mc.model(data_new) except NotImplementedError: pass except Exception as e: diff --git a/auto_round/compressors_new/shard_writer.py b/auto_round/compressors_new/shard_writer.py index f76c9f7af..0cba008e5 100644 --- a/auto_round/compressors_new/shard_writer.py +++ b/auto_round/compressors_new/shard_writer.py @@ -96,12 +96,19 @@ def __init__( ShardWriter._initialized = True + @classmethod + def reset(cls): + """Reset the singleton state so the next instantiation creates a fresh ShardWriter.""" + cls._initialized = False + cls._instance = None + @classmethod def get_shard_writer(cls, *args, **kwargs): - if cls._instance is None: - raise ValueError( - "ShardWriter has not been initialized yet. Please create an instance before calling get_shard_writer." - ) + """Return the current singleton instance, or None if not yet initialized. + + Callers that require a valid writer should guard the result with + ``if self.compress_context.is_immediate_saving`` before use. + """ return cls._instance def _parse_size(self, size_str: str) -> int: diff --git a/auto_round/context/compress.py b/auto_round/context/compress.py index a67ac1966..b891d6735 100644 --- a/auto_round/context/compress.py +++ b/auto_round/context/compress.py @@ -28,6 +28,7 @@ class CompressContext(BaseContext): + def __init__( self, low_cpu_mem_usage: bool = True, @@ -56,5 +57,6 @@ def __init__( self.enable_torch_compile = enable_torch_compile self.immediate_packing = is_immediate_packing + self.is_immediate_packing = is_immediate_packing self.is_immediate_saving = is_immediate_saving self.formats = formats diff --git a/auto_round/context/model.py b/auto_round/context/model.py index 214e9ef27..603b5a7de 100644 --- a/auto_round/context/model.py +++ b/auto_round/context/model.py @@ -79,6 +79,12 @@ def __init__( self.tokenizer = tokenizer self.device = device + # MLLM / diffusion artifacts – always present so callers need no getattr guards. + # _load_model() will populate the ones that are relevant to the model type. + self.processor = None + self.image_processor = None + self.pipe = None + if envs.AR_USE_MODELSCOPE: platform = "model_scope" self.platform = platform diff --git a/auto_round/formats.py b/auto_round/formats.py index 3cca93baf..3d3f20326 100644 --- a/auto_round/formats.py +++ b/auto_round/formats.py @@ -664,7 +664,7 @@ def check_and_reset_format(self, ar): if not awq_supported: logger.warning(f"The AutoAWQ format may not be supported due to {info}") if ar.bits != 4: - raise ValueError("The AWQ format only supports W4 quantization ") + raise ValueError(f"auto_awq format support quantization scheme with W4A16 but got bits={ar.bits}") if self.backend is None: _check_divisible_by_32(ar) From dd5aec7eba8eb8aa5937249b5683ed7ebd0906e2 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Fri, 20 Mar 2026 09:49:41 +0800 Subject: [PATCH 13/90] fix Signed-off-by: n1ck-guo --- .../quantization/auto_round/quantizer.py | 3 +- .../architecture_visualization.py | 12 ++++-- auto_round/compressors_new/base.py | 12 +++++- auto_round/compressors_new/calib.py | 2 +- auto_round/compressors_new/diffusion_mixin.py | 1 + auto_round/compressors_new/mllm_mixin.py | 42 +++++++++---------- auto_round/context/compress.py | 2 + test/test_cpu/export/test_export.py | 11 +++-- 8 files changed, 53 insertions(+), 32 deletions(-) diff --git a/auto_round/algorithms/quantization/auto_round/quantizer.py b/auto_round/algorithms/quantization/auto_round/quantizer.py index bbddb0d1d..3f37b87e8 100644 --- a/auto_round/algorithms/quantization/auto_round/quantizer.py +++ b/auto_round/algorithms/quantization/auto_round/quantizer.py @@ -553,11 +553,12 @@ def quantize_layer( if q_inputs is not None: q_inputs[i] = q_inputs[i].to(layer.weight.dtype) + static_kv_dtype = self.compress_context.static_kv_dtype if self.config.is_act_quantize and check_need_act_calibration( self.config.act_dynamic, self.config.act_data_type, self.config.act_bits, - self.config.static_kv_dtype, + static_kv_dtype, self.config.static_attention_dtype, ): tmp_inputs = q_inputs if q_inputs is not None else input_ids diff --git a/auto_round/compressors_new/architecture_visualization.py b/auto_round/compressors_new/architecture_visualization.py index 6d044c520..50c72ce05 100644 --- a/auto_round/compressors_new/architecture_visualization.py +++ b/auto_round/compressors_new/architecture_visualization.py @@ -27,7 +27,8 @@ def print_architecture_table(): # MLLM combinations (dynamic classes created in entry.py) print(f"{'MLLM':<15} {'AutoRoundConfig':<20} {'MLLMCalibCompressor':<40} {'MLLMMixin + CalibCompressor':<35}") print( - f"{'MLLM':<15} {'RTNConfig':<20} {'MLLMCalibratedRTNCompressor':<40} {'MLLMMixin + CalibratedRTNCompressor':<35}" + f"{'MLLM':<15} {'RTNConfig':<20} {'MLLMCalibratedRTNCompressor':<40} " + f"{'MLLMMixin + CalibratedRTNCompressor':<35}" ) print(f"{'MLLM':<15} {'RTNConfig':<20} {'MLLMZeroShotCompressor':<40} {'MLLMMixin + ZeroShotCompressor':<35}") @@ -35,13 +36,16 @@ def print_architecture_table(): # Diffusion combinations (dynamic classes created in entry.py) print( - f"{'Diffusion':<15} {'AutoRoundConfig':<20} {'DiffusionCalibCompressor':<40} {'DiffusionMixin + CalibCompressor':<35}" + f"{'Diffusion':<15} {'AutoRoundConfig':<20} {'DiffusionCalibCompressor':<40} " + f"{'DiffusionMixin + CalibCompressor':<35}" ) print( - f"{'Diffusion':<15} {'RTNConfig':<20} {'DiffusionCalibratedRTNCompressor':<40} {'DiffusionMixin + CalibratedRTNCompressor':<35}" + f"{'Diffusion':<15} {'RTNConfig':<20} {'DiffusionCalibratedRTNCompressor':<40} " + f"{'DiffusionMixin + CalibratedRTNCompressor':<35}" ) print( - f"{'Diffusion':<15} {'RTNConfig':<20} {'DiffusionZeroShotCompressor':<40} {'DiffusionMixin + ZeroShotCompressor':<35}" + f"{'Diffusion':<15} {'RTNConfig':<20} {'DiffusionZeroShotCompressor':<40} " + f"{'DiffusionMixin + ZeroShotCompressor':<35}" ) print("\n" + "=" * 110 + "\n") diff --git a/auto_round/compressors_new/base.py b/auto_round/compressors_new/base.py index 6517f1a65..1f874e03d 100644 --- a/auto_round/compressors_new/base.py +++ b/auto_round/compressors_new/base.py @@ -80,6 +80,7 @@ class BaseCompressor(object): compress_context: CompressContext = None model_context: ModelContext = None shard_writer: ShardWriter = None + supported_types = SUPPORTED_LAYER_TYPES def __init__( self, @@ -185,6 +186,7 @@ def __init__( is_immediate_packing=self.is_immediate_packing, is_immediate_saving=self.is_immediate_saving, formats=self.formats, + static_kv_dtype=self.config.static_kv_dtype, ) self.model_context = ModelContext( model, @@ -212,6 +214,14 @@ def __init__( self._adjust_torch_compile(enable_torch_compile) self.compress_context.enable_torch_compile = self.enable_torch_compile + @property + def mllm(self): + return self.model_context.is_mllm + + @property + def diffusion(self): + return self.model_context.is_diffusion + def _adjust_torch_compile(self, enable_torch_compile: bool) -> None: """Sets the torch compile configuration for the tuning.""" self.enable_torch_compile = enable_torch_compile @@ -304,7 +314,7 @@ def __getattr__(self, name: str) -> Any: if name in self.__dict__: return self.__dict__[name] - for obj in ["quantize_config", "model_context", "compress_context", "quantizer"]: + for obj in ["quantizer", "quantize_config", "model_context", "compress_context"]: if obj not in self.__dict__: continue obj = object.__getattribute__(self, obj) diff --git a/auto_round/compressors_new/calib.py b/auto_round/compressors_new/calib.py index 9196c3cb1..96d4d290d 100644 --- a/auto_round/compressors_new/calib.py +++ b/auto_round/compressors_new/calib.py @@ -928,7 +928,7 @@ def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None: enable_quanted_input = self.enable_quanted_input has_gguf = False - if hasattr(self, "formats"): + if hasattr(self, "formats") and self.formats is not None: has_gguf = any(format_.is_gguf() for format_ in self.formats) if has_gguf and self.is_immediate_packing: enable_quanted_input = False diff --git a/auto_round/compressors_new/diffusion_mixin.py b/auto_round/compressors_new/diffusion_mixin.py index 1b7e25f38..ab723bab1 100644 --- a/auto_round/compressors_new/diffusion_mixin.py +++ b/auto_round/compressors_new/diffusion_mixin.py @@ -50,6 +50,7 @@ def __init__(self, *args, guidance_scale=7.5, num_inference_steps=50, generator_ self.guidance_scale = guidance_scale self.num_inference_steps = num_inference_steps self.generator_seed = generator_seed + self.diffusion = True # Call parent class __init__ (will be CalibCompressor, ImatrixCompressor, etc) super().__init__(*args, **kwargs) diff --git a/auto_round/compressors_new/mllm_mixin.py b/auto_round/compressors_new/mllm_mixin.py index d8ec59cbf..e2026620e 100644 --- a/auto_round/compressors_new/mllm_mixin.py +++ b/auto_round/compressors_new/mllm_mixin.py @@ -59,8 +59,6 @@ def __init__( self.extra_data_dir = extra_data_dir self.quant_nontext_module = quant_nontext_module self.template_obj = None - # Backward compat: ar.mllm is expected to be True for MLLM instances - self.mllm = True # Pass quant_nontext_module to ModelContext so get_block_names can include vision blocks kwargs.setdefault("quant_nontext_module", quant_nontext_module) @@ -126,25 +124,27 @@ def calib(self, nsamples, bs): if dataset is None: dataset = self.template_obj.default_dataset - ( - self.dataloader, - self.batch_size, - self.seqlen, - self.gradient_accumulate_steps, - ) = get_mllm_dataloader( - template=self.template_obj, - model=mc.model, - tokenizer=tokenizer, - processor=processor, - image_processor=image_processor, - dataset=dataset, - extra_data_dir=self.extra_data_dir, - seqlen=self.quantize_config.seqlen, - bs=bs, - seed=self.seed, - nsamples=nsamples, - quant_nontext_module=self.quant_nontext_module, - ) + if isinstance(self.dataset, str): + dataset = self.dataset.replace(" ", "") + ( + self.dataloader, + self.batch_size, + self.seqlen, + self.gradient_accumulate_steps, + ) = get_mllm_dataloader( + template=self.template_obj, + model=mc.model, + tokenizer=tokenizer, + processor=processor, + image_processor=image_processor, + dataset=dataset, + extra_data_dir=self.extra_data_dir, + seqlen=self.quantize_config.seqlen, + bs=bs, + seed=self.seed, + nsamples=nsamples, + quant_nontext_module=self.quant_nontext_module, + ) # Process data through the model for calibration total_cnt = 0 diff --git a/auto_round/context/compress.py b/auto_round/context/compress.py index b891d6735..776083067 100644 --- a/auto_round/context/compress.py +++ b/auto_round/context/compress.py @@ -39,6 +39,7 @@ def __init__( is_immediate_saving: bool = False, formats: Union[list, str] = None, output_dir: str = "./compressed_models", + static_kv_dtype: Optional[torch.dtype] = None, ): super().__init__() self.low_cpu_mem_usage = low_cpu_mem_usage @@ -60,3 +61,4 @@ def __init__( self.is_immediate_packing = is_immediate_packing self.is_immediate_saving = is_immediate_saving self.formats = formats + self.static_kv_dtype = static_kv_dtype diff --git a/test/test_cpu/export/test_export.py b/test/test_cpu/export/test_export.py index a2212fbc6..a073c02fe 100644 --- a/test/test_cpu/export/test_export.py +++ b/test/test_cpu/export/test_export.py @@ -429,11 +429,12 @@ def test_export_format_with_scheme(self, tiny_qwen_model_path): group_size=32, sym=True, ) + ar.post_init() with pytest.raises(ValueError, match="auto_awq format support quantization scheme with W4A16 but got bits=2"): - get_formats("auto_round:auto_awq", ar) + get_formats("auto_round:auto_awq", ar.quantizer) with pytest.raises(ValueError, match="but got bits=2, data_type=int"): - get_formats("auto_round:llm_compressor", ar) + get_formats("auto_round:llm_compressor", ar.quantizer) ar = AutoRound( model=tiny_qwen_model_path, @@ -442,8 +443,9 @@ def test_export_format_with_scheme(self, tiny_qwen_model_path): group_size=32, sym=True, ) + ar.post_init() with pytest.raises(ValueError, match="but got data_type=fp, bits=4"): - get_formats("auto_round:llm_compressor", ar) + get_formats("auto_round:llm_compressor", ar.quantizer) ar = AutoRound( model=tiny_qwen_model_path, @@ -452,7 +454,8 @@ def test_export_format_with_scheme(self, tiny_qwen_model_path): group_size=256, sym=True, ) - get_formats("auto_round:auto_awq", ar) + ar.post_init() + get_formats("auto_round:auto_awq", ar.quantizer) def test_autoawq_qwen3_vl_infer(self, dataloader): model_path = get_model_path("Qwen/Qwen3-VL-2B-Instruct") From bde95c68fa69f2de0bf763d60e1e1c1051bd075a Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Fri, 20 Mar 2026 12:51:26 +0800 Subject: [PATCH 14/90] fix merge Signed-off-by: n1ck-guo --- auto_round/algorithms/quantization/base.py | 5 +++- .../algorithms/quantization/rtn/quantizer.py | 2 ++ auto_round/compressors_new/base.py | 10 +------- auto_round/compressors_new/calib.py | 25 +++++++++++-------- auto_round/compressors_new/shard_writer.py | 4 ++- auto_round/compressors_new/utils.py | 18 +++++++------ auto_round/compressors_new/zero_shot.py | 4 +-- 7 files changed, 38 insertions(+), 30 deletions(-) diff --git a/auto_round/algorithms/quantization/base.py b/auto_round/algorithms/quantization/base.py index 9701857bf..b743d1dbe 100644 --- a/auto_round/algorithms/quantization/base.py +++ b/auto_round/algorithms/quantization/base.py @@ -51,6 +51,7 @@ SUPPORTED_LAYER_TYPES, check_to_quantized, clear_memory, + compress_layer_names, convert_dtype_str2torch, find_matching_blocks, get_block_names, @@ -245,10 +246,12 @@ def _gen_auto_scheme(self) -> dict[str, dict]: def configure_layer_config(self, enable_gguf_official_mixed: None | bool = True): # before get_format, therefore, compress_context.formats is str is_gguf_format = (f := getattr(self.compress_context, "formats", None)) is not None and "gguf" in f + predefined_ignore_layers = get_predefined_ignore_layers(self.model_context.model) + compressed_predefined_ignore_layers = compress_layer_names(predefined_ignore_layers) if not is_gguf_format: predefined_ignore_layers = get_predefined_ignore_layers(self.model_context.model) if predefined_ignore_layers: - logger.info(f"Using predefined ignore_layers: {predefined_ignore_layers}") + logger.info(f"Using predefined ignore_layers: {compressed_predefined_ignore_layers}") tmp_str = ",".join(predefined_ignore_layers) if self.ignore_layers == "": self.ignore_layers = tmp_str diff --git a/auto_round/algorithms/quantization/rtn/quantizer.py b/auto_round/algorithms/quantization/rtn/quantizer.py index bd34b11b6..f5af29059 100644 --- a/auto_round/algorithms/quantization/rtn/quantizer.py +++ b/auto_round/algorithms/quantization/rtn/quantizer.py @@ -166,6 +166,7 @@ def quantize_layer(self, name: str, dtype: torch.dtype = None) -> None: enable_round_tuning=False, enable_torch_compile=self.compress_context.enable_torch_compile, disable_opt_rtn=disable_opt_rtn, + enable_rtn=True, ) m = m.unwrapper({}) except torch.OutOfMemoryError: @@ -181,6 +182,7 @@ def quantize_layer(self, name: str, dtype: torch.dtype = None) -> None: enable_norm_bias_tuning=False, enable_round_tuning=False, enable_torch_compile=self.compress_context.enable_torch_compile, + enable_rtn=True, ) m = m.unwrapper({}) except Exception as e: diff --git a/auto_round/compressors_new/base.py b/auto_round/compressors_new/base.py index 1f874e03d..d222d8fe3 100644 --- a/auto_round/compressors_new/base.py +++ b/auto_round/compressors_new/base.py @@ -186,7 +186,7 @@ def __init__( is_immediate_packing=self.is_immediate_packing, is_immediate_saving=self.is_immediate_saving, formats=self.formats, - static_kv_dtype=self.config.static_kv_dtype, + static_kv_dtype=self.static_kv_dtype, ) self.model_context = ModelContext( model, @@ -377,14 +377,6 @@ def _adjust_immediate_packing_and_saving(self): if self.compress_context.low_cpu_mem_usage and self.is_immediate_packing: self.is_immediate_saving = True - if self.compress_context.low_cpu_mem_usage and not self.is_immediate_packing: - logger.info( - "`low_cpu_mem_usage` is only supported when `immediate_packing` is True. " - "Setting `low_cpu_mem_usage` to False." - ) - self.compress_context.low_cpu_mem_usage = False - self.is_immediate_saving = False - if self.compress_context.low_cpu_mem_usage and self.is_immediate_packing: if formats[0].is_gguf(): logger.warning( diff --git a/auto_round/compressors_new/calib.py b/auto_round/compressors_new/calib.py index 96d4d290d..d806ca241 100644 --- a/auto_round/compressors_new/calib.py +++ b/auto_round/compressors_new/calib.py @@ -44,6 +44,7 @@ check_seqlen_compatible, check_to_quantized, clear_memory, + compress_layer_names, convert_module_to_hp_if_necessary, get_block_names, get_module, @@ -714,10 +715,10 @@ def _quantize_blocks( if self.compress_context.low_cpu_mem_usage and not self.is_immediate_saving: if nblocks == 1: - self._offloader.offload(model, n, overwrite=True) + self._offloader(model, n, overwrite=True) else: for name in names: - self._offloader.offload(model, name, overwrite=True) + self._offloader(model, name, overwrite=True) if pbar is not None: pbar.update(1) @@ -790,9 +791,12 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: clear_memory(device_list=self.compress_context.device_list) logger.info("caching done") if self.compress_context.low_cpu_mem_usage: - self._offloader.offload( - self.model_context.model, all_blocks, clear_memory=True, device_list=self.compress_context.device_list - ) + if self.model_context.is_model_patched and not self.compress_context.is_immediate_saving: + self._offloader(self.model_context.model, all_blocks, clear_memory=True, device_list=self.device_list) + if not self._offloader.enabled: + self.compress_context.low_cpu_mem_usage = False + else: + self.compress_context.low_cpu_mem_usage = False if len(all_blocks) > 1: pbar = tqdm(range(0, sum([len(i) for i in all_blocks]), self.nblocks)) else: @@ -831,6 +835,8 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: ) pbar.set_description("Quantizing done") pbar.close() + if self.compress_context.low_cpu_mem_usage: + self._offloader.reload(self.model_context.model) self._quantize_layers(layer_names, all_inputs) convert_module_to_hp_if_necessary( @@ -839,9 +845,6 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: if self.is_immediate_saving: self.shard_writer.write(is_finalize=True) - if self.compress_context.low_cpu_mem_usage: - self._offloader.reload(self.model_context.model) - end_time = time.time() cost_time = end_time - start_time logger.info(f"quantization tuning time {cost_time}") @@ -861,7 +864,8 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: f"Summary: quantized {len(quantized_layers)}/{len(quantized_layers) + len(unquantized_layers)} in the model" ) if len(unquantized_layers) > 0: - summary_info += f", {unquantized_layers} have not been quantized" + compressed_unquantized_layers = compress_layer_names(unquantized_layers) + summary_info += f", {compressed_unquantized_layers} have not been quantized" logger.info(summary_info) self.model_context.quantized = True @@ -915,6 +919,7 @@ def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None: enable_torch_compile=self.enable_torch_compile, device=self.compress_context.device, disable_opt_rtn=self.disable_opt_rtn, + enable_rtn=self.iters == 0, ) new_layer = wrapper_layer.unwrapper({}) set_module(self.model, layer_name, new_layer) @@ -1104,7 +1109,7 @@ def _quantize_via_rtn_blockwise(self) -> None: ) if self.compress_context.low_cpu_mem_usage and not self.is_immediate_saving: - self._offloader.offload(self.model_context.model, block_name) + self._offloader(self.model_context.model, block_name) if block_name == block_names[-1]: clear_memory(input_ids, device_list=self.compress_context.device_list) else: diff --git a/auto_round/compressors_new/shard_writer.py b/auto_round/compressors_new/shard_writer.py index 0cba008e5..772a6cd88 100644 --- a/auto_round/compressors_new/shard_writer.py +++ b/auto_round/compressors_new/shard_writer.py @@ -213,7 +213,9 @@ def finalize(self): """Saves remaining weights, renames files, and writes the index JSON.""" # 1. Capture remaining weights not yet saved full_sd = self.model.state_dict() - tie_word_embeddings = getattr(getattr(self.model, "config", None), "tie_word_embeddings", True) + tie_word_embeddings = False + if hasattr(self.model, "config") and hasattr(self.model.config, "tie_word_embeddings"): + tie_word_embeddings = self.model.config.tie_word_embeddings finalize_skipped_meta_tensors = [] for pname, tensor in full_sd.items(): diff --git a/auto_round/compressors_new/utils.py b/auto_round/compressors_new/utils.py index 9169fb757..4e3170ae3 100644 --- a/auto_round/compressors_new/utils.py +++ b/auto_round/compressors_new/utils.py @@ -26,7 +26,7 @@ from auto_round.export.export_to_gguf.config import GGML_QUANT_SIZES, GGUF_CONFIG, GGUF_INNER_CONFIG, QK_K, ModelType from auto_round.logger import logger -from auto_round.utils import check_to_quantized, get_layer_names_in_block, get_module +from auto_round.utils import check_to_quantized, get_layer_names_in_block, get_module, to_standard_regex if TYPE_CHECKING: from auto_round.schemes import QuantizationScheme @@ -268,6 +268,9 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str extra_scheme_keys = ("scale_dtype",) scheme_keys = tuple(f.name for f in fields(QuantizationScheme)) + ("scale_dtype",) layer_config = copy.deepcopy(layer_config) or {} + if ignore_layers: + ignore_layers = ignore_layers.replace(" ", "").split(",") + ignore_layers = [name + "." if name[-1].isdigit() else name for name in ignore_layers] # 1. ignore_layers -> force 16 for name in get_fp_layer_names(model, ignore_layers): @@ -352,15 +355,18 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str logger.warning(f"{name} is not supported in current scheme, ignoring its setting in `layer_config`") continue - regex = re.compile(name) + regex = re.compile(to_standard_regex(name)) matched = [ln for ln in all_supported_layer_names if regex.search(ln)] if not matched: - raise ValueError(f"Invalid '{name}' in layer_config, no match found.") + # type(mlp.gate) is Qwen3VLMoeTextTopKRouter instead of Linear + logger.warning_once( + f"Layer name or regex '{name}' in layer_config does not match any supported layers. " + + "Please check for typos or update the regex pattern, ignore it for now" + ) val = layer_config.pop(name) regex_config[name] = val # keep regex config for match in matched: layer_config[match] = val - # regex_config = None if len(regex_config)==0 else regex_config # 7. lm_head lm_head_name = get_lm_head_name(model) @@ -884,7 +890,7 @@ def get_fp_layer_names(model: torch.nn.Module, ignore_layers: str): if not ignore_layers: return [] - ignore_layers = ignore_layers.replace(" ", "").split(",") + all_layer_names = [] for n, m in model.named_modules(): if type(m) in SUPPORTED_LAYER_TYPES: @@ -897,8 +903,6 @@ def get_fp_layer_names(model: torch.nn.Module, ignore_layers: str): if fp_layer in all_layer_names: not_to_quantized_layers.append(fp_layer) continue - if fp_layer[-1].isdigit(): - fp_layer = fp_layer + "." ##tricky setting for name in all_layer_names: if fp_layer in name: not_to_quantized_layers.append(name) diff --git a/auto_round/compressors_new/zero_shot.py b/auto_round/compressors_new/zero_shot.py index 0c6e4446e..1ee314f43 100644 --- a/auto_round/compressors_new/zero_shot.py +++ b/auto_round/compressors_new/zero_shot.py @@ -143,7 +143,7 @@ def _quantize_via_rtn_blockwise(self) -> None: self.quantizer.quantize_block(block_name, input_ids, input_others) if self.low_cpu_mem_usage and not self.is_immediate_saving: - self._offloader.offload(self.model_context.model, block_name) + self._offloader(self.model_context.model, block_name) if block_name == block_names[-1]: clear_memory(input_ids, device_list=self.compress_context.device_list) else: @@ -247,7 +247,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: self.quantizer.quantize_block(block_name) if self.low_cpu_mem_usage and not self.is_immediate_saving: - self._offloader.offload(self.model, block_name) + self._offloader(self.model, block_name) clear_memory(device_list=self.device_list) memory_monitor.log_summary() pbar.update(1) From 7b4e479abfe2a78252594513bdbb4f190c4bddfb Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Fri, 20 Mar 2026 15:34:08 +0800 Subject: [PATCH 15/90] fix Signed-off-by: n1ck-guo --- .../quantization/auto_round/quantizer.py | 33 +++++++------ auto_round/algorithms/quantization/config.py | 46 +++++++++++++++++++ .../algorithms/quantization/rtn/config.py | 3 -- .../algorithms/quantization/rtn/quantizer.py | 34 ++++++++------ auto_round/autoround.py | 30 ++++++------ auto_round/compressors/base.py | 32 +++++++------ auto_round/compressors_new/base.py | 1 + auto_round/compressors_new/zero_shot.py | 2 +- auto_round/context/compress.py | 3 ++ 9 files changed, 122 insertions(+), 62 deletions(-) diff --git a/auto_round/algorithms/quantization/auto_round/quantizer.py b/auto_round/algorithms/quantization/auto_round/quantizer.py index 3f37b87e8..831619a54 100644 --- a/auto_round/algorithms/quantization/auto_round/quantizer.py +++ b/auto_round/algorithms/quantization/auto_round/quantizer.py @@ -554,12 +554,13 @@ def quantize_layer( q_inputs[i] = q_inputs[i].to(layer.weight.dtype) static_kv_dtype = self.compress_context.static_kv_dtype + static_attention_dtype = self.compress_context.static_attention_dtype if self.config.is_act_quantize and check_need_act_calibration( self.config.act_dynamic, self.config.act_data_type, self.config.act_bits, static_kv_dtype, - self.config.static_attention_dtype, + static_attention_dtype, ): tmp_inputs = q_inputs if q_inputs is not None else input_ids hook_handles = self._register_act_max_hook(layer) @@ -893,21 +894,23 @@ def _sampling_inputs( for key in input_others.keys(): if "positional_inputs" in key: continue - if (key not in share_cache_keys or len(indices) == 1) and not isinstance( - input_others[key], (str, bool, type(None)) - ): - current_input_others[key] = None - if input_others[key] is not None: - current_input_others[key] = [input_others[key][i] for i in indices] - if len(indices) == 1: - current_input_others[key] = current_input_others[key][0] - else: - try: - current_input_others[key] = torch.cat(current_input_others[key], dim=0) - except TypeError as err: - logger.warning_once("Please check the model cache inputs or try setting batch_size to 1.") - else: + # Shared cache keys (e.g. position_embeddings, position_ids, cache_position) are stored + # directly as-is (not wrapped in a per-sample list) when batch_size > 1. Indexing such + # values by sample index would incorrectly decompose them (e.g. (cos, sin)[0] == cos). + # Always pass them through unchanged. + if key in share_cache_keys or isinstance(input_others[key], (str, bool, type(None))): current_input_others[key] = input_others[key] + elif input_others[key] is not None: + current_input_others[key] = [input_others[key][i] for i in indices] + if len(indices) == 1: + current_input_others[key] = current_input_others[key][0] + else: + try: + current_input_others[key] = torch.cat(current_input_others[key], dim=0) + except TypeError as err: + logger.warning_once("Please check the model cache inputs or try setting batch_size to 1.") + else: + current_input_others[key] = None return current_input_ids, current_input_others diff --git a/auto_round/algorithms/quantization/config.py b/auto_round/algorithms/quantization/config.py index dd27be99c..0c5cbca79 100644 --- a/auto_round/algorithms/quantization/config.py +++ b/auto_round/algorithms/quantization/config.py @@ -65,6 +65,35 @@ def __post_init__(self): # Resolve scheme attributes early so properties (is_act_nv_fp, is_wfp8afp8, etc.) # work correctly at construction time without waiting for post_init(). self._early_resolve_scheme() + # Run block-wise validation early (at construction time, before model loading). + # Guard with None checks because _early_resolve_scheme may leave attributes unresolved + # (e.g. when scheme is an AutoScheme that needs model info). + if self.group_size is not None and isinstance(self.group_size, (tuple, list)): + if not ( + self.data_type is not None + and self.bits is not None + and self.data_type.startswith("fp") + and self.bits == 8 + ): + raise ValueError( + "Block-wise quantization (tuple group_size) only supports fp8 weight quantization, " + f"but got data_type='{self.data_type}', bits={self.bits}." + ) + if ( + self.act_dynamic is not None + and self.act_data_type is not None + and self.act_bits is not None + and not (self.act_dynamic and self.act_data_type.startswith("fp") and self.act_bits == 8) + ): + raise NotImplementedError( + "Block-wise fp8 weight quantization only supports dynamic fp8 activation quantization. " + f"Got act_dynamic={self.act_dynamic}, act_data_type='{self.act_data_type}', " + f"act_bits={self.act_bits}." + ) + if self.act_group_size is not None and isinstance(self.act_group_size, (tuple, list)): + raise ValueError( + "`act_group_size` must be -1 (per channel), 0 (per-tensor), or a positive integer, not a tuple." + ) def _early_resolve_scheme(self) -> None: """Resolve scheme attributes early so properties work from init time. @@ -123,10 +152,27 @@ def check_config(self) -> None: "`group_size` must be -1 (per channel), 0 (per-tensor), a positive integer, " "or a tuple thereof (e.g. (128, 128) for block-wise quantization)" ) + if isinstance(self.act_group_size, (tuple, list)): + raise ValueError( + "`act_group_size` must be -1 (per channel), 0 (per-tensor), or a positive integer, not a tuple." + ) if not self._is_valid_group_size(self.act_group_size): raise ValueError( "`act_group_size` must be -1 (per channel), 0 (per-tensor), a positive integer, " "or a tuple thereof" ) + # Block-wise (tuple group_size) is only valid for fp8 weight quantization + if isinstance(self.group_size, (tuple, list)): + if not (self.data_type.startswith("fp") and self.bits == 8): + raise ValueError( + "Block-wise quantization (tuple group_size) only supports fp8 weight quantization, " + f"but got data_type='{self.data_type}', bits={self.bits}." + ) + if not (self.act_dynamic and self.act_data_type.startswith("fp") and self.act_bits == 8): + raise NotImplementedError( + "Block-wise fp8 weight quantization only supports dynamic fp8 activation quantization. " + f"Got act_dynamic={self.act_dynamic}, act_data_type='{self.act_data_type}', " + f"act_bits={self.act_bits}." + ) # Reset the default value of super_bits and super_group_size if self.data_type.endswith("_dq"): gguf_config = GGUF_INNER_CONFIG[f"gguf:q{self.bits}_k"] diff --git a/auto_round/algorithms/quantization/rtn/config.py b/auto_round/algorithms/quantization/rtn/config.py index cb297529c..d31471f7e 100644 --- a/auto_round/algorithms/quantization/rtn/config.py +++ b/auto_round/algorithms/quantization/rtn/config.py @@ -67,6 +67,3 @@ def __init__( self.disable_opt_rtn = disable_opt_rtn if not self.disable_opt_rtn: self._alg_cls = "OptimizedRTNQuantizer" - - if not self.disable_opt_rtn and f"rtn_{self.data_type}" in QUANT_FUNC_WITH_DTYPE: - self.data_type = f"rtn_{self.data_type}" diff --git a/auto_round/algorithms/quantization/rtn/quantizer.py b/auto_round/algorithms/quantization/rtn/quantizer.py index f5af29059..7467a4faf 100644 --- a/auto_round/algorithms/quantization/rtn/quantizer.py +++ b/auto_round/algorithms/quantization/rtn/quantizer.py @@ -81,7 +81,7 @@ def quantize_block(self, block_name: str, **kwargs): tied_weights_values = list(tied_weights_keys) tied_weights_layers = [".".join(val.split(".")[:-1]) for val in tied_weights_values] # rm weight/bias # In fact, we should detect whether it is is_separate_lm_head, to simplify, we don't do it - if hasattr(self.compress_context, "formats") and self.compress_context.formats[0].is_gguf(): + if getattr(self.compress_context, "formats", None) and self.compress_context.formats[0].is_gguf(): lm_head_name = get_lm_head_name(self.model) if lm_head_name is not None: tied_weights_layers.append(lm_head_name) @@ -187,6 +187,8 @@ def quantize_layer(self, name: str, dtype: torch.dtype = None) -> None: m = m.unwrapper({}) except Exception as e: raise + + set_module(self.model, name, m) self._immediate_pack_and_save_module(name) def _immediate_pack_and_save_module(self, module_name): @@ -378,20 +380,22 @@ def _sampling_inputs( for key in input_others.keys(): if "positional_inputs" in key: continue - if (key not in share_cache_keys or len(indices) == 1) and not isinstance( - input_others[key], (str, bool, type(None)) - ): - current_input_others[key] = None - if input_others[key] is not None: - current_input_others[key] = [input_others[key][i] for i in indices] - if len(indices) == 1: - current_input_others[key] = current_input_others[key][0] - else: - try: - current_input_others[key] = torch.cat(current_input_others[key], dim=0) - except TypeError as err: - logger.warning_once("Please check the model cache inputs or try setting batch_size to 1.") - else: + # Shared cache keys (e.g. position_embeddings, position_ids, cache_position) are stored + # directly as-is (not wrapped in a per-sample list) when batch_size > 1. Indexing such + # values by sample index would incorrectly decompose them (e.g. (cos, sin)[0] == cos). + # Always pass them through unchanged. + if key in share_cache_keys or isinstance(input_others[key], (str, bool, type(None))): current_input_others[key] = input_others[key] + elif input_others[key] is not None: + current_input_others[key] = [input_others[key][i] for i in indices] + if len(indices) == 1: + current_input_others[key] = current_input_others[key][0] + else: + try: + current_input_others[key] = torch.cat(current_input_others[key], dim=0) + except TypeError as err: + logger.warning_once("Please check the model cache inputs or try setting batch_size to 1.") + else: + current_input_others[key] = None return current_input_ids, current_input_others diff --git a/auto_round/autoround.py b/auto_round/autoround.py index ed90bac09..f5365d80b 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -232,21 +232,23 @@ def _sampling_inputs( for key in input_others.keys(): if "positional_inputs" in key: continue - if (key not in share_cache_keys or len(indices) == 1) and not isinstance( - input_others[key], (str, bool, type(None)) - ): - current_input_others[key] = None - if input_others[key] is not None: - current_input_others[key] = [input_others[key][i] for i in indices] - if len(indices) == 1: - current_input_others[key] = current_input_others[key][0] - else: - try: - current_input_others[key] = torch.cat(current_input_others[key], dim=0) - except TypeError as err: - logger.warning_once("Please check the model cache inputs or try setting batch_size to 1.") - else: + # Shared cache keys (e.g. position_embeddings, position_ids, cache_position) are stored + # directly as-is (not wrapped in a per-sample list) when batch_size > 1. Indexing such + # values by sample index would incorrectly decompose them (e.g. (cos, sin)[0] == cos). + # Always pass them through unchanged. + if key in share_cache_keys or isinstance(input_others[key], (str, bool, type(None))): current_input_others[key] = input_others[key] + elif input_others[key] is not None: + current_input_others[key] = [input_others[key][i] for i in indices] + if len(indices) == 1: + current_input_others[key] = current_input_others[key][0] + else: + try: + current_input_others[key] = torch.cat(current_input_others[key], dim=0) + except TypeError as err: + logger.warning_once("Please check the model cache inputs or try setting batch_size to 1.") + else: + current_input_others[key] = None return current_input_ids, current_input_others diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 864cc069d..eb5226734 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -2059,6 +2059,7 @@ def input_capture_hook(module, *args, **kwargs): first_block_name = self.quant_block_list[0][0] class _FakeDecodingLayer(torch.nn.Module): + def forward(self, *args, **kwargs): return args, kwargs @@ -2603,6 +2604,7 @@ def _replace_forward(self): self.hook_handles.append(hook_handle) def _register_act_max_hook(self, model): + def get_act_max_hook(module, input, output): if isinstance(input, (tuple, list)): input = input[0] @@ -3514,21 +3516,23 @@ def _sampling_inputs( for key in input_others.keys(): if "positional_inputs" in key: continue - if (key not in share_cache_keys or len(indices) == 1) and not isinstance( - input_others[key], (str, bool, type(None)) - ): - current_input_others[key] = None - if input_others[key] is not None: - current_input_others[key] = [input_others[key][i] for i in indices] - if len(indices) == 1: - current_input_others[key] = current_input_others[key][0] - else: - try: - current_input_others[key] = torch.cat(current_input_others[key], dim=0) - except TypeError as err: - logger.warning_once("Please check the model cache inputs or try setting batch_size to 1.") - else: + # Shared cache keys (e.g. position_embeddings, position_ids, cache_position) are stored + # directly as-is (not wrapped in a per-sample list) when batch_size > 1. Indexing such + # values by sample index would incorrectly decompose them (e.g. (cos, sin)[0] == cos). + # Always pass them through unchanged. + if key in share_cache_keys or isinstance(input_others[key], (str, bool, type(None))): current_input_others[key] = input_others[key] + elif input_others[key] is not None: + current_input_others[key] = [input_others[key][i] for i in indices] + if len(indices) == 1: + current_input_others[key] = current_input_others[key][0] + else: + try: + current_input_others[key] = torch.cat(current_input_others[key], dim=0) + except TypeError as err: + logger.warning_once("Please check the model cache inputs or try setting batch_size to 1.") + else: + current_input_others[key] = None return current_input_ids, current_input_others diff --git a/auto_round/compressors_new/base.py b/auto_round/compressors_new/base.py index d222d8fe3..d17e8cdbb 100644 --- a/auto_round/compressors_new/base.py +++ b/auto_round/compressors_new/base.py @@ -187,6 +187,7 @@ def __init__( is_immediate_saving=self.is_immediate_saving, formats=self.formats, static_kv_dtype=self.static_kv_dtype, + static_attention_dtype=self.static_attention_dtype, ) self.model_context = ModelContext( model, diff --git a/auto_round/compressors_new/zero_shot.py b/auto_round/compressors_new/zero_shot.py index 1ee314f43..915c5da08 100644 --- a/auto_round/compressors_new/zero_shot.py +++ b/auto_round/compressors_new/zero_shot.py @@ -232,7 +232,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: tied_weights_values = list(tied_weights_keys) tied_weights_layers = [".".join(val.split(".")[:-1]) for val in tied_weights_values] # rm weight/bias # In fact, we should detect whether it is is_separate_lm_head, to simplify, we don't do it - if hasattr(self, "formats") and self.formats[0].is_gguf(): + if getattr(self, "formats", None) and self.formats[0].is_gguf(): lm_head_name = get_lm_head_name(self.model) if lm_head_name is not None: tied_weights_layers.append(lm_head_name) diff --git a/auto_round/context/compress.py b/auto_round/context/compress.py index 776083067..979b6f67d 100644 --- a/auto_round/context/compress.py +++ b/auto_round/context/compress.py @@ -40,6 +40,8 @@ def __init__( formats: Union[list, str] = None, output_dir: str = "./compressed_models", static_kv_dtype: Optional[torch.dtype] = None, + static_attention_dtype: Optional[torch.dtype] = None, + **kwargs, ): super().__init__() self.low_cpu_mem_usage = low_cpu_mem_usage @@ -62,3 +64,4 @@ def __init__( self.is_immediate_saving = is_immediate_saving self.formats = formats self.static_kv_dtype = static_kv_dtype + self.static_attention_dtype = static_attention_dtype From 9b4cab717246c94f9330645b75d8e27517977d79 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Mon, 23 Mar 2026 13:11:55 +0800 Subject: [PATCH 16/90] update Signed-off-by: n1ck-guo --- auto_round/algorithms/quantization/base.py | 64 ++++++++++------ auto_round/compressors_new/base.py | 88 ++++++++++++++++------ auto_round/compressors_new/entry.py | 8 +- auto_round/schemes.py | 12 +++ 4 files changed, 126 insertions(+), 46 deletions(-) diff --git a/auto_round/algorithms/quantization/base.py b/auto_round/algorithms/quantization/base.py index b743d1dbe..0b4887e5a 100644 --- a/auto_round/algorithms/quantization/base.py +++ b/auto_round/algorithms/quantization/base.py @@ -23,7 +23,6 @@ from auto_round.compressors_new.utils import ( IndexSampler, _get_quantized_layer_names_outside_blocks, - _get_save_folder_name, block_forward, check_need_act_calibration, check_skippable_keywords, @@ -90,6 +89,8 @@ def __init__(self, config: QuantizationConfig): self.ignore_layers = config.ignore_layers self.quant_lm_head = config.quant_lm_head self.to_quant_block_names = config.to_quant_block_names + # Instance-level flag: avoids class-level sharing that could cause cross-instance bugs. + self._scheme_resolved = False @classmethod def from_config(cls, config: QuantizationConfig): @@ -104,29 +105,39 @@ def from_config(cls, config: QuantizationConfig): def formats(self): return getattr(self.compress_context, "formats", None) - def post_init(self): - # should be set after loading model and set layer_config, cause some special scheme need these. - # Preserve the original, unparsed scheme for later use in auto scheme generation - # within `configure_layer_config` (which may need the raw value instead of `self.scheme`). + def resolve_scheme( + self, + model_context: "ModelContext", + compress_context: "CompressContext", + dataset: str = None, + ) -> None: + """Phase-1 init: resolve scheme and bind config attrs (no model structure needed). - # # Alternatively, you can use ModelContext.get_context - self.model_context = ModelContext() - self.compress_context = CompressContext() + Must be called BEFORE get_formats() and BEFORE post_init(). + Idempotent: safe to call multiple times. - # used in shard writer, rafactor later - self._get_save_folder_name = _get_save_folder_name + Args: + model_context: The ModelContext created by BaseCompressor. + compress_context: The CompressContext created by BaseCompressor. + dataset: Calibration dataset name/path. Used by AutoScheme's delta-loss + scheme selection. Callers should pass the compressor's own dataset + (or a sensible default) rather than leaving it to a global lookup. + """ + if self._scheme_resolved: + return - self.model = self.model_context.model + self.model_context = model_context + self.compress_context = compress_context + self.model = model_context.model + if dataset is not None: + self.dataset = dataset + # Build user-specified overrides from fields defined in QuantizationScheme. scheme_fields = {f.name for f in fields(QuantizationScheme)} - user_scheme_overrides = {} - for k in scheme_fields: - v = getattr(self.config, k, None) - if v is not None: - user_scheme_overrides[k] = v + user_scheme_overrides = {k: v for k in scheme_fields if (v := getattr(self.config, k, None)) is not None} default_scheme, self.is_auto_scheme, final_attrs = _parse_scheme(self.scheme, user_scheme_overrides) - # Bind attributes to self.config for easy instance-level access + # Bind resolved attrs to config and self for convenient access. for key, value in final_attrs.items(): setattr(self.config, key, value) if hasattr(self, key): @@ -136,16 +147,25 @@ def post_init(self): self.orig_scheme = copy.deepcopy(self.scheme) self.scheme = default_scheme + # GGUF format uses fp32 scale dtype; everything else defaults to fp16. gguf_scheme_name = get_gguf_scheme(self.scheme) - # GGUF uses fp32 scale dtype as default if self.scale_dtype is None: self.scale_dtype = "fp32" if gguf_scheme_name else "fp16" self.scale_dtype = convert_dtype_str2torch(self.scale_dtype) - if not self.is_auto_scheme: - enable_gguf_official_mixed = True - else: - enable_gguf_official_mixed = False + self._scheme_resolved = True + + def post_init(self) -> None: + """Phase-2 init: build layer config on the patched model. + + Requires resolve_scheme() to have been called first (asserted below). + Must be called AFTER model_context.apply_patches(). + """ + assert self._scheme_resolved, ( + "resolve_scheme() must be called before post_init(). " "BaseCompressor.post_init() does this automatically." + ) + + enable_gguf_official_mixed = not self.is_auto_scheme if self.quant_block_list is None: quant_nontext_module = getattr(self.model_context, "quant_nontext_module", False) diff --git a/auto_round/compressors_new/base.py b/auto_round/compressors_new/base.py index d17e8cdbb..da33842b6 100644 --- a/auto_round/compressors_new/base.py +++ b/auto_round/compressors_new/base.py @@ -30,7 +30,6 @@ SUPPORTED_LAYER_TYPES, TORCH_VERSION_AT_LEAST_2_6, compile_func, - convert_dtype_str2torch, extract_block_names_to_str, is_debug_mode, is_hpex_available, @@ -204,11 +203,13 @@ def __init__( ) self.shard_writer = None - from auto_round.schemes import get_gguf_scheme + # scale_dtype is resolved in quantizer.resolve_scheme() after scheme resolution, + # so it is not initialized here to avoid premature evaluation with an unresolved scheme. - qc = self.quantize_config - if qc.scale_dtype is None: - qc.scale_dtype = convert_dtype_str2torch("fp32" if get_gguf_scheme(qc.scheme) else "fp16") + # Flag for post_init idempotency. Set to False here so post_init() can be called + # either via quantize_and_save() (preferred, outside inference_mode) or directly + # from quantize() as a fallback for non-AutoScheme cases. + self._post_init_done = False # Apply torch compile adjustments eagerly so that ar.enable_torch_compile # reflects the correct value immediately after construction (not only after post_init). @@ -266,50 +267,88 @@ def _adjust_torch_compile(self, enable_torch_compile: bool) -> None: self.enable_torch_compile = False logger.warning("reset enable_torch_compile to `False` as nvfp4 is enabled") - def post_init(self): - assert self.model_context._model_loaded, "model should be loaded in ModelContext.__init__" + def _get_calibration_dataset(self) -> str: + """Resolve calibration dataset: self.dataset > AutoScheme.dataset > default.""" + dataset = self.__dict__.get("dataset", None) + if dataset: + return dataset + from auto_round.auto_scheme.gen_auto_scheme import AutoScheme + + scheme = self.quantize_config.scheme + if isinstance(scheme, AutoScheme) and scheme.dataset: + return scheme.dataset + return "NeelNanda/pile-10k" + + def post_init(self) -> None: + """One-time initialization that requires a loaded model. + + Call this OUTSIDE any ``@torch.inference_mode()`` context when using + AutoScheme – delta-loss selection needs autograd (backward pass). + ``quantize_and_save()`` does this automatically before entering the + inference-mode quantize loop. + + The five phases in order: + 1. Scheme resolution – pure config, no model structure needed. + 2. Format resolution – needs data_type/bits from phase 1. + 3. Model patching – needs formats from phase 2. + 4. Layer-config build – needs patched model from phase 3. + 5. Hardware setup – device map, torch.compile, offloading. + """ + if self._post_init_done: + return + + # ── Phase 1: resolve scheme ─────────────────────────────────────────── + # Creates the quantizer and runs scheme parsing (pure config work: + # sets data_type / bits / sym / scale_dtype etc.). + self.quantizer = BaseQuantizers.from_config(self.quantize_config) + self.quantizer.resolve_scheme( + model_context=self.model_context, + compress_context=self.compress_context, + dataset=self._get_calibration_dataset(), + ) + self.wrapper_block = wrapper_block - # 1. Resolve formats (scale_dtype was defaulted early in __init__) + # ── Phase 2: resolve output format ─────────────────────────────────── + # get_formats() inspects data_type / bits etc. that were just resolved. if isinstance(self.formats, str): self.formats = get_formats(self.formats, self) if self.formats is not None: self.compress_context.formats = self.formats - ShardWriter.reset() # Ensure a fresh ShardWriter for every new quantization run + ShardWriter.reset() self.shard_writer = ShardWriter(self.model_context.model, bits=8) + # ── Phase 3: patch model structure ─────────────────────────────────── + # update_module() may replace layers (e.g. MoE expert merging); must + # happen before configure_layer_config() so it sees the final topology. self.model_context.apply_patches(self.formats) - self.quantizer = BaseQuantizers.from_config(self.quantize_config) + # ── Phase 4: build layer config ────────────────────────────────────── + # configure_layer_config() walks the patched model; _gen_auto_scheme() + # (AutoScheme path) runs delta-loss forward+backward passes. self.quantizer.post_init() - self.wrapper_block = wrapper_block - # Set device + # ── Phase 5: hardware / compile setup ──────────────────────────────── set_non_auto_device_map(self.model_context.model, self.compress_context.device_map) - - # Re-check torch compile with fully resolved config attrs + # Re-evaluate torch.compile eligibility now that data_type is resolved. self._adjust_torch_compile(self.enable_torch_compile) self.compress_context.enable_torch_compile = self.enable_torch_compile - self.block_forward = ( compile_func(block_forward, self.compress_context.device) if self.enable_torch_compile else block_forward ) - if self.compress_context.low_cpu_mem_usage: self._offloader.reset() - def _should_disable_inplace_due_to_layers_outside_block() -> bool: - return self.quantizer.has_qlayer_outside_block and self.need_calib - - # Disable inplace mode when there are quantized layers outside blocks - # under specific iteration/optimization settings. - if _should_disable_inplace_due_to_layers_outside_block(): + # Disable inplace when quantized layers live outside transformer blocks. + if self.quantizer.has_qlayer_outside_block and self.need_calib: self.inplace = False + if not hasattr(self, "formats"): logger.warning("this API is deprecated, please use `quantize_and_save` instead") else: - # Determine if immediate packing is required self._adjust_immediate_packing_and_saving() + self._post_init_done = True + # backward compatible with the legacy API def __getattr__(self, name: str) -> Any: if name in self.__dict__: @@ -539,6 +578,9 @@ def quantize_and_save( kwargs.pop("inplace", None) # Perform model quantization + # IMPORTANT: post_init() must run outside any @torch.inference_mode() context + # because AutoScheme's delta-loss selection requires gradient tracking. + self.post_init() if self.static_attention_dtype is not None: from auto_round.experimental.attention import attention_quant_ctx diff --git a/auto_round/compressors_new/entry.py b/auto_round/compressors_new/entry.py index 27a2c6b92..3dcf640bc 100644 --- a/auto_round/compressors_new/entry.py +++ b/auto_round/compressors_new/entry.py @@ -121,7 +121,13 @@ class DiffusionCalibCompressor(DiffusionMixin, CalibCompressor): static_attention_dtype=kwargs.get("static_attention_dtype"), ) - if enable_imatrix or needs_act_calib: + # AutoScheme always requires calibration data for delta-loss based + # scheme selection, regardless of whether imatrix is needed. + from auto_round.auto_scheme.gen_auto_scheme import AutoScheme as _AutoScheme + + is_auto_scheme = isinstance(config.scheme, _AutoScheme) + + if enable_imatrix or needs_act_calib or is_auto_scheme: config._alg_cls = "OptimizedRTNQuantizer" # For RTN with calibration data, dynamically combine with model-specific Mixin if model_type == "mllm": diff --git a/auto_round/schemes.py b/auto_round/schemes.py index 953d10921..91cc4ea56 100644 --- a/auto_round/schemes.py +++ b/auto_round/schemes.py @@ -534,6 +534,18 @@ def get_gguf_scheme(scheme: Union[str, QuantizationScheme]) -> str: return scheme if isinstance(scheme, str): return "" + # AutoScheme is a lazy placeholder whose concrete option is resolved later + # (after model loading). It is never a GGUF scheme itself. + from auto_round.auto_scheme.gen_auto_scheme import AutoScheme + + if isinstance(scheme, AutoScheme): + # options is always a list after AutoScheme.__post_init__. + # If the primary option is a GGUF scheme, propagate it so that + # scale_dtype defaults to fp32 (GGUF convention). + primary = scheme.options[0] if scheme.options else None + if isinstance(primary, str) and primary.upper().startswith("GGUF"): + return primary + return "" for key, val in PRESET_SCHEMES.items(): # For q40 or q4_1 we only support it with str scheme, otherwise it will be matched incorrectly with W4G32 if not key.upper().startswith("GGUF") or ("0" in key or "1" in key): From a1fe717cc88a35d68ca9f330a398c55902f87c14 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Mon, 23 Mar 2026 13:44:36 +0800 Subject: [PATCH 17/90] sync merge change Signed-off-by: n1ck-guo --- .../quantization/auto_round/quantizer.py | 5 ++-- auto_round/compressors_new/base.py | 8 +++++ auto_round/compressors_new/calib.py | 3 ++ auto_round/compressors_new/mllm_mixin.py | 30 +++++++++++++++++++ auto_round/compressors_new/shard_writer.py | 6 +++- 5 files changed, 49 insertions(+), 3 deletions(-) diff --git a/auto_round/algorithms/quantization/auto_round/quantizer.py b/auto_round/algorithms/quantization/auto_round/quantizer.py index 831619a54..617b9ab9c 100644 --- a/auto_round/algorithms/quantization/auto_round/quantizer.py +++ b/auto_round/algorithms/quantization/auto_round/quantizer.py @@ -43,6 +43,7 @@ is_auto_device_mapping, is_hpex_available, memory_monitor, + merge_block_output_keys, mv_module_from_gpu, set_amax_for_all_moe_layers, to_device, @@ -91,7 +92,7 @@ def post_init(self): super().post_init() if self.enable_alg_ext: try: - logger.warning_once("using algorithm extension for quantization.") + logger.info("using algorithm extension for quantization.") from auto_round.alg_ext import wrapper_autoround wrapper_autoround(self) @@ -129,7 +130,7 @@ def _get_diffusion_current_q_output( ) if isinstance(current_input_ids, dict): hidden_states = current_input_ids.pop("hidden_states") - current_input_others.update(current_input_ids) + merge_block_output_keys(block, current_input_others, current_input_ids) current_input_ids = hidden_states output_q = block_forward( block, diff --git a/auto_round/compressors_new/base.py b/auto_round/compressors_new/base.py index da33842b6..e7e5ee077 100644 --- a/auto_round/compressors_new/base.py +++ b/auto_round/compressors_new/base.py @@ -72,6 +72,7 @@ class SerializedCompressorConfig: super_bits: Optional[int] = None super_group_size: Optional[int] = None to_quant_block_names: Optional[list[str]] = None + transform_config: Optional[dict[str, Any]] = None class BaseCompressor(object): @@ -172,6 +173,8 @@ def __init__( logger.info("habana_frameworks is available, import htcore explicitly.") import habana_frameworks.torch.core as htcore # pylint: disable=E0401 + self.transform_config = kwargs.pop("transform_config", {}) + # Reset both context singletons before creating fresh instances so that # consecutive AutoRound creations don't inherit stale config from earlier ones. CompressContext.reset_context() @@ -266,6 +269,11 @@ def _adjust_torch_compile(self, enable_torch_compile: bool) -> None: if self.enable_torch_compile and is_raw_nv_fp: self.enable_torch_compile = False logger.warning("reset enable_torch_compile to `False` as nvfp4 is enabled") + super_group_size = getattr(cfg, "super_group_size", None) + enable_alg_ext = getattr(cfg, "enable_alg_ext", False) + if self.enable_torch_compile and super_group_size is not None and enable_alg_ext: + self.enable_torch_compile = False + logger.warning("reset enable_torch_compile to `False` as super_group_size is set for algorithm extension") def _get_calibration_dataset(self) -> str: """Resolve calibration dataset: self.dataset > AutoScheme.dataset > default.""" diff --git a/auto_round/compressors_new/calib.py b/auto_round/compressors_new/calib.py index d806ca241..09a20c5b0 100644 --- a/auto_round/compressors_new/calib.py +++ b/auto_round/compressors_new/calib.py @@ -54,6 +54,7 @@ set_module, to_device, to_dtype, + wrap_block_forward_positional_to_kwargs, ) from auto_round.utils.device import ( parse_available_devices, @@ -460,6 +461,8 @@ def _get_block_forward_func(self, name: str) -> Callable: Returns: function: The forward function. """ + if self.model_context.is_diffusion: + return wrap_block_forward_positional_to_kwargs(super()._get_block_forward_func(name)) def post_process_cache_data(batch_size, data, data_name): """ diff --git a/auto_round/compressors_new/mllm_mixin.py b/auto_round/compressors_new/mllm_mixin.py index e2026620e..50cda6492 100644 --- a/auto_round/compressors_new/mllm_mixin.py +++ b/auto_round/compressors_new/mllm_mixin.py @@ -179,3 +179,33 @@ def calib(self, nsamples, bs): exit(-1) elif total_cnt < nsamples: logger.warning(f"Insufficient number of samples: required {nsamples}, but only {total_cnt} were processed.") + + def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **kwargs): + """Save the quantized model to the specified output directory in the specified format. + + Args: + output_dir (str, optional): The directory to save the quantized model. Defaults to None. + format (str, optional): The format in which to save the model. Defaults to "auto_round". + inplace (bool, optional): Whether to modify the model in place. Defaults to True. + **kwargs: Additional keyword arguments specific to the export format. + + Returns: + object: The compressed model object. + """ + mc = self.model_context + processor = mc.processor + image_processor = mc.image_processor + tokenizer = mc.tokenizer + + if processor is not None and not hasattr(processor, "chat_template"): + processor.chat_template = None + compressed_model = super().save_quantized( + output_dir=output_dir, + format=format, + inplace=inplace, + processor=processor, + image_processor=image_processor, + quant_nontext_module=self.quant_nontext_module if hasattr(self, "quant_nontext_module") else False, + **kwargs, + ) + return compressed_model diff --git a/auto_round/compressors_new/shard_writer.py b/auto_round/compressors_new/shard_writer.py index 772a6cd88..4ea5ce0ca 100644 --- a/auto_round/compressors_new/shard_writer.py +++ b/auto_round/compressors_new/shard_writer.py @@ -91,7 +91,11 @@ def __init__( # Directory Setup compress_context = CompressContext.get_context() formats = compress_context.formats - self.output_dir = os.path.join(_get_save_folder_name(formats[0]), "") + base_dir = _get_save_folder_name(formats[0]) + subfolder = getattr(self.model, "_autoround_pipeline_subfolder", None) + if subfolder: + base_dir = os.path.join(base_dir, subfolder) + self.output_dir = os.path.join(base_dir, "") os.makedirs(self.output_dir, exist_ok=True) ShardWriter._initialized = True From b58d55aa82e8e002d241f20e1e4ddd06435ce612 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Mon, 23 Mar 2026 15:00:29 +0800 Subject: [PATCH 18/90] fix Signed-off-by: n1ck-guo --- auto_round/compressors_new/calib.py | 3 --- auto_round/compressors_new/diffusion_mixin.py | 9 +++++++++ auto_round/compressors_new/entry.py | 6 ++++++ 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/auto_round/compressors_new/calib.py b/auto_round/compressors_new/calib.py index 09a20c5b0..4cdc6625d 100644 --- a/auto_round/compressors_new/calib.py +++ b/auto_round/compressors_new/calib.py @@ -461,8 +461,6 @@ def _get_block_forward_func(self, name: str) -> Callable: Returns: function: The forward function. """ - if self.model_context.is_diffusion: - return wrap_block_forward_positional_to_kwargs(super()._get_block_forward_func(name)) def post_process_cache_data(batch_size, data, data_name): """ @@ -922,7 +920,6 @@ def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None: enable_torch_compile=self.enable_torch_compile, device=self.compress_context.device, disable_opt_rtn=self.disable_opt_rtn, - enable_rtn=self.iters == 0, ) new_layer = wrapper_layer.unwrapper({}) set_module(self.model, layer_name, new_layer) diff --git a/auto_round/compressors_new/diffusion_mixin.py b/auto_round/compressors_new/diffusion_mixin.py index ab723bab1..e37702cb9 100644 --- a/auto_round/compressors_new/diffusion_mixin.py +++ b/auto_round/compressors_new/diffusion_mixin.py @@ -18,6 +18,7 @@ from tqdm import tqdm from auto_round.logger import logger +from auto_round.utils.model import wrap_block_forward_positional_to_kwargs class DiffusionMixin: @@ -55,6 +56,14 @@ def __init__(self, *args, guidance_scale=7.5, num_inference_steps=50, generator_ # Call parent class __init__ (will be CalibCompressor, ImatrixCompressor, etc) super().__init__(*args, **kwargs) + def _get_block_forward_func(self, name: str): + """Diffusion models pass positional args; wrap the base forward func accordingly. + + The MRO guarantees that super() resolves to CalibCompressor._get_block_forward_func, + mirroring the old-arch pattern in compressors/diffusion/compressor.py. + """ + return wrap_block_forward_positional_to_kwargs(super()._get_block_forward_func(name)) + @torch.no_grad() def calib(self, nsamples, bs): """Perform diffusion-specific calibration for quantization. diff --git a/auto_round/compressors_new/entry.py b/auto_round/compressors_new/entry.py index 3dcf640bc..0f6bf7cc6 100644 --- a/auto_round/compressors_new/entry.py +++ b/auto_round/compressors_new/entry.py @@ -71,6 +71,12 @@ def __new__( # Detect model type to determine if we need special compressor model_type = detect_model_type(model) + # If the user explicitly passes processor/image_processor, treat as MLLM even if + # auto-detection missed it (mirrors the has_multimodal_assets check in autoround.py). + has_multimodal_assets = kwargs.get("processor") is not None or kwargs.get("image_processor") is not None + if has_multimodal_assets and model_type != "mllm": + model_type = "mllm" + if isinstance(config, AutoRoundConfig): # For AutoRound, we need calibration-based compression # Dynamically create combined class using Mixin pattern From 6a7ac607c7342e3724e705889a5173f3b28fac48 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Fri, 27 Mar 2026 14:44:40 +0800 Subject: [PATCH 19/90] fix ut Signed-off-by: n1ck-guo --- auto_round/algorithms/quantization/base.py | 26 ++++++ auto_round/compressors/base.py | 6 +- auto_round/compressors_new/base.py | 82 ++++++++++++++++++- auto_round/compressors_new/calib.py | 15 ++-- auto_round/compressors_new/diffusion_mixin.py | 1 - auto_round/compressors_new/utils.py | 5 +- .../export/export_to_autogptq/export.py | 2 +- auto_round/export/export_to_gguf/export.py | 47 +++++------ auto_round/formats.py | 5 +- auto_round/modeling/fused_moe/gpt_oss.py | 21 +++-- auto_round/modeling/unfused_moe/__init__.py | 5 +- auto_round/utils/model.py | 5 +- 12 files changed, 171 insertions(+), 49 deletions(-) diff --git a/auto_round/algorithms/quantization/base.py b/auto_round/algorithms/quantization/base.py index 0b4887e5a..1218a6111 100644 --- a/auto_round/algorithms/quantization/base.py +++ b/auto_round/algorithms/quantization/base.py @@ -105,6 +105,16 @@ def from_config(cls, config: QuantizationConfig): def formats(self): return getattr(self.compress_context, "formats", None) + @property + def amp(self): + return getattr(self.model_context, "amp", False) + + @property + def amp_dtype(self): + import torch + + return getattr(self.model_context, "amp_dtype", torch.float32) + def resolve_scheme( self, model_context: "ModelContext", @@ -290,6 +300,21 @@ def configure_layer_config(self, enable_gguf_official_mixed: None | bool = True) quant_lm_head=self.quant_lm_head, mllm=self.model_context.is_mllm, ) + # For GGUF _mixed formats (e.g. gguf:q2_k_mixed), the inner GGUFFormat + # stores the mixed-handling result in ar.layer_config which is NOT the + # same object as quantizer.layer_config. Re-apply the special scheme + # on the patched model here (after Phase 3) so layer names are correct. + _gguf_orig_fmt = getattr(self, "_gguf_original_format_name", None) + if _gguf_orig_fmt and "_MIXED" in _gguf_orig_fmt.upper(): + self.layer_config = _handle_special_schemes( + _gguf_orig_fmt.lower(), + self.layer_config, + self.model_context.model, + supported_types=SUPPORTED_LAYER_TYPES, + inner_supported_types=INNER_SUPPORTED_LAYER_TYPES, + quant_lm_head=self.quant_lm_head, + mllm=self.model_context.is_mllm, + ) fill_default_value = True if self.is_auto_scheme: @@ -307,6 +332,7 @@ def configure_layer_config(self, enable_gguf_official_mixed: None | bool = True) enable_gguf_official_mixed=enable_gguf_official_mixed, is_mllm=self.model_context.is_mllm, fill_default_value=fill_default_value, + gguf_format_name=getattr(self, "_gguf_format_name", None), ) def _register_act_max_hook(self, model): diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 209b85259..c6fe4e4bd 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -2473,10 +2473,10 @@ def post_process_cache_data(batch_size, data, data_name): Processed data or None """ new_data = data - if batch_size <= 1: - return new_data if data_name in self.shared_cache_keys: return None + if batch_size <= 1: + return new_data if "alibi" in data_name: if isinstance(data, torch.Tensor): alibi = data @@ -2524,7 +2524,7 @@ def forward(m, hidden_states=None, *positional_inputs, **kwargs): ): if key not in self.inputs[name].keys(): # initialization data = to_device(kwargs[key], device=torch.device("cpu")) - if data is None or (self.batch_size > 1 and key in self.shared_cache_keys): + if data is None or key in self.shared_cache_keys: self.inputs[name][key] = data continue if self.batch_size <= 1: diff --git a/auto_round/compressors_new/base.py b/auto_round/compressors_new/base.py index e7e5ee077..b145b7211 100644 --- a/auto_round/compressors_new/base.py +++ b/auto_round/compressors_new/base.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import os -from dataclasses import asdict, dataclass +from dataclasses import asdict, dataclass, fields from typing import Any, Optional, Union import torch @@ -27,6 +27,7 @@ from auto_round.formats import OutputFormat, get_formats from auto_round.logger import logger from auto_round.utils import ( + INNER_SUPPORTED_LAYER_TYPES, SUPPORTED_LAYER_TYPES, TORCH_VERSION_AT_LEAST_2_6, compile_func, @@ -81,6 +82,8 @@ class BaseCompressor(object): model_context: ModelContext = None shard_writer: ShardWriter = None supported_types = SUPPORTED_LAYER_TYPES + inner_supported_types = INNER_SUPPORTED_LAYER_TYPES + quant_block_list = None def __init__( self, @@ -325,6 +328,83 @@ def post_init(self) -> None: ShardWriter.reset() self.shard_writer = ShardWriter(self.model_context.model, bits=8) + # ── Phase 2b: propagate format-adjusted attrs back to quantizer ────── + # gguf_args_check (called inside get_formats) may have overridden + # bits / sym / data_type / super_bits / super_group_size / group_size + # on *this* BaseCompressor object. The quantizer stored its own copies + # from Phase 1 (resolve_scheme), so we must sync them now, before + # quantizer.post_init() builds the layer_config in Phase 4. + _gguf_forwarded_attrs = ( + "bits", + "sym", + "data_type", + "super_bits", + "super_group_size", + "group_size", + "act_bits", + "scale_dtype", + ) + _any_gguf_attr_changed = False + for _attr in _gguf_forwarded_attrs: + if _attr in self.__dict__ and hasattr(self.quantizer, _attr): + if _attr not in ("scale_dtype", "act_bits") and getattr(self.quantizer, _attr) != self.__dict__[_attr]: + _any_gguf_attr_changed = True + setattr(self.quantizer, _attr, self.__dict__[_attr]) + # If gguf_args_check changed scheme attrs, rebuild self.quantizer.scheme + # so that set_layer_config() uses the correct default_dict and gguf_name. + if _any_gguf_attr_changed: + from auto_round.schemes import PRESET_SCHEMES + from auto_round.schemes import QuantizationScheme as _QS + + # Prefer to derive the scheme directly from the gguf format name to + # avoid ambiguity (e.g. Q4_K_S and Q4_K_M share identical weight attrs). + _gguf_preset_scheme = None + _gguf_fmt_name = None + _gguf_original_fmt_name = None + for _fmt in self.formats or []: + # GGUFFormat (outer) has output_format="gguf" but backend.output_format="gguf:q4_k_m" + # GGUFFormat (inner/standalone) has output_format="gguf:q4_k_m" + _of = getattr(_fmt, "output_format", "") + if "gguf" in str(_of): + if str(_of) == "gguf": + # outer GGUFFormat: full format in _original_format (e.g. "gguf:q2_k_mixed") + # or backend.output_format (e.g. "gguf:q2_k_s" after _mixed → _s conversion) + _orig = getattr(_fmt, "_original_format", None) + if _orig: + _gguf_original_fmt_name = str(_orig).upper() + _backend = getattr(_fmt, "backend", None) + _of = getattr(_backend, "output_format", _of) if _backend is not None else _of + _preset_key = str(_of).upper() + if _preset_key in PRESET_SCHEMES: + _gguf_preset_scheme = PRESET_SCHEMES[_preset_key] + _gguf_fmt_name = _preset_key + break + if _gguf_preset_scheme is not None: + self.quantizer.scheme = _gguf_preset_scheme + # Store the exact gguf format name so set_layer_config can + # use it directly, avoiding Q4_K_S / Q4_K_M ambiguity. + self.quantizer._gguf_format_name = _gguf_fmt_name + # Store original format name (may include _mixed) for _handle_special_schemes + if _gguf_original_fmt_name: + self.quantizer._gguf_original_format_name = _gguf_original_fmt_name + else: + _new_scheme_dict = {f.name: getattr(self.quantizer, f.name, None) for f in fields(_QS)} + self.quantizer.scheme = _QS.from_dict({k: v for k, v in _new_scheme_dict.items() if v is not None}) + + # ── Phase 2c: sync layer_config set by GGUFFormat._mixed handling ─── + # Inner GGUFFormat("q2_k_mixed", ar) calls _handle_special_schemes and + # stores the result in ar.__dict__["layer_config"] (via ar.layer_config=). + # This is NOT the same object as quantizer.layer_config, so we must + # forward it here before quantizer.post_init() builds the final config. + _compressor_layer_cfg = self.__dict__.get("layer_config") + if _compressor_layer_cfg is not None and isinstance(_compressor_layer_cfg, dict) and _compressor_layer_cfg: + # Merge: let the GGUFFormat-set entries take precedence over any + # user-provided entries already in quantizer.layer_config. + if self.quantizer.layer_config is None: + self.quantizer.layer_config = {} + for _lname, _lval in _compressor_layer_cfg.items(): + self.quantizer.layer_config.setdefault(_lname, _lval) + # ── Phase 3: patch model structure ─────────────────────────────────── # update_module() may replace layers (e.g. MoE expert merging); must # happen before configure_layer_config() so it sees the final topology. diff --git a/auto_round/compressors_new/calib.py b/auto_round/compressors_new/calib.py index 4cdc6625d..2fb746987 100644 --- a/auto_round/compressors_new/calib.py +++ b/auto_round/compressors_new/calib.py @@ -475,10 +475,10 @@ def post_process_cache_data(batch_size, data, data_name): Processed data or None """ new_data = data - if batch_size <= 1: - return new_data if data_name in self.model_context.shared_cache_keys: return None + if batch_size <= 1: + return new_data if "alibi" in data_name: if isinstance(data, torch.Tensor): alibi = data @@ -526,9 +526,7 @@ def forward(m, hidden_states=None, *positional_inputs, **kwargs): ): if key not in self.inputs[name].keys(): # initialization data = to_device(kwargs[key], device=torch.device("cpu")) - if data is None or ( - self.quantizer.batch_size > 1 and key in self.model_context.shared_cache_keys - ): + if data is None or key in self.model_context.shared_cache_keys: self.inputs[name][key] = data continue if self.quantizer.batch_size <= 1: @@ -1234,7 +1232,6 @@ def get_imatrix_hook(module, input, output): for hook in hooks: hook.remove() - @torch.inference_mode() def quantize(self): """Quantize all modules in the model using RTN (Round-To-Nearest) strategy. @@ -1244,7 +1241,13 @@ def quantize(self): Returns: tuple[nn.Module, Dict[str, Any]]: The quantized model and the layer configuration. """ + # post_init must be called OUTSIDE @torch.inference_mode() because + # AutoScheme delta-loss selection requires autograd (backward pass). self.post_init() + return self._quantize_impl() + + @torch.inference_mode() + def _quantize_impl(self): formats = getattr(self, "formats", None) or [] if not (any(fmt.is_gguf() for fmt in formats) or self.super_bits is not None): diff --git a/auto_round/compressors_new/diffusion_mixin.py b/auto_round/compressors_new/diffusion_mixin.py index e37702cb9..65e2c23f0 100644 --- a/auto_round/compressors_new/diffusion_mixin.py +++ b/auto_round/compressors_new/diffusion_mixin.py @@ -51,7 +51,6 @@ def __init__(self, *args, guidance_scale=7.5, num_inference_steps=50, generator_ self.guidance_scale = guidance_scale self.num_inference_steps = num_inference_steps self.generator_seed = generator_seed - self.diffusion = True # Call parent class __init__ (will be CalibCompressor, ImatrixCompressor, etc) super().__init__(*args, **kwargs) diff --git a/auto_round/compressors_new/utils.py b/auto_round/compressors_new/utils.py index 4e3170ae3..75a99da28 100644 --- a/auto_round/compressors_new/utils.py +++ b/auto_round/compressors_new/utils.py @@ -224,6 +224,7 @@ def set_layer_config( enable_gguf_official_mixed: bool = True, is_mllm: bool = False, fill_default_value=True, + gguf_format_name: str = None, ) -> tuple[dict, bool, dict]: """ Normalize, validate, and expand layer-specific quantization configs. @@ -315,7 +316,7 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str # 5. collect supported modules embedding_types = (torch.nn.Embedding,) - gguf_name = get_gguf_scheme(default_scheme) + gguf_name = gguf_format_name if gguf_format_name else get_gguf_scheme(default_scheme) if gguf_name: if torch.nn.Embedding not in supported_types: supported_types = (*supported_types, torch.nn.Embedding) @@ -549,7 +550,7 @@ def get_layer_config_by_gguf_format(layer_config, target_gguf_format: str, model import gguf # pylint: disable=E0401 - from auto_round.schemes import get_gguf_scheme + from auto_round.schemes import QuantizationScheme, get_gguf_scheme from auto_round.utils.common import MM_KEYS, LazyImport from auto_round.utils.model import get_lm_head_name, get_module diff --git a/auto_round/export/export_to_autogptq/export.py b/auto_round/export/export_to_autogptq/export.py index acfed7772..6d293a846 100644 --- a/auto_round/export/export_to_autogptq/export.py +++ b/auto_round/export/export_to_autogptq/export.py @@ -206,7 +206,7 @@ def save_quantized_as_autogptq( # --- 1️⃣ Extract inputs & configs --- quantization_config = serialization_dict - quant_block_list = serialization_dict.get("quant_block_list", get_block_names(model)) + quant_block_list = serialization_dict.get("quant_block_list") or get_block_names(model) processor = kwargs.get("processor") image_processor = kwargs.get("image_processor") safe_serialization = kwargs.get("safe_serialization", True) diff --git a/auto_round/export/export_to_gguf/export.py b/auto_round/export/export_to_gguf/export.py index 57d510ac4..c3a06c2ad 100644 --- a/auto_round/export/export_to_gguf/export.py +++ b/auto_round/export/export_to_gguf/export.py @@ -177,32 +177,29 @@ def pack_gguf_layer( ) ) - if not hasattr(model, "last_layer_name_to_block_name"): - block_name_to_last_layer_name = {} - block_names = get_block_names(model, quant_vision=True) - block_names_flatten = flatten_list(block_names) - all_qlayer_name = [] - for n, m in model.named_modules(): - if not check_to_quantized(m): - continue - all_qlayer_name.append(n) - for block_name in block_names_flatten: - block_name_split = block_name.split(".") - name_split = n.split(".") - if ( - len(name_split) < len(block_name_split) - or name_split[: len(block_name_split)] != block_name_split - ): - continue - block_name_to_last_layer_name[block_name] = n - last_layer_name_to_block_name = {v: k for k, v in block_name_to_last_layer_name.items()} - model.last_layer_name_to_block_name = last_layer_name_to_block_name - names_in_blocks = [] + if not hasattr(model, "last_layer_name_to_block_name"): + block_name_to_last_layer_name = {} + block_names = get_block_names(model, quant_vision=True) + block_names_flatten = flatten_list(block_names) + all_qlayer_name = [] + for n, m in model.named_modules(): + if not check_to_quantized(m): + continue + all_qlayer_name.append(n) for block_name in block_names_flatten: - block = get_module(model, block_name) - for n, m in block.named_modules(): - if check_to_quantized(m): - names_in_blocks.append(m.global_name) + block_name_split = block_name.split(".") + name_split = n.split(".") + if len(name_split) < len(block_name_split) or name_split[: len(block_name_split)] != block_name_split: + continue + block_name_to_last_layer_name[block_name] = n + last_layer_name_to_block_name = {v: k for k, v in block_name_to_last_layer_name.items()} + model.last_layer_name_to_block_name = last_layer_name_to_block_name + names_in_blocks = [] + for block_name in block_names_flatten: + block = get_module(model, block_name) + for n, m in block.named_modules(): + if check_to_quantized(m): + names_in_blocks.append(m.global_name) if name in model.last_layer_name_to_block_name: # Packing block diff --git a/auto_round/formats.py b/auto_round/formats.py index 3d3f20326..4626abef3 100644 --- a/auto_round/formats.py +++ b/auto_round/formats.py @@ -733,6 +733,7 @@ class GGUFFormat(OutputFormat): def __init__(self, format: str, ar: BaseCompressor): if format.startswith("gguf:"): + self._original_format = format # preserve "gguf:q2_k_mixed" etc. for Phase 2b self.gguf_args_check(ar, format, model_type=ModelType.TEXT) if ar.mllm: self.gguf_args_check(ar, format, model_type=ModelType.MMPROJ) @@ -768,14 +769,14 @@ def check_scheme_args(cls: OutputFormat, scheme: QuantizationScheme) -> bool: return True def check_and_reset_format(self, ar): - if ar.iters != 0 and ar.bits != 3 and not ar.enable_alg_ext: + if getattr(ar, "iters", 0) != 0 and ar.bits != 3 and not ar.enable_alg_ext: logger.warning_once( "`iters=0` is recommended when exporting to current GGUF format" " or add `enable_alg_ext` for better accuracy with much more tuning cost." " Please refer to https://github.com/intel/auto-round/tree/main/docs/gguf_alg_ext_acc.md" " for the accuracy results." ) - elif ar.bits >= 8 and ar.iters != 0: + elif ar.bits >= 8 and getattr(ar, "iters", 0) != 0: logger.warning_once("`iters=0` is recommended for bits>=8") if getattr(ar, "quant_nontext_module", False): diff --git a/auto_round/modeling/fused_moe/gpt_oss.py b/auto_round/modeling/fused_moe/gpt_oss.py index 959e01356..da68a57fc 100644 --- a/auto_round/modeling/fused_moe/gpt_oss.py +++ b/auto_round/modeling/fused_moe/gpt_oss.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import torch import transformers from packaging import version @@ -33,6 +32,7 @@ class GPTOssSingleExpert(nn.Module): + def __init__(self, hidden_size: int, intermediate_size: int, dtype: torch.dtype | None = None): super().__init__() self.hidden_size = hidden_size @@ -101,17 +101,26 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: B, T, H = hidden_states.shape x = hidden_states.reshape(-1, H) - # Use the original router (it returns scores and indices already softmaxed over top-k) - router_scores, router_indices = self.router(x) # scores: [tokens, E], indices: [tokens, k] + # Use the original router (it returns logits, scores and indices) + router_out = self.router(x) + if len(router_out) == 3: + _, router_scores, router_indices = router_out + else: + router_scores, router_indices = router_out final_hidden_states = self.shared_expert(x) if self.shared_expert is not None else torch.zeros_like(x) num_all_tokens, total_num_experts = x.size(0), self.num_experts mask_weights = torch.zeros((num_all_tokens, total_num_experts), dtype=x.dtype, device=x.device) - topk_ids, experts_mask = router_indices, router_scores - topk_ids = topk_ids.to(torch.int64) + topk_ids = router_indices.to(torch.int64) mask_weights.scatter_(-1, topk_ids, 1) + # Build per-expert routing score matrix: shape (num_experts, num_tokens) + # experts_mask[e, t] = router score of expert e for token t (0 if not selected) + expert_score_matrix = torch.zeros_like(mask_weights) + expert_score_matrix.scatter_(-1, topk_ids, router_scores) + expert_score_matrix = expert_score_matrix.transpose(0, 1) # (num_experts, num_tokens) + mask_weights = mask_weights[:num_all_tokens, :total_num_experts] mask_weights = mask_weights.transpose(0, 1) @@ -124,7 +133,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: mask_weight = mask_weights[expert_idx].unsqueeze(1) current_state_static = x * mask_weight expert_output = self.experts[expert_idx](current_state_static) - expert_output = expert_output * experts_mask[expert_idx].unsqueeze(1) + expert_output = expert_output * expert_score_matrix[expert_idx].unsqueeze(1) final_hidden_states += expert_output return final_hidden_states.view(B, T, H), router_scores.view(B * T, -1) diff --git a/auto_round/modeling/unfused_moe/__init__.py b/auto_round/modeling/unfused_moe/__init__.py index 3b9511731..a112a2a15 100644 --- a/auto_round/modeling/unfused_moe/__init__.py +++ b/auto_round/modeling/unfused_moe/__init__.py @@ -145,7 +145,10 @@ def get_file_path_via_model_name(model_or_path: str, file_name): def pre_check_config(model_name: str | torch.nn.Module, trust_remote_code: bool = True): if isinstance(model_name, str): - config = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code) + try: + config = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code) + except (OSError, EnvironmentError, ValueError): + return False elif isinstance(model_name, torch.nn.Module): config = getattr(model_name, "config", None) if config is None: diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py index ba6c934f3..b25db668a 100644 --- a/auto_round/utils/model.py +++ b/auto_round/utils/model.py @@ -757,7 +757,10 @@ def is_mllm_model(model_or_path: Union[str, torch.nn.Module], platform: str = No model_path = model_or_path if isinstance(model_or_path, str) else model_or_path.name_or_path # For dummy model, model_path could be "". - if model_path and not os.path.isdir(model_path): + # Only try to download if the path looks like a HF repo id (not a local filesystem path). + # Skip download for absolute paths or relative paths that contain current/parent dir markers. + _is_local_path = os.path.isabs(model_path) or model_path.startswith("./") or model_path.startswith("../") + if model_path and not os.path.isdir(model_path) and not _is_local_path: model_path = download_or_get_path(model_path, platform=platform) if isinstance(model_path, str): From b753bab110b02a05cc4a3f00577f200789a728ea Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Mon, 30 Mar 2026 15:55:52 +0800 Subject: [PATCH 20/90] decoupling quantization and refactor hadamard Signed-off-by: n1ck-guo --- .../quantization/auto_round/adam.py | 2 + .../quantization/auto_round/config.py | 9 +- .../quantization/auto_round/quantizer.py | 180 ++------- auto_round/algorithms/quantization/base.py | 290 ++------------ auto_round/algorithms/quantization/config.py | 88 ++--- .../algorithms/quantization/rtn/config.py | 6 +- .../algorithms/quantization/rtn/quantizer.py | 78 ++-- auto_round/algorithms/rotation/__init__.py | 148 +++++++ auto_round/algorithms/rotation/base.py | 168 ++++++++ .../algorithms/rotation/hadamard/__init__.py | 38 ++ .../algorithms/rotation/hadamard/apply.py | 286 ++++++++++++++ .../algorithms/rotation/hadamard/config.py | 112 ++++++ .../algorithms/rotation/hadamard/patch.py | 228 +++++++++++ .../rotation/hadamard/transforms.py | 171 +++++++++ .../rotation/hadamard/utils/__init__.py | 2 + .../hadamard/utils/hadamards.safetensors | Bin 0 -> 1436901 bytes .../rotation/hadamard/utils/math.py | 143 +++++++ .../rotation/hadamard/utils/matrix.py | 101 +++++ .../hadamard/utils/triton/__init__.py | 2 + .../rotation/hadamard/utils/triton/mxfp4.py | 192 +++++++++ auto_round/compressors_new/base.py | 363 ++++++++++++++++-- auto_round/compressors_new/calib.py | 143 ++++++- auto_round/compressors_new/entry.py | 68 +++- auto_round/compressors_new/utils.py | 12 +- auto_round/compressors_new/zero_shot.py | 295 ++++---------- 25 files changed, 2333 insertions(+), 792 deletions(-) create mode 100644 auto_round/algorithms/rotation/__init__.py create mode 100644 auto_round/algorithms/rotation/base.py create mode 100644 auto_round/algorithms/rotation/hadamard/__init__.py create mode 100644 auto_round/algorithms/rotation/hadamard/apply.py create mode 100644 auto_round/algorithms/rotation/hadamard/config.py create mode 100644 auto_round/algorithms/rotation/hadamard/patch.py create mode 100644 auto_round/algorithms/rotation/hadamard/transforms.py create mode 100644 auto_round/algorithms/rotation/hadamard/utils/__init__.py create mode 100644 auto_round/algorithms/rotation/hadamard/utils/hadamards.safetensors create mode 100644 auto_round/algorithms/rotation/hadamard/utils/math.py create mode 100644 auto_round/algorithms/rotation/hadamard/utils/matrix.py create mode 100644 auto_round/algorithms/rotation/hadamard/utils/triton/__init__.py create mode 100644 auto_round/algorithms/rotation/hadamard/utils/triton/mxfp4.py diff --git a/auto_round/algorithms/quantization/auto_round/adam.py b/auto_round/algorithms/quantization/auto_round/adam.py index 3f0f87325..b7a6131da 100644 --- a/auto_round/algorithms/quantization/auto_round/adam.py +++ b/auto_round/algorithms/quantization/auto_round/adam.py @@ -22,6 +22,8 @@ class ARAdamQuantizer(ARQuantizer): + is_adam: bool = True + def _get_optimizer(self, optimizer): if optimizer is None: optimizer = torch.optim.AdamW diff --git a/auto_round/algorithms/quantization/auto_round/config.py b/auto_round/algorithms/quantization/auto_round/config.py index 781ea38c9..66920a45c 100644 --- a/auto_round/algorithms/quantization/auto_round/config.py +++ b/auto_round/algorithms/quantization/auto_round/config.py @@ -14,9 +14,7 @@ from typing import Union from auto_round.algorithms.quantization.config import QuantizationConfig -from auto_round.auto_scheme.gen_auto_scheme import AutoScheme from auto_round.logger import logger -from auto_round.schemes import QuantizationScheme class AutoRoundConfig(QuantizationConfig): @@ -36,8 +34,7 @@ class AutoRoundConfig(QuantizationConfig): def __init__( self, - scheme: Union[str, dict, QuantizationScheme, AutoScheme] = "W4A16", - layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None, + layer_config: dict[str, Union[str, dict]] = None, *, iters: int = 200, lr: float = None, @@ -59,7 +56,7 @@ def __init__( enable_adam: bool = False, **kwargs, ): - super().__init__(scheme=scheme, layer_config=layer_config, **kwargs) + super().__init__(layer_config=layer_config, **kwargs) self.iters = iters if self.iters < 0: logger.warning("`iters` must be non-negative, reset it to 200") @@ -67,7 +64,7 @@ def __init__( if not lr: # TODO need to check 4 bits lr setting for auto-round-best, 3bits only validate on small models - if self.iters >= 1000 and self.bits <= 3: + if self.iters >= 1000 and self.bits is not None and self.bits <= 3: self.lr = 2.0 / self.iters logger.info("set the lr to 2.0/iters for better accuracy") else: diff --git a/auto_round/algorithms/quantization/auto_round/quantizer.py b/auto_round/algorithms/quantization/auto_round/quantizer.py index 617b9ab9c..066d77e34 100644 --- a/auto_round/algorithms/quantization/auto_round/quantizer.py +++ b/auto_round/algorithms/quantization/auto_round/quantizer.py @@ -62,6 +62,7 @@ class ARQuantizer(BaseQuantizers): + is_adam: bool = False def __init__(self, config: AutoRoundConfig): super().__init__(config) @@ -218,129 +219,40 @@ def _get_loss( return loss def quantize_block( - self, - block_name: Union[str, list[str]], - input_ids: Union[list[torch.Tensor], dict], - input_others: dict, - q_input: Union[torch.Tensor, dict, None] = None, - auto_offload=True, - **kwargs, - ): - """Quantize a block (or multiple blocks fused as WrapperMultiblock). - - Args: - block_name: A single block name, or a list of names when nblocks > 1. - The module(s) are retrieved internally via get_module(). - """ - if isinstance(block_name, list): - from auto_round.wrapper import WrapperMultiblock - - modules = [get_module(self.model, n) for n in block_name] - block = WrapperMultiblock(modules) - else: - block = get_module(self.model, block_name) - q_outputs, output = self._quantize_block( - block, input_ids, input_others, q_input=q_input, auto_offload=auto_offload, **kwargs - ) - if self.compress_context.is_immediate_saving: - for n, tmp_m in block.named_modules(): - if not (hasattr(tmp_m, "bits") and check_to_quantized(tmp_m)): - continue - immediate_pack(tmp_m.global_name, self.layer_config) - return q_outputs, output - - def _quantize_block( self, block: torch.nn.Module, input_ids: Union[list[torch.Tensor], dict], input_others: dict, - q_input: Union[torch.Tensor, dict, None] = None, - auto_offload=True, + reference_output, + *, + loss_device: Union[str, torch.device], + mid_iter_mem_check: bool = False, **kwargs, - ): - """Quantize the weights of a given block of the model. + ) -> dict: + """Apply the AutoRound optimization algorithm to a block. + + This is the pure-algorithm entry point. All infrastructure concerns + (device placement, act-max hook collection, reference-output caching, + DDP setup, memory cleanup, logging) are handled by the Compressor + before and after this call. Args: - block: The block of the model to be quantized. - input_ids: The input tensor containing tokenized input ids. - input_others: A dictionary containing additional input data. - q_input: The quantized input tensor. - device: The device for quantization. + block: Module already placed on the correct device(s). + input_ids: Calibration inputs (already on cache_device). + input_others: Additional inputs for the block's forward pass. + reference_output: FP reference outputs collected by the Compressor. + loss_device: Device on which to compute the MSE loss. + mid_iter_mem_check: Pre-evaluated by the Compressor as + ``low_gpu_mem_usage and card_0_in_high_risk``. When True, + triggers mid-iteration memory threshold checks to reduce + fragmentation on the primary GPU. Returns: - Tuple: (q_outputs, output) if self.enable_quanted_input is True, else (None, output) + best_params: Best quantization parameters found during optimization. + Empty dict if no trainable parameters were found. """ device = self.compress_context.device - materialize_model_(block) - convert_module_to_hp_if_necessary(block, self.model_context.amp_dtype, device) - - if auto_offload: - # card_0_in_high_risk indicates that card_0 memory is already in high usage (90%) w/o any weights - # loss_device is used to calculate loss on the second device if available and card_0_in_high_risk - if is_auto_device_mapping(self.compress_context.device_map) and len(self.compress_context.device_list) > 1: - card_0_in_high_risk, loss_device = set_auto_device_map_for_block_with_tuning( - block, - self.compress_context.device_map, - input_ids, - self.compress_context.low_gpu_mem_usage, - self.batch_size, - device, - ) - else: - block = block.to(device) - card_0_in_high_risk, loss_device = False, device - else: - card_0_in_high_risk, loss_device = False, device - - if len(self.compress_context.device_list) > 1 and auto_offload: - for n, m in block.named_modules(): - if len(list(m.children())) != 0 or not hasattr(m, "tuning_device"): - continue - from accelerate.hooks import AlignDevicesHook, add_hook_to_module - - hook = AlignDevicesHook(m.tuning_device, io_same_device=True) - add_hook_to_module(m, hook, True) - - if q_input is None: - hook_handles = self._register_act_max_hook(block) - - output = self._get_block_outputs( - block, - input_ids, - input_others, - self.batch_size * self.infer_bs_coeff, - ) - - for handle in hook_handles: - handle.remove() - else: - output = self._get_block_outputs( - block, - input_ids, - input_others, - self.batch_size * self.infer_bs_coeff, - ) - hook_handles = self._register_act_max_hook(block) - if hook_handles: - self._get_block_outputs( - block, - q_input if q_input is not None else input_ids, - input_others, - self.batch_size * self.infer_bs_coeff, - save_output=False, - ) - - for handle in hook_handles: - handle.remove() - - if q_input is not None: - if input_ids is not q_input: - clear_memory(input_ids, device_list=self.compress_context.device_list) - else: - clear_memory(device_list=self.compress_context.device_list) - input_ids = q_input - quantized_layer_names, unquantized_layer_names = self.wrapper_block( block, self.enable_minmax_tuning, @@ -367,9 +279,8 @@ def _quantize_block( lr = torch.tensor(self.lr) minmax_lr = torch.tensor(self.minmax_lr) - is_adam = "adam" in self.__class__.__name__.lower() - extra_kwargs = {} if is_adam else {"momentum": self.momentum} + extra_kwargs = {} if self.is_adam else {"momentum": self.momentum} if self.enable_minmax_tuning: params = [ @@ -393,8 +304,7 @@ def _quantize_block( ) logger.info(dump_info) unwrapper_block(block, {}) - mv_module_from_gpu(block) - return output, output + return {} if self.lr_scheduler is None: lr_schedule = torch.optim.lr_scheduler.LinearLR( @@ -438,20 +348,20 @@ def _quantize_block( for tmp_step in range(self.gradient_accumulate_steps): indices = global_indices[tmp_step * batch_size : (tmp_step + 1) * batch_size] - current_output = self._get_current_output(output, indices) + current_output = self._get_current_output(reference_output, indices) current_output = to_device(current_output, loss_device) output_q = self._get_current_q_output(block, input_ids, input_others, indices, device, loss_device) loss = self._get_loss(output_q, current_output, indices, mse_loss, device) num_elm = 1 if num_elm <= 0 else num_elm total_loss += loss.item() / num_elm - if self.compress_context.low_gpu_mem_usage and card_0_in_high_risk: + if mid_iter_mem_check: # clear memory to avoid OOM due to memory fragmentation clear_memory_if_reached_threshold(threshold=0.5, device_list=self.compress_context.device_list) self._scale_loss_and_backward(scaler, loss) - if self.compress_context.low_gpu_mem_usage and card_0_in_high_risk: + if mid_iter_mem_check: # clear memory to avoid OOM due to memory fragmentation clear_memory_if_reached_threshold(threshold=0.8, device_list=self.compress_context.device_list) @@ -462,8 +372,6 @@ def _quantize_block( best_loss = total_loss if not self.not_use_best_mse: best_params = collect_best_params(block, self.compress_context.cache_device) - # print(f"get better result at iter {i}, the loss is {total_loss}", flush=True) - last_best_iter = i if self.not_use_best_mse and i == self.iters - 1: best_params = collect_best_params(block, self.compress_context.cache_device) @@ -492,7 +400,7 @@ def _quantize_block( if self.compress_context.low_gpu_mem_usage: clear_memory(device_list=self.compress_context.device_list) # clear cached memory during training if len(unquantized_layer_names) != 0: - logger.info(f"{unquantized_layer_names} have not been quantized") + logger.info(f"Unquantized layers: {unquantized_layer_names}") with torch.no_grad(): unwrapper_block(block, best_params) @@ -500,34 +408,8 @@ def _quantize_block( # enable moe experts act_max automatic generation for WrapperWALayer set_amax_for_all_moe_layers(block, attr_name="orig_layer.act_max") - if self.enable_quanted_input: - q_outputs = self._get_block_outputs( - block, - input_ids, - input_others, - self.batch_size * self.infer_bs_coeff, - ) - - if len(self.compress_context.device_list) > 1 and auto_offload: - accelerate.hooks.remove_hook_from_submodules(block) - if auto_offload: - mv_module_from_gpu(block) - - clear_memory(input_ids, device_list=self.compress_context.device_list) - memory_info_summary = memory_monitor.get_summary() - logger.infoclean(dump_info + "," + memory_info_summary) - - return q_outputs, output - else: - if len(self.compress_context.device_list) > 1 and auto_offload: - accelerate.hooks.remove_hook_from_submodules(block) - if auto_offload: - mv_module_from_gpu(block) - clear_memory(input_ids, device_list=self.compress_context.device_list) - memory_info_summary = memory_monitor.get_summary() - logger.infoclean(dump_info + "," + memory_info_summary) - - return None, output + logger.infoclean(dump_info) + return best_params def quantize_layer( self, layer_name: str, input_ids: torch.Tensor, q_inputs: torch.Tensor = None, device: str = "cpu", **kwargs diff --git a/auto_round/algorithms/quantization/base.py b/auto_round/algorithms/quantization/base.py index 1218a6111..7a60f5191 100644 --- a/auto_round/algorithms/quantization/base.py +++ b/auto_round/algorithms/quantization/base.py @@ -11,69 +11,39 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import copy import importlib -import sys import traceback -from dataclasses import fields import torch from auto_round.algorithms.quantization.config import QuantizationConfig from auto_round.compressors_new.utils import ( - IndexSampler, - _get_quantized_layer_names_outside_blocks, - block_forward, check_need_act_calibration, - check_skippable_keywords, - collect_best_params, - get_shared_keys, - infer_bits_by_data_type, - init_cache, - set_layer_config, ) -from auto_round.context.compress import CompressContext -from auto_round.context.model import ModelContext from auto_round.data_type import QUANT_FUNC_WITH_DTYPE from auto_round.data_type.utils import reshape_pad_tensor_by_group_size from auto_round.logger import logger -from auto_round.schemes import ( - QuantizationScheme, - _handle_special_schemes, - _parse_scheme, - get_gguf_scheme, - preset_name_to_scheme, -) -from auto_round.special_model_handler import get_predefined_ignore_layers, update_module from auto_round.utils import ( INNER_SUPPORTED_LAYER_TYPES, SUPPORTED_LAYER_TYPES, check_to_quantized, clear_memory, - compress_layer_names, - convert_dtype_str2torch, - find_matching_blocks, - get_block_names, - is_quantized_input_module, ) class BaseQuantizers: - # Class-level attribute declarations for dynamic properties set in post_init() - # These prevent pylint E1101 (no-member) and E0203 (access-member-before-definition) errors + # Class-level attribute declarations for convenient access in quantization methods. + # Scheme-related attrs (layer_config, scale_dtype, has_qlayer_outside_block, etc.) + # are resolved by SchemeMixin in BaseCompressor and synced here after post_init(). model_context = None compress_context = None dataset = None - quant_block_list = None - orig_scheme = None - is_auto_scheme = False supported_types = SUPPORTED_LAYER_TYPES inner_supported_types = INNER_SUPPORTED_LAYER_TYPES def __init__(self, config: QuantizationConfig): self.config = config self.layer_config = config.layer_config - self.scheme = config.scheme self.bits = config.bits self.group_size = config.group_size self.sym = config.sym @@ -89,8 +59,6 @@ def __init__(self, config: QuantizationConfig): self.ignore_layers = config.ignore_layers self.quant_lm_head = config.quant_lm_head self.to_quant_block_names = config.to_quant_block_names - # Instance-level flag: avoids class-level sharing that could cause cross-instance bugs. - self._scheme_resolved = False @classmethod def from_config(cls, config: QuantizationConfig): @@ -115,224 +83,16 @@ def amp_dtype(self): return getattr(self.model_context, "amp_dtype", torch.float32) - def resolve_scheme( - self, - model_context: "ModelContext", - compress_context: "CompressContext", - dataset: str = None, - ) -> None: - """Phase-1 init: resolve scheme and bind config attrs (no model structure needed). - - Must be called BEFORE get_formats() and BEFORE post_init(). - Idempotent: safe to call multiple times. - - Args: - model_context: The ModelContext created by BaseCompressor. - compress_context: The CompressContext created by BaseCompressor. - dataset: Calibration dataset name/path. Used by AutoScheme's delta-loss - scheme selection. Callers should pass the compressor's own dataset - (or a sensible default) rather than leaving it to a global lookup. - """ - if self._scheme_resolved: - return - - self.model_context = model_context - self.compress_context = compress_context - self.model = model_context.model - if dataset is not None: - self.dataset = dataset - - # Build user-specified overrides from fields defined in QuantizationScheme. - scheme_fields = {f.name for f in fields(QuantizationScheme)} - user_scheme_overrides = {k: v for k in scheme_fields if (v := getattr(self.config, k, None)) is not None} - default_scheme, self.is_auto_scheme, final_attrs = _parse_scheme(self.scheme, user_scheme_overrides) - - # Bind resolved attrs to config and self for convenient access. - for key, value in final_attrs.items(): - setattr(self.config, key, value) - if hasattr(self, key): - setattr(self, key, value) - self.config.check_config() - - self.orig_scheme = copy.deepcopy(self.scheme) - self.scheme = default_scheme - - # GGUF format uses fp32 scale dtype; everything else defaults to fp16. - gguf_scheme_name = get_gguf_scheme(self.scheme) - if self.scale_dtype is None: - self.scale_dtype = "fp32" if gguf_scheme_name else "fp16" - self.scale_dtype = convert_dtype_str2torch(self.scale_dtype) - - self._scheme_resolved = True - - def post_init(self) -> None: - """Phase-2 init: build layer config on the patched model. - - Requires resolve_scheme() to have been called first (asserted below). - Must be called AFTER model_context.apply_patches(). - """ - assert self._scheme_resolved, ( - "resolve_scheme() must be called before post_init(). " "BaseCompressor.post_init() does this automatically." - ) - - enable_gguf_official_mixed = not self.is_auto_scheme - - if self.quant_block_list is None: - quant_nontext_module = getattr(self.model_context, "quant_nontext_module", False) - all_blocks = get_block_names(self.model_context.model, quant_vision=quant_nontext_module) - self.quant_block_list = find_matching_blocks( - self.model_context.model, all_blocks, self.to_quant_block_names - ) - if self.to_quant_block_names is None and self.quant_block_list: - from auto_round.utils import extract_block_names_to_str - - self.to_quant_block_names = extract_block_names_to_str(self.quant_block_list) - self.config.to_quant_block_names = self.to_quant_block_names - - self.configure_layer_config(enable_gguf_official_mixed=enable_gguf_official_mixed) - - def _gen_auto_scheme(self) -> dict[str, dict]: - if self.model_context.is_mllm: - logger.info("AutoScheme is not yet supported for multimodal LLMs.") - sys.exit(-1) - - if is_quantized_input_module(self.model_context.model): - logger.info("AutoScheme does not currently support quantized input models (e.g., FP8).") - sys.exit(-1) - - all_dtypes = [] - all_gguf = True - for option in self.orig_scheme.options: - # Resolve the quantization scheme or data type - dtype = "int" - if isinstance(option, str): - if not option.lower().startswith("gguf"): - all_gguf = False - - option = preset_name_to_scheme(option) - - else: - all_gguf = False - - if isinstance(option, QuantizationScheme): - dtype = option.data_type - elif isinstance(option, dict): - dtype = option.get("data_type", "int") - - all_dtypes.append(dtype) - - # Check for mixed data types - unique_dtypes = set(all_dtypes) - if len(unique_dtypes) > 1 and not all_gguf: - logger.warning( - "Models with mixed data_types " - "cannot yet be exported to real formats except GGUF. " - "Please save the model using the `fake` format for now." - ) - - layer_config, self.has_qlayer_outside_block, self.regex_config = set_layer_config( - self.model_context.model, - self.layer_config, - self.scheme, - self.scale_dtype, - self.supported_types, - self.inner_supported_types, - self.quant_block_list, - self.ignore_layers, - self.quant_lm_head, - enable_gguf_official_mixed=False, - is_mllm=self.model_context.is_mllm, + def resolve_scheme(self, *args, **kwargs) -> None: + raise NotImplementedError( + "resolve_scheme() has been moved to BaseCompressor in compressors_new/base.py. " + "Call BaseCompressor.post_init() instead." ) - quant_layer_names = layer_config.keys() - scheme_keys = {f.name for f in fields(QuantizationScheme)} - fixed_layer_scheme_new = { - k: {key: v[key] for key in scheme_keys & v.keys()} - for k, v in layer_config.items() - if v.get("fixed_by_user", False) - } - - # mainly using quant_layers and fixed by users - from auto_round.auto_scheme.gen_auto_scheme import GenScheme - - if ( - not self.compress_context.enable_torch_compile - and self.super_bits is None - and not self.orig_scheme.low_gpu_mem_usage - ): - logger.warning("we strongly recommend to set `enable_torch_compile` to True for AutoScheme to save VRAM") - self.scheme_generator = GenScheme( - self.orig_scheme, - self.model_context.model, - quant_layer_names, - fixed_layer_scheme_new, - self.dataset, - device_map=self.compress_context.device_map, - tokenizer=self.model_context.tokenizer, - enable_torch_compile=self.compress_context.enable_torch_compile, - ) - layer_config = self.scheme_generator.get_layer_config() - return layer_config - - def configure_layer_config(self, enable_gguf_official_mixed: None | bool = True): - # before get_format, therefore, compress_context.formats is str - is_gguf_format = (f := getattr(self.compress_context, "formats", None)) is not None and "gguf" in f - predefined_ignore_layers = get_predefined_ignore_layers(self.model_context.model) - compressed_predefined_ignore_layers = compress_layer_names(predefined_ignore_layers) - if not is_gguf_format: - predefined_ignore_layers = get_predefined_ignore_layers(self.model_context.model) - if predefined_ignore_layers: - logger.info(f"Using predefined ignore_layers: {compressed_predefined_ignore_layers}") - tmp_str = ",".join(predefined_ignore_layers) - if self.ignore_layers == "": - self.ignore_layers = tmp_str - else: - self.ignore_layers += "," + tmp_str - if self.is_auto_scheme: - self.layer_config = self._gen_auto_scheme() - else: - self.layer_config = _handle_special_schemes( - self.orig_scheme, - self.layer_config, - self.model_context.model, - supported_types=SUPPORTED_LAYER_TYPES, - inner_supported_types=INNER_SUPPORTED_LAYER_TYPES, - quant_lm_head=self.quant_lm_head, - mllm=self.model_context.is_mllm, - ) - # For GGUF _mixed formats (e.g. gguf:q2_k_mixed), the inner GGUFFormat - # stores the mixed-handling result in ar.layer_config which is NOT the - # same object as quantizer.layer_config. Re-apply the special scheme - # on the patched model here (after Phase 3) so layer names are correct. - _gguf_orig_fmt = getattr(self, "_gguf_original_format_name", None) - if _gguf_orig_fmt and "_MIXED" in _gguf_orig_fmt.upper(): - self.layer_config = _handle_special_schemes( - _gguf_orig_fmt.lower(), - self.layer_config, - self.model_context.model, - supported_types=SUPPORTED_LAYER_TYPES, - inner_supported_types=INNER_SUPPORTED_LAYER_TYPES, - quant_lm_head=self.quant_lm_head, - mllm=self.model_context.is_mllm, - ) - - fill_default_value = True - if self.is_auto_scheme: - fill_default_value = False - self.layer_config, self.has_qlayer_outside_block, self.regex_config = set_layer_config( - self.model_context.model, - self.layer_config, - self.scheme, - self.scale_dtype, - SUPPORTED_LAYER_TYPES, - INNER_SUPPORTED_LAYER_TYPES, - self.quant_block_list, - self.ignore_layers, - self.quant_lm_head, - enable_gguf_official_mixed=enable_gguf_official_mixed, - is_mllm=self.model_context.is_mllm, - fill_default_value=fill_default_value, - gguf_format_name=getattr(self, "_gguf_format_name", None), + def post_init(self, *args, **kwargs) -> None: + raise NotImplementedError( + "post_init() has been moved to BaseCompressor/_scheme_post_init() in " + "compressors_new/base.py. Call BaseCompressor.post_init() instead." ) def _register_act_max_hook(self, model): @@ -479,14 +239,30 @@ def _quantize_embedding_layer(self): return is_quantized - def quantize_block(self, block_name: str, input_ids=None, input_others=None, **kwargs): - """Quantizes a given block of the model. + def quantize_block( + self, block: torch.nn.Module, input_ids=None, input_others=None, reference_output=None, **kwargs + ) -> dict: + """Apply the quantization algorithm to a prepared block. + + This is the **pure-algorithm** entry point called by the Compressor after + all infrastructure work (device placement, data collection, act-max hook + registration, DDP setup) has been completed. + + Implementations should: + - Perform the algorithm-specific weight/activation quantization on ``block``. + - Return a dict of best parameters (may be empty for zero-shot algorithms). Args: - block_name (str): The name of the block to quantize. The block module is - retrieved internally via get_module(model, block_name). - input_ids: Calibration inputs for the block (required by gradient-based quantizers). - input_others (dict): Additional inputs for the block's forward pass. + block: Module already placed on the correct device(s). + input_ids: Calibration inputs on cache_device (None for zero-shot RTN). + input_others: Additional inputs (None for zero-shot RTN). + reference_output: FP reference outputs collected by Compressor + (None for algorithms that don't need a reconstruction loss). + **kwargs: Algorithm-specific keyword arguments (e.g. ``loss_device``, + ``card_0_in_high_risk`` for ARQuantizer). + + Returns: + dict: Best quantization parameters found, or ``{}`` if not applicable. """ raise NotImplementedError("quantize_block must be implemented in subclasses of BaseQuantizers") diff --git a/auto_round/algorithms/quantization/config.py b/auto_round/algorithms/quantization/config.py index 0c5cbca79..45eb47bcf 100644 --- a/auto_round/algorithms/quantization/config.py +++ b/auto_round/algorithms/quantization/config.py @@ -11,23 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import copy from dataclasses import dataclass from enum import Enum from typing import ClassVar, Union from auto_round.algorithms.alg_config import AlgConfig -from auto_round.auto_scheme.gen_auto_scheme import AutoScheme from auto_round.export.export_to_gguf.config import GGUF_INNER_CONFIG from auto_round.logger import logger -from auto_round.schemes import ( - QuantizationScheme, - _handle_special_schemes, - _parse_scheme, - get_gguf_scheme, - preset_name_to_scheme, -) -from auto_round.utils import convert_dtype_str2torch +from auto_round.schemes import QuantizationScheme class BackendDataType(str, Enum): @@ -43,7 +34,6 @@ class QuantizationConfig(AlgConfig): _alg_cls: ClassVar[str] = None # quantization args - scheme: Union[str, dict, QuantizationScheme, AutoScheme] = "W4A16" layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None bits: int = None group_size: int = None @@ -62,12 +52,10 @@ class QuantizationConfig(AlgConfig): to_quant_block_names: Union[str, list, None] = None def __post_init__(self): - # Resolve scheme attributes early so properties (is_act_nv_fp, is_wfp8afp8, etc.) - # work correctly at construction time without waiting for post_init(). - self._early_resolve_scheme() # Run block-wise validation early (at construction time, before model loading). - # Guard with None checks because _early_resolve_scheme may leave attributes unresolved - # (e.g. when scheme is an AutoScheme that needs model info). + # Scheme resolution is deferred to BaseCompressor.post_init() via SchemeMixin. + # Guard with None checks in case the user hasn't explicitly set data_type/bits + # (they will be resolved from scheme by the compressor before use). if self.group_size is not None and isinstance(self.group_size, (tuple, list)): if not ( self.data_type is not None @@ -95,37 +83,6 @@ def __post_init__(self): "`act_group_size` must be -1 (per channel), 0 (per-tensor), or a positive integer, not a tuple." ) - def _early_resolve_scheme(self) -> None: - """Resolve scheme attributes early so properties work from init time. - - Both entry.py routing (needs_act_calib) and BaseCompressor._adjust_torch_compile - need resolved attributes (act_data_type, data_type, is_act_nv_fp, ...) before - BaseQuantizers.post_init() runs (which is deferred until quantize() / after model - loading). This method performs the same _parse_scheme() call eagerly so those - attributes are available from construction time. - - AutoScheme is left deferred because it requires model information to select its - concrete option. - """ - if isinstance(self.scheme, AutoScheme): - # AutoScheme needs model info for option selection — defer to post_init - return - - # Collect fields that exist in both QuantizationScheme and QuantizationConfig - # where the user explicitly provided a value (non-None). These override the - # scheme's built-in defaults so that e.g. RTNConfig(scheme="NVFP4", bits=8) - # expands NVFP4 but keeps bits=8 instead of the scheme's default bits=4. - user_scheme_overrides = { - k: getattr(self, k) for k in QuantizationScheme.get_attributes() if getattr(self, k, None) is not None - } - - try: - _, _, final_attrs = _parse_scheme(self.scheme, user_scheme_overrides) - vars(self).update(final_attrs) - except Exception: - # Silently ignore failures — post_init() will do the authoritative resolution - pass - @staticmethod def _is_valid_group_size(gs) -> bool: """Return True if gs is a valid group_size value. @@ -205,48 +162,61 @@ def is_act_quantize(self): @property def is_nv_fp(self): - return BackendDataType.NV_FP in self.data_type + return self.data_type is not None and BackendDataType.NV_FP in self.data_type @property def is_act_nv_fp(self): - return BackendDataType.NV_FP in self.act_data_type + return self.act_data_type is not None and BackendDataType.NV_FP in self.act_data_type @property def is_mx_fp(self): - return BackendDataType.MX_FP in self.data_type + return self.data_type is not None and BackendDataType.MX_FP in self.data_type @property def is_act_mx_fp(self): - return BackendDataType.MX_FP in self.act_data_type + return self.act_data_type is not None and BackendDataType.MX_FP in self.act_data_type @property def is_dynamic_wint8aint8(self): if self.act_dynamic: return True - if ("int8" in self.act_data_type or ("int" in self.act_data_type and self.act_bits == 8)) and ( - "int8" in self.data_type or ("int" in self.data_type and self.bits == 8) - ): - return True + if self.act_data_type is not None and self.data_type is not None: + if ("int8" in self.act_data_type or ("int" in self.act_data_type and self.act_bits == 8)) and ( + "int8" in self.data_type or ("int" in self.data_type and self.bits == 8) + ): + return True return False @property def is_standard_fp(self): - return BackendDataType.STANDARD_FP in self.data_type and not self.is_mx_fp and not self.is_nv_fp + return ( + self.data_type is not None + and BackendDataType.STANDARD_FP in self.data_type + and not self.is_mx_fp + and not self.is_nv_fp + ) @property def is_act_standard_fp(self): - return BackendDataType.STANDARD_FP in self.act_data_type and not self.is_act_mx_fp and not self.is_act_nv_fp + return ( + self.act_data_type is not None + and BackendDataType.STANDARD_FP in self.act_data_type + and not self.is_act_mx_fp + and not self.is_act_nv_fp + ) @property def is_static_afp8(self): - return BackendDataType.FP8_STATIC in self.act_data_type + return self.act_data_type is not None and BackendDataType.FP8_STATIC in self.act_data_type @property def is_static_wfp8afp8(self): - return BackendDataType.FP8_STATIC in self.data_type and self.is_static_afp8 + return self.data_type is not None and BackendDataType.FP8_STATIC in self.data_type and self.is_static_afp8 @property def is_wfp8afp8(self): + if self.act_data_type is None or self.data_type is None: + return False if ( ("fp8" in self.act_data_type or ("fp" in self.act_data_type and self.act_bits == 8)) and ("fp8" in self.data_type or ("fp" in self.data_type and self.bits == 8)) diff --git a/auto_round/algorithms/quantization/rtn/config.py b/auto_round/algorithms/quantization/rtn/config.py index d31471f7e..e470a49b2 100644 --- a/auto_round/algorithms/quantization/rtn/config.py +++ b/auto_round/algorithms/quantization/rtn/config.py @@ -22,7 +22,6 @@ class RTNConfig(QuantizationConfig): def __init__( self, - scheme="W4A16", layer_config=None, *, disable_opt_rtn: bool = None, @@ -34,7 +33,7 @@ def __init__( ): # pop before super().__init__ so it doesn't leak into QuantizationConfig as an unknown kwarg enable_opt_rtn = kwargs.pop("enable_opt_rtn", None) - super().__init__(scheme=scheme, layer_config=layer_config, **kwargs) + super().__init__(layer_config=layer_config, **kwargs) self.seqlen = seqlen self.nsamples = nsamples @@ -53,9 +52,6 @@ def __init__( self.orig_disable_opt_rtn = disable_opt_rtn if disable_opt_rtn is None: - if isinstance(scheme, str) and scheme in ["W8A16", "W8A8"]: - logger.warning("`disable_opt_rtn` is turned on for W8A16/W8A8 quantization to improve efficiency.") - disable_opt_rtn = True if self.bits and self.bits >= 8 and self.act_bits and self.act_bits >= 8 and self.data_type == "int": logger.warning("`disable_opt_rtn` is turned on for W8A16/W8A8 quantization to improve efficiency.") disable_opt_rtn = True diff --git a/auto_round/algorithms/quantization/rtn/quantizer.py b/auto_round/algorithms/quantization/rtn/quantizer.py index 7467a4faf..3466507ab 100644 --- a/auto_round/algorithms/quantization/rtn/quantizer.py +++ b/auto_round/algorithms/quantization/rtn/quantizer.py @@ -68,8 +68,23 @@ class RTNQuantizer(BaseQuantizers): def __init__(self, config: RTNConfig): BaseQuantizers.__init__(self, config) - def quantize_block(self, block_name: str, **kwargs): - block = get_module(self.model, block_name) + def quantize_block( + self, block: torch.nn.Module, input_ids=None, input_others=None, reference_output=None, **kwargs + ) -> dict: + """Apply zero-shot RTN quantization to a block. + + Pure-algorithm entry point. Materialize / device placement is handled + by the Compressor before calling this method. + + Args: + block: Module already materialized. + input_ids: Unused for zero-shot RTN (accepted for interface consistency). + input_others: Unused for zero-shot RTN. + reference_output: Unused for zero-shot RTN. + + Returns: + dict: Empty dict (zero-shot RTN has no tunable parameters to return). + """ shard_writer = ShardWriter.get_shard_writer() tied_weights_keys = getattr(self.model, "_tied_weights_keys", []) @@ -104,15 +119,18 @@ def quantize_block(self, block_name: str, **kwargs): m.to("meta") # Move remaining GPU tensors to CPU; offload to disk if low_cpu_mem_usage. - # This mirrors _quantize_via_rtn_blockwise's post-block cleanup. if not self.compress_context.is_immediate_saving: mv_module_from_gpu(block) else: # Save once at block scope to capture tensors that are not saved # in per-layer branch (e.g., custom module-level params/buffers). - shard_writer.write(name=block_name) + block_name = getattr(block, "name", None) or getattr(block, "global_name", None) + if block_name: + shard_writer.write(name=block_name) block.to("meta") + return {} + def quantize_layer(self, name: str, dtype: torch.dtype = None) -> None: """Quantizes a layer using RTN (Round-To-Nearest) if available. @@ -229,46 +247,18 @@ def __init__(self, config: RTNConfig): self.enable_alg_ext = True - def quantize_block(self, block_name: str, input_ids: Union[list[torch.Tensor], dict], input_others: dict, **kwargs): - block = get_module(self.model, block_name) - materialize_model_(block) - block.to("cpu") + def quantize_block(self, block: torch.nn.Module, **kwargs): + """Apply imatrix-informed RTN quantization to a block. - block = convert_module_to_hp_if_necessary( - block, dtype=self.model_context.amp_dtype, device=self.compress_context.device - ) - update_block_global_scale_if_needed(block, self.data_type, self.group_size) - self._register_act_max_hook(block) - if is_auto_device_mapping(self.compress_context.device_map) and len(self.compress_context.device_list) > 1: - set_auto_device_map_for_block_with_tuning( - block, - self.compress_context.device_map, - input_ids, - self.compress_context.low_gpu_mem_usage, - self.batch_size, - self.compress_context.device, - ) - # Dispatch model if needed - if len(self.compress_context.device_list) > 1: - from accelerate.hooks import AlignDevicesHook, add_hook_to_module - - for _, m in block.named_modules(): - if len(list(m.children())) != 0 or not hasattr(m, "tuning_device"): - continue - hook = AlignDevicesHook(m.tuning_device, io_same_device=True) - add_hook_to_module(m, hook, True) - else: - block = block.to(self.compress_context.device) - input_ids = self._get_block_outputs( - block, - input_ids, - input_others, - self.batch_size * self.infer_bs_coeff, - ) - - if len(self.compress_context.device_list) > 1: - accelerate.hooks.remove_hook_from_submodules(block) + Pure-algorithm entry point. All infrastructure (device placement, + act-max hook registration, imatrix collection, cleanup) is handled + by the Compressor before calling this method. + Args: + block: Module already placed on the correct device(s) with act_max + attributes populated by the Compressor's hook pass. + """ + update_block_global_scale_if_needed(block, self.data_type, self.group_size) if self.config.is_act_nv_fp or self.config.is_static_afp8: # enable moe experts act_max automatic generation for Linear set_amax_for_all_moe_layers(block, attr_name="act_max") @@ -278,15 +268,11 @@ def quantize_block(self, block_name: str, input_ids: Union[list[torch.Tensor], d clear_memory(device_list=self.compress_context.device_list) for name, m in block.named_modules(): - # fix issue: Ling-flash-2.0-q2_k_s fail infer on cuda but well on cpu - # https://huggingface.co/Intel/Ling-flash-2.0-gguf-q2ks-mixed-AutoRound/discussions/1 if hasattr(m, "imatrix"): m.imatrix /= m.imatrix_cnt if hasattr(m, "global_name") and check_to_quantized(m): self.quantize_layer(m.global_name) - mv_module_from_gpu(block) - @torch.no_grad() def _get_block_outputs( self, diff --git a/auto_round/algorithms/rotation/__init__.py b/auto_round/algorithms/rotation/__init__.py new file mode 100644 index 000000000..06e1bbdc9 --- /dev/null +++ b/auto_round/algorithms/rotation/__init__.py @@ -0,0 +1,148 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Weight/activation rotation algorithm package. + +This package houses all *pre-quantisation rotation/transform* algorithms – +mathematical operations applied to model weights or activations before the +quantisation step to improve numerical properties. + +Current algorithms +------------------ +* **hadamard** – Block-diagonal Hadamard rotations (QuaRot / SpinQuant style). + See :mod:`auto_round.algorithms.rotation.hadamard`. + +Adding a new algorithm +----------------------- +1. Create ``algorithms/rotation//`` with ``config.py`` and ``apply.py``. +2. Subclass :class:`BaseRotationConfig` and :class:`BaseRotation`; register + with ``@BaseRotation.register("")``. +3. Re-export from this ``__init__.py``. + +Typical usage +------------- +>>> from auto_round.algorithms.rotation import apply_rotation +>>> model = apply_rotation(model, config={"hadamard_type": "random_hadamard"}) +""" +from __future__ import annotations + +from typing import Any + +import torch + +from auto_round.algorithms.rotation.base import ( + BaseRotation, + BaseRotationConfig, + ROTATION_SUPPORTED_SCHEMES, + check_supported_schemes, +) +from auto_round.algorithms.rotation.hadamard import ( + HadamardConfig, + HadamardRotation, + apply_hadamard_transform, + normalize_hadamard_config, +) + +__all__ = [ + # Base interfaces + "BaseRotation", + "BaseRotationConfig", + "ROTATION_SUPPORTED_SCHEMES", + "check_supported_schemes", + # Hadamard + "HadamardConfig", + "HadamardRotation", + "apply_hadamard_transform", + "normalize_hadamard_config", + # Unified entry + "apply_rotation", + "normalize_rotation_config", +] + + +def normalize_rotation_config( + config: Any, +) -> BaseRotationConfig | None: + """Normalise any supported config form to the canonical :class:`BaseRotationConfig` subclass. + + Dispatches by inspecting the ``algorithm`` field (or missing field for + legacy dicts that only carry Hadamard keys). + + Args: + config: One of: ``None``, :class:`HadamardConfig`, a ``dict`` with + an ``"algorithm"`` key, or a plain Hadamard shorthand string. + + Returns: + The appropriate :class:`BaseRotationConfig` subclass, or ``None`` + when *config* is ``None`` / empty. + """ + if config is None: + return None + + if isinstance(config, BaseRotationConfig): + return config + + if isinstance(config, dict): + alg = config.get("algorithm", "hadamard") + if alg == "hadamard": + return HadamardConfig.model_validate(config) + raise ValueError( + f"Unknown rotation algorithm: {alg!r}. " f"Registered algorithms: {sorted(BaseRotation._REGISTRY)}" + ) + + if isinstance(config, str): + # String shorthand → treat as Hadamard config. + return HadamardConfig.model_validate(normalize_hadamard_config(config)) + + raise TypeError( + f"Unsupported rotation config type: {type(config).__name__}. " + "Expected None, dict, str, or a BaseRotationConfig subclass." + ) + + +def apply_rotation( + model: torch.nn.Module, + config: Any, + need_calibration: bool = False, + **kwargs: Any, +) -> torch.nn.Module: + """Apply a rotation/transform algorithm to *model*. + + This is the single, algorithm-agnostic entry point. The correct + :class:`BaseRotation` subclass is selected automatically from *config*. + + Args: + model: Model to transform (modified in-place). + config: Rotation configuration. Accepts: + + * ``None`` – no-op, returns *model* unmodified. + * :class:`HadamardConfig` or compatible ``dict``/``str``. + * Any :class:`BaseRotationConfig` subclass. + + need_calibration: Forward to the rotation implementation; controls + whether transforms are fused eagerly or patched into + calibration wrappers. + **kwargs: Forwarded to :meth:`BaseRotation.apply_to_model`. + + Returns: + The transformed model. + """ + if config is None: + return model + + normalised = normalize_rotation_config(config) + if normalised is None: + return model + + rotation = BaseRotation.from_config(normalised) + return rotation.apply_to_model(model, need_calibration=need_calibration, **kwargs) diff --git a/auto_round/algorithms/rotation/base.py b/auto_round/algorithms/rotation/base.py new file mode 100644 index 000000000..658d932f4 --- /dev/null +++ b/auto_round/algorithms/rotation/base.py @@ -0,0 +1,168 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Base classes and utilities for weight/activation rotation algorithms. + +All rotation algorithms (Hadamard, SpinQuant, QuaRot, …) must subclass +``BaseRotation`` and declare a corresponding ``BaseRotationConfig``. + +""" +from __future__ import annotations + +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Any + +import torch + +# --------------------------------------------------------------------------- +# Config base +# --------------------------------------------------------------------------- + + +@dataclass +class BaseRotationConfig: + """Minimal base for all rotation algorithm configs. + + Every concrete config subclass should be a ``dataclass`` so it is + trivially serialisable / comparable. + """ + + #: Human-readable algorithm name, must be unique across all subclasses. + algorithm: str = "base" + + +# --------------------------------------------------------------------------- +# Algorithm base +# --------------------------------------------------------------------------- + + +class BaseRotation(ABC): + """Unified interface for all weight/activation rotation transforms. + + Concrete subclasses implement :meth:`apply_to_model` for their specific + mathematical transform (Hadamard rotation, random rotation, …). + + Example + ------- + >>> from auto_round.algorithms.rotation import apply_rotation + >>> model = apply_rotation(model, config={"algorithm": "hadamard", ...}) + """ + + # Registry populated by subclasses via ``BaseRotation.register``. + _REGISTRY: dict[str, type["BaseRotation"]] = {} + + def __init__(self, config: BaseRotationConfig) -> None: + self.config = config + + # ------------------------------------------------------------------ + # Abstract interface + # ------------------------------------------------------------------ + + @abstractmethod + def apply_to_model( + self, + model: torch.nn.Module, + need_calibration: bool = False, + **kwargs: Any, + ) -> torch.nn.Module: + """Apply this rotation to *model* and return the (possibly mutated) model. + + Args: + model: The model to transform. + need_calibration: When ``True``, monkey-patch training-time wrappers + (``WrapperLinear``, ``WrapperWALayer``) so the transform is + re-applied each forward pass during calibration. When + ``False``, fuse the transform eagerly into the weight tensor. + **kwargs: Algorithm-specific extra arguments. + + Returns: + The transformed model. + """ + + # ------------------------------------------------------------------ + # Factory + # ------------------------------------------------------------------ + + @classmethod + def register(cls, algorithm_name: str): + """Class decorator to register a ``BaseRotation`` subclass. + + Usage:: + + @BaseRotation.register("hadamard") + class HadamardRotation(BaseRotation): + ... + """ + + def _decorator(subclass: type[BaseRotation]) -> type[BaseRotation]: + cls._REGISTRY[algorithm_name] = subclass + return subclass + + return _decorator + + @classmethod + def from_config(cls, config: BaseRotationConfig) -> "BaseRotation": + """Instantiate the correct ``BaseRotation`` subclass for *config*. + + The algorithm is looked up by ``config.algorithm`` in the registry. + Sub-packages are imported lazily on first access so that optional + dependencies (e.g. ``pydantic``) are not required unless actually used. + """ + # Lazy-load all sub-packages to populate the registry. + _ensure_registry_populated() + + name = getattr(config, "algorithm", None) + if name not in cls._REGISTRY: + raise ValueError(f"No rotation algorithm registered under {name!r}. " f"Available: {sorted(cls._REGISTRY)}") + return cls._REGISTRY[name](config) + + +# --------------------------------------------------------------------------- +# Scheme compatibility check +# --------------------------------------------------------------------------- + +#: Quantization schemes that support (and require) rotation transforms. +ROTATION_SUPPORTED_SCHEMES: list[str] = ["MXFP4"] + + +def check_supported_schemes(scheme: str) -> None: + """Raise ``ValueError`` if *scheme* does not support rotation transforms.""" + if scheme not in ROTATION_SUPPORTED_SCHEMES: + raise ValueError( + f"Rotation transforms are not supported for scheme {scheme!r}. " + f"Currently supported schemes: {ROTATION_SUPPORTED_SCHEMES}" + ) + + +# --------------------------------------------------------------------------- +# Lazy registry population +# --------------------------------------------------------------------------- + +_registry_populated = False + + +def _ensure_registry_populated() -> None: + """Import all known sub-packages so their ``@BaseRotation.register`` calls run.""" + global _registry_populated + if _registry_populated: + return + # Import each sub-package here. Add new entries as more algorithms land. + import importlib + + for sub in ("hadamard",): + try: + importlib.import_module(f"auto_round.algorithms.rotation.{sub}") + except ImportError: + pass + _registry_populated = True diff --git a/auto_round/algorithms/rotation/hadamard/__init__.py b/auto_round/algorithms/rotation/hadamard/__init__.py new file mode 100644 index 000000000..02b61f979 --- /dev/null +++ b/auto_round/algorithms/rotation/hadamard/__init__.py @@ -0,0 +1,38 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Hadamard rotation sub-package for ``algorithms/rotation``.""" + +from auto_round.algorithms.rotation.hadamard.apply import HadamardRotation, apply_hadamard_transform +from auto_round.algorithms.rotation.hadamard.config import HadamardConfig, normalize_hadamard_config +from auto_round.algorithms.rotation.hadamard.transforms import ( + HADAMARDS, + HadamardTransform, + RandomHadamardTransform, + build_hadamard_transform, +) + +__all__ = [ + # Algorithm class + "HadamardRotation", + # Config + "HadamardConfig", + "normalize_hadamard_config", + # Transform modules + "HadamardTransform", + "RandomHadamardTransform", + "HADAMARDS", + "build_hadamard_transform", + # One-shot convenience + "apply_hadamard_transform", +] diff --git a/auto_round/algorithms/rotation/hadamard/apply.py b/auto_round/algorithms/rotation/hadamard/apply.py new file mode 100644 index 000000000..219e684fa --- /dev/null +++ b/auto_round/algorithms/rotation/hadamard/apply.py @@ -0,0 +1,286 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Hadamard rotation – concrete ``BaseRotation`` implementation. + +Public entry points +------------------- +* :class:`HadamardRotation` – the stateful algorithm object. +* :func:`apply_hadamard_transform` – convenience one-shot function. +""" +from __future__ import annotations + +from typing import Any + +import torch +import tqdm + +from auto_round.algorithms.rotation.base import BaseRotation +from auto_round.algorithms.rotation.hadamard.config import HadamardConfig, normalize_hadamard_config +from auto_round.algorithms.rotation.hadamard.transforms import build_hadamard_transform +from auto_round.experimental.qmodules.mx import MXQuantLinearBase # optional dep, guarded below + +__all__ = ["HadamardRotation", "apply_hadamard_transform"] + + +# Detect optional Triton path once at import time. +def _triton_available() -> bool: + try: + import triton # noqa: F401 # pylint: disable=E0401 + + if not torch.cuda.is_available(): + return False + from auto_round.algorithms.rotation.hadamard.utils.triton.mxfp4 import ( # noqa: F401 + mxfp4_forward_kernel_wrapper, + ) + + return True + except Exception: + return False + + +@BaseRotation.register("hadamard") +class HadamardRotation(BaseRotation): + """Hadamard rotation algorithm. + + Registered under ``"hadamard"`` in the + :class:`~auto_round.algorithms.rotation.base.BaseRotation` registry. + + Typical usage (via the top-level helper):: + + from auto_round.algorithms.rotation import apply_rotation + model = apply_rotation(model, config={"hadamard_type": "random_hadamard"}) + + Or directly:: + + from auto_round.algorithms.rotation.hadamard import apply_hadamard_transform + model = apply_hadamard_transform(model, config=HadamardConfig(), need_calibration=True) + """ + + def __init__(self, config: HadamardConfig) -> None: + super().__init__(config) + + @classmethod + def from_config(cls, config: dict | HadamardConfig) -> "HadamardRotation": + """Build a :class:`HadamardRotation` from a raw dict or :class:`HadamardConfig`.""" + if isinstance(config, dict): + config = HadamardConfig.model_validate(config) + return cls(config) + + def apply_to_model( + self, + model: torch.nn.Module, + need_calibration: bool = False, + location: str = "weight", + use_tqdm: bool = True, + desc: str | None = None, + **kwargs: Any, + ) -> torch.nn.Module: + """Apply the Hadamard rotation to *model*. + + Args: + model: Target model; modified in-place. + need_calibration: When ``True``, calibration wrappers + (:class:`~auto_round.wrapper.WrapperLinear`, + :class:`~auto_round.wrapper.WrapperWALayer`) are + monkey-patched so the transform is re-applied each + forward pass during AutoRound tuning. + location: ``"weight"`` (eager, fused into weights) or + ``"input"`` (activation-side, via forward hook). + use_tqdm: Show a progress bar while iterating modules. + desc: Custom progress-bar description. + **kwargs: Reserved for future use. + + Returns: + The mutated *model* with ``model.hadamard_config`` set to the + normalised :class:`HadamardConfig`. + """ + cfg = self.config + + # Collect target modules. + try: + target_types = (torch.nn.Linear, MXQuantLinearBase) + except Exception: + target_types = (torch.nn.Linear,) + + modules = [(name, module) for name, module in model.named_modules() if isinstance(module, target_types)] + + _desc = desc or f"Applying {cfg.hadamard_type} transforms" + for name, module in tqdm.tqdm(modules, desc=_desc, disable=not use_tqdm): + if "lm_head" in name: + continue + _apply_to_module(model, module, cfg, need_calibration, location) + + # Store config on model for serialisation / downstream inspection. + setattr(model, "hadamard_config", cfg) + return model + + +# --------------------------------------------------------------------------- +# Module-level application helper +# --------------------------------------------------------------------------- + + +def _apply_to_module( + model: torch.nn.Module, + module: torch.nn.Module, + config: HadamardConfig, + need_calibration: bool, + location: str, +) -> None: + """Apply the configured Hadamard transform to a single *module*.""" + from auto_round.algorithms.rotation.hadamard.patch import ( + patch_quantlinear, + patch_wrapperlinear_to_apply_transform, + patch_wrapperwalayer_forward_to_apply_transform, + ) + + if location == "input": + _apply_input_transform(module, config) + + elif location == "weight": + _apply_weight_transform(module, config, need_calibration) + + else: + raise NotImplementedError(f"Unsupported transform location: {location!r}") + + +def _apply_input_transform(module: torch.nn.Module, config: HadamardConfig) -> None: + """Register a forward pre-hook that applies the Hadamard to the input activation.""" + from auto_round.algorithms.rotation.hadamard.utils.matrix import multihead_matmul + + inp_transform = build_hadamard_transform( + **config.model_dump(), + location="input", + inverse=True, + device="cpu", + precision=module.dtype if hasattr(module, "dtype") else None, + ) + + if config.hadamard_type != "random_hadamard": + hadamard_weight = inp_transform.weight + else: + hadamard_weight = None + + if _triton_available(): + from auto_round.algorithms.rotation.hadamard.utils.triton.mxfp4 import mxfp4_forward_kernel_wrapper + + def _input_hook(self, args): + x = args[0] + orig_shape = x.shape + x_flat = x.contiguous().flatten(end_dim=-2) + w = hadamard_weight if hadamard_weight is not None else self.hadamard_matrix.T + qdq_input, _ = mxfp4_forward_kernel_wrapper(x_flat, w) + return qdq_input.reshape(orig_shape) + + module.pre_dequantized_input = True + module.register_forward_pre_hook(_input_hook, prepend=True) + else: + + def _input_hook(self, args): + x = args[0] + ori_shape = x.shape + if hadamard_weight is not None: + x = x.view(-1, hadamard_weight.shape[0]) + return multihead_matmul(x, hadamard_weight.to(x.device)).view(ori_shape) + else: + x = x.view(-1, self.hadamard_matrix.shape[0]) + return multihead_matmul(x, self.hadamard_matrix.T).view(ori_shape) + + module.pre_dequantized_input = False + module.register_forward_pre_hook(_input_hook, prepend=True) + + +def _apply_weight_transform( + module: torch.nn.Module, + config: HadamardConfig, + need_calibration: bool, +) -> None: + """Fuse or patch the Hadamard rotation into the weight of *module*.""" + from auto_round.algorithms.rotation.hadamard.patch import ( + patch_quantlinear, + patch_wrapperlinear_to_apply_transform, + patch_wrapperwalayer_forward_to_apply_transform, + ) + + assert hasattr(module, "weight"), "Weight transform requires module to have a 'weight' attribute" + + w_transform = build_hadamard_transform( + **config.model_dump(), + location="weight", + device=module.weight.device, + precision=module.weight.dtype, + ) + + # For random Hadamard, save the matrix as a submodule for serialisation. + if config.hadamard_type == "random_hadamard": + module.register_module(config.hadamard_type, w_transform) + patch_quantlinear(config.hadamard_type) + + if need_calibration: + inp_transform = build_hadamard_transform( + **config.model_dump(), + location="input", + inverse=True, + device=module.weight.device, + precision=module.weight.dtype, + ) + patch_wrapperlinear_to_apply_transform(w_transform, inp_transform) + patch_wrapperwalayer_forward_to_apply_transform(inp_transform) + else: + # Eagerly fuse the transform into the weight tensor. + with torch.no_grad(): + module.weight.copy_(w_transform(module.weight).to(module.weight.device)) + + +# --------------------------------------------------------------------------- +# Convenience one-shot function +# --------------------------------------------------------------------------- + + +def apply_hadamard_transform( + model: torch.nn.Module, + config: str | dict | HadamardConfig | None, + need_calibration: bool = False, + location: str = "weight", + use_tqdm: bool = True, + desc: str | None = None, +) -> torch.nn.Module: + """Apply a Hadamard rotation to *model*. + + This is the main public entry point when you only want Hadamard (rather + than the polymorphic :func:`~auto_round.algorithms.rotation.apply_rotation`). + + Args: + model: Target model. + config: One of: :class:`HadamardConfig`, ``dict``, ``str`` + shorthand, or ``None`` (no-op). + need_calibration: See :meth:`HadamardRotation.apply_to_model`. + location: ``"weight"`` or ``"input"``. + use_tqdm: Show progress bar. + desc: Custom progress-bar label. + + Returns: + The transformed model. + """ + normalised = normalize_hadamard_config(config) + if not normalised: + return model + rotation = HadamardRotation.from_config(normalised) + return rotation.apply_to_model( + model, + need_calibration=need_calibration, + location=location, + use_tqdm=use_tqdm, + desc=desc, + ) diff --git a/auto_round/algorithms/rotation/hadamard/config.py b/auto_round/algorithms/rotation/hadamard/config.py new file mode 100644 index 000000000..7ee370207 --- /dev/null +++ b/auto_round/algorithms/rotation/hadamard/config.py @@ -0,0 +1,112 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Hadamard rotation algorithm configuration.""" +from __future__ import annotations + +from typing import Any + +from pydantic import BaseModel, Field, field_validator + +from auto_round.algorithms.rotation.base import BaseRotationConfig + +__all__ = ["HadamardConfig", "normalize_hadamard_config"] + +# Supported Hadamard transform types (also used by HadamardTransform registry). +HADAMARD_TYPES: frozenset[str] = frozenset({"hadamard", "random_hadamard"}) + + +class HadamardConfig(BaseModel, BaseRotationConfig): + """Configuration for Hadamard rotation transforms. + + This config is designed to be embedded inside a model's ``config.json`` + for serialisation, and is also used at runtime to drive + :class:`~auto_round.algorithms.rotation.hadamard.apply.HadamardRotation`. + + Attributes: + algorithm: Fixed to ``"hadamard"`` – identifies this config in the + :class:`~auto_round.algorithms.rotation.base.BaseRotation` registry. + block_size: Block size for the block-diagonal Hadamard matrix. + hadamard_type: Which transform to use (``"hadamard"`` or + ``"random_hadamard"``). + random_seed: For ``"random_hadamard"`` – seed the generator for + reproducibility. Excluded from serialisation (``exclude=True``) + because it is a calibration-time detail. + """ + + # Override BaseRotationConfig.algorithm with a literal default. + algorithm: str = Field(default="hadamard", frozen=True) + block_size: int = Field(default=32) + hadamard_type: str = Field(default="hadamard") + random_seed: bool = Field(default=False, exclude=True) + + model_config = {"arbitrary_types_allowed": True} + + @field_validator("hadamard_type") + @classmethod + def _validate_hadamard_type(cls, v: str) -> str: + if v not in HADAMARD_TYPES: + raise ValueError(f"Unsupported hadamard_type: {v!r}. " f"Supported values: {sorted(HADAMARD_TYPES)}") + return v + + +def normalize_hadamard_config( + config: str | dict | HadamardConfig | None, +) -> dict[str, Any]: + """Normalise various input forms to a canonical ``dict`` for :class:`HadamardConfig`. + + Args: + config: One of: + + * ``None`` → returns ``{}`` + * ``dict`` → validated via :class:`HadamardConfig` + * :class:`HadamardConfig` → converted to ``dict`` + * ``str`` shorthand → treated as ``hadamard_type`` + (``"default"`` → default :class:`HadamardConfig`) + + Returns: + A validated ``dict`` that can be passed to ``HadamardConfig(**result)``. + + Raises: + ValueError: If the config is invalid. + TypeError: If the config type is not recognised. + """ + if config is None: + return {} + + if isinstance(config, HadamardConfig): + return config.model_dump() + + if isinstance(config, dict): + try: + return HadamardConfig.model_validate(config).model_dump() + except Exception as exc: + raise ValueError(f"Invalid HadamardConfig dict: {exc}") from exc + + if isinstance(config, str): + key = config.strip() + if not key: + return {} + if key == "default": + return HadamardConfig().model_dump() + if key not in HADAMARD_TYPES: + raise ValueError( + f"Unrecognised hadamard config string: {key!r}. " + f"Expected one of {sorted(HADAMARD_TYPES)} or 'default'." + ) + try: + return HadamardConfig.model_validate({"hadamard_type": key}).model_dump() + except Exception as exc: + raise ValueError(f"Failed to build HadamardConfig from {key!r}: {exc}") from exc + + raise TypeError("hadamard_config must be None, dict, HadamardConfig, or str " f"(got {type(config).__name__})") diff --git a/auto_round/algorithms/rotation/hadamard/patch.py b/auto_round/algorithms/rotation/hadamard/patch.py new file mode 100644 index 000000000..4c5006f81 --- /dev/null +++ b/auto_round/algorithms/rotation/hadamard/patch.py @@ -0,0 +1,228 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Monkey-patching helpers to inject Hadamard transforms into calibration wrappers. + +During AutoRound calibration (``need_calibration=True``) the weight is re- +quantised at every forward pass. These patches insert the Hadamard rotation +into :class:`~auto_round.wrapper.WrapperLinear` and +:class:`~auto_round.wrapper.WrapperWALayer` so the transform is applied +transparently inside the tuning loop. + +Each patch is idempotent: calling it twice has no effect. +""" +from __future__ import annotations + +import torch +import transformers + +from auto_round.export.export_to_autoround.qlinear_fp import QuantLinear, pack_fp4_to_uint8 +from auto_round.wrapper import WrapperLinear, WrapperWALayer + +__all__ = [ + "patch_wrapperlinear_to_apply_transform", + "patch_wrapperwalayer_forward_to_apply_transform", + "patch_quantlinear", +] + + +def patch_wrapperlinear_to_apply_transform( + w_transform: torch.nn.Module, + inp_transform: torch.nn.Module, +) -> None: + """Inject *w_transform* and *inp_transform* into :class:`WrapperLinear`. + + After this call, every ``WrapperLinear`` instance will: + + * Apply *w_transform* to the weight before quantisation (``_qdq_weight``). + * Apply *inp_transform* to the activation before quantisation (``_qdq_act``). + + The patch is written at the **class** level and is therefore global – it + affects all future instances as well. A guard flag ``_hadamard_patched`` + prevents double-patching. + """ + if getattr(WrapperLinear, "_hadamard_patched", False): + return + + _orig_qdq_weight = WrapperLinear._qdq_weight + + def _qdq_weight_patched(self, value, min_scale, max_scale): + if self.orig_layer.bits >= 16: + # Keep original behaviour for ≥16-bit quantisation. + return _orig_qdq_weight(self, value, min_scale, max_scale) + + min_scale.data.clamp_(0, 1.0) + max_scale.data.clamp_(0, 1.0) + + weight = self.orig_layer.weight + if weight.device.type == "meta": + weight = self.orig_layer.get_weight().to(self.device) + + is_conv1d = type(self.orig_layer) is transformers.pytorch_utils.Conv1D + if is_conv1d: + weight = weight.t() + weight = weight.to(self.device) + + weight_t = w_transform(weight) + + quant_kwargs = {} + if hasattr(self.orig_layer, "super_bits"): + quant_kwargs["super_bits"] = self.orig_layer.super_bits + quant_kwargs["super_group_size"] = self.orig_layer.super_group_size + + weight_q, scale, zp = self.weight_quant_func( + weight_t, + bits=self.orig_layer.bits, + group_size=self.orig_layer.group_size, + v=value, + min_scale=min_scale, + max_scale=max_scale, + scale_dtype=self.orig_layer.scale_dtype, + tensor_min=self.weight_min, + tensor_max=self.weight_max, + data_type=self.data_type, + q_scale_thresh=self.q_scale_thresh, + imatrix=self.orig_layer.imatrix.to(self.device) if hasattr(self.orig_layer, "imatrix") else None, + global_scale=getattr(self, "weight_global_scale", None), + **quant_kwargs, + ) + weight_q = weight_q.to(dtype=weight.dtype) + if is_conv1d: + weight_q = weight_q.t() + return weight_q, scale, zp + + def _qdq_act_patched(self, x, act_max_scale, act_max=None): + x = inp_transform(x) + act_max_scale.data.clamp_(0, 1.0) + x, scale, zp = self.act_quant_func( + x, + bits=self.orig_layer.act_bits, + group_size=self.orig_layer.act_group_size, + scale_dtype=self.orig_layer.scale_dtype, + q_scale_thresh=self.q_scale_thresh, + data_type=self.act_data_type, + max_scale=act_max_scale, + tensor_max=act_max, + global_scale=getattr(self, "input_global_scale", None), + ) + return x, scale, zp + + WrapperLinear._qdq_weight = _qdq_weight_patched + WrapperLinear._qdq_act = _qdq_act_patched + WrapperLinear._hadamard_patched = True + + +def patch_wrapperwalayer_forward_to_apply_transform( + inp_transform: torch.nn.Module, +) -> None: + """Inject *inp_transform* into :class:`WrapperWALayer`.forward. + + After this call every ``WrapperWALayer`` will rotate its input activation + before the activation quantisation step. Idempotent via + ``_hadamard_forward_patched`` guard. + """ + if getattr(WrapperWALayer, "_hadamard_forward_patched", False): + return + + _orig_forward = WrapperWALayer.forward + + def _forward_patched(self, x): + act_max = self.orig_layer.act_max if hasattr(self.orig_layer, "act_max") else None + x = inp_transform(x) + x, _, _ = self.orig_layer.act_quant_func( + x, + bits=self.orig_layer.act_bits, + group_size=self.orig_layer.act_group_size, + scale_dtype=self.orig_layer.scale_dtype, + q_scale_thresh=self.orig_layer.q_scale_thresh, + data_type=self.orig_layer.act_data_type, + tensor_max=act_max, + ) + return self.orig_layer.forward(x) + + WrapperWALayer.forward = _forward_patched + WrapperWALayer._hadamard_forward_patched = True + + +def patch_quantlinear(hadamard_type: str) -> None: + """Patch :class:`QuantLinear` so random Hadamard matrices are saved when packing. + + Only needed for ``random_hadamard`` where the rotation matrix must be + serialised alongside the quantised weights for correct inference. + Idempotent via ``_pack_patched`` guard. + """ + if getattr(QuantLinear, "_pack_patched", False): + return + + from auto_round.data_type.nvfp import cast_to_fp4, get_reciprocal + from auto_round.data_type.utils import reshape_pad_tensor_by_group_size, revert_tensor_by_pad + from auto_round.utils import get_packing_device + + E8M0_EXPONENT_BIAS = 127 + E8M0_EXPONENT_NAN_VAL = 255 + + def _pack_patched( + self, + linear, + scales, + zeros=None, + g_idx=None, + global_scale=None, + input_global_scale=None, + device=None, + ): + device = get_packing_device(device) + if getattr(linear, "bias", None) is not None: + self.bias = linear.bias.detach().to(torch.float16) + + W = linear.weight.data.detach().to(device) + if type(linear) is torch.nn.Conv2d: + W = W.flatten(1) + if type(linear) is transformers.pytorch_utils.Conv1D: + W = W.t() + + tensor, orig_shape, pad_len = reshape_pad_tensor_by_group_size(W, self.group_size) + scales = scales.to(device) + if self.is_nv: + assert global_scale is not None and global_scale.numel() == 1 + global_scale = global_scale.reshape([1]).to(device) + scaled_tensor = tensor.to(global_scale.dtype) * get_reciprocal( + scales.reshape(tensor.shape[0], -1) * get_reciprocal(global_scale) + ) + scaled_tensor.clamp_(-6.0, 6.0) + scaled_tensor = cast_to_fp4(scaled_tensor) + else: + scaled_tensor = tensor / (2 ** scales.reshape(tensor.shape[0], -1)) + scaled_tensor = revert_tensor_by_pad(scaled_tensor, orig_shape=orig_shape, pad_len=pad_len) + if self.is_mx: + final_scale = (scales + E8M0_EXPONENT_BIAS).clamp(0, E8M0_EXPONENT_NAN_VAL).to(torch.uint8) + else: + final_scale = scales.to(torch.float8_e4m3fn) + + self.weight_scale = final_scale + if self.bits == 8: + self.weight = scaled_tensor.to(torch.float8_e4m3fn) + else: + self.weight_packed = pack_fp4_to_uint8(scaled_tensor) + + if global_scale is not None: + self.weight_global_scale = global_scale.to(torch.float32).to(device) + if input_global_scale is not None: + self.input_global_scale = input_global_scale.to(torch.float32).to(device) + + # Save the random Hadamard matrix from the submodule. + if hasattr(linear, hadamard_type): + self.register_module(hadamard_type, getattr(linear, hadamard_type)) + + QuantLinear.pack = _pack_patched + QuantLinear._pack_patched = True diff --git a/auto_round/algorithms/rotation/hadamard/transforms.py b/auto_round/algorithms/rotation/hadamard/transforms.py new file mode 100644 index 000000000..0f41b4cd8 --- /dev/null +++ b/auto_round/algorithms/rotation/hadamard/transforms.py @@ -0,0 +1,171 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Concrete ``torch.nn.Module`` implementations of Hadamard transforms. + +:class:`HadamardTransform` – block-diagonal Hadamard (deterministic). +:class:`RandomHadamardTransform` – randomly signed Hadamard. +:func:`build_hadamard_transform` – factory that selects the right class. +""" +from __future__ import annotations + +import inspect +import math +from typing import Any, Callable, Dict + +import torch +import torch.nn as nn + +from auto_round.algorithms.rotation.hadamard.utils.math import ( + deterministic_hadamard_matrix, + random_hadamard_matrix, +) +from auto_round.algorithms.rotation.hadamard.utils.matrix import apply_transform_weight + +__all__ = [ + "HadamardTransform", + "RandomHadamardTransform", + "HADAMARDS", + "build_hadamard_transform", +] + + +def _filter_kwargs(fn: Callable, kwargs: Dict[str, Any]) -> Dict[str, Any]: + """Return only the keyword arguments accepted by *fn*.""" + accepted = inspect.signature(fn).parameters.keys() + return {k: v for k, v in kwargs.items() if k in accepted} + + +class HadamardTransform(nn.Module): + """Block-diagonal deterministic Hadamard rotation. + + The rotation matrix ``W`` (stored as a frozen ``nn.Parameter``) is + constructed once from :func:`deterministic_hadamard_matrix` and + normalised by ``1 / sqrt(block_size)``. + + Args: + block_size: Size of each Hadamard block (must be a power of 2). + device: Device to place the weight on. + precision: Dtype for the weight tensor. + location: ``"weight"`` (default) or ``"input"`` – controls the + orientation of the multiplication in :meth:`forward`. + module_type: ``type(module)`` passed to + :func:`~utils.matrix.apply_transform_weight`. + inverse: If ``True``, use transposed orientation (for activation + transforms that are the inverse of the weight transform). + """ + + def __init__( + self, + block_size: int = 32, + device: torch.device | None = None, + precision: torch.dtype | None = None, + location: str = "weight", + module_type: type[nn.Module] = nn.Linear, + inverse: bool = False, + ) -> None: + super().__init__() + self.size = block_size + self.scale = 1.0 / math.sqrt(self.size) + self.location = location + self.module_type = module_type + self.inverse = inverse + self.weight = self._build_weight(self.size, device, precision) + + def _build_weight( + self, + size: int, + device: torch.device | None, + precision: torch.dtype | None, + ) -> nn.Parameter: + data = deterministic_hadamard_matrix(size, precision, device) * self.scale + return nn.Parameter(data, requires_grad=False) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + ori_shape = x.shape + x = x.view(-1, self.size) + out = apply_transform_weight( + self.weight.to(x.device), + x.to(dtype=self.weight.dtype), + self.location, + self.module_type, + ) + return out.to(x.dtype).view(ori_shape) + + +class RandomHadamardTransform(HadamardTransform): + """Randomly signed Hadamard rotation. + + Extends :class:`HadamardTransform` with a seeded random diagonal so the + same seed always produces the same rotation matrix. + + Args: + seed: Integer seed for the internal ``torch.Generator``. + generator: Pre-built ``torch.Generator`` (overrides *seed* if given). + *args, **kwargs: Forwarded to :class:`HadamardTransform`. + """ + + def __init__( + self, + *args: Any, + seed: int | None = None, + generator: torch.Generator | None = None, + **kwargs: Any, + ) -> None: + if generator is not None: + self.generator = generator + else: + self.generator = torch.Generator() + if seed is not None: + self.generator.manual_seed(seed) + super().__init__(*args, **kwargs) + + def _build_weight( + self, + size: int, + device: torch.device | None, + precision: torch.dtype | None, + ) -> nn.Parameter: + data = random_hadamard_matrix(size, precision, device, self.generator) * self.scale + if self.inverse: + data = data.T + return nn.Parameter(data, requires_grad=False) + + +# --------------------------------------------------------------------------- +# Registry and factory +# --------------------------------------------------------------------------- + +#: Maps ``hadamard_type`` strings to their transform classes. +HADAMARDS: dict[str, type[HadamardTransform]] = { + "hadamard": HadamardTransform, + "random_hadamard": RandomHadamardTransform, +} + + +def build_hadamard_transform(hadamard_type: str, **kwargs: Any) -> HadamardTransform: + """Instantiate the correct :class:`HadamardTransform` subclass. + + Args: + hadamard_type: Key into :data:`HADAMARDS` (``"hadamard"`` or + ``"random_hadamard"``). + **kwargs: Forwarded to the transform constructor after filtering + out unsupported keys. + + Returns: + A new :class:`HadamardTransform` instance. + """ + if hadamard_type not in HADAMARDS: + raise ValueError(f"Unknown hadamard_type: {hadamard_type!r}. " f"Available: {sorted(HADAMARDS)}") + cls = HADAMARDS[hadamard_type] + return cls(**_filter_kwargs(cls.__init__, kwargs)) diff --git a/auto_round/algorithms/rotation/hadamard/utils/__init__.py b/auto_round/algorithms/rotation/hadamard/utils/__init__.py new file mode 100644 index 000000000..0b3bf2eae --- /dev/null +++ b/auto_round/algorithms/rotation/hadamard/utils/__init__.py @@ -0,0 +1,2 @@ +# # Copyright (C) 2026 Intel Corporation +# # SPDX-License-Identifier: Apache-2.0 diff --git a/auto_round/algorithms/rotation/hadamard/utils/hadamards.safetensors b/auto_round/algorithms/rotation/hadamard/utils/hadamards.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9624e008623e86678a2da7f27000106e03055257 GIT binary patch literal 1436901 zcmeGAZH^?#(xnGZZ(6s&1G^n5xRhIY&g*EIjl7_y7EVc>Q1h_>ceb|MQ>!{onr6zy15a{m1|K zkN@HS`p36#zy0HX_{V?wxBv9dn*H;C{kQ-6k1zlD_WSo?`~CYr|J(nG?dz|@RJi zzm2%x*08UO#IN5Ml3$nHUq|jABkqsypnap{MN7?w@%!NTK8hPyqtE z5%i-3EE!3#y$e|#neN8d<+RppDE+oT*X*DFK91aS8ot$O==|jv7G%{4&G#XrW^Mbn zw4-MK{9pc4h5Z;}ek_U0L-MsAk|zBzz<&G~l*?AtPK_#-ZTVW;;!b;@mh<+t4qV4B zEJcFJ!@6|qTeWKxyllj}592IW0n66a&gF^zW1NjY#?kn?$gkNy|NVde-zx9hdaQD; z=-HB5vr+o8Gs|ma>FnCr<=B6#V?TudK7{zJ{Vzy0ICS1G&$|I@$!pa1KBuEhV5llb5M%fGSse|`P(^?&;E4_}o3%m4g8 z|F{4A@Bgigzm$TY?RRzXoZZCuUKrnh#xVQK?>~l) zJ2Fbe6%nQu!1orxTkV#JZ?z%cu55_eU;g-Y6v7Fy?Hiy$I^h#Gmd9FR_~_wJ*P3*_Wh$sXu6c{1Pqv z@w$cbTQ&B#D~(P1mwJo+I#BMk_P1*7Z&zBI^ec`x5YsqX%MrMpT0^6mFue(jyU15y~*Aj}2g zdpCtY8n=XeuMPTsWrLFb<=Y?MzYUZ-8}z+4==+roO8S>yfBE(6uX!c$#=ohhHt746 z4NCf#Ukl`Wwwb>!w^0754f^BC1||K=uit+e+swDAn<&-XKdy8)>0f^Rv%Y6W3x8B^ zq5P=s{oAfWg*IxC(OP#d^<@N^Gzb6Xi$_6L>%lF@Z`H`)u zKh8IsTE0TLvcXCJ^8L@>e#zE+yYn}_d|&GK=fB5EufK(V)_an_9`&91wwABo3+2iN zC;dzP8T;N^?(Gm;bZw84{^iH7OGS77FxNj?3*}08lg?k}Z+WfwMfS##EnlHr>2A`$ z{P}CWD0AfQ{B5p(eSE3kpSzp%FMs~|%P-&4-8+Aq>z^Eja;3XT=X=8Me|(RaJEyMN za-}Ps49NFi^Xm7Xr?>i2f8opjb48tBU+#YS_0NHGPYrUe=}A6c5`XPpj{yY4V-F^z|Plg>z-EI={YT{r=l8 z{dC_ctE#$k_&QmTdTa4m-Z_1B_^zEjKzQZV`1_AM+jkCMoxN*Euag1!<7>Wg-`?!s zzFgbrWCkGz=FT_EI(Ao29U#BezoUNr{l^~-az{rAxvqrd0{Q;S_Z+!9C$0|M)$^8Q zK)&^PyYscOPTSSPmSjN23-iudtD|=Hq$L@UAN_aa&NssPLU`@_U?&&IAK!kYeX{-+z3$_HD3}3*^Vw+G@`Hov(xSZSdNc!A=I``>#2! zcfJk&@%_uS?}D9NAoX1^-&bxw!9TuSeSXahLf+VSzE0M+$!lLGrT@V;hI81#WFJHTF|dol47`{HMu}w$kGD9a)Q+#-Jvki z98-iM+TW4a*~SVIyz3+;Clj(#)mg>y(Cm_$;7xTs%Lf$he- zNrs{V5-hh^Kb(Z^yGl~VcUF>4q!AaDct+ku;-P}yiD3PlJd?!TPPd%eAL?atqg^*D z8GD_y&Bjx*Xd<$x6+bsggd&DEiO1AL<*3I>8!JAe9S%T1O2ADK;Q2 zsmz&)iSC4c3<+!y{zQU3&ICkgLe)n#21VXjA@fBgomxXe*Gk?^B07SOlb|1=-oZ;3 zm}KREA4h6(;`*WzbF`2LoTh7rAMU=5EPa6T}EGTnUE5Us2b&O9+mVDDhn+3Y2!}Mj=vish(O5jBdTFdYuMTV#qHm zN$!H07?#aZIrSlGKZ)t)2lfC74Y*oK>b^pP@LX@LgoH^X-L~$ZJ?z^plHQ55?6O_A zD!E0HO}tV`C0uWp9lGAG8zdlIZ`Uo7is#dpi&_QbI!VP`CjsMnCGRF7;e8}fU#(=V zrnRAxA1GvzXuH4=KPdy(@p_5I2oummwW6ye#yyKf-fNX~weP4TZM)X4XOh@*_Um{h z)u!u}1pYlFg>$`KWq%%tO?v0X->4+*zS1u0zPcybB7EtL1I^HKwOzMJ%p$+3(ozl0 z{$>(ED*vZS*rB(NZ3y9k)cnuBzM1gH6oxe6?B+J7Q3h+z2z zSA}UfqaRyW!i)vDW)s9Grk?1+qE(=PkrZ=`9av#t51h-fy+GF&Cm8@x-Bphv+4Nzks4DDi_yMvJ;42TFk3 z!uzh46s`x+R%u|G)KY({1WCND?}dNh*uQm6--4e}Tqk z_5u#YRg!^AF#7o+ zL^6ClQ8Rov(dgMH(b9pM_^9P6%emrsLM6{N-p(l2nAy4)z$NRH&J6dO(eN;d6fzA71yoAQC&VOM0eemH#KpZZJW+8xf=HGJ5EaiuDuU+`M4W?%5K+tnh>GPQM0tK5P|>4^>P_&7 z))mWRi0T+Tg2<*lfG8AyBck4&kFKcjAJe)|qI!LdRxeLQOuYG>^(&?Z9#3|L+ap>h z$KQ#_=6!tY(z1u_(L*Yto`(=E2mE9AC{NnQSG0X(KeD3QpY`4FkwmpWk05e?9zc|8 zKAx!e=i`Zbe?FRM*`E(4qHmAcqsI{W9(_bbe>b9fRr4q4qgw~VBZxfH4%*VH`wuhbha3UxkL1Zm|C!*@wBU)E1k0Gi*aE~D3 zTt9+nJUaaZedxd*`4ly&1?Iil9_jTi#govF9yTz%W6p<8*{s-2aGE^f5$t*#M=dVL% z87YJUT7L|aD-(Zo?W4)q2`_j2Ln~X3PrCM@l|k`$tjw|=R9SKPls@8scqAUSXOF9l zL;k=vK8(z|{)UxxU5}~kk!1Ok^^nt8&3jm7Mf6GLwrM zZA6bh|IGam-d?lF)oJWsh%B|)iI1+#7Sa543c8PL<0qN@<>2lfs~X_%Jfg{G-S`h7Tfx>rs_?Fdju#Jddi(v+*FZoQ=nj zbq60pCf9@ZEa&iVNS1d7&+~`xSy4Q!jSxMIjOIOvtY{ubR#cB7v#v*xS=VF8x~@+$ zz6s@D*c>VkmWM0m>W`1mINmjSkwx&s+6d9#lFUXvrZRi@xHdklG9Kbbk#YVXMV5!} zF_q=HeoSTIdQ2PZ%x_*;AN-tslJRs@$MegirZn9<=6rCpk5ChIj)aHonL@)KHRv1) z+a7f&SELi7hS0ZGW{QcHYb(mu?GPSJSdNncgfQS5kkv*gCg-NCS@$nnvL_&_US$Vr-RZDAfjSV}h+iZ|EBlP(wD>j^7TCOwx72-FZk%vhcT+%4-?Kt8iD| zme6jAS%7TSrlq~2ZxHGoguNoOrmy7T9)c4LW)K0gZY{2G>fN2UCW7rMgr!1?Chb)S zM`VkAbKXMnb0XBY2(d@nd-HB53V4H1AvXx`=Fz@IX|^pev~*kHc72Z3(0DI$^52PWX;Idg6zc z$kc=OxXRwWTv{scuDlDvB7sw0Avc7uLd-A%a2x-QJYhN!IIzSM9EYoh9fcATmnrYb z+awm+N(i5;5Eg@xWz)VT9FYpWn}=*+Mj5vVvEL%Z{?@z&hOXU7+|=|_h409tHp8v2 z=Bv_dTXefsOC;Po^DYvL?Iz&`#S6ms zeFeE~1x^KNzT=~aJ%p67rFbhLWGf+A7pt2-$X3Fnl>spAw|oL#P>@Sf3%OXWzPq51 z-5^YLH$d-*xnIR?LYbD9W;Lp~=TrVtMOGExUD;ND=`&*4!(LSgExqR(0ZS0!O?b~& zn7F&bu_Dvtn?5rj<2U&X7prSNGFH%KH6U*K;FC$vWi>?J^x?P(ddEl3P0%|&lXeg) zat9%;Un|_MaNPE}QA>r}Rl2-Zf9bmmdWX<$y9K(Xpb=~HLEKzay16!h&nL%4~g`9N4vfZ42=ZkAi8EdyA= ztoW=e{a1YA-M|dlb!V!(pm-DEJxrMrAiP~%=&hg$9qN_}vl6I|S2$L~8H#`!BOM51b&#LAa@@t8DsLD!hluMywTXSCqf4aJ#~B+qXr5 zdu5<0o8{$-@@|9P@!iENj`gC_^(wZ_wMIGHcp_@`Q^k5+1eVHef#utlgwtfqg|KM_ zhFd;dn;u?Z0MvVp<1fTDOCdE0jnldKI)!LJl?ErdmO5lJF8?iW5BSX9q+gBdNu zse^#PB(GPJEign1W#pyr9wzeqs3uCvaD~uFfze&phVasN53`cCgfIyOp}fL|UNRFp z{WR6fYgMEyz0A$*(BrzaYHD0v_uIZgOKrt?;a*TVM16aDNY{09Qh0w+Por2-nEbnxNoYBW=J1=A~0=!cpOG(~E+kl2^b){^YxdN$`Mq zsRqW%<(|(H3F&R$Jxnqem=rZAE?4lbk!o1QT>36nmncG*96UoKKKdm=5Gv%0g?Y?Y zA16Q>i8e=P&FoJ*iN$~ol)lB{G89)x10zln$uFRgi94JxNWr~HiUgGaMkK4~15S>C z8z)#xtT~2Xk_v?#1o3mzB93d8`0E;^B(%AcaRRi&-XJxGTjvQJ_JsvCjsEeZka=Ar zO}*1ArMyRzTHYlrex98@;le25%~< z3T1)>nlqE8^t3^>+x99xeY&=%dn+w0w&6NyIyRlO5A=1?-lo~4#d)XFd!+c%o_8Ol z-K53z5u`ao)mkR>!&#OiuUGQ1Y<$ko+MYC;5E~(}t8RucQwaMp4@- zOfx_Q2?1?#9iZX7LmB{(lYT)eZWO`MAyU0UDvTnyks(0F2DnaYh@45Qb;UbZL1r;T z(TWo;315;9fHk-*xy%C4t0pyjE2-Mxs!JT@I~z-V197l4$ymONREp;NiL}@NdnhR) zZj!2{+ob4swD(S>w);$^t>t>9+5VJM4XIe>y-zV<(_+IC;FRcgRzRY(U{4NKW*LUD z{hb_dlR}9-X}vOCW6f4)a2KbE2AqUc8B{aM*;bm;)?!?b0CH|tT8_FB@KdGq4bZEk z$t)i~0t_TAdr3u`PdWr%t&|G=)Huo^ka>!sG?(wH^d4#9egr9sf$q^F%3^u(+^RI? znflI3ZPP7MJ9w+ov^bm-NhzKV_{pN?UbY9pJ4i)M69EN93Z}gRn2#=IUIu>vDMS!U z`5;oOyG4qe1YBva-B5DEGEMW$0+l6~V_OI2^-4RKTr(ZR1K5X>)?S;ChK_(YwwH}c3mtkdD|x|z@)~K< zrjr)c?e?~72PvpGE5&t_bhsvys%JhaTXnNixAg-my+>MdZnqa->-o98deWgnVE5cX zGc3|ux6lEmG75T|G(h-`4S!OlVX84mVbQNSgfLwwYT5>O6$VJ4QrcWnSSaHCq^MU| zvV+4MV@n;lh7xWpemqx6t@Bo;*gt?&VYk|AEw@OmH$^31eQ12O(tD(|My?M>bMWI@lk!pFp(F=EtPgEK5KT6X+R{M+3*q}Y>xoxC72mM`21-;e8U+a(}WOB)WjsV!It1c1H+7NF zL`fs+&7=s6xGSVlshpXl65b+(XjLSw9kHTb?x?iVZ&w=FMbSG*OBy?2@0dx$h}Gq$ z7$l^b&P=^dT5);R~X)-1^8TH~Zf4~x;k*t=b$>k*}ezV(`HAw@-N#Ev>D(bCKY*r|3n%Mlp}qHHjboJU@C!8-NjXm28%NR_eG^H zF{M%xgaGR&(l!wxG_lOh0Lj(6N#PVCxWQR*qvI;6oVQ3DG8zd|UQM)o4C$!jDk+`X zSE=Ho-^O>{gXLbOA46&rflf2iS$n*A_Eg$9*DDR#V@bQhhmxkJH}+H|+}_>Zf~~xq zoJ<2jv+><-FL{wOl@v7}K#Injm3p#nktT0XrB%mmQUPyON)K+fw?P|#KsTaN+=ve$ z9iUf96)?Y2%URlxmMoPPC-vGU=5-OOtrxCL;Fef|3|(+#*>>=Ddy~3{RMv=P3TMWa z%yLnLZi8^<8vd~yX{?Ry3Tw4gn&b`cn@OAXI_ZFo1o$?jQC=ZsZ4W3}pp0vECeTQu z42-{~T;SxE1uK7qY%BssT*zQC!uVUHi862{5osW1@i0{BlrQg2Eoz!_v8LmCYQk05_Hq-g9Cdti<59<^pkU<0HO6k|vN;Jq?1q4@l* z$OY0F6~{AhkCBQ$pH%&~Dos&)NYjqUElD0c}e=a(A5>xW- z+(XHq#Mv?qj7O}79LI4G*%vm4Ht%iqq1;2s2SgvJxFqi;_DR^=>LbJl_E_4T6_4Y! z=Wrd$J(N7&uk3R*ZY)_bzxz1tCzi@)zrS0u?%d-;xrdT%=Dv#A3V$m7-R(M*dnkFg zXsg^;aXPd!_fYZ?(VxV0fL*KKOf$dw_90-`!6a{IvT3_>_Y+g+UePvbck(B(Cy_mW z{b2rUwb~=O?_}mV+MBzdxEXg5SJH0cqq%#D;XRbQGg;ofxrdS|_%B9Wa`&qCwY0P1 zBHc}VFn2dGjoK^PChtr>lzS*So4fCP>?h9Ox_zxK*4@Mhb9WOXc&})W)b8ZJAaRbx zu8RA8wJ-Nj@)6NbB1ZDg!)4;$+(XF+MAs9zx8lm&pL;0z05PdMRhzjxcRz7GxVuI3 z(CV1V6qAc!4R#YFO#)bHDV+JUsRB zCs8cS03tx3=}c?`V&vlw^CjMZ#$m2AvCEt^auOEEk+%Sqvuyr|-&3+OHQ?ikhlV}G zlwjVDinDQ}^lpWCb)$BMG&rF$r8_!z5{s!`0p3O2Y7z0+iVqMY)ZfN<&*7p__F-qM zpF*6C*;ldkKBZzBy_0wx<7}+km1*OZpBV!O!NHwzUw0D=h3XxxHjSHV2MEjPi$?E+ zj0y}lOVvAwM~XeDm=dh1@|{yAmARxCy>oP5W~BdSkAq~hbpGPoy6|I-iotdw)dT_CUbAAsdaC~?CxG-o4k|Q zT6Yq|ySHL^_f{CTCz?Gk)?i zsU_{qGNrM0MjrE+6GxWOZhi7j9!@haGv z1!P89-6keuxvAI&uK+ed8!;niU;x1nF0SBcKzf@xXe12~qX`4TTQFP<6Nvm6Bak#K z`OymozkqOM(3v~Mi)wzN%i7jI% zF-S{`Ea2_LNZDJlz1v9)=M#vlRlA98)6Q1grJWU%w!hT}Dn3B$k=)tpr0uIXS^J1B zcXz7~RD6Iqto!!3XgOJrulN8l_40DtQ?ay9saVnbT20p8ihUR=+aH@o+sbP9nAL-V zvpWit>oIvev67&#NzWs%?qyK9_smdBV1j!_75r!ggaj&M<~=j~1SUQcfs9I|_W>Y` zP-Qk5h=hQ<>pGaOdISz3=t&ik9i7$~Dnpqjkjc*MVC+su*>3VKU>`XR+D~rh4y1pQ z?>KPFe(yOl*{r>LUT18a^d00Qd*6ZceD6MROZG1EJeyA;={N$={l z)aKI?Ost84rB`}70<&S|B*l>Sjd*geWDDKtK)f?us!#HF_w8S@?R1Ir_T~fU z*}TZ+QF!Zdg_Ua``IYQhRmYc5!h;5ZKz$r`H;U01V- z0@Z}x&}-Sotpp@A@f*?sye9o^8uOCH)3DDErA3@rLvdA&2)Z+OLwE_?B!aC{wo4b|C*H$*ULp zE2ooh&91l?ZKvUHPA9T{*Ij#8_D|$MeUj5pTS_ z_Fu=kUcBzN?;1t)8>eAuDf^PHj)2kVcJ4eepzSAD*Mal)Jo2s@0W^>nFvUC?dZ+E} zGFPNBn4CSbqu?tsEhahE^w`SRZtWnq&ZXt=u6(mq?kiJuC_b z<-Ozg z_mZ>UsE-D*-u0Xxs{9anF&`ke$fs5AdvtHxYr~FIPT%(Kxeeb<9^JdimHiy@?%7Uq z>L&F$l^-JatnEHPQ74u&5PTfnM4cklJ$7LtE59I#JkC2nL~Pa;?J zZgQWm-IaUFc2}N8*xIRyhj~Swq!lAk{v2 z9Xcx?r$5z4Sj+o3f1kvs>^+C=nR}lD=Bay-(5Ok#`)={TMHtY}bqH zu%GfbpUqVB*2jFhAIQz`<%;SqEtjha{uh83Vm4=D$e8Z8LPV^4JLZgBZ21_sbDc;1 zy{Fr~9rLmG?D?>5=TrO_)luuHG~;FUAK)*-wYuyw_hH-4u?_ne5Bqx)Up&?Ye|deF zpHl17Ll#~AviL}UTIvb6x!vS$b+2;jKg8dl3(;neTbh;+^Vc?yy=BK!Y4(21Kh(e3 z+i?!YzH=sqZ9AXp?~2)mwfbzz*0!A!w~fteXMQD*y#;nuyq$ljKU;nCKKB0Bcr&pb zxAKO4?B%VIj@z! znR_u*!^J*vWwowj{n!O{*SG8=Z;ZF%ZrMj3 zRd27cm$$}k?6vu=ykY+-XKZSZFP3c6#phyi>2h_&bdhqgAULgGMmJ(vz4W5~e#Cw6 zmHZb8cc_hgh5qM)UI_jv-->`HFxQx08?V{O8usmc*tX+#ZfxyAwjtwo&R)G-U$bpv zw)1PTKiRB$Rkc>K+R~Y6_L^;*vz;4DS#_}& z+bh=9KGtI#u`k!R>?3ZBOC05Q^*z-PYt&*N>oJblm+M>h5jVyq4hpO1zW$Ecu#NQ? zU)sldjJM)`vUkFUTFVeK>|;H~m-ew9t`E^$^vO+#76RJq#6dW<9f<@%O=#Eo%@tI2lH zMun+TSrw3o;oHW)j2kg8?PER0C600rT;nY49odf~pF_!Xo`xLwJ(hi#o(c|0VC%=T zxM$pxv*l#(*~7i#kz6>BRiEY`!8lfZ_WlS4d%-oH->lxg-kUdg+=D&?V=u>9c*~lY zx5VAz$qwce^B}! z{yjXh_vU)?vm0Yi-m|BB&diqQJ$rp^kKb}euH_BhvFBqPdpO2nALFp!8b{n1U&aml zGG^ETYhiBNwdW^$9#$g}hVo%FUPWE<;mn+m>TE=SF$9jyzKE`1m<6`g03;2?E zX}MfcMJ&TU)?*y@F%J6}7kh2UvKKwy?h*5Q{&Xf8V}39GC0;7YZjAYMkA{tiG2i-! zed`O__4pCD7gALC-LiOJ;vW~koI@A=;wKX8{k ztvt3|bRQt19w&$`V?_0l`d$sCS z_3h5AG1og6|2?1BlvCGuzO!1_x9l(DZmnZmngCS#UytcLxq>sRBfIJ-fWT+80{T;mHnwr$+4^({NO z_)>9pFy;tI2guH-z;ia0(h|AT_{#WB`gQ6AQyivO?PX(zac1=VL3jw(a~H)=OKeUR7UK^K%8S#lQ5wT))JpS}yi+d&Roi z$9jz0Smwh%#^Haterc!jQT^DTQT5h7;>Ng*-NwK4zg&;JVW+~eXPL$F(mvK>+{UK* zOUqcj8sCA67{N8RteRvt)?*y@F%J6}U)sldj3bWy+3pGRmwUEdzZU<}|8o5jKb2-z z#+-4?nQYfD?XUS?jko$VIh}~&F;6$LNn@VXsO&QSR^B!@Y~6`+Z1*J9#@hLu#744GA!>*Tf>Kkkkr#t)4>KayX zSu(5$tO3uh{kjt;b_Wp$oFK-?Bf*$0bQp}SaI-3bb@BqPBSvF$!|KOG7okw3n7Ks@ zR_nJCXE>z8Wh_vO^{*#_rrA5H@spCR30I1=*hCm;4<>aYO?$EkxAnQ5u-nhyj;D7b zeR%T0{1m~1KhDA!eIKx>&%@pm$awm8oI!*e^4w1N=itfP;n92YnW$aJ^YP5>ID-hz z7AetpCWiHKRe+$$TRWOGcjIK z|MDW12lIKI=oeLCtqJiilX5J4B7+zFq}XN$<6tfzK@jd=EUcp5%MWQ5*5}POuT5B1R9M+X=Sf+1sIjXN#y4`Rq>Ef+ve84Nva` z8lEj8e=MGTCZ6Ak>cjIzl!RxC;3Pa-M8Bw?{B%6O6DoM}c5oh_-3c^2Tg3XFeB#sb zDZ*pmzcQbFChGgkv$uoi^4TJMS3g_CI2BJk6V-?3Z^x$yPhxGw(>syF@Z2-e&*rCy zoQmh33D4wn4`!dqr-~S7@~I;FiSWt^vqjj4=ZYwHHsh%xLjGJ4{eF0^h~9>0ieN8}UQ;lt3G{(|S^nAAz&Lb{muXgY zC%_Cksr5L98lEqNx-8oIxKaJPIuZbqj$j-Y;@%uACKVh7@w61wb2|e6(XLv=VUP(z z=;gs`+>gT;$AwVG(PBKOhe7^ELO7!b+gh740pmDG&iS!o3gfVlDHu~gW)$-&gcHHV z$;#veXGAH-_BZ7m#Fzpyp_tyBBh`!zK7=s^WJWQcLg)a0+xn?p9^ga!`?)nQ+j^`c z>&yS=D5Dr6GIVIigCIK~u^ z2|}pkU^SkPqZm^_W)$-&Bpt{ryym`;kDQSy7*jxI6mt?GRm#z70630u9EA7!Q6av? z4uZ_WI9d%lreGWdQO6XFV<6FS5Mv4m8*_APjti+{eppDq)Q?p&LrB$fv>FhOV;l!5 z4TptL$5D`37)OP;CkKnEb9%fZQ!u80%qZp@Lh5yVR7f3-2^hygs*b}#DC9WE42+Wq zscsythVyY4V+zOwA-qfvR#OPaF{Xh0jfBwAgKahaFb01b4qo*FI({8*NIkRsc-pHX zB-vson6vUMQ?a4OanniCGIz1YCCET>@fz)fu)%Ih(6Pl2tnsAEB~i^?IfxJ;QIWX_ zn>f~>HPQgq7QTj5&_N+hrsBOqtZQi6+mYi!M4)MQ4pFww(%!l?Bh~I(mpFmUCcb5B z%zU;G`465E`T<1UojRwp5AVnXA!`Xow&t*qw3CL~T5q(}GfJc1a9As3tzoSTYulZO z20=&!4g+o|%MmCh8o0IH31*`bX_~fLz_lsX9=(g6E% ztZ-P{?!^2B3De>Q4HQB!GSR@T?cpl3Q5}XcX(YZ^2&trz388GF#32bcd>ba#n%Wa5 zXo*UOkQie(NRCh$w5SgfH1XotWVRh+D1ha{p)M(Wp)=UtB zD21VQr;rpeO4`>EMI01j1=$SCVn--TMG#bds!Xkh<8w#sLsT5^2vnioEhJ(U9J#2B z{2KI9ONTLajw^O!u*E>G&czKJwoYxkK|fK2S}=T_)v;^lil%D_G2x&P>q07mcM3rV zH6Q5+I_%m3A?i3NL>du@kUY1?I^vcd>XI&3lZV@LaL6#Tf?jL6djo$gnk_D$awu6IU}5mqe9#jZ%3Y* z2|}uvvs`sfIjSit4bn9ByuCTkBhHPWhe>s3E>=sY8@`=MFbPgLmfV`x)pWy*&EH{H zstaetHB7KaA={Op!Z-sD3F96ije?wbAt7wlh{xYy*Xv@5`w%gT9i%8^{vqru7GaGYeDlyeR-p4-pPxEY?8UMS?9BQAk2Ax&Lmo?__KEE@Z z-Lr^6$fp?3NLtbV6zNm*xlQzA+nHKq`TVEJqk9rDqeZ89nm)yNW~&X141HAk=w|u! z13Xkc{mj^;b39FBqv*-Aw`uIpDLQi|G3)d6^lhrPoToFNVt8olaq-ofZsZMop3daG zg2>hy#i~X*j(n`7g|9}^{H>dHdUug^5P%4BdU3hJiA5`OCq1x;kU&*b;;`$&pb@*j z!>$Au&WOs-t761!f(~YQh8R>(6}P8%=1gMf$4NSq&G{5FHpIt`&2f*0^|=SQ_vjpA zPShFSi6=S0oSUE*rzsEJ`usdq%-ATNcjiwNv!0sg zi(z+O^y0HKo{zenq%+y0lZZ)MP9cW(3wv}5F^D*cm~`YEVoJ+tI)j$8h{<2Yvz(dp zbjGuLl5Ohu?m5oPnZ)>A@f>G{^KzCmLr2acCcATv&Zy-ione1YvrT6b$$DW=ZI=FFU^Gkt{5@Sdrz z4B8f70`w-b|E|`M4tMq;9M}}|DdnP4Npie>BRUeFNPsf~2_dDt6-YaIpmT}JWi=qREaTZSh#k4;M1eKgAg;h(Y08PW0 z1~jX%lL@L_pHUf`J_TnQ(5Ikm$k>&Po9{7mI^A!L!Ls)~G+r9Vapnl3cXgry1NG-pDG zgqVRd4QNtfY0lxw3Sf$$dK*pwnus$+kZ<`39pVr2r=Zb;>O{U?(313pZ!Kr)Q0@5) zK|B;waHauG669e%UK#s8qeG=+3eYs1X+V<-Tb{FHPZ|3)qto?JOahvOLs1hd17!}* z9H3}9UKlU2DV0sbnFchguyYB@TlMgk6~L6Or#Ul#CgMyHRC?y%Oaq!Eh_~2;$~=Hm zaOMc&{LCmUXL&-0{;q;*&u6rn_sqU}$a&oV zf~~{j#Xy(1gXSbRL-Zvel#8!*m_(kxtb*bsx8N?wjl1}wL#tvc9N#`cfVHwig1}H% zWb76s#son^Oc2x%fulwmy<^L|q?A|py;~MN$G0qcj&E7(IV`C492Vs9LsU1shMikB zsY8V+E2L37I%Jjr#{?mz5e{t`QX1fpAjQlOl)fJC&?G_W&hZYF z9J+I&4owo2$7#kHnj(njTrmf?jJ=v6h{tP&AWqH{L7<#S5H-yZlxORBhw?f+-l5Vn zWy^RG&gc+grU)uAQv|W=GXxdL6hWZO5Cq7H1f{Cug1W539qKk87L=niW6LIWXp*3A z%;7Usa;iCV4)v4{O%gOj!Z2x{i`TKQTP-&Rbp;A(fo^ggI2}*B|3!(e7HlNMHaqqnnj#1(GXxdJln!waX9(iKIFTUsYK9;&PSqj0JV6lMnb9G|Oc11) z34%(`ln#}iDIG%33_<9bAxJ$F1gU3+pa31ebDU>h+h@!d9dNts=)d6h*@8Z!HQ&CYaE;!)-YMS!<8}-wDCZKD^D*J+qou&~ z?Lep2*Guq@K>e*n(U|nx{d2zSpaU|eQ#@z`>X@i4Qeiggp7JX7e4?t|b41bY73oN) zz}Y|@*uED?2SgRpBvCJs_KV^?olX>jrik)9T_El4l#lGEsOms>B3<=$RGU>R92s$4 zG=1082jWjiySA_7T!Gpn>N?UMQMTyk`gVxAfwV(ZHty|q(?^BU9}x9<%|ZxqGeeTu3s z@FKhW@*Hj>X_-l;bh?|@^ZQWo-b4D!ZeN5?I8***dBf>^y;FeH^?K7^efr*v^r|T3 zoJ|zFHl+XGNOMHJ59yvL&iLs>jW<(- zj}-@pv+ug5eU3a)J-3qbcVx$s;IVrZDPhT%Y()y~+o6=zQeHp}8K0u;z{htAns+w+ zx+qS@dy%rsWNuGqj^qqcyO6Gm@|eE6zH6P@hjdL8UBf=1Q~QwqDxyl~oKEGW?&$eX zonqHcC#nw898ny}ok%Y_RXy4bb@}uiLAn&>zU{5=LewFo3sLFz$wc{Lnz4PHrvnFJ z(BVP={S@J zeG1a2sCwtrl{b%jvcAuC!`Kqaw(?$Of=0Rb|-+3G*`9 z2fT5+#!TF0M2rFG@I;tu2Ik-ZffqBga0@tWiP$5Gocax}95U<>g&;1V6uMhf^vnBU@(7%+Q&rlW?W3|e+vnplMO6MsO*vDuMA7Xzo$@@*+P>3? za?_@CD(`Xdj-IK!$7hK01eBl&qVmkm5Y_#eAga_5QUO@%W2Rjx+ZmsM{e5o%@RdKnfieoUTQ`1D1qDi9Y_MA>}0_KRa z+fzh&EYBsX>zW|SGd^qkW_4b+%7Z!OO!=@(5fzXbqJl9)l>O#yFyTyzF+-G( z%M?*QaZ}C|#TD9wP60FNpwAMO+GdFA;wE&eYnvd7{Xbo&rir3K9?ip@;&h*{Q`1D{ zOwM@v`thA`ruvm~CQ&&|GtLy}DTne*oth?!^EB&BO%lc4%@O5^oU(oH{S;AoJ53Sg z@jjO*o|HM;$8%Tb;9O^Fnked;)2TeYGdh*m+>GrjO_MrRnkID$O>;z{X^yBm0jIqB zG%;`-GCEz}Hk@88xz&*NI-xm2*zcuPYtuR{+fk9^W}&Ci#2EZb6cK@XerFDhW47^$fm}>&+3Wn&xZU!x64G5AFUM@a*)>$)R zNSb$o@EDGx!&WS34VF}OnV94th{3I)2nf2MhT@EM2^?vqhSz(q*d8yboS*@+HI9=Ru=4P;^k3I=u7b<38 zEwiNQep|{HnIL)Sa>>g(dOW1k(P1AIfO!MEUe-}E039Ql-C4J3n=F5c?o zW}yeP*vxX8aMt{Ee#VBXMH0al2NZ(wBuiNHaq$V(GAqMyC+&r>#=~mhz;$e}FyaqX z`AFl2B{wW#ND!+rODyYkmNY;*&E#4L*vYs$aG*H~3|hOL$_S?jj4L0oxbT^iSz__5 zv!r#P(VZnAb`ZG4cO-(fzSYc3P7y+ZMe_h6z@UbZIk9-wS<-YvR=>h}6ATzH@$ri5 z=oB=~1PmWhq+`j=JYb#RU1y19z0Q&bi0It1NJ!mvU?t6w2-f;mGqZB&(`yP*gc__j zK@B0<{rnb}B027WBLVIdnJ23BL1Sk~)|#S&GM6y?C2bv?k)y&K^aLCk`dn|=xh5;-L zH!}0i(lgL1#xw?Cyb-7ao3xC$9Gxy^0)~$$aoVK)4P!b;L-$9QCD#MqxP8_xzE;$-Xhh%cdMIsD2$<5r$kST_809m^>@-x+fo z5%<|K|KfH(eEhGyF3Nw_tvsbLkJbr0#(6*ER&xG6g>jUZHt@*h1YLV7^_{O{5xmsE zgo-$F{hsm5Ni2FX*OjnQu%mfq$K61+fF;Pe#{!CpEQK_5yXrin!kAFw6HWL7g~5g# zbY#XN{jo3HQoYex9!ZV3{GrqZbaE^ z!rTELpiC1irc<+7dXbh?K)8ZGfJ6&~fpsfaKFo=)JF_I}5;7+t33@TqC5abfhO=)~BWRaV&2GN9BYv&m=@CYQ$JLv!$9-FiSwKEw2%;=PrY37Ttuo-A7A!Zru z3LFqjhBwd&G&->cBnZ5X0{pD`*_gq>fKRVJri5bgGRKgB;L1nO2qmZSOiVUjg7pTk z!^*77(W%)iy-4Gi%-Fog@+s`S9dkQqxJ|dUJxhP>dpDPs*JbzVG||1#BS>t9k(($p zSz^f8K=|?M%bkUp<_3baF&l0W63C?Sf*%p(H6TH;88)ySg=yWj?kcc2e3pO1L<_`!~BA}TMz(n_q1*ZB4n0qXQX%-{X1dHj^Y#{># zlyITItz$re9CSeqmA%oJCnk8n2$*|P1;tR;LaR&khDsuS;$f1FdE4S3T(5ZvGu+6Q zldOhc1C16o*g%6;m$Z(!SP!W14GrzpZxV3?8?JZd8?)hF1BD-M=<1hyjUWv)Y^)p; z)xOo7XcLnh1mn0pQCN#dzcRSB+@}g_871mm!lLVZTiLo!C#-fa7u$HIt^5?0-54() z-(Pp%W9RG`r|N`ftR$UM7_B;^Ft_l8!g97gh4J?GyFnl9>Im=kGzJ%If3^-^{q2o2 z7E{NY2jrj!1fG2KI8Tgex`>233y}EL0$Qd)6+Pb7!3@kGgaV7^0Y!j8hV?A5l0crA zfkxo_gf)d~1RYS|AV^8MH=hh2E=+dGtqED-e6BEiV$)G`o{lMHvs+QdeQY38FZ#&b zhB{5m2p&=d_KRgnZxCyEJR z1>P4I)}X7qGde8@pf+0U-iMaOGz{RxXr4=eo-3|voFuM0Vp)Zp^13bQ16F$frp}S# z@7>OA;S4vax#dfboAecNxZfZywY@6tqSnLWBz;|6kt&St-RWG(+0=SeTu5IP2kBYG z*{j#Z^#faV)WumNg)x$|G&h zwGqSwU*9`Qhbab_&CRQ0fioF<2ykN?w*dpVM*PhA8E_rmaHKYhGr~;JK~P;60XPC4 zyBUe9Y1;_ttc<~on<(8E;@X0zNtY-r!Yp?tM*S7VrLMYfN?mMg=cw3DjT%WvoW;8o zV~I>pl9g;p2zLn~pt+bWE~mm9VaC{ppT!YFc6|~q&IgMNp?7tcyrylBslf@PkmOsofD3p2b3+uJc_9wd zUtAnDS=Ie*O%#C%r*dc3pY2L75|FlB1bddUt=QktxxrCM{;ZUN0Yz7o%n>K)>*9=1 zz+G{pggKoH>8s+>+gHUcCEORsR*lv=O-*cp`8 zH&F|B^9>4Ug;@f~Z18>J5F|a2HRGqX;$XTiFS(_TiK$n@gCMRqyule!wFb6BI%}8) z8d;qWyi0&O(P?m(s4%U3gJQZkDHhtB;?TH6MON->8I)F6_dvTDofZU=5(QN^zzMg- z#5_=##L_^L_QWQ{KI#cN8KxPkLnfjZFhY}v(HY~29{IR1GjP?+DCeQU+aVbZ7CsbY z_A&sXU=$#uck-LiS&T7lqhXAK!JK7;w5yTxuvGIX*fyO$)P7>NCamri%jfMaO91q!pDQIm;q6;C6 z5(QoO&w}La{PQ9|NZ|WRaZxB~hkk-8qiAg_+GvIY`5#F?- zGM7UVXhPKts2p*vz-&^m6V@3H!YL7FLKD#OTFWxB0?;hMivZw63rgVRAR7+?Zi+J% z|B7Z(rw(z+Zybz`nk<6J;-inTQbb*Bl3;I_(ZnoqWPm`nL*H4)MfC76G1R1igHc3~ zOoE`Z^o<<++8ZT_&7=q5fDPDiXTnpd>N zMg~{12nbG$u96jiP;^2ZO&S~dfDM;Cs8JCsv@J1&W+8A~L<>^)FC0MfphoXDpn$cWJBO1Vf%hl7j zh!Y^Ud*T!ckL&`6HkKiHFwPN2QBZP|$BDfeCATE`BLz1p<}fhF#^8FmMih~WgQi0g znjCHI3S8aG$lY54g41OB@cWxPXdxn0A2irPl(<44vrO+`tZ)jtirbxU~@B2U={vMHfNL335bd0<7nQGeF2$t_bjDgO+$l>NWTj5K<^)?i?ZLJDz9y~$ z8^u{DViKcCZNGEK2!&{`I4TW-xDBrCa3}!uC^203V(NT{&L^5R!&$^3t<~HUXD110 z{TWlxXcaEvG6uK7@#wT)jnmx#BuP3$VS5snRx#%X6tG-2sOtn9qR$zmBSrqPEU#ID zImMGpY?#QzWRku?iv%Fb<>ww3=$-V$!cS5I}W0kmfrPV&twd z&XT}g2IvA}(g*$Fag`$(H)z# zU?(7Im<#qn;znJtM0FSNqI0;*V>ptzMu!C@AQsl}2nKvEM@ z_qdr5mRRqo;aA%raaks=c903U(dNV-GD zItP%(!=8C^!Vu5^#Iiy(T@9g13Q_lP4?jNsi8C#C|L6IQ4kYoc%5SnGR; zxMGS9CKhu<+`#CX3Mxfdlhf2ng_3o#(XiOMd}8N`TMC5JS|1>;c=(Z6>Fm%*3iK4LrweC7xI|H->4KYnO=7CYOL_rr2v<-hUH3ZK*l43I@7={8CUm#+v zyAE!d1uP)wG_CI$2B&a;M{yRN%IUyb^=0Scz`Jybx$Sv>=QHi)FQ~g!&1Q z^FkbGrZO6m;13noTp>IxZmDM!m`bVw>J3*GHIGFPe#tL8_cn3l+OLc^J2zEaHmXaC zYBf+*U05iw?#A>wAy(R9hIB7RXk5v}51dFYfYqW0lX%e(Tg^?#O&v8&qPeUZRL8QY zNw60R*-SR2m#{MRm!w{VgFA-(;?@G3$pHU8aZwV*-mi-jU6#ICN3k+`4O!8H125%! zXU@fdHS>fFm%+p&u7gHh$uy4$2pGh7m2?gI;~~LWA_rr}cGS>e0kU2(Q#DZo-v4fnNk+U9lc8RBR|Dj0S3CbH!zHs#M0m&4S& zw3(kFBScYE^OK4zU2hRbQ&UilB=n7Hb)-HQdPhvBk${Qdbi#n1;x&M^4x-3#6>F!) zI08GXSy*+tc;ORgi$|0Mz{``Yrj#gpxHo8x%*Y^l0|7>K>TUl3CYB^)OkD#;T1c`2 zm~it5Wx3E|FfPWj>o!6YeryARvD2dhkIs$~dV)B3-XKoazqYv1-#MLgr(W+|NMGO9 z6N}^EadG}Q0gPpFqA@0o9^wRcj=|&zLDtmHg>{lR!Kg-!sfD}Z9hun#%dwIrNk_cI zR@h8&0XwI-uJko=w1n1rF=Ii}uFjE^2D>S)w#W-rqTm_Z;vRCV3!68>v|vP&(M*_f z)7WcMhZ(0y-m^%W_?=G*C@x8GooK-mM+!Lc_JX9&aJKLT4VxF?iL1dRM&Rn=E^b^! zK&ZBnV{k%skJQUt^SB8I0C8T-MiOo!%I1|dVXJu!c7|_+0hvHjY8tp2D#AEsTnxh& zC#%IlfESiT8^7Gdhznq_Fl~ZO!>7y12;3q!kZ~szr~KE&S%qz=7Ua5UhJDGc3+}ih zol9>YERM3cUfb5G;tGuB!EF;2kh!)#ID|Z0oZQpI(Y807xl#LQj8(Hpdur^65ip~3 z?)vNE2KCaIL*lBYH*M=warUp7?{)4O;^@ZG+S}r6+v}a_9ULStEx#%d$$JI6#^~r5G7`^ny zZO$6d8671=l9rTSHf;?}3J@#(V}Xe|SI1=)2j1aeY}%OD)MOeOs0M}tz#esKmm?&+`YwB%r1tT#E%*wK9^~QBxaS_SgS{#|J z*gy<;Fd98tLN-mD!kaIZH1nmntPpxvoVmnMh~gcYS&RzZh>a7Tpj7BKxC0_b zf+asG2_|Pn5QDjVJdFr&0zUy0yXC|3MxBIWk^;6HDZ{FC&Ns@e=ULR$;67?_?dXq0cRI{ za)CS<6So+!xp;E{^KO|}@cQDKbf`ZC^7kpnD1WK!Ncvpn{!~cWz1L>4pM-Jp=I_Y# zEg7;=JnHb6?z0Fn_`yXLsPTUfeOvRfku%3&l1@qlyF5qmuPk{%& zU!Hr5IeiniIB$}9XE4&{37n2M9njv*E4bdJGq;#`{an1cfT)`(u)wAQegxi3f#{o# zHxW>M)A8m3M&Crd>40qT%q^ZP(D%zkf%UiTQ=m{IU#ZE04u(^Pf@#^87DX{i;GTvN3)XfxFU~{j0 ziI{kwTd=*$p{~2#jMj08Xb28JOCXV4WyQhF+#1Gs=_V9s?&SGNp7%jZRj5T?J#ofkc!r^BLTxp3! zK{DWA&A?=M+o)Gt0GB`_xy*HLhR|4r+`8M?06`N3i{S_WBv~tAps9DG+@#1!jLv+`-8LJtGr4Sn_H+rfxAuZ{ikD7RaM^ zVu3ly^8{M$+%2Bl!N~%7K4uE!(ct|z>mlLMm?_X_ccMT~-b8`uo7qA1&FrB1CJI#F zM1j#aPhj-T6UbraDnvnR=IL>Cda+*wncX3|re1Id&KD{W1m3MOr9iAu%U&;Qq(Zk=#=8sJeY?P zlTxy-sRE^hnu*8<1zOC^4kB%$K&4fowq0OrZwmGzym30>7OfhoCr zN{7H@I7QnSyA3=|2PX^c{!Q!PkayY+P8PWIZ&C;ItVZOFGsqDh zpBWDc55vHl@sRk84ZsNkOWa(69^{Dv+0>aGq-R6k3D01k;iclETTJt2c5s~GSprMn zRDnJp6So+BGXPjlI#^}Sx~`d9Ja~sZ2*l_9et}D>(vc4c zkls%oJEklr~GCRZbL0lV%FcX_zOl^i36rzL{H$ zzL^5mH&LMa&MPp_+Pn@{rRVP8WPzOJnOp2(p4dSjwuu5g$y2v@at9|1%)4sd7IXY~ z+9qx>e=p|=4A?w@DR!Q~KG4$y_5(LfAR7zVEP*_`V4YN837aYqu$dhMYo4u%&~KC03!W~shx2foOro}O&%D)xwpGCp1zYD$4`+qftN=PRF3^S|Kj`wI$3p{eY@F$vx`g*&)rGXoxPLp^QXxE zcFTVCPaiEFg!*;h#iTn+Y+mhj6=R37`~D=)cynPXTjmmS8(9Ok)qo)icjhM4(DOD2 z4O`%x@-tcbK1F&SkDkfTPUhu%@afKRJoT9zJAG=A zd<(1V%uhFe1kNt9I`}D)4)}08&=C$+-SZQ#jq1ppJNGt`iS{1fzzYWNjGA$`-A-`D z(5^TQO?3ImS+WMGMiKP^Bp9*zA##|@r`NjyF|hU;4pKT1(netFJTB*=i^2?$4q{{( zYp3a}JbB{6W8^RchQ&|38n}67#WW`DLWsZ=gVA-@={lzJlbN98#emT?ePW5K*$f#I z0x)qzgkl(kZ=g`&A3;vmtab&#Bp?aTEsn@&Yk3yha%XM=9utQdNm%^EtAU$WR#Iu= zCIo3qR=eVqy#YiqvPlA3cW?#(qw|=#lQ_)f;|fCfU2+SakxXO4E=260udX;_B|7(mWV=ve%OjJvVe%xNoV z4OEWIOeDw&fyp9Oz`FNLldBQL2vEnE5MLPeg3M1~K1F7ix@YvIyF-84iiHm1yVJ=8 z1+SycNy@+r2FNwz-kTA-1dA(9Llf<;9S|uA99-ZGewW<5vSJ!jbfDCP&N8|Jtl(cv zWI913Y(P!0c(+IPru&zkNfS>lQiUfMnY})_$n5p0MUK5bv&hnT`c85lPcIUICl@*XUbjvK*Xt6qSIK9J zn{{qgRT*538Eb=_;EDljCykk=^5a;tTw?H7Y<&J?6^A)Og37PMisX`;S5{18!d_V{ z{o%H_;&j~rq8MqGO91>8djQo!T(NN*M5@0X8#;`!s+rPg9txx8w6CDkcalDSinIgV>bGv(-~HhH1v+VU=ict#f>Vne=j7BPdF}Trx^>oY zuJ>dfv2%+YEj#nm&3QckcAsBl&f~c|* zDSO4XzH7U4Dw289?M^{TJoA}6eJ42^XBX*7KDkH_;mJEW-s$IlPoBS%J~RG-Id3QP z%$!?f@ARofj%VgmWOab=5qZaD=J$$xO^Cf;fHv*r_sP#r+M~4Lvy;3;X$XJU)0mo< zt$&>3@wX!hG^4E^$&MMP6Ig?s5EuhiLUkIz0;9Ymj1gO`1P=&C=emR;a+m>=42x*- z5Mx+$1kkil8D;$H3Iakz6{2qqj!#&ilPG>T^c74c;hTC|@%hiCB5*SR|BHe71e?kx-|wFqK6b5wt30oHA=<0zp_LSQ&CiR5RNM z*$SlYgs=vqRMis$rg3;HYY{?vsu4@r8L)&PCUSMQinw!aBw|H|1%4-7&9u$YK_E*S zGPkP8N}!f5WiE)6@=cf2-d529mPR6J5lCGTc#v7uNMy-bFe09G`k;lSH*F`iOjAh9 z2vZ@oXUgIy8DUBVk$O??G$CpviiA2x+|h{TtNKcz*fIcPh<=L%OK+=#hazSp@(QGm zmL-;OtF8_;xrb+(DoUu(^CGrICovtrMFN!3D&m!aE1Ff1L4YYy2Vy&sC1)uco>GK< zI~p61PdV6GTSk~JsXbE`KgkGFDyaO6Cw?Gm2oB!4mv~a<_yyx&cH@*;g|s}qApecY zopJ793RPD&Rvc*~qg$o20_?`pi>043VsJ^Lmd+%6Y#I|u6I}k`wHXa*!E5;vPs$v> z9qsJIDYJH(LNXU#D}a1dA=c)nlan>YxM7l`bO=$wy#WHp}v#8mArAy0p;ximRB*ar7K-7 z8EjxBut0U|%2rwlqK$2ks-~hjfi=v*YtqBEZu?e>qTWKK&ZEOSb06N1Ee0dFuAUn)} zkYi>VG|$x%8#PwJSs+2%i%4>4gcd|Kb4msnS`IHdVK|AUEqlg}Q(&6snrh>;W<{ur zO;dH!05}R+1sdaujNLBT1{+5?2qX^^E{a{lW6ZfaZ#vXi1!u7dw^x)xDG?x4_gUPn zq%^J2g^82lq>CAl6lWpn7dlsdtY!flIrRZtmyOW6*4Q>knGkhD#m` z=;!?%eUlUI`}x)1QT`o#hZ7x-?_rexa(IIi<(a>QQU6u{`X|a^zQc)*&$lqj@B3RA z<)OcU(fq!@fzep}@<#Xf{i_@G-_0@k<=@f2M*UwgJgNL^=T2tM@2^j>!Nxw?BBthl zc`Sqc%RhQ7zk$)5?bkOt4&UNL_htTjqWq5X)E<6!$?_O)|JL(op`5S%TmRQ6Pu7Q? zwU4@wkL;;O|M1%I8uc|m?MwZy(Dl=s`?5wo!*_T`{~G00@^#{+_mOcXeecsCrAIf| z1RE<*1fc4qGHV8~MnZPP=oxiEQn+ZErde$O?0^YlY#XHX6x-HvK}!e%P<6tgOFuJ} zV9;LFxw@#d%haKsi%2whdI1{Sm^ORF5()}0I*R~Qop5H&0M->NDfH`TFzR=Evul*v55Q>DgNDBn^G!$<#?ct1pIYn2rEKt%<2*2p9@6+MPfn zq^<&uZIBKiQxfV}9t|`PhU{?DE3gfk=lZXV*439c>QUv?zVwM=@jV{>dl=>Ye1{X= zSM$sN*1wBU{uaK$qaT-ljpknupVu6Rk8K~Gzq?b*KYbik{yY*SGw(41YZyRfiXo#N z4SUAUl;_ik0=U%J;X&(S(^Qo-KzBgVRu~G69T~F%mrX4oF=Qaip=Zn*31uQi&!`JX z;i74pX0@So227YC+aP6zVj*6k!$lT@sA|cP?5#xw>Su^Fl#wujn9@0mF&0 zZIG%J(QQURlE`8xGqxP27_-*T?0eDW>H<=@Xqu*3Z77`q6K2RZNY$$14vZuuk;Ray zmYg1fnMuR%Y+lr@Wyxsgg%a(o(HUSgAja4>Na?Mji7=3qMHWK_vK)HGtd)Sx4 zwgDM^J^)P&HdbOWZI?4y2Czmzkx1!<+>{ks7n`O~+W;6H9aNL1H1oK)J3M-QzJbyB{Q5@ugZ>te{#}gDBY%ez9g}Zil*4=n zqk4P?qy3Myul|nCQNG8c|7(=r&t9MKU-%>S4v*fypKo9^zeHc(C_dle(a+2I*JvKp z=ls_5r{fJyG=JdUz-ayie|e++k$;Cr|JSJhc;x>RI=}MNn)sVRn|H2D+N8N`r%?Q{S7MtM7P zeHfWxUI#(Bv z02g1HwWv%uco?AqHMRj$H4xjHU}J+KC?6(h6uZ_BBE9HxbpaVMDpK93(esO-1gI1t zyOWtKmrzjJvr2#g>49k$6;`_W&gM1M)NBB+*yKdrlB5MbDp1#rJDF3|OehFktN|cQ zrdIM4(PBF?skf{;|J>)V`fqEO5)zm1$11TYU zXs2jjTFB%N5Zz2vV8U}z=jyVW&I>7}hX&A&m~h>=lVz;3 zhUmw^21QU-D>)$KTmRx6B+b`6Qmm$nG`w_Clf##6)P;KAt(Ll9dhNjT1}MZ6J2!(3ATy} z^!e6C3rX8@@G=7yuMn8|xG9zisSPOul+B1BiDA*75FnrEF2~AN19uJO6snNctg1yD zUlmQ2c0LYaJOEIc);OB3G!~IE$oc$xvyN2smu7?dY_s0e-a_u)HTQQ4rY|w;Y+Ym+ z$DgzK;bbibdLG2@o9$!gx#u~0o1|Ft)Y+VP-)!#Tmf5)l$2cmt-g$_ur;Zj$`7}cR z9kV^~J7yWCh5s$H9&~Q$S10QMJiH7%io6{>0zP?StK+F|)x%>>Ks`wkWN)6V@DO&0 zJ9OvUqmP4eA(s` zZK|ganluIX;Hn)KAj=UtD@3o^`k@!K=}`6y>~yTFAwR8niVfAGjnwL$N>G;32P4X9 z4E3_8IQ1elvR0`sd*c7xEVgc%#pq4573Fr|NDH0Nn`A12d-DWWnli@bM>;G4;!-3> zZV^#fU}*LR0c-_MhdKl3(N&3BzHAifQ86sJmC5XY1kn^4lm`y^6b^zWxg~pLryS^k zJ0;81kWFhOwiPvdC$y&t>XU(Yz~+&P7c2>ict?QXVrKSLp|s_Qo{mDPK?28zZA0Z& zK|DRs*@Gvs+7zwUmZP{;iB$-w6DLszD}J4_a}+v*szftisWBnQ&|XK0tXOcv^Ju8nQM`JmHYe`ke!*C5f1!|eL;%a52 zz0L{BW(-Z}G-MmbAwtf#oU9^ZkA`%R^BDlFNSUJ{q8-NwTopOOR-m1t&Y&u(@SRy| z)84^m))-AT3u~-^oKI?35hPU!Qzi^Z-T?--0)dMLOmo)gN6V@qoT%C-1Y5GHc{RWS zG{6Qbk!#GZP>s0v8T>98}Eu6a_l5z`$0ZouYzgym~b% zv9u{^Z~Dk)ETka3l2TAuYC@6b&KGf!Lm*eO znjq(MYHBKO)sf847=f7(AUu>JkOXH^ElW4f8B5<*eY*D~bW3TeE9(0D(l5w9L#Im4&25;gsq(h^hLC2H8PQ$&|_t zvc#Qs5Lk>V7jDvKllcS2bDXMBao)WpA0FVxd zVojZd=RNc&IpV=X)Gt;Mh%m`<)XrQ)GL_N} zEH*$^VTTm4&`x)`Vo}0HnZ}c;oB6aTq(v_mEUQJM0qr1ZTUx53B!W~05)`yV)&^4j z>{rfwGvNHB;A(3MZKcP$Vvd$>@j?!uxNos2>T5P-L^0st+fXZW#G1z&%*c+KA7vZ1 zg>bHj>;P+wJ|ak&QgQlZw$EBmB@JxxZKxG4`Uon8z{ZsQux%Kaau(M>2Lza8TcbB+ zWh0KLj=EYc3LghAE}V*?e++6%c*N12ZoX_N(Xvt#RudO&Qe@H5ldL8Z5KRyyM4k^3 zN%oY_SUU(IU@sdvm3-My0Ab}@F{=UFD6(i}p^88#aBShoB_Qv zkK}`seIv6Ro_Fx;ljY#~37)esr?6+k^G>`K3KQqc_Q;~K`G_GLV3O*O`so>ilDDhF zief-MRMmtupP;NZ&9oh(Z09pU^@|m;%tuR@Nt<4-t!R*<-^?g$*cowQDFtkA^<(wmBgJ$CSf>J3p-nf-jzmzb&?cY2 zYb#K#P&GU((elb393a2Q6q(3OKS%=$*j0fgDDgyAG?RFzfAA<4BDkve@qj2@ExvZ2DtRK8VhdBMS?3&c^ZV1Xa_%yD4i7& zf=W@{_|Zxr4CzEiJtB~`m$_gR*Pc5!AHtmHu=$jC^NDd0c>+)5#YQI z87>vSvz7!EDBFdl6mSCDAxR8AQt)g=Xo^B8Uh!l{o-sCeA~>(hK*(2P$(iAZ6}P11 z8CeuQFbBCq%_p;h0`c}XF2%IH)2h>mrkb0X6)_qLy6$jPtc#cg@G_Qk0LKs=c(RVh zVTzk9d+4Y_3EDA_2-}@Mm)(lG=D`0l9AFp3qEPcembL5ZXx+7*n$)szA(VDZITDf%cbEW=qPD|p3B zu*JlQG9IWo(+U9CEccm55tIW7qEIX7mtx0Whh@2k6aDXr(TG54PXAXrH%=8RcLn4-3L z4l)=K45ls-$Tfl;Y)v>q%Wtb7S1 zScke(8*l1hx()^vTTviOD^1WCtk|Ho*hdO3;KD$S$grHDS5K%c?ZRsiap5 z^!cP_>od(Nea~#}N9@{+um9|1@0;!6|F4SF10;CBUO1`>XBTyeG?mU&Fa1sYUz+Wa z+%b!15AIKy^?Y(>9(E`*jTa_t(6T_hfK6^x0C#AB_$5Ni=eY2iuG$ieaNg|w`-Wh-6HHf?Q_P=6wp1{K~ zc#j+pw>LjLiuAjlj-;QSEG)_36_cOHTBX%@Dx;jpXuhYg0`&b_gD z7cu0&1pxrOoP&r$S`;gya!3@LQs84HWi!f66SBq%1Ff-vNu7QX)V>8lh@)^0A_}%( ztFgBfQuU8whG+pmbfp&WprE`G7;0Jv=f*|CROJko(3^pT`f zm1&j`D`#yvd8L?#?9v(FkfVIGpqCpLCOcM%m5GWvk`&Wd(&-?Z50O`$tfSP{Anc9l z%omnlwk2B$;NO^y?yla{PMayr>42TLJLO{#;ISkj3x?@ElDLCWEk2ovWzf`8cWWUk zfNV!{VN9VFo|p_kEal`70Iemm`E;V;&&?_{ixx;!3Oz%%5dFzSbr@5^${fJ0#n7w* zvJZ4e)DpD$P_;8#I~Ag!1{*bZgAk!LO~@LRpqqdpq{~OKt)cxmH+d|bc)3Va<{MJ= z#aI81+3LekIX?bFvwIkS%xrGFr~1z)JD@zdPc=J_K-oMBG9E>C207JNBn!FKKVVi{ z_`_u0l`m^nlYjDLeM&hfo)`y}C+Oql6#Lneyk)jWAF7fKA0G$rN1hT*9}#H+OCXAJ zO9I5kt{mdR;30-mYicGJmz*Z7Hq^49l)Z{nJR?e@1`}GSlZ0-`qQrsN*j3zNJ~UnnK{Fk!jGI5?MusYF7u-A*Vr057v^J1G?G)pF+7n+KGrHd0|hJNsdSW z92wTSuwnJsl&9ncI|%7E@g7e$_g+ZQo*(*tI+6Gr61iV-pp{G@x_txzg5auj22|3H zu!Fz_q!6;(i~!(A03)nWHPlYWg0yWj9u=N?O6`~ZC(U;DIQga7z0({;4u(7OUUUD5 zc4Mai9K4+}wC$k9#5Cr!|)t4%hcD@hFMYSbPAtVzFc)EyJB@qQ6 z6wg{5x!@vNE`dyG_C^yBk*erSCit=FAW@b9gOVsO2_$Qw9>S5%fJ>4*QW?JV5XhXN zY@+H1kTD1-!n_LBMOrqc(@g*seFs#B%&M3mSF!^mbrqqfl|7|s#>=CuHrR;U`Q$GXXl%)|Cc~SU)9XHbh&&Uk0P7Qt2V6ST0+{)o;}?Ogj@%ORx)xP_z$ zS&t-8(tJwDZ1q$~xmCd@4n1jQZkY{_5x+DWAL0%RyC0YZ^gn3UBf5^^+&K5j4O8Ib z@tiZF*P0tpQ5OvAt&c-)A+dD-^Ay(mbJ-!@-%R7Syg3@9naum8s~>CsE%wa}^ImYV z{NmIC{L|R`hJ7b)-rUQuFB9g4f5FY=H|drzZ^B&@FT=h}_(eCzZ`#dM1NrB%_f6pS zuHlzqUncyjoBM0H|BS!pAm$I$UEz6o|2+0fn0V$O{u<`bIiF`v?HTOLgr70-4ECK2 zb6+yuW`Q`z1^~a}a+G&r9PQ z_ROi>#-e-^Jh(4_y>DWE+in|vA@;V3{&l$ZG+v3lYZwn-0(;Yhe*ZM~OPJun-8=~2 z#(UuW9r@GP`-bOH+XLJ!%XBK%8{7_i8 zHx0i`_X%Nc$&1DNWTlop_e(?A?Sptm*U@@r$;7(JNi)8JA|S+$jgr1MEaRqO#yR+o z?(0rM8{P}NLMe@eM0!0 z)eXZQ+RcM_M)w;1%SV^#_f9yE!DsNvy7vvA=PYHP%g3*TFUhp-obZ#nw+-`l-hFg= ztM3_pN_QOGHax)FhF_)I-|IUkF=lTY<~iIwi05?g3uE@KVa(n&{G{$};nBTqc)Y$$ z_pUI0?;6JM%XDuF>-VN%{k}q%x9oK%aq}I1=E*%_cw$8%zdm8^AK0A} zMveRWeBJxPp6gA+!0sA;QunrS&h?(*oa?<4eoEIT{GyY1$q7GU*mrjx$DMaLzx4MG z;yK;>hGX=e;k*j>PWVaP+lGB9Z~m3=rM&q|_pI(+!|3{iZ@d!teT6RbZwd>$X;|1R zbbZ5a3j5~XH2gB%Cxpl9?N?%s;P#VyPWQerZ|_~hykvI`Kck!9ACzTfk1r-p+hoJL zhKF?9a7S|zlCp=+a2#mF@V@Zg(;eZRJ)B=2#6oWAY99B?%Fcb|wzxNMkvq-oJHoHh z&1v5_7Jg#l;wpF80v44*^c7U#o( zY2Q2b_JG#b<_hUga(R}HX+f8W0N+FIEL?HpRVtyff%~0+i zg^_%TE*|fFJc)ZyPU80G87!x8`yigcj$tkzA+-6-of>(^X7pRXi$&)ZhBbHB#1l6c zw~sfKV$#7fuZ|)frPhg)i@q}S4+&$rEu*%*%wP&!` z1N-$W@eKA-yI6C-AHEXLU~>dFPxu;rjo|6{8v2~-NL3$)6c3PA+j&_^3>(NDtiJ+| zQmwR~n1~{>s%ONkxt7_+Z1?ECwISHk!A1%~na%zJqki{DqAuzv5Fc;e>t`vSvSy?MgV zV6}SlgrC9sOLp^>cn0gAv74{NGuZqR-!uFSHh zY#2ko^A3+PlkQ;k2mXfPXRu@Qwuxu3*yP4O+c0PK3d3l=!mtnb1%`93pMS!m_>vRO zQ@eM<&tUV=?!6MvV8`k0SK=A$sNO!|XRtllTkr5Q*q-dIU%F?oco1JR|MX|=87!Jg z)(memgJz@d#nQ@cEGkQLM<9yIT_c=x8;i(Y6VG7Pe1YM;_t~+NQ}J#;IN>J@^MLrE zNyMrA?1Z1e#->Iqhqwc5nJDPsjHSqUK0m=uxqW$)@d@k55u|+^D_*Hzn|KE6ZRCc2 zY4{necbt3wrQv6=WAgq9KY{f@VWb1HXOss+-AhqQx3H_{E2EH;!bcI!z75Y{IfdIM zp1^7aUpEXtgUx-y*t^W0Q69`JqO#t-xo1o~fqlxv6E`;&c;30~S-o@j+%S!OC7!`9 z4fUL?Q63}XRxGm}aR+;NxKan9-#2{j^NxupZY~Ept{K25h2J);-Y60caQKL!_Q#xdHcgT-TdJ5_UE<%Zr_WA1K~4cj2ve+DJRv7T^cL3 z>~<%%V4dg=Rx9n@Dd)vel*k?S5y=hp# zFEIR!2d%!~guSiXPviax^QVE&tzWu<-F?jbp}C8F#sjLi4Rc>NP2~O&cff3Ly;vs6 zRBC~=)`Ebt1RRojx4@x%zF|)x_VJNB_?h9S?(Y7nc{CgxNqUOaMkBFGj@l-Zkp0y{dlJ!Ixh!yf)U!#&v%*7z@O?!IA9>h9eQ&ilJz z0?1wLGake`Uv;NAsSkHrec(CQyC(E|*Kpo0CwbCD4En%>^KU+m)lz7QKYNFtFuYQ; z-#N4=?(V*c=bi>O?;V<_ejDs_@;Lg$nBpY;ndBkPojq}Lirg~nH2k98*+#g^Xa>hl z8?-F26G0AwL{{X6a045NQtBWFQL<_#PN?23usYS!mSiG2%9%WC_!$r0gA)LiWu1=7dGvHSvVuXFTx0 z@88{h6Ug@dKlOyUx4VYVVZ-2@Ex3F1=tQion}!dKTg=ln$`A)3tUOs}{2Cm8H%)K{ z&OqIn&Z#3^Sy1l22fey5fKToqGR~sWyE~x`Pqq+IYFkqf znf_+%Qa}-8y)?B%-OdOmB28sWWVzV$!`6OV@+>;cK666_oS*7fwEfN4ErBA)ipIn$ z*qOx=MIx3|*f*o0N@ugR-`2r`TUsauhuW5-Xirr*JDMqhI{6GTW@n59OQy-gohAs@ zC|3?9h3%(%VJ9b%7D~YqSFxgC17}|jWYXpiv}7X$;S^QcdG3^~D$bH0HZDGCC8X#u z`^+T|m`zHkp{0L`l%nkP5R*rI_))RJ*!D8mo&$%+B`g}2VVrqUMfgEP3JmK0K*%Lh z-AlwF;xq0Ht)`ZY9b-CCMG0D(0DVx%`NPuw>8_o(G0h#ln`s`#+nJuX<=sr@t$weQ z?l0=utW%TVwFH0e14*5OZuZ2MuYs!fOyp}b%?N`_iR`vYfRj<89a!y`U?VoL^acb@xZ zG*p>Qjaj(zNwZkhR)k?( z{2CAgkKhtSm85Qr{5Piew*H!a@#~SN%qx_4hv1999(9Q4_c84+)tu}bnf7o!?DsL9 zH{#7qUw?ex=Jnu&`GfKIq&WwU;WdA!J?FnC%^%MF_vs}k-E;nX(l0*g958=4-si8! zSk>@5na1+1c%j+?Y|C-Jr^c;M2FL*uvo-~K*>-P7geZM#ce)~9+p3K|)_3%OdHJwN2 zL)^dnKEi#&eZ>A-kf^KwLU?*Udfoy%`zDYh8m8UkJ7h9{Sh)BAR>{&8PGl?45P|2C zl>Q}AilRtH;mAACQIDUmozc#42ooIQ zd~*Rr{V*{yCdf{Pa1$rp7A3=UnNa>&Hsa(@04im;jwFqolM3 z3s1F#;K^I3hoC6S+n-u_rGSE)wY|fSN^}u~J&S1f%__@We^|Kq?8KBDW>^P>h6p^@ z-I?xg>k_Nn*Y+;H6bT?fv50oxj79|!jdCb$6$xrdA%J(FLDU%l z(iONFyCo~=G^IERLVV2Vwa~7WEr|jV7|`G^ja9Xk0m5aWBZ+{}D|$+G0FBN_)K#QJ z`J%OsD4)Nkb8MamOAqG%-%0yE_&V?^y~XP>ukU$>-^TR5V72}ZC(Sv(-AVs7?Y|&A zxB1QkeE3uEFSyT?#h@tuLEtYUYyNp4-jbEgtC<0;p@5y%$-`D2h$s+&0S!JB6DEjM ztE3Q=mXsOjRZ>BjNLtMd`kQy0uqqHsFqmj(M5oj11Tq{a=;a|~F0IyQQ!v6I zrGH73qUiJxgEc>7Mk)Ryujuzel0nle0nPY>unH&S@ zqTYcmnZh1KK`oKxQqcL*({HQ1$u5b{GR^p?I?~uu96U_a{N6&HX_y=SFHK|Q?M!=8 z?{(67s_%8uz8G)zdf@rpUJw6_{WYDx^!+Qof1J-BCm$@2mV@GtmOuN~+;SehOmM(b z0!7{mOu9#b*pk6S_5x~&elB&;GC-4$-Q`WTA#*RNG~odF8W4r96=cK!#kz0iY&|%8ogm&;k<4va2Y*fXjB{3B?4*5IJ~JK{ z54X?n>zdAssa@Q@hv_`_w=(T7w!h!J67O-+zP@kvPVdq7j5p=Y=2icazlfTAvv)e@ z``2{;Wcs(Bx62od!{MR!-|mNyUwm4DvgM7G7*hI20?m-#=C23$_im=~`);Q7`);Or z``_%Od7t0SG_S>9)4pgu3I5dj!;(L{*CD0kLpGeq^J{&!;mBIa_N`9$t=?(w<;`9X z{JxuM{l1%N-sLwt>AcJDWqLfn-AUv5-AwEG%}nzrfr{UgXpG^IGn{MgCb6eW6K-8ZAL{GQVG z+bS~5N}$38G(U@Cr&q*g?3U~g!W=HK8eyhC1wQQs)RNVX=xj1Hmb03)A0(V#D4PU{ zEMi?VSmrDdM@W>Z@ll9QasQQR@0Xbx!&`5GeLmiuL9B}e)Z6}R+Ox>P^~^XUj)%pd z$34ZV~(iDHT`sb#GD|%M?J}X_)5W`)8^8ANvM_C zH^vG|(KxCqIKff`&4YDvN1%B4I>SIHZLOLALDSx6?zhL_d3@!Q?!$R6({rlx7WK_~ zkCVpj+nL7lo0-lzznSTr^IMtTbN*|(zasqWmOn`Z9t*2~oqp$EM8)3g7cs}+-ww~g zTj;<3Igg%;=fi&o0Q0{(&Pj2ID%hEYjySpWtZd1u;=(*I@#x47T;4;-)WL~Iv_bSM zTEu4TmXsZ;IZR>|GBEuq@M$lgmaH;6a?WNc1cXMkFW5qVRLhMif2*I{}UpqKd-&0O>?+i$DLFhi=51&6%2 zRa6WMVg7m3_|Ozb)}OC#TF-B0I-cLlw7)9v_IkXdX@2+o%$7ej`BnVFzli*`_-lF| z7YD)0qjPPVJcs0A(G=z1TOYJfS-jkKi9vF_NQTDyfo3{%x3#qNH)EHA072GOU4B%e zEAq@60uvkv>+qQ^J^i+d1SPQp<*5P9PO{R{$DB--6(BH&M|}7WUYuNd6~J?{DIC5U zmhHDyWU{4=FuZ`-GVT&>pMi5_aPXwf;SnExREm>ZMF^TAfEAV5)S#JuTgNoGcQef` zzMW~G$a|f1-1-8|f%7KqTjcp$$Z{TsZ{NVO4}lA6=naHdZaQsR2z)}hWk-gV((d+u zs{Mp`M}05uE&Q(W=e2*^I7j4xetY5`oISb%0H(~jetY5{8~=%RRn+^n_!;%>iTiBz z^&7^Y)85HAZ8O8lkTAB$_4d->Kl;13_C_xZz#zf}7Z#OGE%GVU{+#R+1( zvx09P%1^X??a}^LyrPsJPu$}kWOvAgJYoD(v}@P#SimQr&k%p5wrA%6OdsRKy6Q4kduv7@#-n(pHU@v& zxGuSq*yVL1da3ru;^(lZ-6Ler_s4hqDdV4{{h{%mmPhuljdSeWbnl8Ip!6x)Xx$fQ zG-LT>JY>RpYak{Ioi<@hM1>@aIO@?1Bq@kWif&>o7IU{&#mChD|gI2Y5%v4 z|5W=KON_r`Vo zyT+f>&hz9na|2922RlbvH4eEDO5YhL_3DlkpK2Da>}P3fQzsqrZ;Wg3N8_*5j?Ev8 z&rNV>Ik-fg_ZGi9@u!S`inh;9H~R9~a!yZ)_mvnKZ`S4YF%J`GopJAN$`UB%u`(oM8?+m{Qga6@aer)`w+8-KUjqk*nn5hl0 zhB7;Ecy=r1Q~kj>jSq~gvgjs1*H+|5ah&|N_)E1v7Qb%(n-kC7XHCq6g+!J+&_+XMOWRMrLu#+`X` zZqJB+Z2VKS^CW*Tu3g`N-!T4C?We^1AisZ{dAIi*`&Q?Mc~~6#L@P3>S0ySaYb_@`*&C5PP|lU6*+Np z@Ib#i6t#ae4nM+ser82KC9cyQp821k&8^PodvWgZ1LKSSE#se~y;gV>*Qpr&u5k|I zgG2cgZBP2Pckz=ZIlaWEhSCwX2wgML#S?>O0oln;dsZ3aZ0=$7%H zX+I^-uuSV@3#rDFU1vROJSuye8RRr>J+kpC#M=E+`BzCIA=%(#v3j1O|$ zS;Z)*3XoX%weio!vHj&c8IP!0{l7Jj{QyXb$0pFc+=lA{!{H| zjC)!racU%<-`cfIYfrq5d&ZyB#)RKn;@T8BtjoCem8sajG5(x(Oz-(M%Tpd-TrN2b z(->ZcmQ$DfX#ADh&dq?C{>{`Hel*Tq^rRn+bM#C8Mq-OTn?ineC{Jo%6K{xP>I36i zS&#qL_;cEM6*}uWahzx{eag&4kq>)@*SLrM=f`7rKZiC=g03hfV#$LSBoDR{QKkb2q{a+^72>#x(^AwAI^ zQ8FyXipM=g3tHUepNzj!`{#%C;nCt8^F7U`RJhgL*_B*H@I`PEWPo>B0SeHC(DZ`J zSM(-UQWYM^RN-6SMVciP`~Zrd~ zltWv%qhOj!$&~bAI{YeT&S``T3eC(+hN2(Usin(Y(F9yb_$_>4Q-(6=T~0ttqI;DQ zjqnBvZZUI2P-i02yPSZQM0a6=i2@?vvXZKbOPG=(fwjskuMklR%oT$D1jsSWV&)Xj z2A6J@6C8xf!UStmbPS-!FzFfOD9eh_&FBr-q0FaajBp{)fG~O~iq0gbeyv~mKA|RRk8sEa!N>ojH+N|^ zmblwA+Qw(9Y<%2<<0!nRET$A>2;ZpG?V!llS;W!p9tEUrWidUR+E|54Oox#b92!+8 zBApnY$%I&psk&fn(cp;rY}MW@>UIw4wKm)%b z&=#k(Z3japF+uT?foJf(t$ zaNR+T4<^S1HHr745CyC1<{m#X4mI=!up_JHmEemE)m=!f(`sl_5ITHCHxam^!zg4b z@I~)JYMoX?5lTYkE4s;vmndQu=z@14wN9&{h#0CX(TIv^uyCsm8a}bjK#~Bt z8e0Tq)h>Xh_{yjND5KRyzIql4ZY(VDS-Iq8KcTI&48)PEd1S#`1cBO>$gxX}DYNsD zA~eh7e5@C9B@c}jog!Q*08T~&`sow(W1Rs&P6F8PDN>jc7af|WNgywPNJCkn2+t6S z^(*C|4B+xnJt5=_N5drXg>%A?FS({ za8|!Ncrw~vbZC;Q+r5C@aQ60t>iR|AUK+35nHEVO$8voVNI zqJ<|7*ye=M0XbN%5ECXY#C`}0N?r6SrAXG{5GJvY`cj$C$GreZe#Ok}R~nhd*c%vvqGUg@8B^-7+@=AyDGeCnUfmL-1SCacv(A}%c|e3+Z7|9QMc*-U z5C)Jvp;FAH=T}Zb;cWmQ7bc6)G4-QvDe!>@;mly=8`eg~fl!YLErp&;*)q@s&5X=R z6k3Y_YzKiXU|HQyDs+0q3=iC7^S#!GA^;k0M3Z`Dr(>JWL?YTEw#pPE)`w!jlmbCr zOyTmg6ytz}Gmr1JW`IRT(r_bUj0JZ(n-ziwihSVtz1D{!fMDTFgt~U5vf08vdlr87CkZDzDS-CANrz^AiiSy;gJ^^sC}~v+%|Uo&JVPk3ToCif zQFToAqMIlOEr?BFWR%fKVbvdmCd?V8--jXqr_rI=bZTu`V4($eHLqkU7Nc8)a4O*f zAm*}mqLTtxQT0r=n9)Tb6*wE8O$}|GMaV3x2_(rD*&mC_K^qDz)E}?!>f)$0JwId*`@$nf_KR@$v@ms&^L`c6UJJ(?juge2UeI8VS`0C z`Pg`fGS8AhPZ-a5EDTAvqqQNyTG2R(o|4(OVlFe3f<82!kua)8)X`J5kz6fRtuMSF zO)IkOb?%Nj4D`;ZW(=e@oALpu-zrqTaAgNshK(%XG6gA}><;nL0Z6BUL`7RNmQl6o z0`5W*m|)0WHKLv34m+P!wqH0y z{bq&12MvS>BS^)J0Wi2|=0j94QP5V<2(@U5iDgvHvb>UEFVjgUb2185>BPe0q>NZW zQNvkGOIIe~%EZ2wr|!_{*fcYYAgV^-GAik@N*WCTf$1fJl9ns7jHrQ+=kCabzQUUY z*}*9C4HzzMn1yZhgYg(DzG_>89~GnN_NG;HLMjdttOw_Z;@aVg#UwWTX3dyzl`Vsc zQ)l(+&X(3hSgaa5$W8@QT$F+=!zNT=vIz4Hk@Q4E2I+u|nI2t=i|j5;E*SdR*-QjL zwQeSiZY0GWu1LtK2vG_(Nh+Ht;qXKi?5G_W8eC<&*NNpbJad_{c7vWQpKs=fJ6w?w z7|~d%;W((~kr>56EmpCqI4981&=A@EXU3=gpn7v%v*5=h3#Ch;q$iRc#EL@J-s7(s3NR+3qONt?uP07)RN zW}7;QPH1ug@qrwUm84Qu=SmcGcH}aCtH?rE4>Gb2V2)KQ%PLw>;85s-rgBxuEus^M zZPQEwebEY!Mg%LmL$uN5aG4xA0(jXc)B&;%X+XC#7EQ%(7m#w*ouUdqsL-H?UiLsK ztbkIa4-%qRQ!s>32*M6iqO$aDkvZ6<<7H)xO=S>5X5mSiRnK8nQiQgQSriDXf;Iu9 z3nFNOlT8Q$jBGn*pqN0Ps;uaNwBRZ)NU?U|L6zxVDm=iukTSt<(I#1T5R^!ktGdhq z+EfV)EthHV2@`CQ>_N9O@U%)s(hs}MGEA~40E=(YCRwb}eGG(`i1IQuWRhhXd|0R8 zCy5OwhHIBr$w>Oi9%cv0=<*-|92vH(TiEew(^IcIyf8YO`utd?wP;FJW;)+?8<0(1 zW_Jh4Q^zIbO9DK4!d$nI1SD+`zcQ``>Cpra8)EfS=|m*w#|uC)Q{ZD3qzYLgwrII4 z4`JfUWweFpMKDa|g9IL#iY*$<+DJ6W>KTaFr&gkS9Wwl= z%titBaZ*7ciw1MC%;ZQ~@*vQg&+2yv&`TMUi2?>M$JGpgDhA;)pAU7A-Gi(|%Cw3W z8$7GubY&cJ1qTq_?&?%6iYws^HLNs(ufYUPoED$de=u&H-Vqc%Wy!UZObqf|-2s>q zNOcDgSY#@;Xf+gVBozU#?iMo%u?&EKvMUi~zf7m8h^0n?CYF~4ev}f(t!g_VXWa=q zt85jRr=1K}xlX8~*8RAv40OIxa0^~KlS36}mPw3&72WOuS8W8qhzq*20^X^S6D52= zl#qg(>8V6eqGcmNog?0fWmcX_-PuU&BR7RqfJY)_ zT1AQx_Yg@1v!M1!r%2XaJz-3+Oski3gkO=&?PrJINmJlc5PBFD(3L!J$u>_U^+7JO zYRsbU?vqh)ROis5fPTIKlLcavJ|K4)_K^d3p;rb>TC+pUwCuhaLGr1GC%A`&H-mpS6saPV7vwR`38(!tSGg{ z!JJ7QAHPMf%q151V8N-%GLeb6)jU%c$v)pjD-l;gZ1V`KTq`z@M0@ zFA?SK)A(KEb&5}>H#Kn3Svr4mZYSs@+OW39k|aaXkLn@IP$mmw=fet#PZJC62@)#c zYK5>v)v*-9r36(4{VB}$Z51;aCiC&_`a!_Z8lc_n8tg@ZIm4%d3 zVw55O%uCdQJFw@i_l|JE#rS_V56(~SLv$EvD9F9O-nwREdP4nZhv^hr!mWK)KVQK* zUg}VD<}vtz`RA-hz*C4eCy_&7sDpDPt#Vl3!6huneu+|85M{~E%HdjTsk!yYhgEC| zNI~45DcULrOB`HaSJj5|4jf$)LG~HYN|`c>0S+gdG&6Fl*3nr@+kj|$d!{&lz!C=+ ziOdr5Bb#96&qBB~H5g?A8LuEH`&M5>Q)bm0kl zBwE~tkSZ`MlG&s(t)Hkz4=5U}UMv!}`shVyae-w{2P?(&e$gdKz5`k@O)CQz2?=I4 z5oWDSVfBSrW)9iunZ43F?&8yph4kmDm-xM+m*%XbYVS5E&2cVwlla9TV9|>OW!05kT%{HlShRy4fvA@r!X(Kbh>BnEjuDpO z0dyk3mC#I*(WPt?*JNS0T2kjz9FQbos#vu_T4b8~wg93YS1dml4fLcs+fgXurMq8YNqwW-SD(_SQN;7?D$Uw%+PJ2_NUUBfM z1&5W-v7Qk-%-ONG1HLo=ob_)^e!BIXfCrny`Nhe5cb+7^INey{hTGeNP}&;M;sU#> zHb_M)fi8g{`wVIYoddTBoKFc`ONvwgj)W93zz$kYV4N*%^3mqbU0!ifuI0z|BEoo~> zoy3&WCLoP6Xf#>{Mv{$Uaj8+R5)Dye8%Y%ONCfc#TrhfkK&Z_YLT3eF0t&1bOGv4r z0W%xW;!>jwqzzJ&;Dv7fK>QG{RCw5_Y9%_M)21j|46zrBe9ZWXnG0H6U@2EPf@@Wt zT7tlHZ6$HUOIe%-nSJVXaFw0aW|vh)+8*ujp`pNzO?-pO#hSU%5swAM}r zpzal)duN(`()6g)ydzL_BAgOaT7C2ax47&}{20za zRk+g(CT|w8VyJl)3GsNcMH;RWgekBpn^J`ee!$Ww!^k8qBJ1569GXW4LgB?5s)UG4v!r(XCGXUVW|vKVz}2} zk-}>8vNie24F_t21wn`@CASnFYY<{ZUnN~cggMa~xK~n>NVF=hs<3_npYk>x81cx_ zqTd>c8F@OPB)NKY?FXNRb1Eo(Fs3HKt2;#+N;o^yeuSe7FQe#Va4Sg_pk|K@lniRA zQ)y?RLA2qV5UGf{hOHW6Tvbi>6Zn*Yg}TJ&%}4zO*0J=I^+6ZUPKGOMY1L#py>Bt1 zGB0Hck|*d+g)5RK#inXwEf@O%d^!h2kO3iM47-e=ldlZQUMx{~s6doucQpnEX}_1I zQkjw&Agc>X%J=X~h$t2qP@O5Je(@AyabEW6X2MkqY%K_js7^lg;&_A{4#Y$Po-Cv| znFHd26)2eH&`$m0NwERWHkwkBwHjbY1XxpcAj4W_ddXSsZxhMy;Ua3`TkGoLWx5J> zjG`=&b9ch@DLvE!T^qKLnzG36y*LGtU}B)61nDnU1^p5nGwhe;wSC^b1jEKn(eG9Mcz+c@=$N6EGV z(k@LqMaZ2cdy3Us&-`u1L44}Be2pGrd=re_N)ShfVKO;gnzq7`io4LDCW2)WB~w}!QHcY) ziV|_P!Dy8lkYrwnBtWd&vI|8Wgo^1NdQ?&>#xd3D6gv@6?u;Q*C@wM+wTA{gH5({h zUZLOxMJ=}yA0Vi*v1Cv2!F&)evYz>z1z+tvni)^&VKH5e)NQQ&gL!XGMs+&HvAJ`l z&KNR%Fs(Bqay7{7Y6B_aXX(B@RU{z6Cy*QFgYPhjj-G-;6gp$b7`V*urQ)ybE)Z7M}XomAXIgFI>{$jonSoE}wW)JepQ za@dHor+8VIIXjF9n1!%SFE6wrq*xjjwtQn+ta%q4TtKfQSzrbH0|UNiKCIC?Js0qdo}2h{=)oMvhIoQ>E19uXJ7ky zwt39CVc(y&sQ}zgo~%_Gq+F|JmZa_U#U0RObn%ceZMS;H(YMvzakVW!^U(?P120!6 z>h;5ngtedm)pTeV%6>qU9eqj>3Z8pHLW~CqU8v5iRtzHhEgb#K zNR+V`2$k=wS1QQPt47&MVsojWNPcPEX)Me{hAULqsix6t+J!n|K`V3dYF4F;(3eVz zW?se%KZLj8|BH+I;S9tX$@PYTcWB zq4np?=Rk*Z%ewE&AO{TvPImeGV78iep^gX@61?cRhN2pIz{QZK<96vOAap0EN46C( z%{#OFPMPdmxU)0RmC27F(vPf1k70oqMg9}>0j^O{-+MXMqd?os^hdaY85i6X9yLin zvQBY3AvF7m`Mt+`*6}=yS6P43e2xx3T8b^QtxRNWtql&PXGGX4!Ko5+YF^)fQdhKN zfX<9IOiUG|#V0~o0oQbPDk)kc+e&!Jjl}^&;{$lAneaGbHJMC|PGv1v-^f1l3=z#Z zCvMFY2`^AwWLrUG^M~rd6#5Y!5yw={%4$qPu2jd=k>xZO5%p#aER-!VHn@@#JebTB z5maePBTo1&{D@Als*G4L$LX@D5j59migpr2k-j&D}PS zF>LcC_($u7{REz_Vp=qM(4781TCWE|i;SsedvnL}32s>TE>XbNgDbLDHB2h@5i~68 z7f%XHsLu3@w0@-mI|vD zD72J39ja+eerW;)M%VAG_RAvv0WQK$hDHRK8dFUD8gtYl7sQD(6LMyiOTyMu%AS zw#A8d4wROXAB73N!Nw!22S46Az zA#JLeN)Wx~k)iA5P+MWNp*V}=ZIM=nEy|SqE;@~tgjqQ_B;d?SgP$m5w_?x65g=d$ zEUyR>SFZKsVs}887g>ggDozXR|FL5x2y!Di68!(KH_Qwmv-;Fg)O2bqj|jlw?m&>t zQqRa+$V{Y8FGJeNL4RuWvGGX8&fp|7(3VI~@k4#*wDPM*-)T7iT&NGjk!;D?KUDv> zqo3~Icjf2e9udd&$esxtvw_S+E}osd4tqg{o~&!fiV1p=iJ%Hh2h#dNs5

Ky4J7 zX&HT7mNORG*pObn*!-SlCZMw!_Esri!3VH-(Mw1%r97CBtB<@KkTSfpW*Pun#W1Q4 zBv+NdaHqriy`uTM;)}kSifjZhO(BxiTSEaws`aQGw^Gz`QEvk$vq|D14-c(kzw-$)d)znM!-8BmiEl6J5e- zs@yW&Q?0!sb^27M9sHJYIjKMe;#g2KfZ-SApt4?23t534v(yt^!X&1Y{4&4^inr(N zOOSh!#-)C=ai#x;(EQI4kZFm`l( z^VV0h!(kpf7HY77b|3^bGf9Hd(kaWJg;tUWyJ|UX+wU6v!;l%;yE&>hi32zU?dE$T zX9Y;|arEyT{`*Fsjt;paF@!Gd4f=zlf8X$4h)%y(X=gbTo$JLimS%@5;epW|gO0wY z!Cy^5T^`gk-Q*u2hs}d|b7kEFCDcLG0x1*~GXYK_oSaV#0tz9nLsV@oN*qmcx?Svt z?C8f>*T0nLR!_Lj(XQsSy*!*NCuw#(c?<$NIyBJ$lwnkj6s!48p)!3YR&8>bq(604 z`srn^5G`hYhBY4B9bahyS5>EmvH>ho$^Sk)484@h3_@C9rd580O<)^GCdWhWF&r@K zarmau>6Hs~^keD90)tL0)M;xPJdmV6kfp9gLbSSRbXk%ug41TUx1yW^A3P)pm>d#3 z>XgGuqf#Z)x+2^vL1hp)NPw?9Gc6|VOjg_2NoV$vj+AVA|Of1q7+j@RK_P~moK zP@7J}5QVl=X%k0*ig*kRXTUR#lv5qANy!06YgtOd+%6JC6GhLAeoeE3GTcn-WF4{j zybOk&?B7OD%iw#`_Uoc)=rJuEzVlG`GZz4Z3yD;Yv=Xb?-wz*Enk{pkUWjwJYHQ3X zb!Bf7+X7w%+K=*^MxSQeBXMrV_Ia5N;uIXh{4DLPJI)MsDn$2$1(V(4wR^#ETYb$sV_pMM*s$hP^`sYm=)5K;8ar&wDoHyv%CR#cMxiQF z^-I4smG+9BY`oNukA-QXiydPeI!y+_ZF3@KGgn3)z-mS=w8Kx!P5+?Q(P?$SL#Tb6 z8tkPoEq1UfC|?mK+PvW~6)~WXlP*;LVR$&F>}-G3ZY>u(2%W`GJxu_R>tq8>b+iM8 zZ~~$~jV@rt`jI6Vs9Y#3TWNB{++|y)-hP<$^b0W zlt|cuu_d|Gb~xlGJ}YwG6D+-S3(KqHi%|WDZKqiynd>tR)M*8JsZJ2V87ibbaGH}k z7{JknHsWxqSP1`@(f>Tg+Wp1w3_+QtUGk2^P{K_{Y)LM)9c~JlXnI#5C7_H){lqY| zio+DzS?!?866x^T|57oIqt!*|EPlq9r0<|MY{IQTa4fsh6Y+cGN9Rc zjauJqKJXh!E_QSfYOk|4SP|4Zl8vwvw}lx!C8k+j=sH+S$%>0#))XgND6sq#hw(T3 zab6qUnNUBA3~q+w(sdnRHSFjvn!zY?oU)WdOz~oFC#rlX4~K1`nplCb6lU{_h@hPM z%{|~`1eBJSYv>fJK^fxZ)NE7O*>JKDP5TjJ`zOszq6;kuXoS!l4YEPl%>v@wx{H?4 z+TelhXo;3^L2}GkhB7>re^AkMZFUTLY)^u#VPJxxtfoQgAZ-d~vvlbHI(ma>zfF-V zb(_+0azsbd-5wJiQauTN+2}LpOjz508eU=t;V?x^Y)tSgNADs#(>DHx;bYI(|Lf7` zGI_y_neHEkr}iHiooD8IuKc5;XYT5+n9=4^dcB$yaIW6>4gY;t-CO7cblKlTb2!2Q zw=g43j!2;vC@2hMHg17?#yGk<=^s5lj=)8LM(fKxkhX!R949U*d_PlA--J_^q6M?w z<_Jc7!w0y!VQvS@8VijMt(7&2@jBDQGvrKo>EQdB!sw4k+mn9V=+~K!**6ToVUyxaDu>L@3LdX{D zrTzdPMj&aVWe=i~E|~cJpws20K+Y5nQZNwixd`hrJ@h^3M@mjwRT}~u%-AL6%N<+N zA=Nm|>U24Y9CqH=-N8gDEL8L$GePq0w4}T;vo-`9Fkchdc3an7TZBW3ShLkJmmdJb z1V1!7tup?$;U64*2kXY$B%*!zb??q2qtC_o<}1(D zdu;T)|1^sYy$U`^K2BUCF5IjipRg*FVVs)Fv!l~c*im$e+cF#lLz8k305FGHn2Av;o$B01Ok@b0)HK z0AIB`RC5*7$vQHXAuUGbqg;-xYRYe6woO9U%0|<;W{o`68rOU$N(+G~!*Qs`67L$5 z&bS_^0bdRNIQ-#%`{?6dcY1NS^o(aVDDk}R@c>k?OT24c=I~zL9ZMbRw{OC2FaU1BYa}k`@ zgNt3mdzfz-y~E1HGONzr(FdhcU4_?0NCg#%Gmq^DfM79KRw_x1hYd_PyqRUlnq4Xf zm1-m>b>ibeS9;91^fCvo0lTETwgWclWNeP6RFRdzEE|t)@w9P_G`77s*N}()Poq<; zp4AzmBmCXykRh?OsW&69G8mfx3O*yVM-m7Ytwy5~J#myl)tmeTbHk zKtIVKYzt{%f|0c7V>_kVfDs)KwhC7u5h~h88T8UotR6HrBfo1is0|cAw8yG%Muzph z9I&54gVm2l)03d&0D&jfz-&-r>LYX$I!iliIt^b2!;cbFmbD2gud=I1pE}_bHo1kb%;t zKq(h_i(3K9cPI+tjkr4H@mwE0z%b=WU^B20Ci_B6(eGsym;q#5^s1m7Oy7#uv5Bpg zkjBE*E+2T+DZ}Cggel*7Phl7m8RbCPn4BC%Jgrr>AeFN!*=jXXm?sU_#|Viey9}h>E}#%7EaO0Y z{8nICjoKuFNhBjEkq2!o^Q{OmmF#r+y0+LE2P`yh8BZaLb^(=HBf6$)E3GT@`cZg7 z0Ef_e5;zp-o-;{Bdl^qfR_rM7&`~#TcAP+_SX~ZbOuNN^NHm#Z4W+IXyF@LN=T$** zGa!>`-MF485$YgC0iD4tR)-XuVs$x$F(FdpCB>~2H8iQk&;c?~vBnj&f}n3q$f*F+ zYE+?+mea@xiIoFVW3{d^VPPK?|yn^x=+wMed41;r5X9j3jE0clWO2hD^nXhk$)WUzzO zi=?e!d{JahftfUF(+Ujxd1uL5!DbK!x*DsJt&+MPnkmo}AH>xm!QusEQ6%jYmLo-G z1=MKN73+HdynJWL3IU8>tU)j#THK1P*sN6Rm9BZYa^F%iY;kYMowqArKfq|$Jt$Q(F17JRZ|&r1_sF2Wie!f3QhJs?OS zduBKyNn~_YVqZQdP#0NQ$f(-H9kmu{Xm$FC+6xQq@~Yq@&XWeFb>y*04cUdO#bcU8 z1enSWvM7?)Ibp>?kk|rZN`5b^KwA6GQj6F~k0MtBKckuo*a>xzq5vwG#p;k?@fsfr z5P7p$nm~%ofs@1I6Q4aVO(>0*1s%agqh0C&rCX$yTWmk$M&+)QvHn%6P61g>{HKhY?6W)p{ipn zeRXf4n#ky?#HLtX4&kKJ>TCz#f#67Q*nWGqj2pnucWm-3$3Ob8RQE>S* zL*nT=ka~>GQJk@;Y(YnTM~dYZPgDa}2&g~-E+3<-0;{5Q zJycE#dpKAnPB9}<<`XRtv2RKQ5x_FcwW4+8g3=9jkYerfp$N5QJca6}5vHcOf<*BBQGUi`V!p3o>*|!;vC$;6`#1{G_qxrG*|kRnSpi zWm!^JRma)^SU4RO>}Xh^13XLYp!BTsh6@Y9P~6BAQ}Syzz6V$XSSIpdrc|y}jcQ7; z)e^#HZK(iabXCT4eNGX|yx~sH?uX6S@y5O{n^x=+v%m~Dc`-C}8K!k(*`#!n9R(Ah ze2lEK#HKiPq=cpFh9gBLsA8yoFDoqb(gL%bDxAy)F2l5LEIVDkt}W=OV|b#gGQ$bQ zkVP1`R`yj0)0Y!Ll1N5L6k=~ZE`kgVU680*m28#Nb&$exUgKkAT^$Ysu{nbt(k*RY zyA9tCHlYz8mO&x*&Tt7b;M56;8r1`)g`|TN1r`n_uev%M@Uw0?B@;|(`^p-=4mN?# zN16M2X<8CtjqpvQ0P#-RO9a#VZfqCiPCN1!gdg}ox zdg7DiO5kTyvQ=tNBt-$#6Br|!%0U>iwKA;UQyPw~hUo;f)u>Gxn8YmX2MHdly;y^^ z+oB*yb@t3~M3Ttps>B|Z3|YFYZaBw9XU$IROQVL77ML759@-AuL9_w2Xx&)0TJ0+o zs-uqP6h@2;fR|=kg88nr90~H`&q$Q{LM-540ah5r0p~poyft^@K3Zlu$1x*cMNQTfsD;jp zR_<8z4Q^=4e4?@EL_fs@5k$fU(r$}esf~V_;b>=ljA#g7mQ}l9t2IqcU~k_OCRTu$ zl3zAy?0C#u4@eSZXqanEJE@YblDZyJSnLcoGt?%Q)uYoXYh6~ieIfMK?ua$iv=}j9 za_E!5p@6~6AOsV##jVJS%}N!Syb%wmY+Tn#7cgwo4Z~L0^g_%?e%Yk4OC%#GkryM3 z!Z7V*EIXaWrC@Siq)V~RLTSq}Wa+ZHqe|}{n=zDQ;fG~Vduc(5fR9X#un3}PouZ(r zA%4Qu;-MqDDzXj~Xc_C?aG@d?f*TnQIQKOhUs|jMViI|*DU~Z#qnZ+IwS;pC5D#?Nl@QMH{F zn3qOv$U@IHZ#^JQfdRz^f(cctt$+blC{&k-;sq+yRS!1Q*ZiHyhWQmlyb*?f}QR?))1bfZ5;q06j!+d_r@LUZ3L2vUvs`_D^JKWpLbrEFjB?RS< zA)isnR;hi3LOTj9Y2@~Apq$|Afo%oy^Qv1hXsRdCeikabE<%WO-7?-YQ0@*W82Jd^fj&Uz@~*xyvX!U zISGYfNRStQMxx9oS|DQI87=`V!(5L9BsC>~bQo3u1?B~FNx}3xaK-9!2xICh<1Fm4 zffHw0JSe2zdgwH4J81)&r4ex*QEG&T6l*GB#EbzUK0OssAebaq91p<9uPNF?^LrTu zBKD0Lfjrif*a&I2MM3G-VdLI{aGi{<8oY|twI(F6Fbu3t9;oft0lFq!Dlc`Htn1~) zq2Q&O%jQM(x(=-JDxM%d!6aOA;x0YAv9MijgfJRA=JU=_3kw-(qei>b1ExKzewjgF z0!$}9`PAf9tS+biqdJ<4fJrX~(r#Qc3QP|5pd@@`YlOu}-qBkbsv?qiF+0ub48QXzV#@K`EySI)aUcqIFfW(_vU)da(&CNd^nNiq#>^dkVv_ zl^x@wJte=FQQ#UJ_6#Utjqn{Q%{752TP-1MR&h6hB~_!}fhFani`tlzfXp$OUHR%FGt1g=mf%4B-S(>5nv=!6!W z&EPB}9oViKz>UT(Q3J&}RmeyiHCm<~5ag;WJ(-@2;xLv6v>gD&>J5y`|Mmb*%@A*T zfUfY@0erT2oc0;>Q6uBta(nxl`4kHB{P{lkJelBYgxyHhs2))IHo!v%7DUO6uFANs zQ`S%+s~X11Y$6Damo{zE0+U0>L#JVnp#|h}@;W7e(URa8$$}-#h^`7OZKsaQyr(b> zH2OU`8U3)lS+7X}QIdSeYg|V+^ivog!>yT=B+97qAkz#G`2=!xW3Y zX3zqYmmln)Q^<&Qq;ggTC><0~>4^z>jgwIz{bWG73x%RYTSaiopib zZi}&a!cbX8#Sf@hI#y{9k?3G(94NYMExbI(bO441G*_@=qqI;U($i2>h2 z)fw<8lM$TCxE`)_Fngn&NvWl28#pm^KFZv4(nODoZ~~23(~K+X1#M9R-$GT$%A{PC zamCxZ96|zKXx!~qENoZ2{X_%HNeewL!WvkXku4pR4={+hk)$W1n6=!)u zY3w;^VYgG@Lm?y9AbLWyXcKGl;8C?}v6|2i3B(N|Hj7}b?DZYK#q_ImRBPfvrz>o z;*83Su1f673APSsX;(g)Hc<4K_cfz!;2J1)iYe%*??}~*3B_ulfvzn;)Q56a4suEq ztIHvbi4nCSxRIehFE$zOgF@;}DJRHaDq8?RZi5rUh!;PS@Wqs3XO?ljobZ^p?aD{f zc2-&?hR!FWz%^)~i-4`l79`U)x2q;E$4D@#*2)M;Q{*~MVi0+cig+|_WVM7O@oduA z@tC(B7eNL}uF+W-9g>$YMHdVWOrBYWF~3UTiYl2Zh+15+R>qz!}$yT36m$ zjZmQjt?M|=FjIlW3!dKTCZRA43G(9C6K_A!0ulSpa0zQf!!%=u1xCVPg?a#2vj7#| zfXV=ZEMnbz3S>)=M^;Ph_?nF?4Uq7rM8HR8O65Q?WthoNX;~dEhlz-~vDPgInPPQ0 zdaP;~&^3{cDZYKpCJ%wRNkQcvM z9Sfl{_nfrIkagI0!Z*mY#ux~j8af7WwRk9r0AN^IvoV>~H?D)WmF|dzDfpqe%2i52_(J)sF zU|Zgrykk`X^uKw427BKF%$0rL1Nh+Z{JiHQG(U{DJpj+oTmJ^W4luW`KeqmL?zhyH z;b+A?oD-3#4b?-#W__NF%*5{%1{SLi|Tqv5w>u!D73`ntN;`diV;jgVHgtR#Yc;RFPk*> zoV1{nQ-zc90v(lx=?7F`9mnhxSOB4TK6bp`R3a#hxKJ3*K?ZBXmS0uTtyOPe-nfrr!^F(Dt>I*Jac;g)f= zg)1s__EJc?l&b=Z7d(sUCZRA43G(94NLlW9frx!)sKqdBb~O1C_!*u2lqb|fiUO!$ ziqWHt#YE!+ro%8NUxGa7O0^qTWucIIV@Ak_RwFMG6nQDfaYcphP!wEF*=z+CFL-p~ zJMSqBLxQ~c#{jBdHfiiRX_BFq7S;;q5pQF4@>8A=c6N9oNn~_Y#&dn1Jb>*TLEr8UF#p`{w>*IJzwrV1ZGGK=|J`4qbNS!;4(uQO_x%yS_Z_(Y1%1~a@w*@3 z{M$Q!J>K#T@btg)0nYGW2jC+3-++53f8l>lbM@c&4)E~5^#S~E&-c9pT>q~-(C_zu z@gwf~f8Bxq)g9RD|F(DF-4DQppZ7cdJFqjq>z~p2-M#G{VE%W0gq;744{(0H?|lc} z{Q&;M^}fFW{`$Z9BlOk>IM@EI?*PxvJ0HMj|BVmOkN1raaGsubeuVt}zVQKge!lJi zH_$6_7Jf@y5hCDf7C=7v|60V+4~xxcK|J}@xRI_gPdO$;URUJ+8dWGfh$+2@MvM#~ z$f5|SOo%bi#EV}~Xi`Zb1S9vY%SDi(WxllVGb-8X@^x*ocfArRo~}QB00z0{a~Ni! zL+?{IfkWYBNLq#o)#6rU#g3xj@|kIMj-~0;ai6!%%16@%y_OOY@KG49mo}6fa0>si z12CS0zv%&H__w_SJ{50#2b}+nAED-d-vjuk=+UJ=f{(V){M-ZnQ1)Ax)jymJ5Y8W} zQ-NOO-&~3x`Q1)M7S*#LU+(_7R=W6)cVMW8TXgWRLwvY%f4#^;xZ-69nFb8y@iHN^iYRpw#B<1Vy^mgcVh2r9p-jS zS09WIk4Mi1&kmH>MuYIA{PA&;r=$D_?!+HDG}a#MQx4&&a(w6zf9&x2hv?S4`615V zAs;;+#7FJ~z8^b$_MLF_k1am?H}U)-xcYy%PM>=S{QubDbMM4)|8t8^J;Yyj_|!vq zh5mG%KJyTN-QhD2vHxT|_&7a(i0;9IcVZsZKXv%LJF$P4{?y|09;dHEoQJ&Uy!&?6 z3ptmW67RpePM>rq{-MJsJx-k9?GN$zo!|=n?PdCuJ8_<4o$uj0YS z>0dg0!XeIGe)>-QONXyR_&4eLBm2gW6Swlw$B91~|I*iVxO4&g|g)X8S{uS0m#=F+fz z-(`yLeV_Y}{&-#&cAk-P+H~P`X>4!Uf^C5ijf9Q$j-tqT>MVJ15nftT4 z@cv+Y|Goa!L(ttBzP|;EZ#cwHboj$J479xJj73R_@+bnC_no+eI4Rk z?u3uglRxbF^Yp{_`ri++e;I$c#lQb``Z|PvzyIxz_-}A0c-$WSP2m2s9sc<@asKfA ze2e42m|TkUIgYRY+3_hI{_zmD+}@A;us^56KOLg$_U4D^wm*1?d9Z%E!{6TttbK&$ zQGVcY`m7GW_aV;dp1l*^;Lo|&e{&~#Oh2c^-~5Sv{t)LTJbNd&&7aocFYm;;1E1L9 zukOU>cKEA9_!NEKb^41#d~%1sIK;WjPv418?(i3f=#SEacY@!;r(UK%9b$iNZ+!@U z(w}>s{_vZ?|C2lX;Z7X)PjB(@5KnaYc!;?|58jDKI(#_9{$2VyL=XIVD1G>Smd?Uc z%IEPrL7PW^6CUJ&4qTirlNaji5c`*tE78;ACE_y8b$aMtUw2~qK5(<=5FP802X+oI zSMHG(zw8kE$K$czM9!IkWPht%iR(w(Z|wRx>>rP>JHZwB>+AFj z9w+ZdeI8oR6Y%hhC+peYbqDY^ojbM7CMV@7pG(OOpOv}X&0qMAeCm9bEtNtB70OpG z^|mgWW;Vj^5;lZ+MVU2aG|942;mth0)*19d^n2lt3JS_BF`~fdDwto46BuXBdLgJ) zjpO*HU*6ar<~EZ}%hUwMDOfMW&M*8?L7|ztmJ{=t!W?Sl;Kn0~8Di@fKE7jS4}|Ac zKlPF)U-eP&QVH8=5c3!Q39V-eNd$(`cAI?VKYWN|FfUs8p^sCTJI>(7jQ2f<&})}x z4l$=^EGK0=qSguaaR1FiINhUnVlIeRrI?#Bm&|L#wdNb3wCHuU~KLv7XpDXbT=60#>2!-7+-?2BR}Pua2u%g*Ju3vpm3yK_KRLrt3NhC!zAK9#d9w_*orjviKq)a-0x3Q^R;=j894}2Fax+pmg@}y%GD2K90Wj&O#p;VH%?{U%AFLcx@%3?ZwB1 z_BaqRkuPJgixB~;T#T(lFB-&r^g~xfBYL1~5y)^>2-B3n*!*OAg}3qLWInh zvC3sc7&S`7<6Jbz(Nh{enZ7{dMKo~+!-z2Qqvv&av15jy*4J%qL8ckXiPMA2n;fA~d%n7^pT7mGnPELOUV z2-wMWlC`eMG_&p%9=|rTC#W+2&EhmV}EY@Ym3_%UQ&w5?Cx|YGV^DRh=GnDgf6-abYib`R_ z**u!%!7j!nh>J1M`L+rqs_Q)y&FWn@Th=)PVr*>&)evXvudT*t_=&QrcUDRABT-biB5xXVbu8MZzj#YMyT19$ zenn89TsBmKu1iLvW^N58+jr3(ckDig6)rdx~B+ z(6EaUVdg`V?7)XeagqFR&?GAp@@Qy@`Os(92^N2GuE+G6p0cqZV8uqzXkxI75do^) zQ&2|kC;};lq;#yHm`6h!kl(CLU{T-Ox#;R;PuaK-usRrv5zAl~!(6JR6DdnVagqFR z&?GAp@@Qy@`AoAH!QzgZGb_WjD}trg0LRH?wG4JKE|#VXX1p!ia!{C5$iUA_00$V7Ol4_)={d zY&!$vO_k6WyX--z-6W;3)qv+P+5l@YBIrH7`NJ==Mr9pNW=;jInFw{x)uq?q;4+) zgy58HG5|6G3;Mz&SX@x$*rp!_Nz}6wBs=kTN@_n)*iZ@LY&U&C<=%`-uy`_>ZI9_{ zL0`SJg0sy|9ZG=uCePNERxS&eXxHrR`G_HOp1QxZTxwcy5IP2MnWVZ-pIO|g7I*R^8+sWNr zvWD5xl1QDB09t~(P7s$dDp0*z+00xvE(A7Z4kfz?yf7kQ0pp&LIQsbeygYHk1AIA-o(p*W=R<@p^rpImC?YO?l=JoWbk=+DgRh_X&>^#@9W5 zr*npf?*wOf_7I%m z*+X~&5At`5Gc*X4KSo#1KLM3pjRVO0O%%;@`wA;LEO!enQCaz13rzzqh{5j1HLk!i zhac<68j1}MU>I5u3mAxPaVj@xHa|!mUdVx&4PDrPwO?51V*lkUy`w=l53-zy%Sc~z`klTrF06r*bdXFUIPqPzgo>`%Hunj z6rxP%It{{f#(@Hhd&UiOV5rvEt--300kAKsIPr3yWQ0>K!vTQBcy++ascH8@QXoDJ zGKq;J9Z|7o+%TlI(XAS+3fpS6wLmj{q%|3c{_i8;Jna`5{+1%$IFZy`VM=W= z$+%%4*Jc{w2djcHAvU6{zT78~(F`Xn(^U*d(8x{3*F#4mM2uwUSm8z<#fl&}=&;L%D*wb)S)rNgO}Mp)ZV%4R}ib*@Ji?y?Ard_dJNhpR3GGeCvap%YWv7`-5-~ zev5-}jNjlO-H+ekAl;9*KZyTY{05Jff2)3r-^lNANBo`t4!@D#B{f$Rs(%>emQ^vC(aq84kMw1foD)>Kg`b=Uv>>XnK27 zv4LjAcdL9rb71Tl2Oqj!Q#g-^k-qxf%^+ii8zjoSS?1$5F3z@8`6X#=k4T_>Nr+$rq9lpFrL6DpZZC@$I;?ZS0+fwD1 zWZN)!*2IRyV^e)vxiu1=NDTs5SvX}_K@Y7|hsbtwO zQjSgaY2_wEr>VVeCq_7(%=8tbN|S z1qc_NlI3wJv-2+QX&UA7mfSSiR40bR_H~fG+}Bm}%AAWcm#HiCbw_$&e1dv(xFkMy zJw$uie&a{WM})(7*0~m310NQyhTpS=d3oYf8(nPj!x~B!XH_^ZWvmuhOw!n%T5Kvd zpgQph6hZN24b+$WBqL2bMr;5q#_9wy?1TP=a^pnk(Mo1DWHHH>SxRf7o3ez}6~%Hk*OE)_sa+X}>9B5JYQ8RA| z>yAd`_iSNoCk8fPTXqEq7oC#jVVBu=7xy%p-rkb!Gd9(+q(2+xyc0RgEH<=MRfQn* zR7-VS6Dpd$P^8zxHgmX`B%r9Kda?#JnA&N0NY2Abl7Z2!)uXBundHMK@{6w++llj> z^0F&HxagEDk4u@e@8X`u_P4j>rpcx{G5y&v=bdN zF4AS&+5TfsTo2a9bHhtFt+J_Jp?Y2NnkofCX5J}ISjM&*W8lEE&CKX+ee$JKvOMfE zRtqd9xMMkRY$~9v8=pW?%*FE%+eHh7YR8BT5sR^11y7-hW-pW*C(;+9Ux4ACttu-t z)0#jA;yv*Rh=n(-I%+M7lTI}wef7|lNEUe38P2lqhtXdeJoMqYcjT_ z3YyT7R(SLtx_HiG5mRc5x@^DYAk)ubJ%{~F_Y9L9`<=qxWPM(ygfzIrH{OA?z3Jj# z2l29S!RC5!^}1Md8@(k{-I@e>aJ$T3cf>2e^_)w|lQ@{j!B4GaE-rOmrLqY@S&QXl zYT=-ASbe!qGSakT#0J1(tWJ>DebB#9Zkz}`TFLg^r6IIsmeQK&rfgt*a62R|*-@5O zlOT_L*hGG33JO5l7$m;Dv#la%59woJR{)zc$W#x+tF$IK0P!}KLo_1xt&UoY;-qQE zhz)?nI7dKQ_c{A3<;IDmLi8I|zN1iu15I*GAOrF469bY3_Ntd!3ur4vc&Zt(jT1U- zV_L2fopO}zO($ttkPTI%k~Es$-q2FAZaw-HDnrcepkmR$jYl?#tP7-akI z;+{s++nb6FtZr66SqBPb^)+zuS?NZ@nba+;j_J0%Z1NKiB@11o%eL=Xq_&u(p?~Sg ziSV*H^~p|9DTe5XK(xSUq_H>CR|g!2A82xUXM5A9|BSPleRpXHZEf!?HPMY3d{6eq z2brUM9i)5T!^Y$EKYO%(lRMJyi_6F_h3jf8d@xZ~U+$BPFs0Ii_(@}|E<9uUkkgDe z%Y57}3$mfgFGX%Wwc-F;*w2n2KgE6zSq)pG8ch z^h3p-ZKIIZWZbI4tF+BlT@zH(Nt$Fz&@|LkvxE7LKX_Ew&~M6(6X}#EOsOp<*)|Gf zy)nbj-X|>17YCqFR$l`b!x*-Pnl45O2dD&W%dP;y%7sTC46=Q9aZjV^?Jczy)bWd< zg`E!wnn$zjbSs=HQb&-^Vw&T_2Wj3WXq?&_3v&)f>hyDlNk*8GshylP%`ZC$W7*Q) zUiGhocsKq>cjR3C-{FpU3BL|IA(2R5W{`NEatUk9L?Ops1z_2O1=Us^}2PuV5^k zx*D3K=~p<_YybgRjB^B3YqQz zZF^ey(G^ds(~1qF^8qCA%`zXi%YtmE@=MY<3-xs;Bw^h+UpZ)Hu=*OfIAO}MVFO?> zzFI-9qS*`O#)-ZT!gcHVd6D+Qb#1sIUR%>@UfC0_kjw0Sm}`ld8VKpOTqXKVkxt3- zxRec5qZ6OqQR#kO0kyGZRbZI;CPHjS(`pjrkq?{5FFnQBPMqVEmt6tEMW*g)(vGxx zh3a**gW6qAmGDqAEYnqt5)L{JdD#^p5M`l@^ufR#br*LOew~9v4{BAE#b`9)Qua4m{ytbsVjq$x|-rTA+aITPW2jLFg6^9X?fWd zAP{9iv*?47fmtX%iGABEIEO?~91+RT5QkM)!{Z>kXs}Zp)Pfsy3PAD=(ovb|E{b&V z=$bjrQZ+KE3#)zxQ>9Cuwzodb+#0HZX0S;{*a&0829V=tA7l?oX9qQ)rRN5+JFE2y z)&D~WVXPUJIVWZ4_d3XRdAr_kdl1j@8{Co3@%{(7ey}Ha>qqN%ImrBFdiTFt=lAhj z+!21~zr#U1!`DIjS@BQ*{^!g5SAqX&P)x|<+nqP#FJl-Rml3L+CT?d zX>};V>|3LB3cDauSkuYa09cG68Kk3v;9d2p!-8n4$KC}!@v9C+OZKgeR)b=~I?~Kc zS20TTg1Y5pSJ3#fpl5X1#C~xgWO2_}Tj1QOkg#ri0!6^SriO;vEQt8V-V88|?Zl7* zO)hVOp=5DZg)eM!P)o(wq4Q-eKH=6jJ_Z}l@>meCvh1H=bVc#01Q`iNO!dgzo2Ebwr$sipSEMFOwVU8{~bQ<$!nU9-5D0$w8QLGIrZGoX$6Fm}= z82ZZ5XKSNHnGq{8m1Dye# z35YN@7$}3b86a{NV@L*URifXN8z;K>(9RcV7x#>%MUP-RYp`mYft?QsnuEwLj%R85 zP?%2|(;SD45o}wdO(CcYmG!anl2w$Xk`{excv(|eeVVz^#U_+au*=K_f^uxw09cIS z8>EX3>850%i|Fv3r`c;jjGe~Xn%3r0p6bN;CP`ZjDYaQ_hv`&n(abs>e5nNKVpo7* zMR59`SaPoQx5RQ@R#^#5Yl8<6?}<;KC}!wL&=`8LUJAjJX=x1-)SmiseYPU%>3G&E?P2_iGu$vcR}a!f*$cwd=gY9Wxc~PL;=I4X9qGdV1_$v$;_2cC;xFedA1zM6PvG4T!gKXo z9K>7k{&%F?@&0#Yo~!pi$UIl?evtmu_H@1VU#;KeAbTm8X|?YoQw^C#aODKVk%hPluKY`8e|eza*N80 z9RMS!d6Lkykx4QzvTL=ITbl(D-`JZ0hOwO(QlQD@{ehIpSw)9m#%h7ZBx7xFcPg}E z-S`BGfPGC3DYaP;@r}J1U>MtpAqAS;-U^45$yt>Szl_xai%A-7Z+9xRV%_)zihzAh z4Jox*5b=$@8DJROi6I4=+};X@l*w6@4!?}m0*gr+ZEtrfv|`=(1d4!tO${lvSrGA! zy%}H_+le6snyhyvD3<^k-{qIFT3|6rqpf5esH7a5>WVP*g))h=%yKqc;y|^G4S>ZM z*g!igDh_y)g3o%O%cm@F&=Z+ttgYnQAhi<~#dxiR9hB9V`y?ZrYMHJ&pmJL6i8F4k zr1inCiV=VqnVolWr?kTu=;fHo5XE302|H+MvmACttdy#j;Sk2QR6&z$9Myc9ejt}7}RV@PyW22XjL$(44obfHlN5PngM3tW| zV{IkZMmO4wP4#KzCgU>?(s*xw5O2h9a7VfkzrjIxuI9n*ne_<|1SjH2-$(sG{XDn= zC6xFDh`3b~rG`X?b{VFvHd+me3F(Y-Y}mrs=m7O9ONL(winR+*Fc?Ndm0yxZ+uI%5 z;aJ-kua&UFVD;rb$q1)fh8sYRe>g}MPp>oS({1K#9F^t4Xus{#g}tAzTj z39|tQV!6X4G9nKkOB3wmZxDm*U%E^)IzHLP;XK!aUG~a7dYHY^&mM*&*x}JOUZ|_p zQ+MV*$Jb%b#qKiy%TM#`&de3?&)PR!ve%!>V~-hsPM^Osba?(Zb9H(2PxBwn@zkC9 zHHW#c)P1ZT&6A&7#%q4w4n)lF-1i*-Qg`)UE7T+Y-2X$_h@j7Bb-B$Qrinheji=lr zJa(94{0k2Aq08N`o9WH@j61_^xXmBCGatI#{k{&vZ|qzTZvzi5FS3Cs%D9)$yk>vw z^30v-M2|maUx(pW=F`OE^AhyP>@#&gWlh`+Cgwm-6+S`I|q@ zTp(Wk8S#;sYsTY5fyZg8GJoJP!`;{T=y?G4&+PK|Z9eZX&C6+?JWSX8Z?4&=9p)dq zeA=C%%ky{UpSyh4VYu{Phv^!0O^Hv``wjrlwc-8~%B%pB+K599HlJ`A^hdOY};`P={gHsAXP;MW|+b3A>Rev;qc z7Hy`G7`2~08=em6BVd(PwVScj9Hy*|=k00h|yL{VW_+#`7 z?#$12`Tutq|G%-P@67oN`3vsMY4Ou-{{0`#&*<{+k6Cj({+szVhv~QvABKO0fByde z+nqW0QeIrivs(A6`egMC z6%J!3y@aTczgREA4(un_0fggUp zn9bEoLX_OAC_mwO$*>X0yzd|e`LJi40sdf}9n?NFILZK5lWDWvvb-j#6T&_x7g8@+ zgma%U2ynG8Q)e=P!(gi4SrcXh4pdQw$vYoruEoQL;iAozF0ySGk1OoW=uQ~REuU~0 zWWz>hCZPW>IgD)|Kg^!@de%7X>B`5~H3pao*Q*OMK@vb2E6m1g2VKu7xm(GTBH}FS zn>p0p!yp^>j5C1#n%v8DJ(^x9bYwLd_t^%#CaKfWW{z5eanZ!03W~mFRGZj%?dK?` za9(3#vchSwhlC<8e++Av{&uMCwApSm3!b=WrZ5}u3gmYLclxaeKsY%W`Mev({GjlB zjm=g`$3uS&u<=!=;|Odr?%O0#&loE_*#Pb2cSNbr7zDWDu=aNVh~AYrVKx^IC58nB zwDpBznWN5sy9J_$Cp#f3fY<#6QNd)skgUbXYfv++Vvw7x5S6y!7I!(Ex~p~?fdo|N zeFsU%kDop^&gOtqV2>g~Z$D^YnYkvffg*)5$cLTqvI)1ii^lZC@x1Dk!>IGG7lre> zC(%ct9@%O`E^*vX$4k>O)dXkr4PuaavK6Mn zr-Hc@kxrsQ11d2!81qCHFuFz|_9bRk1m%h}R->nyQ?)*7GV*zYs&~~4>0oCoB!7ug z;Nwfh)QRPmPdE%RJTIYxkcC^^1=OAh&Wl!BvzpLCT*Q{ea0Zb)xL9z*&62_MV}dG z8sj$djSk@;ew<1Hj$or^?vu}Z7-TYj>0x%(GlV04TtFS`nkm5bI}S7M$mW#noma%m z!34hOcDUB?1l`8-#G^VARNKpjvq+(e<9VftNqnN-cK}EqSK?F$@S|6pp@>Ws%kxT9owy$6 z6ApuHTMSXw)r+DGk(2d_Yo{D)d*U27h(UH-#>e1lU;*1GHu(9&IG>jJ?-9s4QIpjU zcH^BfX4E->qc|M|YcuKc4%kPpIG=i$&hqqOJRZ|OdYC!O!-wGzzMee{PZBSl1Ro&U z5xgv1(IG%}FaTj@Bre||3E8sY!Ss`z`k{zSH7XMbxSG@jYVTo?UGqjf(7|5uWq?|R zcwRKqF$J35cMyYI&oqF2P*=Dq2bn7V=T)ni#Pu+ra2RCkH$+)iudyC~sD9$w>2g5R z`wn7|+lSTHjGLVkL>c$mFPHgWEVc{c-X9Kw%JDKsdssjp7Fp9SJ>{5KtoKWhghEz0 z^CCMR)Ihpi5ka50&2d@G2}Y#%Olqt><6*k~^*Z@<#eb?Y!DkF$6Wrjn4LG@eK@j|0 z%@j}+nW~;EI}Bi6YkvoTBgG_|R;soaxW%(?FObB7DRaYx&X^84Qpga z6>%FuCKmS@wvXV8$q9Ysc1&vS1iOIRbGZWu(|6n4K@81wdG;_(XU_)@WcizktUdvu^*>hv`v%`Y`-m@v6BJ`||nb&s)D(_B&F+4`8@iSjSFo{0U{I zVx^xiBm%UvelM~j#2vt4^`HeqOp|f7o}aCd^|_iUpeQm`EYFKx&}R@By{}+sOG<*O z4>&F#7L6m&;1nsx%x~JvmO7L1Q-iDzY9L*%h@eld%^|bCbBf(?(1X4v(l8wWH(l=~ zg@jB8$cSX#FB?Sb`PoY5;n;w(O<9UmobaT)#=?4ir_Z*Fm`W2RH14C-%yGCy8b`=< z0P1v|Gr;%F!pnLt@gVrQnkk?tGF3f+w6F>nu4vE;8zPbv1SW+C%Pq3Y8xhjiE=?gRCBcYY*xh)OuEG@ad!1*fvM*OJH)l0caF5?z3fD znKcQG`{ZW*USvgR@&OQ!!P?JJP61CfF=%yOU;;|3Lo`w$A#N6xLZ;1j8yXT#&g$!u zOfT@d(tWDS9iYA=ufa2iaTc1+`wp6foGBTa*Np3Yq~N=xkOOsO$6?u&df&n5VXbEP z*=mzeBPM|B=P*|W&PQb0%r>Bs7<4+XOG1gt%c5}vsCuA)R8JD$cQE=`Y4GWT8XNXI z0_wx$2M+*(OtJSJNV6aV$$9Aup!DOCLQdQPYBKT~Y`fv1fi=VAF3^0|EfNv*ISp18 zdSH-w-+?p>GU#+(KX4c<>7Bd=+b&SkoodE8JS0l1V`8M+31zMf$H-KA-@$twhGRc} z7_Ny|?9n^pJ$(EyUa#j5bXicJV(+n`Z zj{6mDFECz-=(=$|%qLvl%+6fA&?;BTq6p}GrWzGuxT^_l@HPn=A&yO7GXbc&-phEx z@g*b6*vNa|!I;?Em}X`*sF{4`z`~Alqyme(noOJ7B)}jab_42$8tKPnJZ;xpNY;Yb znX;Tv@EQfSFI6@WN=T6isE+{fgW-**s$eAx6g6TnT^dr2Y3k_@{Jhf?o57?T6z zXDi1UpzaAN62a|C8U2JXA0d^F4zGhr-AnR8|(B~99+Atzh z?0pA$kYkq&(*bbP^K^IH2m0Ms-Ft9P~*ajKN4>>MI>pf>tpL`K^C4)R!&OL+NXfa|?X*IG4GRh!gz z`fR&E>RFvNArtA1ef2A{qnL6v>oDq1I}FYBbQ)%&F_YQRJ~g8N=wQp9asKTvnkl#0 zLx*vS=MUor_&pB8jhOq&oxShv+}G|r&xG$`5D(kSkOyiXiw{ev6+JD&(_-XgnoMwD zcqcn+t;StI>F^K5Xq>Pq9Na3MZK30Yg4ZM(qs&bKUgEptlL>L6vm&bVnB&u1PfeGd z5S0y(R=*bau{{)+awS}c(M~_1AYo07g<%5%eX8?UW1z9JtpI4D}4G~&E!)Y zq)jzLgu9y51drasARBg#$L-*MahN8-`{zxgmYD!_L10j{>4Q_O~-8j|9K^wo~? z3GT(ALHvw?6cr<{0mJ(S#YB~H!fe1#x%wRiQ1jJzIgDCwJSND447*Es`3rqqAmPBr z7rn?_O~!twA68f7-TY5h)RD9PVl-+Qa@d*s?cD} zG3gC@kD_zqoU(R8@gq{e&`+iV2&c|{n}9WgtHw6Y=Hx=x61&MPj=`a085?cyJNT}{ ztbZpttFIZ=9<`k|+ihmqNgNvG*f=jKh9*Q;a-cD&Iz$*ha~KCW2GfgnK>@Dx-8OfS zgxt|q$Uxd5J%P*J0K^Zb6wQWswt3&dn5fE2VKxZPxq)j1K$GSje) z8K`Bsl6bIIuQ|!G6W1vBv&%Uq3Y!~Um5JMJ3@T*H*^LWkW8kcA_9iVT=R!TBmkx$ z)m*iKhnI60VhJcvABJPpNwnFBX;ix-JpRD?BOxQBi-`*v_=n4Ep?+tL2eA!V?lweM zY1t`#gfZheghUk4{eN3SY2UWE57ee)8}d=|lsWIm(=NizsN%b7-fYPG2HMJ5(M{W= z-3CPR#1Z|kYN*N=#aUq%EFX$HrMLF)t9fjQv-K8m$}bwSjcXWi58>Ns9vVs)mi>>0 z(&cLneLv0B5Vf5ka+)H5RtTQRg{r z?@|YWbBDgQ=FQ@KpmuoC&@}dNhaL}M>A9<%%^t#inr~g)M-J})Nkj9jyz%((rF#7k zzKLdwOa1*(pqtasd`X7X?}Pn(Q;ij$(@y?`zJ`LYqn}FMW){d@p=p9pS$<>9LvfV(dP75cqoHr6 zvCebakHyiafOqJzq3^6oooy&LXYbHsLq22Y4m~#X%{1JE>-+G{iu*`yj}7hD=g&Lj z?|_|8%fROlzL|z6?G!gli*3b1#C*`MNA6o|9*d)a(-8I3s|^|Exit8VHJ<(6A?tj# zA^OQ}{AWYoP4mMV;&{*FI6v@oGZ^ukVmu8tFIcn`-iPZ*jdd z)?VP;ytf+qcA7k0TW+2{njyjsqNBYh?e4J$v7rp~9W<8BK-F_iP2ZsxESb#V`O=-r2XHPkP_K5`Ft$omHIk300uGOHmD!}Q~A z@+Mqg@Q2gx*Ji&%4~f2;<|j8a$A3L?a}T_{cZLPMkq?LP{WRaYxa<4+$REFO54B8n6am#p5o)?KI$+B{E z7Sk$&0Q*L^)y{ewio6Y_m)9El$~)uP`W~#io~6CMdEi#F{;u5 zchsYQNEE+QN|RB6uR~ zV+$k&u_i6G_e^cXRWm4;8`u>yovr@>znwIFh)gLgD^SDr1%hH|Ps zCyKR-L!klB^s%9^qqM`QmVz&oW5P1*FUdnT{3-oq)}(C@FDeBYHUi_>v25)WFKi^7 zu)jQysVB>yiS}k>wy2bs6K=&RHhs+afrdUg&tpSl%{I+ZDP8WnP;bgay@<;3{9iN# zBamdbI9BpidFa5(|B)!Y5bT>e*g*F4XG3<70iLTUiqH;fUzo(H3)};<%8~_FMebs3 zkR;izqI)f?!pX};yyH$1KFMpI$3#ioL;#pakswm{X3DN^CC2>Yd6Lr_UnVly4ynru zQy$ronwYpdBRfna=25)W(58Ssd`5*)W&2E?Q#b8e{KN3r&JxrWk2Zdlgbg#X;oNAtYi5O?3q9Ue~Z%|z5E*db32g?mlZTsji%_OtVBL-B1Q zia3d;5Nn-EtbxiFc6~&FQ?{y@+s_Sote9F`2UVsv%YCvTT`<#4tKNg#@;oM@mDd}h zzIGahHPfJ%U>%Kd9YWcj>2q_1C!wPFs5eWPtIdqlZ?El)?@(+K%M=h9jL@m6>$rx0EB)*i3N%; z&0~6sFkee>OEnkJvZ=+YMJ@TQhAjLru!@lB%kt32Ni@qV5m`?|gsG;b5N|^UcYqZU zV#yL+N!iozFIOTb{>699|A}%1w(@PVK(Z$KILD#PC4j zQfZ88R**Xb$#)^i6G5t)cy^42GqVgN>=s7_5M67PsN~=}x{S3>S0bG=B-c9<)vZaA zmKvjW*&{t@Q}!IPmKdm8AOYJ7z8S$vQs1L zh*JDRSEUc^^=w1Ke(6JEF zW=RqSwfMR`PbaeOCmTv5q{{J(+%sWM`*Rg>C&}QGd3wIuYit4!_{A|kdVfSyvovAp zIA-C2(jOh#6%iK=6jAbbjeX#UEKYHbjbRYCPN@K{!FOmz8&5|B7{&o?!&k9!qTwib>CND0ToYw1D3Yz%lbXlkuq0=anq!RJ zCi8)lDby3RqSX3_1{YuH zpVlCFwzv;yJcg%>qh*x5tn(d*@K9V%NQ<6W-O@A0?fongNZS9TL3)ZyhGer^!9$s( zOFfHbc!%aiQ5hl=|^Ln-L%8~WI3Ki?4jr*yd}7%2w%Z11B2DrgHbIJAITae7@#pjwS1oj`&Wx2-|NLu z*0UN8z3OLb^IRpG%BYTU}O2bx8MnGQ4 zq`5(O3;bP^6t5L$#W>YYuO>8TiDU{zNbYMI0I#~avreLTWJYTyM}zw7F2#)hL8BRJ zoix@)-DDS{WZd*l837UwM~^UK^PmxfcGUI$sS*A{`fX73$s~r4VLXK65T+gaKLMk#_+I{eTYkAx)K}svSnYwT!i?LmBL8 z%}0x)5Gtbx)4F9@pa($ZQXY%5+#N9SW$_MJdSMC}D3)&D)P#;77G~~1#8&{&GCriS zf+(#pc1@g2rv~asz5$-or0OKo9Qk@A4c;d~`~jZRXcF-#m3PoOnNB46KBA%2Qyfcs zN%?_@C7d$M^1MbFK9h|Dy1Fu1!$d=B8raWhJbT@BRNX;m{Ol$zdRsoC@!Xy*ZW7b^ zGY$0^tW&AQXZW&)ygj6xj3#N_*58O-7)V3!)I`Be7y&m{bq8Z~Dfapudbb7-iaIvw zWzB`8oQ48Dr@`M$xW8zq-qgD}p}PB!#x%&=2(Gas@oazsLOpo{g!@-b{Z+?~W!;hx zV-kC{0@6OEi3O#3k)sYoUU<8M`4LUT?O`j78BHa*@$9-};>|_g87+YA8ydp0f=7ZF zg+?yqXcOc~&4>{D&gxyoJ*}~qZK#FQU6w4G-7Z|G|0gwVBn{!3^D%+jJc8BFLX+q>54PA6s4-LJov7~E2$RU|` zHbE#N=LMSgvl>gfhUPFES%OgKtjX_`IxL30QLyrayRPPA#l=!z43Z+rNh|!%Lx6S4 zeyg}x;s-nL2u))Y?6IL28n6T?gH7monSNnP!LEX5H5R2OLsPlbi9k|PDN9~`2YH`n zSiUoiGG=UPhX4%1g2as^nB?!!p!Ut6g*LeAWCD=*O8ST<<+ixknQ>dtzXh;%`Pk4q zHP(%@MSfjF)@^;M)tXzh$v>^3hok5CXe{1lGw)I!>XGL@>?p03)2)`7Hypz=cR_;? z4LDvD*pUFljs-y@5Dd>$%50@x5f~35lt;AWods}$6vy}v;el4LPS&F6Y!ROW$XvwkI`8^wgOoXI7nnYwiE~8g$*LURR9HiR7-yAmG%FgynUso=bw9wUasV@a9ljpz~;!8E}X zmW!t}@kTx`jf}8n%Y=Eh7GYXf&xmxZpT2RxJ!vo_V@GHp=< zMa*>qtssyG!7$sfP|J=|Oz==BM#={?AFJ)Lo=1llJ-nzCgd$arL@|_-d5fxOwq7#enA%hh3kzw1+mjj!M;4`}X073I1s*2$ z3N8_G5wTfIWoCIq-C7+{P&SNNII|3-K#Qj|aia(fd|^TseAiXu1Wj*V;FqHCUlqm9 z+L~t~R-Y2E%^R3i+`|I%^*7BrR;_6Jz}GVHcSRP{4sSCyliczn0$^29J?w!fC>t&p zA+ij_n2(fMzC#oFbG2EALhDZ+Eo^Fs)I||i{#J1q!EvAgSLyY6%|~h@OG==hS?B2X zIA=$+0INfmV(F2}6# zOjdJBm^4jqc51a6ch(S5N++sDzNO9mx`?Iwc@Ni9a_Dr?d^?* z-qx6$Q|TE|3TN8P!mei!2-l7j5q({R;}I#d-k7IQGhxe{fxN2Ne?)^S_2}kqjWrh} zQ8Ga=)vKbA(GBaR=*{BN2njZ?sZ3~)t!|-cEB7@GfLGm83bk;}n1CjfboM~*Mu>5e zxkA_l;H)q1tN{Pv9WmZvD`1%{PiWSEwamQT&!96QQ(f5%$t?L#5yc3ieo>r=wd(Im zYYk0WWuV#kew^d!s5wrvU~qb-wl_t1#WbF#m`an}s#yBoS;=hvY>3#inztLGvbSn` zXvmxyU_Xc?rO=YLxFT+8i&q+2;xGiRhTf^c3BMY$!Xxhj5c&a4jweOZ$8~%#i=A7R z1x!ooPzWd4`!zZ#B$M(CA)ZXgM z)C5VD954B)A%1wo24|dRAG-U;VIg90A^Vx(c@1)Hn%~S0Iu8VZKe_yvhPc@d`iF?X z5kb>R!#HVC>xRd3u;D8GfQBs3)@GU~MW(r-p%2yeDB?6c&JLb0g?rBD4&~h1&_KR- z$juP+2EBHN-l?IObBEqt+gl>t>i6D;%A;P6i<-2t)0ra9XmRgt^WpVXv+G3pyjv*>QRBqtTg&Bb~ZEdMUjBm)Ut;isot95JeyfJBD8 z!TzGrBLm!6R+a6f#?Kdt;e-h;#Fn@iI5hT`$UukC0jold%)KIaAJJ&iVLb7{J(;rA zmMB{`N%ovb;SR}BKRrP(oH?12gQy8XU8aV&H6N?3y^tdnBwMS(lPXd}IHxWv44u+> zE0g}Rp^s?vI|Iq^v`Fg7_8+w+`kE-jS4D3YH%Z=T$kwkRb)~{LH6N<&1EQYL>FQm@ zUA?$^kEVD0sl$fMz3Y9LIfs^FXS49bBL~K9a8NO7w-^y9>>4bBXEoIHY;7u@6j|=G z#kHqr4&fuUafIg-%;{Nj%vL=4f#GoME_42ED3CxHRl-Ee$dYO2cWTTGchlC)0FFSo z$39C#&xv$;wm7mssj+Su$l3(urA^ha)d~^ zww@GCU|+a{hQ``UA{!&_%uS?=C?Zw9WC;w>CV7i6NW`pWGfTL}2Z6<9)J+sx1#(HL zB}K)qMJuZXybu>%nMhTeT`Y1!NVR3ytsyoIvT_T3f It9OnAV--xH#4Yj*)gQZy z=o((?Aq!}B=FVnDkfdVaX2}>MsrJ~PAd%Fe4_@IK$4x(NgH>^?nmO`VsWL4tmDK`X zatx4E$;g(-yqw2(rIxT{31(OWXysyfw&1!=%(Xs|>V75A0AVNrB6mH*zEVetYaC^g zkqVg63ndphjWTr&qB#;p4$#V7*sVrt!d=t&X*;26k1x66W*JHe?8_1&s|CE&(*nDj z*g~1tSi2;tVkZ69fPIRfK8!B9wuW(?c#7*HPHHO*#mJUZKU2nTWoTx75!o0CPz)tk zlFr5=7)qAF5H0br2@%Y;ban|}rm(nlaHE03LUBD($=JZTWVOH<=t8rT94Hi%ioqVf zRu!TqP*k%E1$;1!jk~)CZvGT?45M8HUmIEYGnMhT-9O=+v97PbVCFy;j}@wKpUd=3gx(n6U{Dh4j2esNT^%Hr#L z(UT=y3pX)-EVecgs7VE;Fo8t}GU51S^Y(LXNPIK;#z+L%WrcjKCo-+Kk1bqV%~^4Z;|Y78}EH3kO2Efe$@3go_wr z{8+3=q`q%hvf(co;|gMk!wt0RM9T=5Jmi-~JCe}O-vSSd7Vp~QsL}doJ<}wRwFWpS$P|`w~Rap6p8a3bQ*M1C= z1}9c-@n)yVgK7L=Mig>{NVy_hDS>@WY-L38=W@x$NPuD}rL3`bNm9j3u%dM{>nq60 z?JQu*c1?T`q**{+GVuesj67DVfU&!XuHmUXXm)M`g8Ct&tKl$^33OygIE zoBxfLXzQo7^aJnG&ul5*qo4C#;x0{3Kk4Ve2hXnZL+EcsehLT5!j@zo%StDt%qgGC!4|UM~IOUfoWs8#>OGQ(#^Q+69hhV$dw<{_zflElCjj4WG+Y0Gz9jA-66V$7h;m_Zem_K z6I^5M7$Vm^{%at_aC|SiwgbRjQ?^`2-H1S4$fb&q%sOD~F5((Tc~E_bc|~Oo>M(~T z5V9(mx_9foTsXjP!P&ufO zAW#=_NgX9c#V#c*ag9q{bY(aQWj1@HmNHNk=1VSP>{}iCvxUyC30P=~Gz+NJR#-?N za-Bu7)^24)2^`|0%~&j|&EAX_5V_`h>&C!4vnkG2M~0KRowRNZ!Kz>?GSjx?vhXW&uZ!SyL6t=d~?3zcZr`~zuZ6QyF^R*O>prO zeja}4UE+K3!ydf+E9UpX<6Gy&{pF7}Q+^N`7s~wbedS3XIAnzkyG0p1!L!wopi6Bc$0o*E*RCZT0l z6!Q8OQ>bU{O&Po7vOu(61M84f?rJepNIJ(7JZULW0)uYGtyS*Ahpt<>($s67DB~$w zS3%skN{WhIN-&NPt(VJ1S9YLKaE>Kx*Ht7Ow{TikoPZBKHH2$?kod7U2~_N^*{({9 zs+{PN)#4(KgXnKdoE4|g<8b3KZjguv$(cc3u5nHl9Ez+Z6F-p4$YZ4n7`uz;8lK97 zX6H6gC?*vncZ|U|b=+bPLDz*3Jz2s<3^9Hzwl)!{3%R6@lA>al5|+5eB`&%$9E38P zJyJ^0V2{YB2rsnA%Vy*R3(-xLt_&2uq$ec0$a z#0rl%#tlo}nN4xFvk+{8V+!kUEu}o--CSW}8)Kx}>b?fh z%3b)-Qv+cxd?L-_B+y`Y2ScH>sutH=kF*lWVyeo>DU?}-mf+e{k<`_?fe`7;tPY)> zCJ(0ZgBelC5hCS6#8ndHzHm<*8f!0?Y>WgbhLY zDwvRRxja$=`x@jTx{fu=0TRIni-j^DW2i||h2)4<8Qh96y6DLw&2%8~)0VU)6F-p4 z4a83!q2?S}Ew*kQk_uPc7Rsz5Imim5ev%_vWpLRTU36{BHBK4&v51qC@dF7^Q%R!k zrnn;`I?U9|l8uqd4irk;7z#MVkQFlQ7Ng(^o-GNIC0uE^K}Ugun;7g)Od$+O%zeSF ztZ3?$OG46CCZdj2l8S+ABN=O1(Yk?ff~*c5vL?FYV`*JP6!Q8C30TH>#ZO)AR)*KC zW6hF{kpRUK*98#F59=xtK3j|+!6kUMvmhwo8kz)W$x5nV3MFpJ5onJwFLsCM8lK97 zX6H6gC}tN+o><`#$GA0l633|}G@CVSW?P9Tic=fj>nq0;r_!P@v=gltfkRw0g-9MO zDsx;Jk^i>T6UedtySp@}>^=NRcgcq&4eg_4n>70S;PL&*P5=EaT_^Av-z9&m{N3W= z%a`;gJb32^;^(yV1MkxLA^*Kg-~3+>exH8t(l@_LexH8t(zm}${+o_}*ZqV)wdd~= z&HS*I@`d_Qcj@=%;eX)gffDmi*!+hf|1{_FE4Kem&u+$rGOG~FbDJB^cQfBkzoicB z`;}eTt&R+)!xc~T&WXB-L41pm1qF`qS@~j@x6#l_WCMR?IJn7-bub)USCMeFZXiTD zGpj>q*90uM+-4C`$PtEOBxOt>e(GYkGQ9BTa>>R>fMO`6m^A6lvt$Vj(GHt8n+DHz z7J|!_HbkVlSL9^;K)OYKp(?RVhGPLQImigwjI6TBtRguqVaXU1uGS5N6J(XF@e`Qf zTzE#BMMNR5uN;BMFI1&2vD7QZ&A|qKt1G6n2nw+YWnD$W)w+Qommn**vmh7~*2E<^ zV;iiB-APio(yFLNm&))Wz~yq$>?8*YWsS9Ch#VUY&epwnoMbLt+mjhj^v;R8!JGJj zT%Jmc!Wh8B%Fqzta=B=Bk^_ZecCpA~g@^g-h9&RJ>zgeJv@KVfdc0Z0$;o)qi77;W zp|du6WN1v1x-WEPc2d=57eg3hqzV~yGcMco3m^LM1d$Z1bpzq|E^!Dw2Pd|9L^;14)OltldyfXVWM~L*xm+|m3=3sm zW9^cpikZl-TayP*5ZJJeYBux42SJ)eoI)mkAeSwNbyUC;lpha~)Ju3#eQ;YSvq{BL zLaZu8jkCkdu{$}B#RGBCm1$AcW*5UYY2!CS zya}}`2q(zu(1$0$WPFhLv4|*S;sPY6VQ80J(qM8yxX@-?OqDsH!}P-%N`$C)i=agqJX`3xl`Bm>)v$nC zt?}%{6b4oYq z$aNMqDV8fE8e=N}52`EPUWtdYQcGB}1P0w=4*>^*tla9zP_FSo*mCLMCW=@T8x~hd zQL!U-5nW%zMOP+L)n*qnD>hZc@oY&$flAHYEp$Llbhli$taQ*k2Cd+zgF*v628%jkXuP?a*SVl;Y`@%hz&{%u9G+Bv~7RtQF5o&W3 zaqo6E>=YAxc^(^BO1jsZn#i39S;+23Z|? zYS_$pT38nmg=|5T5=^0<^-~wSmEkq()XS2M5j+qtlzF3oT~{HwZdKO7Ah2N_)g+H` z-1O6yv?UWikW1#U0yF+xnb?cBwWg=DSa->6xF*y|KKF%66XHo_}^a&=v?)XHS z#nxszr~~F=$wgpMmBcQh>x+m6bY&uJLNTdW@+?_`?rSjfkoCdvY{7NI;K7uw?iD#z zSV*_XFVv)1ie476CQKI4En9?5C?;KOHcOVE`x*!_$VPwB*(Ho?%g8+Rh~mAzas*U_ zG?abe5=*kM_HxO_NOr|gawWUffNN95Ot7ML1K|W&9lCDiu4(*+Qc=jW)q-3BEF&by zec^J5u2V0|0TP*|q=hoCG5%dw5yxtyY!NyU(GYsFglpWy_-PBy%0Vj;BU{cvhkA-u zR*Q>BMzp$i=_VADHe=USkfS0E*rx~*4bm-icFQ#`A$38DCsN~s#E(UsLMDD7mnuSv3RsszOc95;s6LSVL^aE{ z!NLup_{{`swnajHTa?Qsc(%~l;prNh1m{EC7Ta>~=EHq-9 z1=J-+SV%y{1k^$F#jdhh)Hs2`gF-wORe{V^shcHZEK9d4mx~jy;aR|x?YgGq`|l+6qHao-%NhqAqqTLxVq;OEyLV6hkS+?9JG9 z6*Iw#){QwgNCYg|X0cr36Gf_{tR)jakW0(Z2ETp9xE@WiIZSqCWECj0Nt448VugqK z@PzILg7qxy796rBtf39ka^m1dq`q?EdZbmM)FqZiO|ryAO##U(s?9ED)`6*WYp%$c z5JQ4z3w=TYu76v~iR6ZRTppUI>fwFpF6G>R)?MPs&&S7!hln;kE{^Z_!JFgpNSrw> z568*)K2HI@hOMIVxze{@p z-}Ns2ewX}Z`{v)JpL&<_XJoqjL4Ruf)%#g3(T(2_e$Do8jX&=D?^-Jvy%qXivSKic&B2Cd+ssTjsX)DjJ!~UQz*d)CImlnVdH@`i#X|F z{6GSujHCAIVz)9h_;a~rVghv@nuqA^uvB2|Il!xp=&*|Re|%NjJSv@n=BWKDF(2fZ^>uo~=6lERf% zh4SRoqe(W$qNYHctX)JQmJ)_h#Y}kb76Caoc($`3DBv2}5aY)JYPH4>BoMhIQI*)O zjA)FTgN>2(#sbsKW^cx>tC&gmHDKQ~7@jS-Zb`sKBeq#Ut+v8K0+H)1iZz}_LUavJ zpMz%SHc%)g6-x=R(n*83#h&O1vT{rMWC_>!An{`nQOLv(wQ$WsXDJlf0?p7JCT#5MXLZmw+FOh(cc9Vy&m7sHoJ^%ylfb?hA#e_d=Of zh=C_<{MP@nhw#7M5=W48*u(hWZ;6KN<|nnpefm)k-jBRXbB})S62GIIL%xPtMcS0TF@7s|ZG+9gR9=KG=FC3>?z51FH3Z?q7n*AwKR zIHs_&P)taJ-H9n&X;t{^lDJU}`DuHVLf?-&oQ51S9151o(QZI6Ak-SOP-H~N-bw#kh4CcE(vC)Hhdj7+!Xx;5F2jhX76Q_KmfJNo1OduH z5E1rxDhUxzF;&A*l-O)I$+-k?N_JDOw029f9^*Z^ZcTRQA^tY@318@25AoJycRh;c zxhGl9@1|V0CcE(v`-kz4Q=B8btucNJ?n-uRu3MAcd5FJ_ zduXZ*M5;+8Au7fLmfkM!NoJkaWCc`wN@$h#knnF~xsy2uZ&Pm^H}C83SRRtQa@pT4 z$vnK99!2h5n)>=Xc5AL%lihiUKeVy_d`x#={vA7iT<*wqYqA>;@%j_}<)`Rh-kTnK zp1-?J@#b8&HRi3l=@7|vS7W#4x;5F2hd4Lut~#(lZ&OEz8I))!!wT&2?)sem3qp#2?yN{{G)`iv0)MO}XxCY@Q7MTj-WYasM}fdy?JQ7+>R? zPSJmF+?DIrWH%n-q`K>lk?FQvwF;m@hiaElYa^0Hj&O`idY)*jxbAllcQ;x)c zKQugX|7<{G(O-4$Qy911F_Pbu%+Bu0b!)Oa5AlaK)(8KN$Ac5R>yDY}wq!Rp#*Mn^ z6erD%x$aBGFaKSQ@dxLwWOwJft1)IjdV5AsQI~>ywT}v=}6e zO4ed(qLi&&`0C{qX&5ivVtLZytNDy-O$IqREly-Z7$QnB!&4FSh8L@+DPt}>4wp_X zMsw+FYV+ZgxO#IA9bUs|vb(Z@$0-G=B_z36926Z3MHAA%sN`8$R|aH@50`pSQFuU%hyCb8-a8+qwcrh~=#v#VKyEBtX z4n`ba-rbQfnd7Su)`l0WWr_%28#M7(M|#}3G;kk#tO6uRv`E8>QdGB9N@)MNm~N_B52Wkn{hnkBB~UBc@D?OP_DM?}xqH8(vXj$NBL9pY%%9o0VaHz7i z3tzn+MH-%I0!`o|X=kZ%b|x~1HG@UM)KJP2Nf2XR>=H(moka>UOiUX_wb7{!Mp*DT z7(hkDC?fT1CRR%!2FMx0Ml~z7!Avxq4b5J_B;~5oVBKV^iZ2?;8Q9G#x*3C6i}sCd zKrS=|SfGcQ!5u?BEFfn{I7$_UBZs414G|$ThzTzZ3@?_@kOpF4$`E#SYLB{9*=9i$ z|M(VZc(G(@5#f_D4(3)zdcvxO*`JX7B#3d5_-K|wN8$ojjIqfMjk+1T7-hk*bqZfj zcvytU>QqH29~QHO9S2sv0J=yb1)*<1GCYZR%u1gO?ili60Xaj$Q3CMJ75lPvA_7xR zB(2|%r$LKB!5K;OES3kIl&M3RUWiBBwLG33J{S_~4dZh#ml3|My)%a;=# zKC?;~dUk~JVKGbCj)e09=vz6Q>{-k*f&{CPqE7}{4Ee;|GU71dyoPaPYhSi;=rE;K z#4h2*u1q5eKMHqEB{_*BFE{p}UbE%hTrq$NC|1u@kysQsTsjn(c#)7T5kmyz%4jn1 zgcnm~nC!LFK%dpcl~H-Q(l^l5v!w>I;tex{JBEB%Xctez)d@sKSbo?#5sAO%inKMn z*p&&Ed_3ZyTk?Ztg|gh(HxQUY;LS@3OzGL_BE$eWL)g{T9T_HS!rp*6G>#;;YnI%x zJZWd!QVgwbok)WV34L$o$P88^C1L0%LwqqI7KOW`3)w7dU$%CL;PQsbi(SHt)l!H7 za)z+0iwzFS;b>PwL?)3Djg*8JOQ=@W31VENVdy8rR*~3faU2I$%_n(H10ILaVe&o_^e{8HW5|bvb`A-nOcc_r zRpPk71_HxqHYi-RbjOg-u|v@?Omo0d23s(Ic0`0rR}!M)V6`;cv5RL6^@v51nh2lL zO+&^cNYa`&yqGFeX|iR+p@|HhN>VS@Ek6Od1c?^GWgTTP%Qg!UVHgV5vSvu3Gz_bDC{8%i+49y@Sd&_WFRWuF zc;l179Ya1Ww2P;-=q#J17a12c)DjZJLmzBmW^l)l4~uAP39Zs$@J0t+LFgL@{3r== zE!{EXbL`@oj7c^G-iAc61~Oy?!faUtF&;YXWMR=_vC}}z1iBE!flH2d1RDq(6-*F_ ztcoQ>SWWCSAS4MTNi|Sds=x*UV+>DvZHBEhDl@?{W{&INOVkoi}223&$O^QIA zybj73@?ilv1EYZi*(rk^L_jAZ!Z29O3vt4WB}5pOeAsblBvG_c3&R!AF9IV*i!?lw zZ2chNvv#bF7)TC{6fYV!A?W6?OcKSr;l++8)rOBp3>rad5C^WnrEdTR2AU`dv2I}c zxzHqbDvDqc2Mv-3q3`&SZ&nFLu$F`uQw3>03``C(5=m4iqEba--vBZUG%14FS}J46 z=h(S#6pJ`$pj!K~H((Bp6|5!UfO)N-^ZA;_IFq*g<&Cw2dm8frxap4F*ci{jU3ZL!@UBOZhw!e({C3^c znD^?Y#?sO~jise~8p~67*Q0pj9lNoye!Xt_j!j$Mte5h4)ZfeYD@JHS;zFapoMxy&3#oHRQr<)qf&AR7~CWSAWS-Yg&* zN-zSEhG#llmuxpSb7&~?f~+?{`~%8t4Mr|WhzKvLOST)sm5Wf5;lQ%-Ab^IfOqeZ; zAgMUjB@$13V*!J;J7puqLWjg5`&I=kBSLu66GsS;36-D9XA?2uypFY|sY6gE1=KD% z5$S0-oReXp=yiscaB?(dPp(=zI2XxvQ+A67`;aNfUt;#@PO(-g2 zn3W~cFqx_cmj#s9@lG6aK$+ABBf^!;Zo~RORR}K*!5x!ZleJDjJ+R*St4^JoR^A*(TD(rPDCl&mEb8J zPPi&jy9fiCp-QpPJ8{UqRcRJjlVk%ICYDNSxd@)f9B0}p*E(@Ub%*YS3gJqO%g@l< zgbQIho2Uurg$2vT6$plon=spz%ql1%^@%8sLu8KePmYnnPh3&mCt?M$W0#ki!dHZW z8C;M)#zW^4iWH!j8U(zdBoLS_ydZ+JPEc$HG)c<@laX*4#yK$Q%jh1yBsC0>RLU2(w)Yy!dE_ z0owhrh*i`;c>xrQ0|-#)^)z@%0zy1gLitoan~39|90!Yo9P?T2v16B)iCaXXxNa)a z$2hgHt59+=SU~`Xl0c+vlZ(Az%(^3#4~sZqmvEE-#o{m{Z77?%rXASaj-a$xrPZO2Wc^9AdU^IMiWXp@L|ApTovqt#X(LC6R4eH zT+9?3z0kwlPrSF3gsW3aPSA+z-T=voG@4xmj!Aw})j}%3mTzu5_4Ry%fm;U=2F z9Ya2q&n9BRd6~Xp9pnrsQ%jgKPPU?vmAk1%qB*nj+2O$Jcmvo49Dof(O0W`Fq8X$z zI5Ri8xLLuSsg0;d|2R{A>r!Oia}KXjYR^H zGP=?z=vumC$cII&@JkpKjt5hNfOis+Fw#;L6@+-G@bjr0nPS3u9qYsu2+G84?QpJ2 zUyE=+bW*{BCfgAxFMwidM1Vr41eru!;1iIQKFoyjVF5Wq!chVgi-R14(7j;FgUCyR zbpw=|aPq2j2)jD938x&zQZ%_dDZ^Jmh$%D)Vr$AcjkE<+x7LYc?>l)g!>;d{d=ZeJ z!5u?B)z2ns<Kiv zT$Mr$v5FXWUI4|`KVay%$%N(tpILE*k{zLZSU}E@aFi+A;PuLiXm)N zu-QQ|t_Win494ZiliRpxsO4Kkz%lQAt)Ft>1P{BEn(rMTP)4SFk3P@0_CNm;b_@l<1YNOXE85OeKNRX$cKgEb(nB< zY7^E01Sr={jggZ|xsv}Xu<;Z}ZQB&{?5ed=A*z?PSR7{LO;8Inb{mOq3F8cwkhTud zVqhh!n^}J1iiQx19s-giN`^L9E#0X+d4~_%Q7x8HG#o9PW}__mm`Giu;h6xKQ2Aki zoFU;^&kLZK8U(zfWQ{~HA<8tK5ZCq6Sl__b- z4!{`yALQ(^^PQ6g%Zv+>^+4}Qmn?TZCq5~YlKF-^_uE`f`V+q4KlsjF-?tkc%j@C4<4f>(*M3}c z`=h)~e(4X2KP5fFufKCiw2d{b{qZ-N{0zqXWBRYY;`cVkeY@>DXRe-6vwq=o^5S1a zU!LwU+aBrHHRnUOX?XN}{H$TtpZ-|h+gy&VO?`E9lg58?pCkOzqwF2J>CXKl+h$G2 zpZ=Wi+&!M<7vH&mWczK7!}n3RpdzzR-VW`ayerj{v2v3mVkF(_N@bR|&`#Y!G+u0uOpb1k^PTne7 z424oEPJ3l*z$_H|mvq6g!Y^)n7GSj?15STUN5-KsAz`J`0Db_xe22q^DW z6WKnYIiJ6u+T8SY-<|V!_Px)^zrJ%kXur+z8+tw*2yf11e_V4+Quee$U^mz5U020~ zi35xnH`8nYc=>)d8-{3bi1$n`82(5Hn~%8ySEq_1glniCcPZ)laDB7(3ImI|fM7GZU_b#3hv@kvC~S}c9EFn_1ZHv&c>v?9E*`QUvu)N7 zW;4A7u=;yCDz|#6R#Tlt5*6-gh0y*)?rV#~;hdcH^D< zP__>@N4EQ#`&hP*HfOfmn)_h34>p%&CNea&UM4TQ2tG}-{$57_>1jR$Z)Q&zH_Ghc(y0+T)O#q zI`#bHcTR{m2gSek8Njk53P<)y20NSu!=Mi8O~9%tkDV#*MwPHJ33=>-Fex#E|p zdZIW9>KeUC6GuvrAx@pt5P{JvTLbUQ)}qMg8!7&;XCv3YN~fM=??`8=f0d0qWPDe) zf1mFAHs?Xyb>}>APjjEjJ~ljCy(O8rxjt-`<|BblgEM>ckiPy{M)fb#z5C96O17Q> zZT{!x?3%OTp!Ukv0DvAdxsaC-NrXZC{4L$*WZSIo$tL%f=FV~byKL`l&fD-)o8!jj zYy9ud@?DQ55AmmDd)J+#q|ZtB+uWxe<^T6P*B|ds`4;^3vD`lupOWpbcW%CH`OEYv z>HgN-=Vtp`b5qUFZH}MG+wPpdB%he=Ki@f?qTlA`1iZnS_F3j!K#1o4b6TEQ<1rLK55+-pq6Y8*;N^R!FcHm_ zpi{hbXg1cN1dFkt&=*@&m|PG8xaReXB3x0<8Z`%fBBvJqNDu(cjtH_Z+k5Bi}od7f>-Wb`ns#RugvymbCd1HJNKp8{%DQ| z^0zsEBLDTT<(D4iKkwY!wEOOyNArcx$;;-Zu^XHF@@y}f%O8N-?%ci2O{e$WIsawz zwa>}Joujk+n!6+0)tt}dPi@ZIc2{$GCciq{-nqGNcRVPh@pDVIZH{>k@~h7>&5`Sl zY^3Rbao>^dw>fVsk5L|A{+;J_p66iRjNBQY^*nWn=HdVGckZ<57hzh@LG0sNnD8ul zpX=$f9M9T$!Z9OA6fC(-@$a)Gm9}@Jvw_>P^~gN88-{bwZ&~mp}KSd05s6$1_HxzwHbs{lVS^D0EiQYikMt5fRYG~ zQ9^#$2R*#R3&XA!Kd-f#O%#(F9zCywC7r@;}k)FE(l>%3;e;^x$LCmG zJULOJz6xT)rH6UMFvmDql8{i|?ja!L<=a9gK-75Q%?-2{xI(0`Gr3>@jc-_n2+Iq4 zc!`H{Qe&~#TFo9)Xs>4VuVVbG_xXp9^!SOweT-grAdKICqnrN`v>q5kRtR$D0 z1oeXBTf>?@VirT{s83d9n49Iy@sbTg!z@5z94B^jp|^-&u~mjTgpoE4r2Qm_;PAw< zBdTu*?B-I7Fj#Cy#iB+R2^y)-O!7x!UDixy2w2XA9s`!MGW|?07?uFXtwhO%qGH2^ zNvB!NlJ~i?H4wL$LP5mn2Odt2XO7oJ3|@G1>oglK^jNTzmI;cvYADA7ByMuF%hN;k zdg&X&cyXa?0#?YGTrivxAZ7nB!^J~@A@WWW29=s#{?$t zb7gBFE-{6I=MKuW7}hn%OX9jFR2P^r84lML&T=-fr5K=%M1|4AC&BVG(l72RXHbA$ zu557@gNtd^Bvr0Vi-E6MyKrEv%bLkluVuV8LtJ7;o_w;9qc0ji&@axKCR7uc>MevK z64|nJu#%H2j_`1DJafD(dR-HOmYNW0=z#tKGrgs>2e`{Yc6}lgHppO6p=<*oXsK5y z&Dp6bz%339FX3w@c{%Hv5ai7*f!$o_AF$Y3Sm7AT!QuKuC~T0yqC$DLIqcSFW@`G% z=8}p;>EofXO%C)>)W2!&(dX-%>pA|!=Dc4$&9A?6K9RRQmONl{&+^E9d2`d&i^uzG zAIoX$#^&hlzUJudzUJ)hw&v{Zw&wiF|EYKG*5>Acyz#M|*7()>p?B`q=H?9VymLG! zcilOEByVdjPvc$9&C~D$K<}|_zK+D-1K&1 zbJN?6&C%O^&C%O^&Dq;+&Dq;+&84@ynoDnYHP_y5YOcNA)ZF%VPjlPbJy4x_sEI_F~lQ(xTGB?BQycuOz< zHWx9%ETJs(g-xQuny4^{7?aSAhZaMO7aEeZ&0x|fw77gQO-(Om>T51n2(n<(9pP~H z%GQ9CP5r`#jYgqRBro4c3B*4kLE?vG?$8X!0udA~9P~+4<5)PR(&U1nDvL3RVL4*@ zflHOLsm-E4E2B7Ep9B$8A`C90nTQ;cAb7n>EhTjvpF}kt`k6^yjw@7IFlkLj_`R|< z@IVPy9z%pWSQVlKYc6S^L^FjRF_71K%?1PH=6>OmsBrQ)LZuLf&@s#IVKVUL&Cv{% zryzE7WlM$Ws#B{C!HNspmQ7Ho zuTO%)1{vbSRF^{%EbxFKCrQ$*Pol&@&rEWN1{jj1wA~4V&4qgm@Bwjh!2pUs9N_aw zP}m>?IEwIhYXoqv^lBiLV}7I+;GGQB(fOl2!PfmQ3E>ZW)OXu zBpEGRKf8r@F2Q67Eea@?51Vy*m}eO;nKNJpRr<9PaS$|*Ow_Pn+9Vp6<7N9p9B$~qOxuXrpAcR zwO+;GVgTjxNz?$<0PylXQ<$aCX1y-XU|x~H7lR8M2ZtF65sixicZm-cX6Vw^44Q8Y zy|P6hhC2pWIKZC}Q3H>&@xj7`D9;EnxZ$wj%JyKh07s@s0(mBCKtoLpqA$~xL_@kn z=~B~b4u%|d(hzoKv@YVw4m%umIGP|+Q^VN{pb1!^W^%y*szc&ZK-B<+6B4G#J4sWl zV_icD1{d}V9~}~BteaHQ#J-Mok4c2_ThmPdhMLJmZ3qz+2Jw^NtgXy3h7G|&OfmkoI)YCxw=o>Iz0QkOMDwN@+eWREO65+Q{Hd_D;VhU1DU3z;)StRq9V zxQM|;TXhIXv5DC=8^Qv)L)7bq#y5Um*_r?V5t9oB`1ns$7{pJ4fe#VBp|BD8#?WiM zW&;41$pr&^9SRRBjd{_-t4XH`SF2%7H-Y+4>eKZ2lSCJDyQ3uRj5BJQP}tCXk1|)b zD8vv|#zfQ*Q3H>&A+wZMll6k@+d3GsH30zTOfDE8kY}R8Abt$8hO$KXhQgk8#KXvy zEebKXXz_4>uR~#c05S)K;;JU=1;;mjUTZZQ07Ohx6E&nc(es%I*h^wv)@)$>5cSH| z0DulNxsaC-Lg8!PI-*)rSYs_2u5S%%`iQ|bbw%dVA@MjxalCL3k%7r*@4D4FkH~gL7}*6SDj?3)v%_xxvYq(YJBlgD6CTy8oeaKSK+j@7_Mt5!Qh&^ zqPQhW9Mt*2%9^N%3$#;wV>BEt@EG7T%SPM3ngXG!Ph$tGcWiBHyFTl`Sq}h|pF| z)DTfUGs{{l$?|HlUT}SDSkp}apuCaw%79N(J(}6PdnIYxl zi_}qV_CX=LV#-1?9N!w&G?eD-RCTlBpcJi}4Pcj;tJE{0uo3yTK!$8lh#`WFhXVwX zpeK;m?t?;cRg?9C>l;6>Y%vl;gtls;hOQ*tHxQVZtGcWiwfWYuuA2b>5mVJoo2c+C zt!G$kC7F$WS;Pb;?{j5qATHP_V`^3$5`UpRCO>PDZqxf7RgPqL=PW%V_#98p$g@aj zPc5eiha>faq$DCbILvb0wG_j-q?u>sivevUGm9`rL?yd(g)`8ZBpzBYw;8m&l8^~C z`s34u#wNy7yd;r}6?H6dfyK*(vwBr=m~kn^BeD=uLO4#rj4dQMkq~ur&@Ht(nnf=Z zhr%M|kjl@dqd=*l&stRmK;lHRaK^*P3}_<>1dj_5*X~>qECxeRR%kP37j1HApCkrk zV6TCOIuS523B*exymXOuzgW72WWDV5Q#U{ZP0<#pL^=dk(wNvK{bip# zBt(vR#Yql{{phH!jKd;X+?iWxRBNHD8y)o5Chns_nL^mOY_OB?LZ(rxrdqQQ-6To7 zwP6|xi&^o`5J_}!{*T?VQE=n9x#j!6^+ir;7)0RW+Dck8r_5wU!*N>$n>yeO*!^h~R|F9RUTu=6l;p%p3gokZ zaK1~&Lf0kRAbH(oWP*N{ei}T6U^lAj9#HQBTGA9^#?3Vq$w>}TzI9aRy{gvc+%v{q z&c=AB^?j0$qI~i z&h?A_a{hii{|nWB%J}b7Uv~dVv-cC4%~xBJp)zyQpEc9b>)~YJ7+b&v@<;L`Y1BLs zH<6y-I^D5DNJ|@X-CaOSni+LEMO#>0Y)hkhu|CA#-TqhH3dUw2b7 zF;VNsX${GUXqaWhvftFrT>!Rs)1YhM)XS60Z5#y?-%?)pmnA3C@5lSXx5tOw$9&(8 zA3Ja#vOAyh&G|~~D-^HEO7=T*^0U9a9;C~2?dR@$S0Kt5G*luMQ}JqpMJdUN6eOZndkTBv5q`*~7*eh9*lci6sahn9T z4a%iRRXHR9$lX&Ta_g*g;$DgydR=G-0^mr$_#8k2Ol;m%0I0Ctnxjaym6X(3K!~wL z7PIri_DTS9_he>n-Rty5CU|Ii*`+PxBSawja-$gIMfL*SYaePjTS?E}n2AdEc3+ zT1bhMwta+wO+;yhti6FceAbY&kdG)Hb>1IriB`12u^rgs? z!3^BBm9#Zfgq17Sb!pe~4H1Zr@l;VL2sVr@$q*npv0_W5^kvD3z86Wg&=a%~-WnqU zSX-5a=;9hJEl_S7G;g#!IwtHY=_@>qdL&DV@vu^7VJ;<{=vdg}yO0toZE?8=GP~DF zpSu*zLkqNR)9no*FMhdqu_1F8*=|Xe%sKB)XbxI-O=cmG5!J@5tl0>vwNNN zxj34K78o%5TF!Y1|G%B}Owye6B+9NL4QfbKlIK4fvE_(&V-SeB;7KiJ7H>s1U5O*yKV@?%#Jd6OTD@D`f)h#*dZfa zU`3=Ad+nL2YO;exASYgv$e8(VKi2D>%lR&SFETSMGgbls)FPU7ofROQ#0_Odo4qbA z_WdwO)x-v_V~eSHwKYePYOA%$!wHIWiG+o&OSX9^MXZRsN*|?_v^UZrqov{4O2F+2 zW$X?hAxOJK5O}pES(1~NMHn7T=QV;kmC|WpZxbZDQS(H)_t#K*dqWcv!019+n(CWO$@n=n2YQ#t>N2Es;GwMmm&ru4$klT{bf6 z0Qo^jfjb0VZIY)1)1*p@#%&T}AYB(dQY|F=bvK0waMxC4Wym<+Nm#%Fk>kDqP)ji`F8wrEAH)qAEiKTtO*ccdBgv}? zX#kQFCs}OG2WJ+8N(#qDNQj}zc^;`GNXPQrJu?y0dJW#gxJCrenw{WTo@+_N4Ai5! z+$aE3@oGzw4$W3lUiX(JXJhh6wUCn6T?S%s*H+%pFe4qxiWVrhP0Mca0P>&vc8y=) z@pvMC_VM|YLM;4_`sDcrd0;m=Tk-KpK8s`Fv-G|VcxST`DQ$84oa1v94c;S&xUF>D zR%}zd9J@b_NMFQ4F}Is$<`jUvwZl43lp^I(e36Hat2(5qTViIldugQ~ZCoz`%qcyer|(^u0)`g`S{|AVni=Swo=G>5d>rOXG&E1e=P?dwC5B zLE0sPz^jey;wWUy;B2~?lVxoXL@Z`crH%A}n`4HbDF#^q+Hveq*14ux?OI}FB4QeZ zLY(sh1c6su^CiW2SgEr>2TPaeSlH{Av69za1`*({t#n#L#t|PHjxEnioyNqjv5dG; z27>Tjfb8)W_UctqAmhOl*i$JT06RZyyDxC_Iho-SzX}kw#||Ma4X>7G86P1660m?1 zg<>jRZDf~2!NfOPDy1(=P9!W~saog>%3TIxaMxC4A-Xt{cc`{5Z5ix)FM;Sn`ka#- zujcipO}AMFH97Z;yUj!Ix<$Xqm_TXE`n|W$u;6pdcxp#gX#e%cBYRVILaZO2Gt(Nw zY_)EpQp&O9sL=wws)e2)^et>z*S$_}q(fQfnr5|Su+P|?&|JTv7$b*g@_lh=nfR6h z84o9tVQk@Pn5uNpm$GvWF+oONm9%DJH(DC;tpuAo%?#N6(MxC%$Pmh&Gx8i@np8=F zd=|hlw#WiK6~y)hXj*Hst$yothnJ$+q1rlpG0=^M*#a_xjsh2|VQfj3(jFh%;Z zrs_GfKu;BceSwXLiwQD2D`^BNZYV1nj@xpS@ev{*0gE>ku8u|#=X^;{mhOZp?k{U9 zFR4p|JNhrA==G4UC_G@M;%o6gLW7sPk$|l1>u|XTx;rUL$~k zw0RyPau`08BI06VB1$W1Yp6)#L*s_61e-d|4A}jVE(CG~!G^I(#>{v7Qa#QB0xW)( zKIc>+*|0Cbqy?gj?N-v(P!U!y&3n`;z|<@TLp1c}Dn)XVBmE>OQf;+1-OR}#F}5&J zJ(Zd6MX&oDiRUWRbXMN5nb?h%hGQ!MrUT*CZi-$)+Ob8D{UJ%G$nl`-B4 z-e)o`cWL89Y_4b(;LczqfMvzZX0_AH#*3Pk1G>-$0)bas^CdZ9s>b6iKw$A>k%c{q z9FjDG$3mpbcCXVKDw6onaBO*+@)=UBbnK>xLP7ex+Pc>eAUUy;qN+_o45aI#N2-OM zppA$-C<@kAWn~CW@^kvD3#Fff98|gw$ zBqTOhewFtz%*d`omR0wt72DL!RczK?44D-5-hD3sixO-ZR8my6JuErVy*>SsN2f9h zXbe$PrXM>7W$h89&5CI8Tz=;Ah6|Xk3i=&Vl9X?3g!;)jC1uRtyW&gOF zl1cRZNI?g!fCU|$>u5rMcz}uWBE4mY=DUPK}o>-LQK@4;uCla*P z#da%c1Pxme0#~f-bUaI%8Ki2m(4~$81YT{;(@i~~Zz+(8)DH!vQb~Ie5jhMWiab+9 zQPf#UTSG-NBWh{g!M3ebE?@{-{2=WTohig63qh1Yw>Eh=k*upXNv&R0F)FpWIcA_Z zvbHMAnb?h%<~>p=$Py!b6`!lnBDgPsDBmFIG&#C9dFZz+InlWiEPNogFF@0pK^L|4 zwlh?`HvaNpR z0mvo;CZj4Zx2@QwZmz1X#;($fS!n3}u3_-T<|)k55lA1CZ`z3FwY zQ#1AiFReS!GdxQgau;B-&`|)oj^0bImXkxQ)WEr%@6x+`PqQ(@n2G?LTZ+vUt&*k> z_nB_i99yBS!}qa#L(Gsd^9)Rdt5`ogoi8cd=uXc4W$6+f3s|ZadhY`2;&!a-tN;h- z*rBXwv)839gPlRDCJX742m&u0mb~R~ui8l-`k^?NRq8?9*+>rnY9V4>XQk5`GS=#) z;n?zAON=&((C9+iB{DcaWqL>AN=Hw>?in-VXGz|7kqGPym=Z%2TNZcnI3FV&dfjM& za@%5a0d7!+{Gdf}H&_=7iH zb?le*-~8?Qef3-9KDU2=)2MqbV@7|k$Gg9rU-p-?uj`k6ynfq$Kc4@T{N4WcxNrY} z?9G3ze_DJfpYYdxJpcLs{dj-B|L#5C+4CRxU+O>W_jvQ~;{GS%7kxbc{q)~a|5t|g z@Bcq#_nqw{{lbs;YyWcI_xOwc?eTd2!A)_ce~!E2JY}iM0qIh9{um-tiMJ`w zVH_9>l8)PoZR!(ZhJ>I?RX-NGnB#m&*%|}|a_*dWn=P_{MHPX4fsI&^F5Ejm4pe&} z+EJB`XrQ^phF_E^JJ%Gr!c#vym84t7!+PCcmYj{rBh`~k+ie-o+AE4_O$GQIM>=FGkQ&)FU>E0Ov~r9_GB`vs=Ebsk&Ac=&2&GF96h9lWp}gWqlA2MbjOs zrJnWN1;RADo3ir^Kys2J{Uqs@v2@IQvq|vu+QoP$LeEBe08nd9w)ILRIPV~57NaU1 zw-sBqw;;p}2|{}GDhKFT&Y&$WCSqEz!F%jbRGSOLD?g7IMn`P*5s$j)lE`L9(T^tw@*nFxKggAfp`Oy4_ts zH!m{Pko?ir+(i&2Cd(YinE7r$7UZ)y7QRczq8tyjDx=7gkqNR{`f;3(kq#LdgONm9 zv1KDOge^vpcIPCADBn7&)1+!`^1$L)a`NQ1}ZWOpv@oGzw4$W3lG;R|<-z9b3^-VW>(d%vs5#aHwr0HX%Ls`)RRkeBH?&@yJ z&attt@F5JIElHPyvlT@ur7ufPq^=9^Y*qpQ)FRr8*lzXv@%&8sX!;1*?~|NzKQtjf zEoKq=X!(4Tq(igS+N=j^$uPF?RIe)C>_^$ThTinP%h#zHJDwa{>A0=framDxZY3bec$rrK;^=$%&3dIcH;}kP``T#{);!R)BUKJ7l!9K-;$1cwvv2V^+{2 zKoEGfHD8hwrm<3I0RoF3i!AI>8WudtrRSm?TBn~h#~8JP^Jj;NgO$lNH07;SeK@WS2IeYO}lQF##y z#YU0id`V80?j#SU=AKJ*EMTcx=m}!dQoK*#O&I8O$M47U!|FqJ_fwsZWug1r{h;`M z_+FpH9CBdHd&8L9Tu=uE%Z`AU2C$fek3@*NQaCDV9j247l{3YOxuKAt_Y&U zYa$CjNz*;_^a!+oz>J?IVPTId9rUHhlfew!wUx9rRD_i))^%yu@(mG)j`37cCh3MiMEiF)P8#HgUJ31!pD(Ndcjd~m*6Gq8%-*o2qJyjTiQ1FtO19EGIem`PNb0 zG9FgyERH2-V@6Jxz6&YQxDCb-Sg^G=_VM8|4P`|ORMn;%MkXSrnH9td1;EoN;+!uj z#=}aT1v*$7M8^WXs)dvQFli-H5BeCKp%_#~OPg7pftx`R19pF;3oU{O0CkK? z1@c*-VA!NSFBDAg47XLn~NT-J@1)Q#W@rZjQ}fj6$(~geys>iG*0H z+8$Q?bxEJ!hXlYB4wU1;xXuc2XD|}HwCQc2F!aXDZw{& zgvU2J76vcbMw7=~1`)Wpwkj(_#t|PHjxA49vfrJnN|;@4lz|}dYJ)`yhUTHe2Wb;# zTMk4l>}_;N(x`d1y2QsX@Hv8va;P_wS)!$~Uz8WwId1k9q?>Jyl5}vklJdG|%uG(C zt{W18{U}B3BY2VcFid%mAhcU`k6N)!-Q3N%c~2!py>}BB4ndWcH~_uYOh$hmXgZLZ){k52^Y z)J-Iy>>{@MU6wPN1TPK8mgic&Ap+6SP8EfM^m(-<83H6HR#H^8Nr-`TUGzw`&=a%~ zaR)`g+N!J!!EUrP;#&#K;0YM8`=gi8B9INvk2odi&}_9fOp(4UIY&>gDhRsKPPns% zn3zasC9RoQ#E0fR=m?gJyj!@Nvhxf;jPE+qPck^O7!T`p*K!#KL5J;LvpX<@HcrG? zxwb0niA?a)@M?LkB}OKe5nV{TLasu*#v3F@v+o!^yJbsL_HHRSS6`fQ`tSdq&6l zaUw`@LqS%$g+PrYLa7V{vA$`t?)2P{+cR4g$NlBf>?9Svw!UDajg_Hm=X{iTyP4PIb z;YbrOi#EH>v|^k3gcvF#2p)mzImu!}&jGd!Dq4@+UzVKcSS)L8Rx+Iv_H1?8?(#O) z-~_O&FtFiDV4J$R3&;pM3S7ko6YhLT*=pV7++WrP<&r+XjNiL}x`<|7X9YMw#|~vh zo4qdWaAkMRROgBv22ykaAncW-TSj#FsM;j3$+6SIK3G@T8n^8u=t5+E9L_rs?Wl?w zl359sjcjJ19?j*7K!(7pO)>=E?Z(Z9N z&LCBjh4e{eur6%P(@l*nZ#ECFgIY3-Ej(pWNn;4L5j>V+9qV4FW{Pj*a1g=Gek`$P zwgBUWjsl=qKf;xyn;Ki*Yr_=j%aRib3-qcMk`2J5rC3MNDnK}i=dhVss@bV$c@9_h z6C!YPxls^6h3$r3Ye-I*#(Ldf)&`x$C{jJi3v5KzU4%%iH}5@y96400hAS~OgWcQ( zWCR@r?$CI(C0UXarf0(x>C2kRORmLqA`mRw&)w76;4b}0IG>9%6%#err7eRUcH$-r zX?IR?#s#b9>88e(w-j&OCd?)$(q1H$vnd~n?p{?C$2u!%hFzSg1`jQ8y0n+Nxr)yf zyC+pWfKeJTPdAFIq`dBU#gelzd8AtC3EGGi4UFFIbu!FIhme*QXxrw6yM;TUOcvto z@+dbO)8<`n8Kg={odpOiInlYT{gTx^6jEI3b)=o*aazOTR-FpRR)S5PW(Ms3=q0oW zWC&%?8F>ycO{%0oJ`3O&TV#Qr3S#>LG_5t+R=;(+!%NZZP;DK)80bdBYylZTM}Z5~ zFt#L1a>Dd%m?C{yQ}vu#pr?w!zQ9Jr#RQq1l{A7BHMe$HX=bqHx`#J4;z^$8J>5Hzx`fxed~DBi-$ z(3&Y>crbs3b^ViaZ&j1k_pS^hP=)YH2vOJlFCK5s*->c8R!4*_b9-l9QK3 z7#>XLMI`(RPxY!KK$DU)kCRB3?OvxfR3!1CbqCwFQlF4%%_1$XNKOC-yxO|glAJ6ZKB_he zZ2aK27WOtlvZ2H?B@<+`^wR)9L$DiFX#kVC8wj3rxX-<={4ZAllK=aBL-5Hhe>h)k|nLDLTo)&$o_|K#mR{RhuxI zE_SF5Q?-z6fZRPZi9R#}a=IhPkwdi*XC>HFWZuhb+#DMS%py=~0=jV&G9K3J{<7q# z(E`1yg`OZx$nYHW?JjR)jg%@YQJ|_eGDu>;?$6GN2(f?)+f6bA-|fdz)h23{DQjLtkHO*?LHx)1HGJg8wb*W$X@p!)d2Ke~>zW4_E0`BA0@8J|17 zsp{`;<~Uzcw$an0#OPt+>$N-$QsB*fEb~rK6&>Ye_RSc7OB|S|leq zhAkOu-|fdD_ADU4;%Dh|P89;t=Ap=w!3?XKDrsxT_?az@k;}6T_6-rZx!foNaSL8; zNzzRXmA4cQJitIamC^yQ^TW3L0ym$N87}dw08xAF5Yp1{YI&CN5h5S~3pi0IrsCB` zb~zMGe6yue`m*Fi!UC46g`S|?WgrH3ZB-Vciz9i5YU|RL!M^trh%ThhImz*AUT@lT zn`KawbI-WjJoK(x^qY(cl(wwjd;1IvKF5rwc2tG-_v7&i@R9RT^r2Gt+4ynskvU&d zpIUS$OtU}-iysS5^;FqNr#^R2QS~ajLL?2zh-lbqsb}p4w4~W6sCq48M6y=2qosL|R7#iqgqU&jo=S=_-W@;RI zbR0n1G>)O_EXu)32k1iUK8GS*XdoB`t(n-37=lOC%`Ns5Vg^>5d#M15*96|?cn@r~ zhLsxW%bJS#T)c=#1a?{|ZE*)h!P-iiJ~%aGw6s9mwo*4&u~}4WV5)imqlk09B;cbv z$pZ}7Q>i2@V1YznUjV43*j)Kl($-KBRxU-}qn5plj}U>ItBeA7Dqd~fi;MGaFOP|D zDG*|8k;Ux%u92mX(f*(yk@VMv;-L(4`t92S48zhn9(NDUk7S zA{oXOo`$JP2Yo3!*ANqAZk4T9a+{+brr|etmv4eunp%_2m0e0PdHtBy>Q09~_pQ5a}Txg!DlJ zO2T;OUuMD1coe48DW{Hx>7|&*B)|%~dO-2|HPl7i+rCt8mX&~&`$V{<-?0XwInAt;W!Q$0cuoKE=`AzLg zsehYeM(0eKqug2{VJerL!(lm@VKR3;MMAlf68Q=Ai|DX^_MT8%y3>6)LCFL^g?kYN zFbU`7;#}G~A`fL^1AhhR0macefe;9;T0}GiiTngdf`NNIq!uh*Z5#9rjR2LiQ>fIe z&2f{5wvKYq7>!T~exUsvi4LBF-2oPH+{LmHhj%0xwu&INuvOw#uoKE=2(9Osf-*B5 zL0TWQb(D){MtY@N0Syccaj?d13vw8Qh=w3poY|3J*gAyNDnh+FtDF?0KMD8PIarR3 zn@nwwe)gVFH#*qRtj=2+5N1wq4yM2p z-8Zy##5FMnHt>PuxzP!~Vel2qsdlfwHQvi(_=S7)$*Ie>bipM@K*ws;MWW`>fP(j>ETh9m2oDRY!t zD$C0d7;Z z{8*1D9(fADaYG`9{;U?)Zb`7nNi;jlEF zos(|j>%4{vz@Z}U2Vkuhw_B2&M>Is~o>MYq+n}7F0E_GW&cPJyxXEDfIhZP^*JOdZ z_^}>Qydi8c2C7A1ximO@&H;(=U29!93fy?L#ak!EGfPgvf~kPY*$zHw>nIlu#G0ye z(#@Q8UPA>19U|3et>ja!noD#cZWjuBmV9QEGx~5+tjS6k?;NfIPX|Doms#$iO!Bi* zfF4kMopB9xCw-Cay$`x@*hhk4>oDKnkJzWq2g66^XSJU-pAc{}#;gyTgx}wyAJKeF zy!@1```7@){n+|6>zz#5M_RXyhM3vUPC2Ux{nS}G7~{s`)UK3Z26^5j3F*v|mlq_= z@SJ2xMC2sn-KAD%=ryt#AiYXC;|&PvLY$A(M}X4bRI_Dn)KGuC4H3w7Rd6Oh6^U(s-w|;df z(BrnG>e-)yd&mP36Nr2Tk z$r4}ZH6%v0WDB#U)~JS`D)uB2Z_<2WTrI9tei1seb6w4=X)lYEKnCe z){<1{0+ZsdVZHrXl~-r5fH{DAqMwBuueNv%jz&QB95l(SoH1s7(AH5dlNp&qkX?M* z#nsEhsKOvx5YdniT)WvI$r&kP4$Dpq>7`kHLtDARy+~XIWDLqWXUZIL4Gmh|3gk$Y zWU?;EDDE28+dDxS6Q1gTTI)g4&%%vYo5CF=o>_hrESL(YobBL~wvKYqK&+`cC*3eS z=QUJN&>>Qd&MI6x)V*u1%j>R0Z`1|lWOQL;LGjM9M76J-JhXM}sX%?>QFTsu9H}}( z1!N9=5yv6gJXQ}0hBr`@uL4W;5HJMJyP0^%GY8AjDGtKh!3~bKa)o*&1IDp&%BiDadMV~H39y2$9#Fjg3~>>k zvxw>7>92b@Jjjq>*jneSz*4<5tDF?0Kgqc0gQ;*c#;gyT1Zn8U$oJ`1;H{Z~tCv?q zy931i3{i^lDKSRf8|wChQqEdp=LI=PtR)s2?M0B`>ZW_jhbAF+vyIlH3cC8uheXfx zE$$jGHKcg?+M(`UYYnM|t&-IxoEfra$wl@eaYv{%RMtr-nI#VmTHOldNX;9?8dh+_ ze2*G3=-~*CM5L`lNUb8gkUFcJ6r(>0&#`l`92+;8+B{8Ho|s91)j8=VzRtSAq_}HX zZ|}t90pb~HOi=yoJ)v%Nu%TI@%r2Z9i=Wx zy?#jK=Xj*O&_y0!HzliE=VV}|YR0`TS@>?`pjd_8`!gS+k&vRi_m;TMphmPy4H2E| zua*2Y9S9kW*?zr8+z;sYBl0`scRpXX)i+=E{t|rrdhx5kuS6TW%MUWzI^r72FOUto z6`%(c-(;GQU7JIjsYrmvIH@by#p%)t!WaY_R$nUQsQOaiRn zs0Un<3eczUeus#LAW@GP^>K;@y^M}(JSo4JW0%mHhy*{0aoWE zYf06!KgHcvL^K4eamaLC1B6Hmg>LS&xib22TAE(gjCZbo8@ubAA?4Up*$N3$E&p;u zU>kOY_d8^RrJxH(un<74!$Hx{!j@N?!W|@@S$-5Om;YOa5qV@BspIjoh329@#QBuk=tCcY}T zN50y@6B+by*hi|0Vof<~YF}*|d{z<$U%(x16%;ywZmGqd|LuwJI#BGCeGJ0UVtmmL{D%^}Q>w~tAa+yG^sapO(hpXQ(!3)w? zqQPfXUY)@i#Gx*)yOv_qjMf?L(#)`Ce92sOe)hGKX%%L+DQF;)lLT+je!}a|Pyu|X zf|3E~v=O&kl6H>>QcHe%GzL4NlxBuCV|>J|oGEiMEmz=(w>_X&8?Zat;E^I8OzKe;Mi)II&OGCKw@-PUnIwx7@{e)dHW~z%) zFQU7-%pVeFwnZLaHzljXixSHqKBrsI_+R3yE&k@(eOl*(<%|Z_-UVnyA#GzJi z?*v^qGyMW;vhiCOEiuutrWzgs2C$jY%e0IsK*pfM!TA`-aXT&7H>YQ{F zUuWIC)CC=~z4sYHd3D>sz5zIhOK!3Ow!89@LwvmIGeZF>8*yewf?=x&QVUxpZX3jivKd0_Ii{e@Oh=H`2TelqPz>_3Qh*7d z_&Vz*=xBzAY!8BDal36`FigbRDnh+FD@J(Wg=*_LXq*Z+W6b)XAzXPVW&u{qk0q{t z!(cu^`bva1E35MAjL3i?pq`YoaO2gsK@N^K=St?FNoM7YG3$f2j&hmI$Q**~;?pj! zULHmjcMTEI5G3WvV_QJ?dPuDza)tWg1ZBepR`|@ZG{_U(rbI)ya!dh%gze(T5?3!z z8C11MfU z6JG;5;#_2VC+NcA9SMf5!$ExxPgB}9h#6(${6wC)W^ImV@JU0s@{BMGu!0UlmUul^ zbg;Xj)l$2d?lGAiNjusB%c6$d%X5Pp98E7Jj5ld-;tWUDIa79dtV4r@8M@?14e=st zJQT&Mt+$7ojZC{G!Cif{57 z4(Nze$x^+slEvX2Ne-=|*(qnqpWSnGRt^%wKMCWVgQ42jPNvmAv)mIiafk$HmZ^u= zpJAzYjh7mNm3*y=3_!nqd33W(Y1<%C*_i;7%JQaW>}Fh(hvp@+Hrr@Ds?JHb^Y$XR zU2)eC5e?BDu-kdFXN-b;0N-I+jS1%8v z3T~M1QA4m=T)S=dLKlgPtB72oS}~!-3zMxG?;Oma9XCk=iCOZ*LBd+DlPrnrjB8*9 zsztWc(oPWKsIoG6fBqusGRNKleUg>(Lk)JIw##QJm)o3P|zV# zjn>MWP^}t=E|i;hgl=vU$_sLkxUe(IN7*@YwoHK!yBl5pfIz}jE;)xIasqXWyM{=H zSSXPnPd_{{9H@SF%31R7B>5(kjS*Ti-nnLNj+;C*geygT7~Cm3L027Rm;a=@pWE91vBC-B5DYd#kJc8 zGjb9^Y8An>(N-TcGZ=fXV`4Tx`*H-~OqfyEqur)z`LP}moD3^QtS4a95Uj}3^dba= z5v}*SVXEcF5?|*vQ~(auh&2?+MqJ}15$O%= zA+-op(uUyd}2VDTaTNv6X?zayuP-(j-VjPewWEfI$11LlT{?Pr(i4J=Bmv4@Yn$S+ouz zwXo&Yb~pK?ahEgdVodRFb36mjgy)EBVhn810m;W<@p@eY-BTB(UOy!Awc3a^B*bC9 zc9t78v~~WV0dX&OicF?6$nz!-4O1nT$0We&oMefwGp>Od5V}$7_0v6^nKz4AYhB(9 z(fMk_a91TT8Njns$hh3jn>;j3l_b+-0XWBxb>0y0SK2N$Bt!iM#RA;-HZXt^L2AL` z)%HBX2Td>N&&8-QYh!nDE^Qrg4P|13ZUyK8#nC!}u*HZ~i-?9Gkst3!FmSJj)PlvU zZG*m{5ukE*3YD6*Id1aM)=@4RqY)~>544{n(ZO@DJHR52yI3~j@Qwt-RuQBYwo2Rz zc0$<KmFFca8GPFvxr^_zg`0S(i7{8u(m6J`NA5no9_}OAXl`1c`dBHewA2N(8ATAIa)I zoS*>bN5M)7hH76snKl~2l_S$+fx2~)^@!r(sQ_HHNQNT11I9ZN44`^QEm*wTo=5ng z=>^^Akf~rD8+T;`EOW%wOLiNYDy`(0n>TFTuHc5l6qF2*!Qpn>?1fe|MCqQUv}t$~ zD8OV`-^Ij%n=xj6&=9UX6oUY(<;Oa22)bOsMe>yhPek{a2#&ND+8$C1TP1E9-azrd zpLeHW4z`Vm$ZnTnEW z`Dm=1?V#9b5|AVV+@|WBHkWzMYpCG4ts|*j3@iDVUPK1A4k5K*DRJANoD>iId3V!h z47Qu&5q{Fv(G(`$B)_`&I6!5tUtxld9Bh<4sYv9n@ir)MPXwt2ONrYCy)^k+%-oey zvo`jf7(l&_a%;;Oz~S;~q2_I!CxAHK)!RFn(c;Wh2h>^*%45tbLcO{VC&gNNON10E z;Tq(5lgra|<>fI6usSE*#Mc?uKzGs?+1~q@EDrlfFl-f-_xln1)cG{{DE+MVvkvP+ z^HUzYb7>N&>(OpgwR{{NQng$=_UuWIC%w9yCdi%3lTswm^AP0)Ul(ULZKU=&8MG=M6!(E4YWehZ>@LOaw=gMXLx>3tL`o1v{Z^HbY8;fYo3QI9r@c zTSvK~OdO(H0eV1jSWX~pv7sTNA-Y40ryri91FFy0&Qd)DdEiQ#iHAIM&DxSVF+da2 zkem^d04q2!WG$(B_NN%J*LbNRSgFT!)d98E3~4{Ig$!pKpe07r4F!G>em>*-VULZ1>IbTx-znIaI}>tyg8Pr_JxpAXAh>jhcXGU zf(}F0BZ4zXp!-MY_EONrgnc9!whjm7F=k=QtF7P$iD#A{1tm-c*329f$xPbvw%0_V z{1wPC_wmjRrct?76X%)*b1_ndj*rj_Wq4ah;!J~10Q#($nSz9vbEAg0aQ)&$< zW1V2glBk}EuL`ax{!l{(JskFtWYIc=)SA;*+Xj6@`{wR6PVpwF*XX{XVG7UpOkgcg z7eCgLROkYe;;v!6{aKY)H{_YddPL=QR}FWI2d+fg%0-wr?GDbMvQ9!delUI83JFuW zKwH54yvRQspyHaY_=9rPO(biEe6QlF1i+{N* ziieQ`_q9emOxNK?iJB&_8+jb`xy ztk*bu1jducn3dJyc1w~YiDAA9EY;7Z5IFB<;vwJ6N!=I1Pa48YVd6~!>f*-|S1%8v zio1qLhF(K=z%}OHy@e2$g#qMVo*UfYXe(E^7eNE;rhCdH1~ApSJSG8F(A5Kq*Pr2l z8K{1#-dKrzs#Qm>qJELb_4354KD(A5%=iV!)*JOb@NpPgDC>}-% zW_Msj)Q~}SJ>zY(dqj|0xG7nk{y9NmmLJ7lOgt5{9emO>OU_giBjvwN(?ZSr3ANeLX%eIu3rSh&5Hqzg*_%<-NsS zLu7-`O8y!foB=tyu~W{1#j9bHvqrOlqozoxa=^U6|lL zQoe(14;8D$?Y5CaqCKRR{PgNRoS-nyPhl4m_ljoW$U0}r9Fd0x32PZY){;!t1sMf5 z%vYkl6WW;Yj)cP2;h^Ye&FQO6;SLg`KMD83N0}8W$4#aN;E1=qEQOGxpo{g0oM@;3 z9Jdh2u5}OBZb{lbqLIeu^wp-}IY9v?!+H)H=3u)yZt~D1Bu~sl4|G7Y&;yFs>l)}T zRWPd2O5|%r3}zW(W}UB{&+5JR)idTX?@i zpu036Wvw=14JA?Mivl-ZZSlbMXoi=PIWTsLKLdph=#I9Iaz&XqL;?}CpChf+<0-D^ zOYLI118TK*T|+_?m1k#}JtM1cXnJ`)#yi))joo$5ki32=wL-#F%a3*5Jd&|XO zlTC-G8iy{l{h(8^aO2e$uR-FO;RT&*=%dWa8R{kvZ5?q<%*1Be52wuy@eX)h!97d_ z5e>m=aqYH|Lx?@37J*9K6z-t=4xAqaD;mVyrT)^s_WJz4-HN-{!`f3M{ zD`{0;-I7o+1k{sqmV6x~obv|_D9Ic**R0L)mO9HGAos)=1X!JutVaYVqYCeLNQQW7 zMZ6k{idq^#8mADn`gtA$FQ@c`v8RU7BBr2IIFONxp)j7!$M+9f~r{Esq zAfh2yEe`uga%dewYGKQ(ZG*m{nF@V|&zuZY``XFnog=R0F|k1cGz&eTc>Nhd5rRiB}o5EzUB9)k9k}Qdv3>xT;aFOlL zYH^L11Y_9Ubi?h8Ru3WnZ)j%RHEyn1Te2dhnKS67Fji? zGMODoJ4p=7Lov0lwheM{G`*C}lqQ*#GvZjrmWQbl5@zB}0_x(&5?3z|ql&wR$OfNP zd38o)fIDDW)YI+Db4DLdO9M*6c;{eTa9=xlXqa9K8i)(jt&=Q?>pW9l3c#mZL^K4e z#Wmgr2JVRu3rZqxGm-K6iq;qIl#f0M}DQGz1B* zXS@vzphS>bxG8Z{xC7bD@}pp-)U3^MlO&L4vH7O;El{^kx-F?XPX*wFdV{gDKdZ$x zUJ?x46U|OJYffKn3U?qI7~!1k6qK`BIorV}O@cIKY(THy>gaQN_d9Vj5i4k)xLJ}&=9UXF_QqRbCM;#&TBa23{+ch@5JN*FkN-T zTI=$D{3DuhH~BRm@jhZd-QSPcFRj0R^I`bQ$zKWuBYs@|QXJ0gNOB}0`m26XFW;LK z>kWUQzs3Db{ZjlE=93=GC+v%%>wEBPnkg>@;7~(ELy#=a>_{+d9YSj1ro>I*4iXQX z9|bF=W^Im}B!NU4*}iFogzeTzmZWvYKQIGAL$*Jw#Wh|M40538NjZx^UTquX;Amzj znJG;&D`z;e&Y3btxpYQ*RxLkPbM*4w;;tdG!Dm%ooxuXS1D1usbo=ro*=sR48c;Rk zor7_~eeL9-VR|WOATCh1PO>Df^Gtau0H1CV(GaW_*LWKkxF>?t!cB?W27N;_1Nsb~ zxn^zbE|a3IqbX>N)}w0q+zIN6;*qBSTu%|v5G1&s@is7k5I*4rDXSkAjs_ zvo^;~l0YJjEZ(8pcnR1#$y!o1ZphkGzf`U^9qxefj)V@V9#Ts_v(;IBL(>bo_u9E; zZR{@2rD;Z8zlA{eLSU-7d3%R7=B|kKe5oPWsYiSfYxsyDwO}c6i|2&0S$@bW=O|YrCiz*N6COvZ&X6$iITzX9$&40<2ed$~eZD+O5$dH`IY_J}78>c53|Aa| z4c@sle3jH|vcT}Tg8W$L4Pgx{3_2u3{bq^sZKMsC){r+2gJ`Mhr_;3Ff`_lXppL_p$-{SsS{{BVv6X?_6SF;~} zPyYtLsQxMa`v2Vc59ssN{Cap7+Tr@`K zSGP{OiLW#Mf$oNeY=2hr*O;62hB}LWcFI|>c(rYigQKl{NG_NP&^X&cDbdzZu6dXt zRPs9EAr53M0nnCnk?oymBM$FKFl-eK%0*`tp&r6HCu=MY)s<4Swqzk41E*eN3-5P`)DR@)DPUUw*+q6yx_x;PeG|%N$u)ZBUQSae?;YZx>>{tF4`R(Kz_1Ko`7c#0ho>0` z1BcW{@obpwGpsoMUzYhN54yMCPob{I>ZAF1mVbCqAG@bc&4=-P3Krja&>xh!SN&^4 z`IX_9pkI`K_(uQR9@KB>OW)``;1?hC>zRD2e|hLn$n@9eOApG&l)KMDQnZ_$@CNBBSSp!1O5f6)8AhzeRuiH+uh0yuY1ON#)nos-wU0`@4I7f#Uj) z%&NZqH|~%AT>g{HyV2vzFEWN*i&;B=>2y7``_q&e*gZ5`}Ite zU;KOddgxEcO6E@q{p&wm6TkP3&Y#|Mqy7y0H+I1LE%?qi$~TR4 zepRSC7R>Xn&h!%8!&`Veui*83%J1AL!FQhJ3}>8?0FS;vAHiRn$yFx)-wYi-kHGfa zoqEAFoE89LROhyTeI{4gdwHWS*A1z+>b84h9@9IsMd%Q;biU$$`9bev`XQNfEpFBo zr|fdibL?HY0!*p06tMl{gVL|zia%u7%=$|+z5Dk<{Y~@h>^UT}@d$tVpg!D}9`pxg z@~OJG_hf9_H`?QT=RtW~mZxHTMshf%b03Vu;B z+JpY6%s+cjKK35)4<6Jf`QA5*?7I(&CDtuVf79rM!>Ee(_uT`&ajG@;AC)+86#|;CFsy`wsyBw)E}m%I`tH4pIHh0Q*6ie;Vqa=a>FQ z&vtHpPxLQ7=#R?08~xFz%QxcJvR{oKHblM&ejNHFRQqkjVzx3S2^cMlp>Q^G^ZY@X z+Wv89;oO!tx3a42QVvje?}wn~Wcx(uhWFEB*hPxzT?Z+8w&Y z{|W8(H}pxkD2r!fj0P;Kz}E0~7;igqYWoTZRc#%;(9!WQiM?+k4r~h&$9^N2ZjO&g57!D2Sh6M`k4Y`(c zK*{h2bG<`Q##k}GiQo!$h%(O(q2ETnvA(xI`+M;#^~=qIU*u%`Mytn$$~A?3 zt?Y=DJCvd(nOrvKCPLn!{gVmE%#f-6lF+|A(+B+0gZ9CFk7RnIJ@+cW&f2R+$eocZxAJ=fgdC%k*u z^HAV7r0?AWrojN5N87@hmX{W@NkIa}XeP}UvWFh2q4E2BbCLHJPqXns!9}flv^zo-s?qmo0;mrGt(8^??WhI z{`_(%yZ`K>|4?Qw=t{HL+qdhgUtN^%AP>UhW8q&_wHJ<$H&6x4tvZD$Tt zVsCq-J;e{UwDI4&tr+I%?vaGN-0T(Jt1)OrUrwnVh(@Olqwfo zq3`4EFdZ;PIA9W2Jg9hlcMue|=gZJ61Xsl`gm#H9LgPEG zy_fs;qW@4PcbJ5{xJ|yKgcke!qDxi77jb{Pet|s*;Dh^q=B+E^HS(QBio(X2pQf*#uOJS->S^4Ns^=xawS9UXiL>|IL4o+?_hs(rYn%}S?I+7 zFf`wV%X0x!&e!v=WPTUQ_x$BWgK(AnCFHgRV=QLD_vlEZ-)djc%k>M7bH4hjY8V9O zuV%6x(VzZB4GAXc&xeLSzAWS~lRtCOe=764i~1e(OZae6gtHX8bCKfd6nZN$aQ2t% zH)@AyhK{MqrDDi4g)jquI7+pUtMZRBbnCdlf@C^4M~V zc`y*6O6UD#E?(JpnGxw%TT`@lNn2R+yo7p(8HLI=U4v9=TRmdbp=h>O4GoVPaa&sk z%>Jisho!Fb z-Gf8CB374u{xeqv% z{0gyR5S_-3GKBnq>9T?+`#{W6Qq#|n?toLlvG-IHnOqKWnDdCog03Kx`7u#?sOd*i z>M%mfE2+TDqm%3?!|}YpbZk*d-qT>f!(dJhx-_A~vG?TQqozkDBSN=eI-R9eL&|V? zFqzZ1zLyRspf3&~L@KKuG0y?h2_`_kPQ&40K%dt4(#fO_WlJ?Z3XZBg=IL~fKkJ~@ zF4XklKHyYvmJG}wI?0YQg#3W%;FrQ`5cLZ+J(Q3Fb|Hv^lj8NbuCG!g$^#!&w zqzU-!PMR;M@PMJ|g>W>A-0>`d-VKJGQd&9%?7~kPxf!jcIX2Q_hrliS9vo@R3r-V~ zU^_T+homz)c)S)=n5NTkWG%ZoTpR_Y(*l1=7a|Gk>W01qNuOJJ`#X*f5$M%3I52!B(0x5kvLwSK| z_0$4z;H6O8IMs5&^?_{^%BtZw55RgEWtdi-L4Zk7A0Gx*dDXH$VdLY1a;;Eg#(o!= z_7!BCQmS#0*dB0w9PCCh>Iks_N<3X*hCTre05md`YNVU;fG8n_ZMR~kjFrif7-wwe z38=qae4xzFno?y@OQ;34$bo_npr6oWYBxilfN*#SkIK|EPO&|p)LKW1UYw~Hfv;5; z%y1f<0C}ZTpuU>av0PAUnkm(Cpx`rAN~@7xAl;H!%1(iL&EOAb)OkfORi>IA(D+0H zUv`oUVzeY(=@iW2Pa3Bgtz~_bW3dG++PqUuvPFh;kY?n|PJsi>3dR=VFp(+C!i@0Sf9xBrh zQA+KV9*{1H@uzFgfI2Bw86cQpgmJuLgh zwc$9Zk&~%W@mXpLhr3?Ha^dNs6x-bh zAv@GIooXgP*rh5{su?)0ZNOt|D0syXLp47e);Ak10cUFYXmE%N07eoJ2b>o}`uVXe z=1^E6o!-D|q;2m3}=7?E}R~%GXl+1Upbpp{CD=Z}Ca6;Y^5G&-h0$6Rq zklIJMEciUd6>x;EM~p<`b8Ho^E&k|UQZ$NRwCj7ER8_`O7yzaS(EX+fAurNZbA^G5 zNeWfQjDxg`i2_-e!RXP}6^?B{jI!DqCab7$7|buD4TynZ5d3XntI>=j-l!Up$&SFV z=V6c!%9M=Lf%^+Y4Z+_Qw)#kzsZw9?De_*i`kBWMDY!D5I2r;35Q8#(A8$5fp6fZ=w{E}%mcl2Ljpnr!Cz?3IM^Xg z2n)FzOz?`MXBMGv1Z~Sxz`;kKnk^7PZ3oj4v4R>_Tz$-jAg8`*L##Z@g%r01Fq%o0 zLV03GvxdyTeB#A^p+=ux9$qo5KFovw%o9Qb5E#r`i*R8<^jA(7sINwtLDx8MB8<_i z046ggC}3nYM-TL1@*EGsTNSlKIt6{lM-Z;9`=v1~s4zYHaAB_~en`QS)yA%JfEWZc z5>1?Bg%A@F3}DbGmgbBx&m!B%Q{Xn$hp8HkD;(QO4Kds(VNq0rjfL!NXS%*DGJbs>q!mIu~J#<0~els-s;0d7P|z~XtuJNqYu}!WVBQASR!hp zNw_$=L&#YNA}i#Lex;0<`;&%xxq8_#R26Jax~fw;3lOJIYcFtY%0!eW3o+k_|>5qRoDwzmVn zf+0gk(YF=cnL6VzQI^b=fyfgFf*CW|>9hy4=g?U)(*OnMRltpG9IR|62++N7FW*7n zgaj{S=(zX2CZz@F@-_%HnJ($*Z5)&vyJ=!?MjImhsM3oxnaE;C$Tnd3FiJS}3s*9P zne%LE&(RI2QZfk>t`t*32v%9m(ZSVYxv`^bg6|wq`BX)q9(fgr8^mBpftdS~Micif zfqI^Yn+Ay=WyePFaNQO{yy62l1N1bAe4=(tKhsjv%s@6>7Ge_S;Ti$~3K%}vWI)Z> zUr1>n=LxTUMX+T?r6G(N7QoR-vDP(54_A*VA1=+b*KZryjH@C|*1Tq>Mcz`t@L`li zqKj-M3`)pc~J>v zXX;c4ERXNqh6kk^QPoF7fy`V1%oYvgvk~-NBUC8juID0BN;?a&du<+%%#s2V%G#FB zV1Dw^#EoD(q|MmccNE}l{gQd7F`!eU5WL308uN>$_J+!TVpEiiBdO)A}h?UNG7 zVNqcQ^OFxMpU4^VXhdzF!VNkycEN#lA%w?PVU1W0OMW;Y@SLD@2tx()m&uMOLdYw? zc<^ARW@R;3Uuc=g`9!8^`-(6E3}N1qBg+txk;RfQr*57>rlc$0X8P#Gz@0>WYy-Lq zl-Gp-BddAw_|kNj?2b$LdzZ5jMA=Rhx=x1brmPsE{G@xj#teBDZ-Wp-GzF8X_)W3k zGnA4R7Q||BGbtG-|zJm6HWSWugx=|3|U z(9NObf+%4SRbyM6=s~G7q)X#e&IQs(rnG(xeGZte&N0zB6Hv0}G_G&KnUH*e^o-E- z$OP&IrmM3+M7vZ*I+II6V%8U8B6#^x(<2jBxWII5u|vEynv=&}Go(AyRP7{G-8klnb!mAy1Vta2 zTt^wQ!ULuoKADC#}{4mcb-1H$o;8bB_2Sp7d{d0)GoM%1TDU(|bsl(yHq?UAPLW5)P z$-zfWk4#2{Zozcxj7LPflw6{c*SGwcge`X{eOuw10#Y7vX-XlKlJ_*rWk71AzLyS% zI+QKlrANV0mB&1t8)G^gvxcExxbT+jJH+x4@QEfejU5FXYDl5kttX1A6E0OZDQjI? zUL)WG<|p=M=r%*21EzxznW@T?KKK*`A;-J4ymA*YOEoi`gMRP#nVGz(RO=xn&nW~9QsiO>&P*0mMN-;{~q4O}P zC0&|WXCH8;eOpCXF}uzq9t%2m4Ot&OAn`n)o!(@|w>Z=KSVee1xcY>4q~YSGx~@>u z8*+iMEe-`9pV*rbG0b@mm=0oOup*Zq?t*q{LK{(1)N}B0)*}-xc7g3kj6NDxm=Og zDzxQ3;8f}n@PXqe_PFxvFkDJ$ZB@wWJbYf_rws@QHG-mDL}f1a`=YN z!C(X&2-7$P!)ivNd1nUm0Y`O#c8Zr1qY>bg*BYoyvl6}FF1fO}5hTdEDfLXJyOfQR z&bAC{i7?GhwSqJh$(5}@gk9$mk40J)bNG`+Qby~E@_;jhg2WG)pUxmIII0V5r$9=4 zNWmbZRtm!$aEASbWam5cQEq_?Pd%R&e@$vOO{%==i84$a;v)?@SY)y!E;!8x&_q4R zPz`;mylPn=*o12%O}*nF2u;RgK0cPQpvPi4+R;RwS3s#@DnQ7u;Z!NDTogU_=F{KL5vQFhb?Z9{Yevf1_7lO#6llw+!2l+t6eZdx_8h- zJ;+dv*Hn4c5@A6cw2`KsL%uAD3u3fjY(tzA6Mhh-?r1%01Q_YO6*y62%v3xNm^PjZ zY8r}WF-3%>8*tPo13MOU6QxV0;g_Mr8Qg0A4JIl=um{!l)wyr@Ory(w$L?*BA>B-3fjSrX~E2R(|(k*Ew z=@dxO42$8!mO7RQZ73q7fJ>WqDy0Xc3ooOzDjevB$B^PneVo=C3w@+X;ZP=9;({2h zF;;el;K5ItR2h&OZa_6X5b%NH$7*Xtag}p*aee7Hc$i5|^|40LB-h1~rOIDF-wueLF*W zfo(OyY96weLTwXkEEkm86k-ro3@YSK%ZlX3_kbF$Yy^fxj9vr- z<6L4vjU>c>*OAL8>Q;Hxa;f9)YT0iDeBk)88k4Xb(v^__+8Hb-MhvX-s)AP#Gzuv) zd;upI%LTXFr2tI~MKkzL`-)zhl}!@DiU|}kWh$Nrq$#7e863_9jtY);9B}+-(jyZR zFAybyf*?^RT&ixNT;R~)ET|1#?ac_jlm$_S#UxMU@TbZv946AM4mvD`YUfn-7&)PJ ziRf(^?Jy1z2c(`~33H@nz)Z;SIby=tXYn=&)zhb7!bb&17q&|%H9 z_^Lv;%Lr#O6mTRUFNc=mP-}~8?d$rV6_EH_OpQ|(XY`8P?czkbFJVHa(q?l*0Aq^znfHo=ZcZrDVa-#u0$J6IfiD8__y*k!GsRs6Fgn^iS~K<$heMyY z6<~@05x~-~FLd;I?-EMuHMkTw1lX;$YJlj)=5Rt(6v)CjFBQ;$id9FOXOVU?35qK8 zkTdGJ1ZIiSfQQ086;-;8G;RqLZ-d zXCtu?%DQX=o`oo|VTHA|c6<`nM+$7y^HyJGt>X0f2GxYa9~`U!Nz;_|=2@hj3OTB} zt0_4+<0GK~ybMur8r*s2rKdtD^I=>gJ2+Z|3#rUSqJXAbP&yTLta`Pyd2mSreY>gw zq7Md&jurA&yh34@fFT#wJT!W#d~9t6!ci42q-pb^j**NR2h|wF3GXIpj8@-B*Vir@ zY@!Mme#{m~aBT&naW`Ux3<8};{Nx8|n-F%2VfCQZ1;OR`g6hHq#r%LrAy$0GJd13D z!jGzO^#F(yaX7ARrHl;FB3xKA^;g?S*H)isVvdTBv!F>aFa8W<#6`pjDXZpPoDJq# zWE(vcXo!O(4%M`-z`Ii^cAC3}FyflUF4C+}pJyQ|LqHZqXCEsYcHwz8V$vuv>)`0- z(Hcz~6nqXuNMomfgAa3IYs;i$9%WpZ zz?vU;)Oqy2T3H%rNQ#2K<5RN*!hf+9iaB}`eNP#StEFTcF>u(nc^?1-c|4Yp!Rn@i zg+aMkhYK?uZ@ug}(pep#;JipBw(kyB=0Ze@cf)d#qMOGuNf~B~ssh9$UM9LPVrlw- z;-!Mx#({Nkg;22contyb)1?f)wAdNA89GJdEfA2FUsfVEc64hiE!-?1(-ICQd0~Y| z-;0Tw;)!N?3SXa=5UK4`xBqk|g;Bj_*>?;X4#sQyNFv{3YUv%-`);|RBA;0}vR3olTZ>OV^* zui(NCv<=p>K@a_L{s!`zIOk?Ma)%A|PQ99(Fdct#)m;lb9y2K!<>GsnD=CW2U4kZ$ zY7Js~)_k1b09IVhl-c+Swe((qh%Y`1w`abXyrCi}?hAP^JTwE3FkA&iflgolrafb2 zv0eIcejTGZ&}X6UQs)pazB%c;ij*7#DWuKPGg`vs^66FU_ruE)!)+Sv8cs6vOGyHo z6=v|%U0)9=FuQ#=#w&wzgU9b+XueC;Ym_d+0_&2=tzDbME))MC6?cJU$D2)~07l`* zy6bDJ7^cCHB%bTYjgbq=z&vV;CPh5wCgGLVvvm?mi_6Bta8i}>Yf$TNL$NTzt01<| zCg;q2F%4&A?9qcV%W=7Af9{s0Y}MB#^-}K}UJJ#(4#p{fq2l{c7F@n-Ud|vJ3a?z& z%(J1cE2Ef77n2h4oM$O`rxUP87qZ~vKq3rZ^4}Y+-K!I=x?kOgG!mvv}~EYt2|1LWx?~8&7P1L29U6>DF4J&V+-|{ zuNr$YT&X+lnw$NMU1%?xH5KlWtE8KgZ)Hq+Fo;IrU%2YE$=Qls{C&2&@7>FuJD&^f zRUrQ}S2ap&zDGAWU*M;yEHZpa^Rn*MTJNeQg)ZMRw>0h1zkV;@UA0D!AJs-h7;MIQ z?i8p@dxWWex5ldjMz{6OrCc1PuQ$LSMxqWO)}~i4XT5+K9JR&MQ)%H99uVvFr=wbS`(o5*5503WkK^nQCa&ySNHbluR$K%S_ zU%WIFJJ!r#EM|G}4ybu}ElqoraT+&AbAmrGgBdwv$-4&`ra1G(GLQ=jg%K|cj;IbQ z<4N5%GQ0TH?3D>V4`h7r@@0uviu)`b4TH6))9FFhH^Oiw3>i=cHah7Lo5!V4nb9Dq zHLvIb*;B3{&Kbm8tmU=Dbrn4Qyb&bRlnn}hk%;jZ?^T(Ge@+Gl z$<5(xQyf(obtXyrkct-&q;pkSopgD?jYz(e+k3|GLUIGz)kl{bSA^H9LbJjtL;NBKu(|k_mo~D8uP?eN&Cq5wgxvT`0PciWA7Q*SbT`r1AP$VFKfZ%z zcIeyUl%aN60PW0+(a>_Xw#x*P{D4;rzRki;Y0dYC*tXU7YS( z6;M~1(H-*C3Kc*6OFHF;(C@4zq$*H*!5la z6J1LGot&&w{Kl(J)UNwk>UUS|7uYZA*Dm{QC|~F<^=|9eUfeJK)WhVtzi!)Op%oi&( z2c%SIY*Ak4q7Mf`Ho*k$=b!xFMRRx#DJ!mZcQh2Y0y9nqs3BgXl6ztxDtzv#hc!Xw_@G%CR;D57va9OT28=ao51Bth~^z_qsfi znuhXXYf|Ef$Lu;EhmOyF?^e5d_RwP287gcXJmRrSItF;MUsD0Z!gd=%oDJ+sn7@*z|J-PlHHUyO1E({3u?c0ZAbR1?Q}GI}3C8GiBt!>?TC8zs84=(P zdMP^-k){+T~9ahTHaDxTM4tHzAzpLIL*b$MCphIBf9 zyKPbm0+B=eZt?nmO8U~^B_@Wu z4ps@+EZbu>5dfh>d`;aJT-v;0ctoXskIdJQY3MU41)Hrk{TPZWx{vY;O4-3)K>Hl= zc&w<5?)Ic#^lg!4!|-yYnANzdAZv-2lcLNmHzeI_D4M7~iZ4iTxeDAZYXa)=SW)d> z?(mwbpm;pA%W=8Hco;7;vslL1pim4lS9;cZf&c7PG080n;kj*Wj}=7HW#;W*kKPFaW~+^E-3eAoo^U3Ni$wvZI~{_|HQCF`o`5;(=<*h(}*de;gMtjec! za(P>xuvg&d-$fs<+FN>G*jdx>!S-0WaP>pm3o%jCh5zJL)o+|zuFDwUUZ1*~an}ij z8SY2P1p)4_#$&(P>>iKDimG>dgxRm-gWI!$9_TE@HkN~+8#G3?>Z_sqXrjQ~%#14@ zgVu^lFtk@p+QH)?FNbjSuAAkC7XQ7&ll2Hp_l7wuc7QTPtsW-xlbll zS2JQ7DXT$nR;Sc1N0{O-sR+c(TlexXp!8KH6ibA$`ieI2LS<$_hCVvfW(GI^xr+0} zl8mh@S|HQ1$Cs%}M6=GF_lQ`8Ol_A)w>x0-PJ-;Lw}D-c8G~GYwhuo{d!6g`jCBH! zhjux>PQ^sJYo9u%kBn7bms7nQZ@TY`V8yzZDuEFv8^H(J+8>Micku!B}ag3Ln!Ot(ty2m+b zehobily9aB?iPGkTeIe6g{h$1vn`VZfsP<|VV9b0&~(5X*A zbGzZ8vmAlhB-+&3HlEOBk+R&Xw)Hxf^rkAltg6v0tITBAn71=W_*<*I%? z96z}1fAFgLC-{TQ{wx20{V(Jv|A0s1-_(B@Tlarx@~7^7z|PhBUkf}8{7nsu*;>H_ zK3WPpX&d7`$xt*oIqBt6(i_?p|}8P#1VJh5J8qRaaq4 zueP_(1zv7=uRO4PuZfnyZ@Q}A9%)H6`lJiG`~|aXwsXawUG`tO>aP8Rt8!uG-2SQU zGU0ESFJ^cxef{UIy7%G6zk1nzA^j%!K3={#zL4wd;d}DSk|yTB?;4>X7U4Z#Ji<#_ z%6e%RvDz9HX2&&mUtq#KTia!#>=`RgxO*M5rNj!xOeD6jtNcxRQ&}$yWvj7p#DVup zGwn=HmL{`N45FC=osO~P6r2NKLB+vZ>Nof>YPLEYjKXQzWp1nWCB3PNpFN0(pGi~R zUMC$6xydv*gE^f&uPzDzZsWz9;M9Kkc0+Yh|Z`>(&(kTQZ4Yk-5objD@3c;n(!T5Vy?;W1-nSieL;aR zP1m(-EwG@(@KWx8$QJ_uW;nm*)A}2z!iUOu@UzQUI0_x;8t9&!^}S$>u&N^r)vp0$ zNh^g)|R1=vMu9>Y!{T@jSK zeq|1f2$m8NYrKf#$%U9>O|>nf))fj$l^1`E^Bj6MIbOQ2{g7Ux7_b%QzOeZ;ggmL` z%D0ZgRWiV>$25@h+QLqnSW_!}s4R0^c~|`2qz;@VhnTl023XZm#_{P;LK9oZ2cgP% zv~;vQ2-s{abNfBSA;jaH=^ZylD~O*-ON3fDE7lp9!Q#>ub0=MAEJf|9?J%f!DQmVZ zu-O-FS9Lw{hQ{l5>Ytj4~!t$>eK=Yy1?>?d=Y^% zo7Wb0X&J5F$wftRTS5F(8Y0xfS+CBB7%ZMk)C-P##p7JiTqSyHu>NNZG{YWdcLhfF zd&D52@wwex;d%TEKjkGA@HlZ=yjpsDm!$FJRPda%arkoeh(O8pfP?bp?X*_zs*I zH6oVbje5>gpFw$f)dsR@HBgUN0WCRDVdW9>lVz#7TQh`A_0I6Fa{p?Lx0cxQy}aA^n*|{CL>!5!!goKzZIYe6q&zh36>rIiWmKL?HmQb0l*krm1_e z$6`fN*9|k6BkSDq;XnXO`XxiS86jn$(E!I0VoWvx;x|$@S# zNZ*zkmxqrLR&|{6!7hlnQX*IMl*zwVqI@{6Yb-O1JqYCY30oNk!xBei5Aho*yjt-} zSAN_w%W{|3aGW_Z68U9!l?D`rV%Y^vzm}l?X5SZ=WD}$-?Kjf5g_8NBt4`ieFMBWX z+pc=P6}}%<=L_TJzV0<1s7y%`^L;6O%6#E|{T?`Cox?Ae&SDQYXsEn|RuxSRp!yto z)|xZB_e|eq5QIPH@1 zN}KXk3a?h-6{eX|7Z@>|sdOiqIg$XmD%oVW{0U!>>!TdvA)VU1cf3o-kx_x;Dp8w3T7Nxe42*A z%Wy`YNYP0EJ1Xk1aqZZ_@zO5h_0?hN=r8lDymPX0nJ$=U^Ve5F%%EV!k;PphJtBwh z#sV3rKodpg=y&8qR~Sw!jp2dfS)-$Om-UkSc@h*wz{g~FjQ5;bAa5H~EP{*|Cy#HP zz#L?YdzbKL617ZSkvpxl#LRS)@TA0nq~oShj4AIdtR)HnJx%}_WUaM{24+1&`&PQt z_xQL>W+L*s*La|;QzOQZ-GKb9ud!ve3JP7~VRBM6F6DB~%c!G~HI1D|e{NG3=3jc-iW z>ZA6u$repv?o)5tjPxS92wP`1liF1s#ybe@AG)f`n`cIrX1gbEODFPkq~}&qO>Z5e zbd12qBQFXJ={ep(Xo90DS4s|s+r}t^C~EnsFDew@>BSoEOMZ^^-lf>T{9NSoRIzF! zcyv@vTURMLVm_y=U49k5d&qy;N7aunyWYFhi*>#izDch?+-t)xL1^D!6~~`mb-x^U zF=;1v>MoEalLjx>^G3TOvLcLf$C+FkO8pFkwm#TswQ0T?;#sh@I+V&k39U-Q7<}P! z8RBP7@@S$@&Ki7^nvM2EpvSqS0W&tG7$H9TCOU-FILwby4*8dF^f3XR#}m!FkC;v% zVOSAS{)_e3sn13zG1gm-#z4CzAq+Z66z}lT_)=WrN()=X;|JXJsDX? z3sf_yn5G2$UqNW1Ge)#Rd6`AKAharc_jvb>p0FUX;#{O*4xSBao!5A~5S{h(+2*U% zGNCv4(NV1rHd>v9;EQrL77J&2+W^VU>(vA#IP<5r%VZtp#XK1p`#UFEh#);aq+VDB zeF3OWj}5C|J) z^IX-&(durHD_p*N8K$9Gg*4~lBSGdM+uWrS886luD}~)l>GQ0s=?(tMRoVGos&~-c z)$h=v63pmzw?vgQl8g0lj#BP`oJ*v$CMxFFT~c#~oh3>D>&>8J{oJz|Jk zc@!~$af_LYzQE-_yEG)?9qTpCSE-1Kkr{ednr1z-0r1(Jdjy>3>@$U(bbR=Q z{Q#Elq{qKjn5t|~;{-=$5Tc@43{=K~w@;u5vw%jcUEg`@m$!|Uub~9Q1V&;x(=87t!YV_zXWBI!6KF0)F zVm(*>qyUqY<05xIF;-pc|Hf6h_UxQxb2o1Ar!M>Rs}gE(&L(D+e#g$jW4}<&>nDFz zB7b~YR@iOyd0ELXEcPdZZ}G~bQ&Vp1xff5UP+ojdkh{q*AVz#4dnoQ80+47QnuO0K z$BxMFQZLMe-M?wcG%~Q630~@ls}8`=Ppd{`16mz za{pCyCI5wl|KqBDZ}8`r&7E|p z5nlpd2(3rHovnu&q^WXu1NOMJ@4l>hjNie_#0+w9iz08L?_u3Cf-L%QcA}wI5Ub6` z(7IOWTN~N*dO!I@LSOB>FN=N&O5S4}hN)Y_vX1I|T)&JAr~J$YCaxj*GQx3=WNtmw zUL=bk<_2r8Q5he6A(E%rhtS6aMq;s0k+;*CBrZT5&jv{6Ih`!n%K^iAzHqLm&#RfYW|T19kgoUFS3TdtwWUU@b$b2^yQHod-NM5Y*4IO$)xP_V<+J2FXun5& zKfS8UK0e1XE03fm*{7==L)9z-Ez@0hX3t>py(;d&2y^gGrjYINg(wI;(`Xq-U#I%Y zib7?NsE?CVW2Q24SvFtL2faDqMUg`3MSWg^ri~ff=GI1EaQkEJ`l~g8#$YXx+D{?(fStw(P{K^B%x>$xLFRpaS(e$G{THs}-z{t_tf!NKyhc1~7503g; zxgDbqsg8~ilgF6!4TiVeM%~21&%!VH#?IstW)trh=SOO24;44WLf>g$xTqicu8(zg zc`z2y*DXI>HCPF#qtqo{2dn$va#dqojQys24c>(tdC<&vum8qXyZlctJ70`nPrfGp z5$u<+UsAul{$2btLt0{C2LJS5l~iND4&S9=y#M`vde!IO&gTD?t8Um2E}K8ZhcAFX z(SAYwv%URQ_uJ*l&$W@&(kg%bnN&S}Mk7-8DmAo6jHFHZglx6kj|-Ua$;DU#b&}a2 zObg>0(>iOzgBjK9z6~h`EgI%|e19g@vxj*}Fl$4wsMFUzFS;bD$}-ij+(6@mtL=c1 z{GKbC6=WPSYmou)cU?7Mp5_=yh+hIZ&)72Qpk3y0orJy%V$<4)AG#$OyO^8 z2aKW#Wacw@#Clvp%+Y$R^JBIMaO_TwK}WiUPOJLF@3z(vW1O==)=+QUj5UGO0i$RF zRVm(i7B5{46^TS z_P{ZE{8${O$8`&LdLRj((9T+yW4&Y>$G!NyHr^J}zkAht0atcczmE6sB%Ahnxlt(p zk*oT5KHv7QU3R`E|AN1M)&7Ccm&ZQ?y_oF358S(uu#_3dI0&4~E~uNSM^@?ZsG&dJ zmfz%%w0sScQ&%tzIYLe0(3I?(^Oz}LAv6$=89sUIdSLTja-#Y~>ORjJ^33=qN#}UU z>h)Zxb?*DJ0!Y?y-`{`Leoa5PY%a|9zj{>!{PEBCqz99A;wurAIOX}06t#mBqsMQV z>S=aRrV~U@g5@`9Y?2=(0ZU+Wz2dCCqS3xMD>MKQ)0r3(}(3mAI_bhEnTR~jz^MX^S^b~ z-HC5=_hjAPpKs~Q%OV)i-_#Bm(enNH(x8V^aAl>?GzRS#f0|kft9Z$Y>I>;(N^(h- z?+c1P9Bv>#3;vg`n(zLHm(9KVck%A?HRMaD;E$W{rT%_ne4&m*kuJ+G+`})hN?p#D zYV7Wu*nc54bdy;r?D)pGBMdAbPA9uim;HsS62o^w&~Lmdkp}+yW!<&P6Enwg<+od2~xGnnAK8q z?!twhoNuMOF)O^9W+^rZw5UuxS8ALG7Tx%D1%|gmX_@*wLY-f&qttqDHH6n#6t5ZA zAksBu^Jk%AY_YTp4L1u=C$(f1b9|#kQC;>Ot}z^WeAoEY$hwXd9s{0X)Q2AQ#_pzxh>bV~|nT^V?{mlHIKr#|}OCiyEBej6|a zanT^m!);5G^#MI12%cZ?+U!uyc-e(U0<;hvdy!J zUI)V`!dBl&4egPOMf)C6cb9x$J^OhE`ps9J(C#%`?uGc)_+_y81MgLXeqi1ryz7IR zr6B?w->?ba72uW2M$CD9CnGazWZl{-JXAs&eIs2{WHG`80TKMBc6gP#7mYSdg-srL zym(#QBrG#QaCvZ<=j6Wc00pp)Q>tptx6(C5jT|l2zAPQ-mT`?stOXk2&3;6#C)^`e z$jw7zZM!xZVi#XspqYo@B@UA^ozv`~q6;=J37vkzUtFgK8eZdh9p^2^W>d4q*>(er zURJJ34Hqb&XIZ!v(h<>w^HB$7KC#xNE+QBUTf?I9M~ieAIOVSpF36LS}NH zpQmj4%p@R;w^evE$@nmxf0V8%@G9!}Wn8U>jtJFEYLqfieLis5b3L>`+>LRXr;Z=@ zHBW-6%3ZhDt{;MiR&&T!=fzJ}KDtIN(#pFOw`q?x8kF#n$fyywU8UhpH} zy*_RQ|5K~40KT_R6uJLLVA!84z5-x>qR8+gQ1(aQ<6p!-)bPKu`V@ft88Gut0p_0p z^UeR?dgd>}lNWzczj=DT9v{Bf{PHbB01e!;19<9uQ3Z!C=78}Ku;*jF3}>hi`YN$Z zMgR~fTQQODNFA&tz@vullXqH)suzPNw}gZ=Vn~cbq3~x8^LWchmaKS%(HcFBqlyd0 z_~0%xdpv_yrijHvr}W3!fZ^8>U%MoY!@=}OqU3}Chg1l|jC=vEiCIZt1`m5a*1p4z z3ZYerWikT5nX(lV>5kOFS^_-kd4?y|(@vslp~#>oBqs*yJOjn<$Q8;$qJG}_L<^Yt zG8il=RhCP*)=BOl!H6O>CJxG%#>i){#2Ziqt}LSH39)-5M43_L?I&e0`bnwjBy0}egR%PQmLG(pF@QE-Uh zOlspZWCQ>XW1cXXrH0XZ8ApW>qGHf6qqT`7kSStVtIMl8hI#NCP0qSQMdXko^jbGt zIHZ{B7@cy3H8G1E%pfKcp0#ff^&v%HVwus}M8X-0baJB03y_&Q-$N$D)0ad>8LF0E zYXfv$QcU&Dh!CqaN#u+$f|yK5*1kbjMG4Wh)X8K7AcAQsCej_LgS7;B)brp6)zgaEnID5VQ`>sc zy_su3mG4w)R&0Di<}4}8tb2xW&BIXqe4h98!+@YBhDSPVj|rbb>&8v z%x`iuYr{GT*{F?x5h!-%5N>`=fMx>Ik9A1SL0}wJT!Jz5*I`*16fy*}frN}_{=G=iWG$WrHvYE29alNr2Zw8n#J^cDIl5kP!d8F+?b zTus8%^Ed;UTROm$$Qny$j~uXsRFGvl4h3Z8yn`c5lq@>6!)U#X=qvP9VwsHe#37#R zWt^i+zBF~=k0H67zOcirLy7_TT|z3zAr*=pBjpMybyknTdMyd@VJIvqdY}Zmg?l`M zR;FOh6J1`Icj#+cp1hh2pV7heh*C0W;GP}8Q|F6nO%fRy;~`+r$J%$;Q6aP{u}nq) zI8(M_BHfWXSWAFMJrDas_4FlCwZq7u6RoD*9{R zE~l~hvS3nYC%xl=NtlPA1~&XU&MTe#X}YOu9M)QzhD$0GyCdIN{Tqd>Uf64y6(At3 zDlQn8H81gHMd}%f!kUDMqBJwp))7(N?^p7>VhIU><$~%+hI8HWOrVXELT z)MF5$TEv+z)C;XM}Y04-=3WAPxql7~$6gx(2 zp+oQ~gCSk7B_W zkPK4=hoK&W5Y-~id>Ka-m*#o!lsxb;qEjo$dKpKBP;Q9;D0|{U=^09}B~`vOb>RCKAWlZuQN<;gtS|xf1Zf`n{4x;EaR?5@;FXR(bqw_o zEBz3LrXeu~bpX$iFRI|s#T+nJ1-JDwqOZ_bmeW`_w8|pk?Ed-&4ioPr(C6WTqsi$D zJ6IiBORu#78ZIfO>mFqs*2FAwFoKv&NY=hVv)+CV{Bs!TI&0)4ntqS4%h{-~m5vNGjmje&FBX#9U zooO+6Qa$Y?supT05<@nMP%zFwW;vo43=Ts*%tWh|gjf$4M@1>OL;&$+W#Ac#^g*l3 z3v*^z`4Gisp1ve9$}l#D&9O9HmlRVSGa8zovZmu9hg>@9e5{voR9PYzLw_Ak`3zby znO{>vqO(8F1-XFqm@1=(U(EHAC^;dGK;1!jQ8EI61w+h%YcR_P@nIYlLWn}>^>75R zgH}v}5u+}z>KNw152~jxiK-pOhBF~KbzM?Sb<7B=l`8yse6wilken~$sIo*bhW4iu6G-$$@uijC_V$`ne7Ky@?nqez9SYidyCwgtKzq!I6kLs8=#T@GJx8 zC8aWik<4gqBH;{0x-oHRADX)EPQ!CGU z846{f)|dq>P(}L;T0sf6Bsy{=IHYNo^q4B6hwsw}l^N(E98yfzJ<2$&$wm#+57sge z)s*U}DCL$2Aik^&JVTK_XmxpE&I~Ibvxq_DOCsB37#qWupoefsG1W0T2v%#dQRc=g zjMhWuc~q41;|htB$p}D1ML6)p0?SE%oMFcg>i@rPzJq*geyjN=`_1|l`AwgJ-yS2r z{(RlC_8aD%57s618_#!tRB;K$&|imGoIxv7u;!U2hdoq>zJ~ALHK(0?wmDeo3E8NP z7-g1O_b6EeASPyo$qWGqA8X%XM}^R;#4;HH;7r+yiF8NmU@ZY2^*qCq>S-rYwNPZx z6B0HCR-S=kcjO9XWwSC9T^A8!5bHTdKGB~zHvkFC$f;q^;jbRRa^#kl8m&)W>Ez>Z zFe3n$U+TklF|M+1I2Gf>=Y$5 zdJm_rB(?F0ON?g+;jEnZnvhf`v((0>CYuT)Rg_k&1uDZ4zz$k52^JHbkXJ{ahYLaV z^o1QZ9a4mzkenJWDW>ZlWgOPTEOIc@53L`>BBp2=lEFgiR7H!@-<+uZ?Ts$^($oRt z7kWIX`NXB{U=4z35hdQ7C(r7oP)rK9>$cu5P`~Y z1h9ivOoGMa$g88zvsdB`s7ymW(uaNs36Y$j4#*;vVJb_V#YkAMC80Ttql!x~hW|pSudfJ65zgX!B*{C5H!8v6eGorV`i^s4qL!{0Jv4|jDrCq#+J~OhoCz%{rt7|P-fLnO-kBj7*pD^H zJeidgeTk68mz9BMDAH>Zrk=+cXy!h7r6Uv@4xTBH1D22qIHW?cV+0{W-tdB9;3ry1 z@hs!0;u4IZzYfu#K`T?RHilv*={Gyr@LeaTU8wSl4Hc_|Y}7^!r;u66nHe8r^$+Ht zUSY&n4^vi^B`}Va;SS@O$ef`VXH0X=i4IL3{B-23JGQ8WVw1H>&_lSS4p>;7FNLyj z79+vW5J33EvkYe?MPFi>$jw9*PxaGDca)6wp{Wa>OF;+lgs9qKWX3E3R8E+%9k7Zc zUkb%Q#loUPkxxR)i2m9)3!zix+A1^fjF)l7T=MGX1u_qQ=5pE%lvNG}O=<}V0hXyQ z6p)qk4vw>W2aKZ~M(brnU!kuohX7TyHj!|4f3YRG#5B>N%ro55s|<0P`6-5~O38^! zjAsYoM9GL%z;PCHP%jX5H^Z4}sG%c9uZJT&J17Jt7%}SdS{frC{B-0*1$%ySw8tY# z2MK8eik9K5oVQLg#2nNs8Sy=g5*gy5@*OJWZn#CE{7U=iDgD>qqG%9(v49UIR@S{XS1XO#2qSXy4pZL zgjB#GEfh|Ma4>|0P|N}2XaQ3%vy9I6CEkYcdXy959ZSgw(#njtZfbA?Rc>0@y(-CjXD!vQdoVHoE2ezx72` zbz|24OiElvk$s)v5fI_G$hZ43f$hY1LMl_37lCVZG&X^au0|m0m z9r`GrdGWCaO=|Q!4(}*h%g+vo+Gc&6)&GBeU2f^?_;+0Q-1Xme-TtomqyHEDG5gi> zFZ^F~-GA2~bN?m(-?z)3`24TcypsNP-~ajI&*J>}PXW;1Al3!u1R~x4M5r5Xx+B+r zvZUvsOv3@Qfqn^{Cc^K|mRZnI`XT1@>8fW+KWQc5dl^@iOFck;1<_w%PNraO42OeM zZ+5-IH>}kvs{H94DprMT)J7c6w9G2!V0?_#zn;K&#mH9=Q&!aw7-wa;V4g*`MlsHq z$u%b(njZXgWF2>GQQOL>THp^SXcU{L~T@E>4K zAoXg?kt;#)EbKmQRhp{a9yF;#&|@PA8Xyk2$k{#0C`zNemXs%0>{JLnA>M=_)`>)` z{_N7nL?`4Cbf%D-qeGnqnT9M=BTp$%$V41uTG?GWHaew&BrslOK?PC=b=o)0keUA37sZ_paJ3_z(pau3^+22=G>=7<2))n zm{E~BCjg>hhYNH9kuIbT))L@xJ`a9SUEL(A7EET0Aq5ml5A|8bF&(w1`gDtuMTM%5 z=Ve?~L(~KGSMVGkU`|a@N~D9FAA?vsTY6D%w(d~nhdQ3rArx}fApwtLkaF{Ig2&@N zM9zKk9EO-yTGs>gcdno^U|Oc^l~BsmbF-ZwKXV(#F(4y4MK$sg1qz`ZX5{B~RPBlV#lW^hhaLkBvY*06cYW zoOF_5Pr$Lj`k=Xt^doc|A`RVIC(=Ch{hnU!m?+Xw_Yf5w!_KD(WaT0HCFrpcs0YYB zWe#g%)-s7MQyphKK(C79`LQQUhEOcQ2|hjj^cY=R1f8c@SnIfFsK?qg;>4Ff5ikb8 zmWfby@+iX|Ou1)SQl9uvQ&KMVRFEP#tZb%**ZI{W5|y<&G}Ampqghc!Y@@w`Z^pjQ+;>)-yr_~srNf1PTfjOCiHP0zptf30J9ll|$R`LnCy`dMf zQ5$iTOiThib#9#2#H^S@hUkadjw$L#=tk>&{Kaiq0n>s~ueP~zht`>{ zi9mCjh!;S+D>vUkkk~`YN0dqI7!yP6A(1q!n8;Pr$x{wId$r|MN86r$Q1x}g?i^f1 zizr}e8ZK#C6TfoyHKBz$fFI6%#Oc5&H&Xf`(y+g*0M{te$0Vkn*O}I25rfE$P;3U| zfSl$KIzf&!t?Yo5E99tiQ5xm7q&&fBYI*it>H+#IIIMtanSyFeu6%UV(`=j>%F&J( z8$B2ms=sHEPzdKpnd%s%T;Z6Q1s@}bYH4nDALJE*G3aeo%0C1t8_5Fwt{q{std1hGs=p55uJiX%kFQYRCLh8;{hfk+op2WtuN zIG+bUsIG1jRSPCF#*hLErHA?~Kc$)Z~8dya8c4G|B}UqLJ`FsG&{CDK98 zk3r+?cKAGP>z?6&$r@`2g`kl+!>oHC92)CvvH);enw{Nar$Xqd;Sj(CVm(`uP9XJ` zLL1TQ0@?inbsl+YZU3m zs7p>0Or857AE?z$B2Zv9jit2#7^rhS%Q&W^_OvEN4lA#*XB3_oSJe=cr%u&@OQwUQ z*3Pmfa^w!JvwXuQvNKudbC#)*U+Z%;gm$C_;soZfCe4v-ys*~@A3CV7iX%i@>YM&JU`-k0qua8r+ir($=-*cAadGh(DtWLyhgvRK}UiUyaG+-gbLj$nRFy_3B zt3n7-u{xP)l_jZViao5=wP8T?_Vhatw$~jhB0&z(FM)3DSqFfp&Q0N%ka$34`9bs} zP6tMM80YAQh^`DF8Vk$`q+TXhK04~2y%IZ3Wg6;H9sLptnTXMiQ# z2gjNd&Wz#FjTT)QHe&@$3rf9Au6%UVJw)lqiV7KWm#L#)LLn0|x^aAwBpaJ5TL>G}P81@9l zD+Zk=ka1NAA<_f%H;8qCIe|zY6LXP|j=IMz>3JyAaKLP!UqYveAZUO%2yjuzF2f$o zqQUwg=3$&i#ZfLDni7O!1xyP{y-ajMK04|il5}K6g$%jN)X^`Ykck-GIKIenSQE2i zu2&h1L5Sqd9if{oCxZ#Zy1<-3>Me;1t^`4f$&#LjGHP9421zMU$V3n{fOh5}$6-x2 z3V=^qNuKqfvnq}d8^8l9Al3!u1X6EFRB$B-Qsf6!UpI-W1(O+L$VRC#da~CYfsW+K zLQ)g}AA*MDS;h*7L^%U_K$RAyzpZ)ITasR<7@E3DFY3)`G6p{_Q=i5t1@wS3q)8q3 zz#MdfH&Ghpg zldbQIoMwH5>IKFNdyVi}hL~wulQ}FITI$&}tx@U`L%_(F2IPaEj;!MjS^o6&NGX}F zoFPpMzjSUiV?nnk0DK5#%=uxA3L!-E=*keHvA~=_>S3*}ZFGP=_(64blc-uS8|a0! z>bj(9VHo7Lu{qmgNnDALD-jH$1)0dR*t z%7~{pKhrx*!?JeOa7iRa;gGV^nwUinM(2aIj5*T*9ZV8F@C0etUsix?6zO9UQ_t&6 zD?c3(jr+t}(Ss&+2!+s=sV)kgz)tmXRu6N%4$V)V!x+6Jd`>+qJ%se6lcyYb(1psF zI@@tJ z?^h4na^w!JGg_avD)|n(JtM9)W*IJN0qw4QV{vg1ca;X=V#k;mVyD|~(y(GWb}|7C z@by;@^BaOXnz~BTpwndu0b3^A=xL#IVo zhLD*9ObbdqV$`)Q4af&S9a&MqK0hr}mv;mhQzSCNdyW^X4n!6p&bbwF=Pq3DLhNg_F_R)OS7{( z7V5D=%{Ihg0@1K@j&lN$E~F0D65w$@4}MTx-6X0OOlFKB8>I&B188^U3hl!v%G|u@ z@R<=}hz)57!;0zH$pkcD0)O=|zagljsjK8t&}q1y3f6!e96;sLKv1;S8u!2))`XUP z%;t(R=1ew>tKw*B2qKvW8diX76zD!4l0y!i-YI$7XmU2kWojZ)3h1FG(zFPBkz=za zv@i$Y$&J%8L#kG&u2u4e1oqqZ`wZWJzj)NFNiO?m&U8>$i~xBUDW_HjNd% z?2UloFi=M4X1**K_F(!sgfi$5GK}<*(hrda%305rgf&XNB~ig0T4zhcSW-qj#rX+E z=I;{pupKE=D0EVGS`%8>X_OawKH}~)!G_cc5nUNVu>z(Ar5<)UOs!5wMt~>P)k?l< zTge(LIx7b1TnEtZ$~QLWqLA@Pi^Yy`_B@Ay8m=KsBQvdqz?^7H9}}JArlan`Pd8Rn z$dJ2C9sLsYfHNd?;~1oz=9-uVKMVnSb)4y(Cu~URFA;`Ntfa0{>Me;%&+Cl8xkIOM zjuG1^lNq*zLLeJ0?2H(g!={iO-bBgDE5@9caaFnOK$9Sd1@t7HOgVTl91c=_ou}D2 zGnAuAZJ8Q*0RqLHbx6~~Fv!ikvm~+S9OFfY51R8JX2kPh=jcX@t_&d>3(N_mUM5#Q zI_e&xpt`!5qgpT<=!LXuxTI-e804n#I@@DPKLia44V7xKA*COp0w<#jJxM21upSdR z)z^8Ng|%WziiUbrJHUh#P-x7_Htj`DQztx}L+N^zL5I*`kf92pCtB(PYSY>Y7ky}yv*J^;2tZVK(gu!pp0wKT(*kv>xT zAr6y{nI@^_WQx7oa;mTM>Cp6PtHKUOh@nF0 zHbfJE#uJBQQ>~YDA$7@@2IS*yvBB3(A`XGsK)-}ev?EOmzaTe-W3o|);3q9jHD+^4 zA1S>dPF5d|GNp3@kxq^W7Q7`0Qc&`=RY_F6J!n#gpobc`^B0?D=EQl{QnR~qn)Nr* zg!MwN4xeR+$Ie5C^^Plo}nIV zW0)ZuWm~2&vuFpzEal5WVh?6SeI*q-jH}9}9-zO1=r1rQQ?NFM!$GPyyWZjRw5>Z- z`O`a8tO{w>a7oiLOPz8~I^kbWV7$u6_cF>IpBX)T69xRlOl=kL(q`&RIJ-DP{SP}WW=-D!Bk7V z>Md#H5d>l97k!iwfz0wi(6yc@+mWV~9S~C_ly#2vkgnH~5OXIPSCvaWKz{|%U*Mcf zXxAgsLC?>5uJbetYef~YZF;CiDM62kF=&7|8qh@~Ya`kFsDhC%sXUL0qg)3IB=9Hv$W*h3UlS2y{p1(QKXxinpuG%XB++%`6eJq-NdGbA)rs>Ozseuy-{ zXT?PcYm|CRqPhe@{N)aPlo3yHetagP0FXHq<9e0@W`~Yi2aigMk_8d0kLPn#eO0;C z1N2w$93Nm#Am>+)NL1F>d76g^)zwX+YQbbkQ7)}eO@jgA1j2@|!WJi4RGj;GrUNQA zr1V3iVSiZxuFQ0v1{e~Rp4S=w(;TV-D~YPN2Tf{pR*=)4b$~d5K+A$|Phh;zt2+x$ zGOh|^^@gB>A;h}CoIvU=iAvAwYydo|u2vFN+e&7bAscn@CGGvmDu+#>taIdn&kR&* zC3#-PRW(FCKz{|#@d4&!ioRHLU?C?RbGQ?z{z5{uH3B932tahoN2o zSXHDvhoKM^Lbry4gbIii0>?RlIERqB;SNn*e2GKvoZ)zE#I>F%+cGauJc4dQ<7E&B z62ogro|}Dh&(6_ph>a~oVyJ%WaC;vvKlSwVpgDw2Xy68)mRaS{ z;Zz?69x@}Ul_b85t3s%Lh%|I-#aUA=&mwKC)g@mVkk8>sbwyQ98>Okz&oP2g8v$bg zMtS54Wx=oq(+@s!=@2rE^pVmJkp{|H&z6KW%Jw2r!5v!X%{na}l@U)dAcxZ!<3JxlhVLWQ z7(=2JI;4hK_rM(1gkVR)EHCW&h`ZAPlfll(dw75e#Ja$oKBfvl4y zJ&&czXATYyu-5;8>+XfUAGh=l?sFwC#!KqGv`ug52aC|fIs2yX{oy@w-_u!mNRm%3 zm0FsemjMuiozoD9nbsl+YZU3mwG#iecIMNLp@ezj3FDf5im;4&Y1`|e@r%t zS^1DBW6pShUR5sg1Zmh`R)A|>hla(FsI0H^G!NNm)}!XkPs`MJ&>TW1$TEeajLwZ_ zeT3=-#;XiEgbXA72;B@pCo`=@64ofj88h;SKn9)j5Oo@@Zl~2Kw+BsXbXE{`t^>qb zIeV;vhY5y58FO9+Mye2c8lnk7tZ~F zq-mL@4l186z$oO;7g9I!p#k|E?$FV9hHBjO4iWk# zbb>5XU6j$enU51Z9(TxdMn30dD3qbKnpiQ@DoetOufL`+5)N+SX4j*Z>S`5L{`44+ z->h9VBql>!+6qCnS)ZJmCZh8sMNK@dFEhoPQ?h-xL_s|Sp$;wX2BH0&=cz%`2W(;Z!#=7Fr1L&xwO z%W0#`#;_$60vXe@4xpVmbQIc$QPg4jNh_()VO&)%^>T-yB`YpUI+=nsk0h2Quut!( zc-rvb%umbI(Z~8IJwcXv0c?fbIMs)4PXIXWVa$0M=||{h2s)W*Et0TCG0vD5c{(_B z&clU{rq%7VoMS+$=wodFh&tD^j3ZVfS15~aQL^-d=;uH@F~o+H-VoeX%`6m2I+N&bfO(;TH_lC2SZp0)e8WprP+BISA`IwV!AQYS|q7u ziaqSqwT%w2$7!UG5;nC4%6T_^fA3 z!WyOClBnPgZ8x|>A7#W-oS*6Gky5f^+mWUz(Ar5@HfOs#Ifa&v5$WXN?rWwOR16}`U`@d9XP94dwEGUkenqK}MxFXO6mnY@h@ zJQys0TT%OaGKe`pE_e?u$nr4nP@N$oqVj+|rGOr`BTZ}EQ|7QHw6N1CFZ3FL(}5v2 zBz)}2k|7jJkiHyv(8mOasjoAbT&YF6N^O~XsnJGYlb9^v7^wOnrc!-X z4N(u!U%_*HfH{E}Hzsn-Nk?sK25?gpaC#Azcj_GJ$8pA6epzh9s>X9>CW?>~9#R3u z&+9Cn9m%1R#ek^+qaR4^Ia<-HPuEivRx@1fnK_*0z9rVzJ7}{LJa@vinPh;V{<3HX(lo{Jgbm=0rZ zlQN`NaiZpgEtY~~KHccKB4&{lb~SE}!Ed|t49ge}KG$=gX7_r|$qk)dwzmfI86QgT zGX^lR9w7!%BxNSG4lX$)bA@%Dlnmf$f!W!9TAT+yFRGnDf5%{0O<`7ek8-*b6vz@- zD;6DOD3UU>p(%O4{?aoz13KhH?gR|skY2@!K<3hq;vaSCXV^C`{qmX~AKsd8g|*@@ zzv+2(Zc_I>qY2O-v2Whhp5Mf9mRtFAe~%1gN1P`Op+nYGH52&T80kB{~Gogo)!OV4!l19-r=cx?_M0-Mu^ zv$1)G#n)k)vw^%yG^g(t_N=es@%jJq(%jQ`##{R5T>8xX{x|JC|7Dlvx^I8e{$;%U zlkzXde>}KmtNk?M_XjoW^Y8IXVrPE$n|8K;*`@n;@#dxZuKGg#_jSKozWTrMH|?o+8Ek%Ud}uQ zFIVCU*y-mdGycfl6FZo;I2+--1{Lx|?gUcfLRMIh5p@IS86Rm-x_7l_9z~oItkCs;rfinYo_Lf{**GoDflwritPq1mR{@Eip+V7LjPI6mxcZEhjdSM5M7?qZ=J}Es2~LD) zntV`@96c?K1h1AXGE7kY4s;ce@=*f?Q=Zw|QVv(2v9j^_t?XTK@!W+FMiZ$38$vy1 zmc|tQSksl8fyImB@u_odclsma!S!-64w#+o{6L@)|JmX;)nR=mhIINq+BlG+{D)94 z2!6TFK&}>8fmDE5=A91)v~4M4fjKMWni9t=BT*^~B1F&64^!b?+-3a}y6o5*QL zM@Ba*EiJ(G!+L${P?I9%WixoW!&z=6_TLjblHoYvb&(1%ewxtp&;~!7Oc-)ASacPS zPaQ?S=hDv=8O1?<{?gAC8~)#L>2pW^0^a+kd%^#>Z<@d6f7_+`j{kv|cIx{tn17Jx z`Pu)F!F>DD{gc4q#O?YN&e02v<6H$Ib<~w*@zY|b){AVaY&8-*H_ zNEHInru!g6Ih&f(nO>1lJtZWv90S)bHi-?#;NbqU0keSV z1{ONVP;8HN?E3j?2_^Nef4fU}|9{t|`Cj=#{!QOBSAP4_{vG?z0+(Lf{zdyDxA=Sg zdHav}pLc2gGVgmnuiLV3kM|2;`z&_*MjnD{I7-FOqHNH$Tog{mhjmrtP|=z7DeN^y zoODfeXsS*HZ`P;CklL^zJgR|7MeUGQV1PG0z_SB554AN?^!G^8wMoK&t zyjh%C5QS&@ z-Qr4FwxqGsI_X)-G1vp<6CtW3`zLZI_$l$G8K?)d5W~~@6!r*Zh$iuL0Yx-0aeaym zQ{C<lXF$?pVu0Q2198Q59t zqA?35MyyB-_Am%tQ6(^C2hp+op;))9(x9NOabnMbr%ldl5HV%hQoVo^VU;0kPM*$U zj_vF@LSm%{8{i@pU_*FMtbYLou3ruPNj~tbC&x$-PBdnp#sU(d!)na~{mrqFJx9Il zQpnd9zFkoqa`d#cpA_Rm(C)@j2OPI^&~sONMVdjMu@JmWa0*OZXOmL^n&ysG3W7}P z8HojLOz58Fil-O!b7DO($m*8OXhVOqH6Ct5#CM3Z+(?N6PJ5P@Vh4Wau8r}Ik^5dR zl&pz}X$b`TD`K2TYq*8buG>UR2G%x-`5r z1*veF2wxO8o<3t$ptzHqlk>Yi1wE!{1UHr~BMQhHdLkWx&y&ZrSJe!x$52|s4KkBn zu_TP}yAymYrUMg$5o~E-4Tr-LRx9TIsMCSvMoJWrd6t(#52zR-^6?CGP3l1lC2JyH z#VRlw8kC`m{ZVX_wiLCpQ&88y49DF-FUK<_m@tv@1u(&BH#v}_!0mZdcxz$LVEN#5 zESx5S-W&M?@#X3>R%sl7S%~3jeF|#kY6LfyEh7q;f|Uct!4~RzOnX(fDU?8qKggfO zdL8Q09ONUhbLg#Gb8yU@x(1mz?skjkmOx#|Z70xUZadrB{+3G@?hBf`?Hwzd@&DYj zt9xy`RS#H_2R+n)lK(nqW`{O2{-XoeEfb^40X7s@J$^nK5pD#fcq79mxxf| z$Y3nd1l#T10@t9W`rwU;UuADJh7JS{Lg8}>RF zy6Wd{%Q-vLr$LG9G@KTz#}GC(l*zHZD%%uFpv52L&tko9Dp|vWXai+(WUb0Jg?gOge<0QiqoJAmf5bGkF_O~)v$GjkBfg#l z(3v9X0MlT@Utj37X9JH748?NVbWuUlLM2$}2VxcSKNja<-$)b4D{V^yjWJUd6j-kV z%NLji+m4^$MfSnmxvt7toCxJjUSuXc8$YJLMEb8VXcb%OO*z2k&4C%e~tVr zamK79ojuY32c^Mwejoto&l1KO)=7BVu~NP8QIw}~g}&nncU5vJ>=E^;c#NB=pg)N- z7X0d^<)4L7oWr4*Ct!BAQUpfz`P{j#>Th)Ex$b+H{=t{^cfB-o@s+lI@tcN@mhTdi z3NXuYr8}L6Di->Ic#J<0XUy98ok&;vlwHt9lf<9Z=?=O1-B`B72E+3?59jl6B4$`o zEMwV?rZnoPKM{{{es_W~p6CY9oEXO7S73Iw^Ft5~9_I>U6_P_MV0N}t0=bL@cx=5b z#`;ACNeie?{zqa-^3nZ4+#JL-F#QRVxB>&2EJFMW_^C9BBOiy;=1aShf#)QVDvULcPQo}p&&1Qq58HB*xgH}PTQ(yNFb%eb!GY%w8?Jf)!w8t24LcjzQ0N4{ zq8E>eSk-t&mG-Ilq`yS2INhonK%3znjnmxSIK0rzL@!%+WO`3TM!e;YuaV7v)TO!5 zd0L->H+QOg%fH*DnXVaMwgYt+ntT-jv$M?wXM5|?nxZt=gGuIvJ~^J8v-Z!6$Ji9Z zT3&98*Leu1$Im$}Fgx4%q3^cqQ{-Vk4xUems~%$mJwM^n#UebjMAIuOPUmS@*+a1% zPV)H~npowmPw@mD4KTj#>jwtB>r)tT$V%GcCK((3wB3OQfpw0XL&7mZH62U+#X>(2 z>-9&m%=}q1W_^O3eAC!r5 z(tXu?-7j96w}SM)|4sL&WpG~%Yj<%Rzraqy)VFlvQ_gq%uekKN{CNZau1j;}w=d1B z;%`QavrzN`FEW&~@ze7}eo<_K){h@2GhLs8sDd1E<}h#@V9hj>-6BJp!#RLlLvXe{ zE0Rfu5JFcJaZ)P!9)BG^ggYCLk1xf{!R> zgk0Ygn@}Kbl*X7j(GB2aQk<5&xOd)$%4622$dFpO_c{wd=NTtWD;9-5qL`M9`M%hM z=D>~8m}X=>;H*#3aOBQJs$u6b>r>=GJ)GfYU+U?0#UdGI5k#UO=g7lJ+x7fe@ii_^ zKeWyGveCamrZfj5aC#wMI$t(vAOTDRJC9kPA}8wM40ly)a%wM;*C`kvQT`E1riw?kW`O@bi-mYz@zxSN$dbM7N7aCFT z1*CWN-@9}#wLJ43v^i#diY`ITH_^2Jz)Lf}^WSTH;nFD4>A)+X4!{+s-Q@J1hE2QQ zl`)4N;dl)JYr6$lj`dQ%^^?+)E?I5N+}Q3{H|dke*Y>hlozW9~%;}qT8C`u${!-<3}tm(=%IX2wegd;CY@e zi*Yb#_s7nL_9gFnbJTfR=bRF|BlRx3K83xR(rKf7d2R(>8H4j<=8gbzaG&0a2xEb= zNZp-MNdS})XMv;~rBJikGCq`uFI!v#Fq!dn1nEKcu;g02flG~8{2R6OF-qW5#+rD`)yI-YZRRE_BR z6iIdh2VoIb3tzy27>jPJrX1lEM9;IbhA@0+m_d+T@rY0B=fz7syMA}q+Zkz@abeSGrkf{ia zMuGUy@W^R5qN2T+NTHEzOrwf59r#~}%lNTaTjqf34B0*ww$wn;nL>#j(nqQg2T(Za+sx_f+xZO9g5|OpPNL3 z;#DSNP_dh%&GOmLukO>(o{*8rSdM&%Mj?Hf{uHL%O$YVhI&zKqrHjG3Gz3^E_+N-| zr?E6BLltu-EOZ&FAp+B2J3pmk1;4ET%Q)y9WcSwr^vFpT5}i@Km`I^v2pwIdip2+~ z^CxjLeJY;m5t33so2I$afk5LttwN|QRl^uycDDEereFoj28Tb-J-vKun@5Z7)t)e_;bz?FPleQ+?JNR@kiz*g6$WUw_ zFCSb#5!2F;J7ujI@@YJF0`XlZAb-?I+IC;pX_$hHy0}rjnBYJ!C@w`f1Aivg{c~{^ zx=fBC*_u=uF`kSpSBy!*I-N>jb~bDWF0&u8&k*l39$)Um*FIc`@NgC&QJgh$G9b<5qLhDP=&EN z-08^}z;#}TX6hiV8p#vsu`f-s&w zZ8DYvAz*g4QpV8DP+N3KU|AEOKgsk6hc|@36djVlCPw`NlVa9NrJBwZsS)IYv zG42H3f<>oi0K(Kb_GfcXulI0Wep1|IgzY8FZAPPQIgOX-#BAn1$!n(}AOkS02} z-6^rblkUN-3K*nfIAa`tI=cL%xS0rh#8tCC#Y1H%K6j^VSh6F&Y+Ucoi!NYxHtfW8 ziVk2SI>j{sq8CKxp^AMfX0$(xvzqlO9;zYP(ynBVvLmvBwLbF>XD+$pGAZ=4qJa)@ zkqWRO)RQRxq5fEW1lrsT0+X$n7SKZ-PGL0YJ0E9wvpz*0)biQlwi$^^C%Y0Z0u4<; zFl7hPv7}2O@j?D9J{*i!Y*{tyQ#@3B&Dp(KiXs-zmyLfqcHNM z?>S=d)8Yn%pAN68S)bye4rjSrzIZqkeMGGGj+fj4rs1}@x1z}GFhiK`#RLa>L0=U& zQ({cCF0=Kv>X~E6jB!{`re{2I+D*pk88Do4Dcf#%SD&@wzjWzlGQIcg`jeOTMthNe z$)$Z^eK~wl`xX1G%@^u-$hXTc+wL9zdEYeG=ug56eeKe{tKV^HGWlWN#-H^m?0L80 zEEgHt_rB?o5t}(fUH+H9^J=|#Fe#Zt+agS7VR0E|QZvMGQHtelNA zA7W5U0Voy)H0Cz{It5Q_ccvg{5D ze^donnxf%5;zr9gJ5!F;seIZ%nT(%d13(L~VU?w;v!Q~hcG)Qr)mmIubj)n0FKtYK z=)EUi-=o$v5jd9)k~C@^g)dM-Fe+r^w@B<0@@s*|Lfkicd&vL__$Fc<$I{o+-^( z@~+**DdKpwGa>^y3^V01>r-S%Jse&43J0u{6NPE#FbF!)C+M`!Pq1>V;ebh|t-+#7 za;8-~+pK1N3d!cg&gpG&lR4DfXeK<4&JoIJlhJgQhI^Fk4Z4G}zv-pDGgs~^PHn*z z#Xh+-FT$4r*W+0bhU*<6%P6*&0yzOqPkO1v>O68bXE@7!OT6aqT-y8WFbwZ>&bZeL z^)7Y2hA(FL|NTolg75N;OFOj}?;H8jrF}oXjAO%>xdSv(Yrd?=tKn1Tb-Z)DKO(@gIcy zJ^Bi)9y?FSpxsT5HlFMj_C@i_u$q7A(vQN)c>zF9Z_&#D971m`%30r^`#a(@Ak#T4 zlQ|T_jHz8G_A0hvOYtr#4sd;BCOp_OBV+(NCT=V@ahtdUKU>_%(E!nx#RXuq=8j33 z*;y+Z^AsTy#Z)Q`kLz)170x|EI@{$$vHP_PV2Yhf3%Mq8vipXZz%kL?vp!>Lw=7b; zVS`E#z@*=1RG%j%MaOxkw-YZqYN%*6(3)_27CT$3?(Os{8f(@8r=hnjg22GH%7 zVf18iGI78m9B_T)G`>+bP&pF~RIX29FY6WU4*ADo3?CESAK8vzfV3S}M0X^kfZ;h$ zj8EARIuAiaBHf*L_iGoRz3Nj}JoCOO#_2H`TW?dRnVDT0DLZozl6Nlc3ps9Y&TZy< zVpZNI6K=3q_~E6U7}hEtTLda16>z8&cN4dj3SKvBL)a*>`Rh+qtR0DJ$ znY{<4G^WR$BMmZ%ov#AUpg(AF;5=~x;d4FH1C z$f;Lh8Mn^qj)AoJi_qh2J^U?C=d-!#y)7=I08O}oc4f{)%`S}*Apkis^(ul-1yq|N zI^YmG5U2%=;j>dJ@uZ4HR;$CkC%!5I&C&Wbb4QRWjhPy%p7l-%!!sN@mfaxlOE`z2 z%3HpYwfnnW`V9B4yR_dyUnSrCelPY!P*JCT z`c&7;2q2__ngi{-AmK*Mcnsu zI0MYird5sjBm15h*Rt@r6X5-l3CRJVN`soP9)zoap*k`OglCDyF*?XlY4N$cf(Q?P zWEjhROUxkY@wpS=ePFr_0m%ZYm>nt(LN5v!G=yp|RSZ>hy`mPOR5GsoIKbtP>}>RJ z&QFUW>YAS7n&3@uGSBu<2aB5l(2K$nH6}_ebOXa{*3+hkJ-^G}Qb~=g`i{6caotD; zq&Oiyo`+^|)b8QB=s;@qczo1P#?ri9mpJW-JbQ!ePI-PZn1Nn4XkQg)rfXRYaZSd2 zGf&46m;xH@xEWZ0S*r3pG!$c?6e17Ujr=cMnlO?Xuy6`6OGfgvH0B#Bbh;x1wRjZM zlLrk7>(V%Cp^#pjHknAo8{qOsW)^l|5ktoo-y8X2g7-`2jv&>Z<2qzL%~pYtXH4i! zMn%@MI>wzqDvOe}3&0D}ZgMuq5JYj?j_m@fsOaE(BjZ@&{gMgkq1uGZ6dG7Ois|8B z=b?(BvS%9`jQACpoelSC;H7G|xIZt}`rS)21J7HR{@&c1TzL3e&GeLid}Gm_ zp2pI8tCnNCvN46)yL$7|PKI~mHzt9eg!RAW244QVVhou5txKaMSPN(Eg@ihPucn(s zgTg&fshtfBzq&E>w*mS5Ebu)sxmYGCKN-BJ#^WpEX2HtoaV?8cz#=pm-Dpl^(>k(344}OzEXT*r>Du=?5oguk zt~TSOBjodw0rWq5lqv<4-Q`wZ@x)_AM_0@$!1(F$JhV)uPdly%a?w#(j*lH|ldaf< zQ=GPV)zJRMqtu@e<6$6Kr05D#0Tuyw&qEvhY%=M}&0x_{Ob`D$Xsl*^isQ1@ zarEbCX$5XFX+@3^Zr0BZhElwL7j5H&V zQ0p@G4HaR~QqmqSXW)~r<04JE4HuIsundq#vTof$?f6qD3y2(~N0kjkm&U}|! zsZ|ZURLvImhhkJlXieIKRNM(>rp<(iy#^coAo?M31aNy2&yj~-jF%0RUect2-&v@U$o>XndNyj??)f-|onL3Tu zKYMA($_-DZU$g#AFFn_rnSE=#VBW$hdJPp|US(cvXPHAgbY$m=o57;0VP2m>%vPn= z83>0dh0_AFvuX8?Sl=}@f?`p^4DTK@fuq?7R&Myw&D|3gSf53zh&4x7tlBq7oFXxTY`%@f1UK9-!HHf}Q^@wWE4b zJ;lf3d~rkTG+LAPAQg9l%`i-6!tPY}LG%NhD>}JZi}d_X7)uvoCs3K_b#noH)k8y@oqco$8*!^~AeGOnP98mxYJ zY3PP1wQvXrU>e#WXVpxBxU~%lot}Xao+lwTo)(y$4fn5#Uu4b3N+4F4=?YQ-7NM40 z$P!p9PS?XA6)V9*6J<#xs$#)sR>yc@PZ*A{u0XPwJFlb8_<_e|?vui60M66p zyyX%Ra{gV}pN5~Y&vYMNocHj_#rM{_9A3A)7f1Wa#eZqGZ~No$GuUSrNBzmgQST$~ zj~D-)*?wCezjX$E|Kk3gynXRsntjaqCi)`!LuAk4fMQ-XOdzb*Uo zi=VN+aq(X1yBGhZ**|#eeDUvJ{Al?;@8e$vexQBzF@y#<>=FISY^V4*oXOw6ICq;n zboVb^{MTlGesOO7$;G*t-;WnMiiJlbbh(Vr~Rs;<`lTGh0!{O%4U3_LBCZonA#vK;S_OH(VEc|45&mF~C zdm&EB*!Ci_dX;;x_b>iSvwOK8zIDEf_b<+y;m4P8-iMEdysXeToQ2@?QTZ#gC*x!} z-k-vm?){5*n|Ck%OS6CQ)_V<1?w8Oj=)35R)Zyjvg1jQP$=Oo6r$RiKGP!3{2 z==e-UtI|_k=;^ojKN;s^XF{0!Np&dyEtamelk6ep@ofwbM{jf?*%7r$ zOe6u&YC(2?0xUNmJ|TLl-(EyTIO3V_*-e>vd8UfSi7GQ|fh@5y(cKfv9PCA$oF8Os zrvYQ0k23u_pSp_jPvOwj+Br1ToL%cW*$`puqbmgmg0*r(zrO&LP$0{X?ipx$qk}Qv zJ*pyz2cDA5IN}@FZVf=mM_K&np6%3&cgnSPGBXb{6g2YiNElmmae>I!I_%QEZgrC;Iy-6!oP5t0!L#ElVDN9x_8yGu@JzSk30g2NxYtksnGy~_ZIAJS-~>vLn5}vA_#koBENV4! zQ(UPcQ+m`={q|_olYUrKbdz8VCLfR3I1j~+1vCv{4=f23hFUuU(m5ftA3>t{(>s_v z*Z*4f4E90zUaLEL3$NJa&@Q8U%^h#U3%z;qb^ayU=K_5rA6)$NHeb;)bUFn;j?DQ- z?cbKoY`-o1cV_=fFU~7}{~P!3_(y-sr{7+PKDf#zDmACKA(RnQWc)pCwsp0{?q&y?MU-q?$6DCVfK%2_6x&(1;6PV|JKF*huwGf z#=mj?On&fIKWTq2n?JLmxG!a1!MunsT>KMrqd$M? zeIxINe|EDUhyTv(c`N_BIA0;3pZquePyNRIclF6Xg>!xWo5YJaEdEV-W&q5oi+lWIKvchaN8YzJj_4C<@J`BhIJoxhS=WufzCm)%D>o`)h zsbS-lV-r(n?V&&wzb@MmcwK)Peqf&uF3#{H&4;@MHuhPQ9{2M|=D4Y_>73do+VhhN z#d$J7lu*EGrD$zR&EODLs-fyY*(9YhF}X*ri(ZE>hC3(r&d?oP$uIRcX730NbH?$$ zj>@?EH2v2uuGhY^%zy~`iHmRY8vMtzKe_mmJzv|G!g*KT@$8+r!_Qv)muCOq&&?rc zfUke!zN@z{{%f;8zj)uyhZpBge1p>5`1k4`rhi1AUp$_~HP8t|em=kWce9y|B-b;_ zBo5HtP$9KM`2Y7A^L%TYDX(rp4klLz-a|JiK(vHT#Mn|u;} zx?{k}xmBEVo~NrpfS2hGtiLmR=l0e(Q@-NHQ{Nr$@frT3i~rW_&%^tkKD_vTYad*EzqJo8j+)l_CimOpJKS&lmuB;YeiBa8 z(Hf{fHpdM0c=S8Hd|G_myoQKQ!Fc+s&@(ruc~(09s1V&?dZVG~87gN!N>f4s-dxdxo#dXlixd|0KkCRDP?Yas6|Z)WFBv~gpk zsk447d`9OQnz{Y-#bGCW=i)!e_I2U^*B5uEzCdr!5Q4e4t;d#VLU~ zyT+5_WOSbAg(o{dqNL51dYm+P$;NeNV3vu+t!f7RAY6JPY*yk-xbXa_Lq?4Zo3l3Jf``}@By7}=bmfd+cvrbd(lp(?db1c{MTlGesS)=1-u(a4)>u3 zlHd_;V7oN{C7;M}WQwGaidmL+-d~1i06U?>j8DG5F8lL~`)=M2XLjCAZ^B#RnvTBQUA}y0ZyM!YL5s6&qhag01q-p z2R7g+l47SRRzRvf1}mbD1}lYZKtzBs&+Y+W8XfH`kEoAqrTHYvVtb#3Z^ZA&KG*u} zX5Eih!_Sv5zOVGd_wt*w`F8*MW}jc2`!KrSF&-FId85zUhr;tav)>Hwj_+Q)YrlK( z|4jD(F7Dr^fA##`@^{$E`{fM!wEr`2{k=c+|4jD3>l^RO^c(%(#sBoT{PCZ<{|4Uv zbK_skeZKss{@v`~8h)?Me>eW}KOn#8r5pF8e=*!q8QH%O{&UCYEUiCiLwcgExcWJ8 zS)FW0`txW%F#l2eTj7kzRBY~@ID7ri+t1Cq{hx1!$Hx~p>h0{8*ZM}d*M0lqzcZWf zYhUZX>Ec}b{fqmTiNjle>c2JnTjA&4ytOwk{>T4%ef6*Hd)fRKh@bf{$QQ%Wy?^oF zn*FWtpZd@Chkp>f>-S&!_p%x4{cyDJf8)P3`&;3@w6`zLOMCyNe=pl_?01F##*3f7 z@o&6!|HzO2yn8-h{GYSmneDW1|G9PLcQ5`sv;A)xUzP{HxaaX}{Hre|(tOQo-}=UX zXEyJgOaJfUU-`y)Uw-YaRA2ZvF8%G7{@v_vedFH^|IT0Ax3c-7ep~qO%>LiS`MdF} z=WBSL55JU1`~&CR`R~-@|6|8o6kE4(E&Bg|Jy>f46fH}39&Ix{shp7DegU$g)4BKG z@{WJwiF4~;zj40X+ix{r{4LGP`Q6@MZ|)z)mv7u#yS=k_H~!7uukHEk<-6tUtuM$m zU-W?YUt8T1x20Tf9r{5v13fwC@N|3it-cCpPzLUfgj=Qybq_`*s}@a^?e!cnrE0cU z=>*o*K?%nR08UPCiE*89TEuFzhy@d1Mk|yW6;5X|X`|O%8R!eK!9xy}-Vbf1d4K_?ywBtHEWsQ{9h-C%5M}Q3rBdJNDI$C-*j;n7-(f4)*ga zE9y=7o6&m0%Nuv7EcD?+XA5Rxp!*x=8+b7rZfR4V(s9qFE{*8N7#|3KGn#aDxCqZ^ zMZir*ls2FKMZvQ%s8UNk3>mNEqEg*Oz*Z>j^u!QYSMs&cB_CVegd2;rtLHp4i=@|)810cZ8*e>(QN2fa1|cxt2fRqnee3> z&)ntD#a)<}yUDQz=l)9J#-Kn8`Ra{lY=XR!ZzK9~SM<9}N00ZP8FQz)!5mi{dY2HMAoo<%k($M61EObDL^|G4 zDCHo^P<^ROBO9!Mt8jJ1lH;CAX9XVNg>yDdsS?n_=HjGBlEWluf#WAEQ6g@@0V+(v zaGY)MkPfSoI4^Z}X#WI|e-2ld0M%-dDc-{cRYucUDjka@`SIp3A;yoNO1B+3Rq_nWP3@o(6p*iKqz6c#j zNmwyzk)uK@<~ec|P^*t6$GY&PL)0FVWRRJNqBe{ZKha7Bs-RXzA4R0y2E|;o=YxQF z_aal(eh^p?qDzIys+KU}A~Xd>n^K{eOC5Pg5}5!Td$^|2L7kY1ed_PIg@iO^v>tMTDV`$-NTj2LT_H>oQPWc`bEFlDj~f0vll~G-^!? zbc74GA}UrBoz2C-Veiw;w%H^wR3@QC)%81EgrW;m>ojHSOI@%k%RK>9E8J+0gu&QN zEi%QNCYQQpPv&!aX~`#mGx+KLDi;+jjH%D9j{r6VSvs7auTRs%EzSPPM(@W%~1*j$_tL$dNyTq=Ap z0a$~$p>IGvS~2A;Q6xfblIi1xf`r&POafKX*>f@nHH5pBwc2p{VhSI@B-737h%G@P z&*7YPGuH%A*$yByg>lHYaJ`SC(W(&BP+FUlgq3b$PL$@dwHJU|M$FtvhfR2dDCSy1 z1c^+*3b$EK#-J;)`UalE(;|s_A4h{Ar7qXhWIm?jQs`=iz0^m3k*;Uh=h<@Wok-G6 z_?yv~^*OvCZpttEVl-OWvNSD3&d<7m?nnz5Kq`B(z#r=&P8 z0QDE?Na>0?E>FodA_~wjB!w^zpDh&_X0K9ly$dH*UDg-|q|bFYinLFM(`fqi-lfy- z-E1mIKBZ6Q&1~3ShC4c`U%K(Sx6Z#9EyFU{P5OAb_ZH`CVB~=J>CB6ad^*@g(=gDJ zqcSI5%X%}K8Mon#OG8`RnHdCzf_J28myDA{ z3nl`T;%zI$J9-Awl3tV=*$4>WJ$NUJ<}?bWGFl{F(zL+M#*_x-Wat~(8|wh2&M9D; zeDa@Z+S2s)&fd`8SO+NeJ_MkcV)CD8Z?E%^QoH||}%xU;|W#(kWAPjvCys;|v5 z-ycw2`C_3SX=j6n$UV4n(x%~>MwlxB$+mYocBDaxuuCU92t7HYNvoWIt!`-Kxe|a{ zhj$@kWamHA z&W-ypDW}&o9p^%6%LI0Eu@ODf5||LUl_rzq1bk03`CJJw2}dUv8_@+V<7}#WO4 z@q$idDZMz^GLE(uMc>J{5m3~&LN5ZSfFz^bF6ks<;7S1M7zHHm=pIjlt*{utEIpuA zCV8OK5d#;eM8qf{aYbi9Tg}|gg4ZOYRM8VIMAK=Alq(7}>KIsEJXf>~x#--=Vn9IQ zJ31XPa3vtE%hAaY7c{H{TMwoqlf0+XuzGsgB}wL*2*Xs9Z=*iUcxlHZSTagZ+Aipd zBouXDocJVvqa%8xF{5>JPeo!WU(;!$4sBfo*)f5g3~@oDRqTc$K&CP31)V;0oGSr@ zCD%mAh<4iAc8Jm235WiG!dEn6EXS1qlkgsNL_1wIG`Vi>sfc__8y_9ox(Kpm0y`Pv zOfxV&AW|W+s)eA}blRvxTbErl8PmxSPc*~O1SS(qVrpt)@g{8ixVH+J8jrTE5(i2%{`UV81;fqA0p=>h)>#&wb+OjjR2`yHv|D7 z-qVP&99IGeGw`4zI%pI$BSN(?0U+Mf^wvs{7biZtciJ$;OMq0Z8-f5C#-tZ?+K@O` z0tid4iI5Qu{h>Q!7RPZTAOIm@#Ol~r6i_F+j&(IHnTg0l3}h!Xc;C^8lZccn0VWZ6 z(1s12c52bsYX;L|0JB`tpkZRIoLmIi@=Vir^3zS}jF)ZaIE~%Mir$O- zOWH^Vh$X%1px6yX0EevQhIX>)L*!fuxS%7FX3FxWrU8=xIY@{Xv{L7cl_0MKT+oJf zK7-op=FZ;Gm4Z88Fp+cB1foxLM9_}Wt7@|#{_Gp);jY{Nhi{y(mhU%fzG$8V5D)A% zee+23%oAXexhC37-g|b*?SJ;hYcjq#2DNlz^0>IAOWIz!Ekp*ORor+cc}aIm+mF?| zO6Co1WX8&nu4(6OASf?CMzcQAC_uw#L7poC7j!{&^{t}9SV3kro>n0)X%hG(f2;&~ z-O`3BcMSX_8Rdo`0K_F71?m{Jkdy0{HY}~M7(la`-n`t@ElmdcxGkAuUAJ^ZW#9*k zt6^rD@|q?h9phjMV~DEjmTr+#GpW*?v}Kx?w2exkv<`d{LttH;E@{Tnpd+m)iE?vK zMPdb=XwdE$M_W;P-O`4o6&W;9OByvluW3!psuV|C3Gzz71zk{G$Z3*RIRRU}rZsiP zIO;&LuH+MD;6W$9de9;5)Yi#V4KHa;D3WcVl{ha>m$V=hKbg&vsDSJ*==32bZVIi$ zc_rY2Mpz2B5~-kB{Q!S}P9yYjQ)ng5EBP06WXN$Sup}xV`wMynpi$RW;=B@YK`X>l z`P2qbQs8=R*AZjXu4U^0Q#cruI8R0k(uV{)Tgc4-XyhYU$q7g&bI##!N2>%qm+86h z+u6#~gHchtR?)D4H${@-oFX=!QiFcV){$i?Z+L{=xjI%A@h8q&$8 zK}dV|K*rvtL(tTXB;o@KJHjX~!kJVH2^2<=AuMngj#QS;U3KrusfU)ByrJ4kZ?=8fVP!NQfVYc#A;x~ zl~obWKSijl#EE-HASx)IajZ!spIIf{JGSTmwrl%(aoMJ zZb1;_Xi)P*?HaI4;SP+YrDe>8f)6PXa^RGpRFWe#H0?AJq!@n+GeA$_DhtiUNl*Iq z!8zKRoE7eoOh;rR$UXxK7=KPTvcSGD@-fVym+6jbFWYG(321Rvp3mVh#}!99I|Gy4 zhZ|-qyP-8LnRM@TQBMYr#c0Oz;>3rP-Vz`XBco2H$Vg)(2Lb*G_vq;%TqSW{ap+|V zQ;X<1&fy4%ZRzu9Aqluh-)5@}V~C?!eqW}mBlc!CTPF58JPn=86kB)aH{oO>adS@< zH&jS+ZGe;y!)i@O2g@AsL}1umdT!Skx+IQ@8q$^=1GYk`7*dDI(pY!K(*Zh&OU?&I zv-hN@$DU5G(-hcOrngh(v|p9{WD@Y6!EH{lZnJISc)dwm8kfp20_#e?jZ#klZJ6M= zagX<-DK^7gmL&FV*gKemJ5mIVKFSRMTQC{T!f+v zQ;W1*bEykfgaqVDpsS8%+O?ruAN<@-0b*A(lo)!rY zDQggYyg1e8O}O1kPfUw}yc4lHNQI1&6^TD?_`&AdbX8ubqm3yM)chc)vQ7b}JSz%+ z+z{fLi&N{^h%wZv!_mf+2x@+!U29>uu*>UehXE5!BQF)IHMQ1l(%Au6)5#5e1EFmm zokM`OLBTc}4OK?o>qR&bWU%s+Nz?;?a7ZB;C9Cs>M5Y`tS1E)PSf0~88_!@95Pbu7 zhFRf7*OhR+kE21%XzElyhrb^Uh3)B{jxmAyiC~+JUQuSitPYqYgOEh$YMQOI5^Bdl zwnP`tu#zpbOYS?ls;BILI`X~rZ%!PImFN~A&*>PW(xE`1ZD?4RMs>=c(&?%$YYYKj zg{#JSJsJ%e?RB`rpt~L|(<92%wJYS7a@josN|fv3tnfPh&1_`xc?w@G~m=>jc%l9Is;n(~mc2K8_> zGcj(?>LEi2rKqb?w`U^S<$Z;sJ^qx|2e>XH1Ox^K(XGY%B=|(3s^f1YG z0SUjO5!V$7B8A>Yh?lgtL}NO553cDKknlSiF%40mQCB4eMAjP`og#QGuBy#?O(U)= z32Ds`0urxiXTx1`92q4iL&0CrSVp0gnEE&m^$VI<1a9R>tDFpdLu08c32I5SXNXs{ zv*D&|hIGw10dHxPUQwXd!R&@5(o;G-&~!!SRt`v3et=)nD03yjD4p>ZXIeLSqG^l3 zjhm>_B*%ffrxE5#fKnW`lrgOHpJ+!b8FDmfl@qYl4UN37Bp}zq#HJvojjVKcS}gXIGOZgt(au&f|> zCt#~fS~ZTk5`bC<6Nm2AfF~Lhfm=Bg1Qfof?TCXb329vxCZ@kZmo&NbQs!2UjFQu9 zn*6!!k|Y8#~imIO!?bs|}!rwize_H|_&LK;b(Y%W)+j z(dFpm6CJ5(k%U5{0>=AuefdPFLC?CnBTd^oo{4@5RZEjA2Pp zSFb19qpg%Nh(iKd>Puzb1KlK2FWZ=5&Ow{T1Vutgvq7-Ar;^OTuW79%%*BaM;wQPd zf>8!9I-;GnwqJ?($s{=R2Nb@dX^rK$5?~VEgN|sYt&!O3pS^Jg zX2{z+dqeMcmYd)2jNJ9TJl%LHVd?MYo{BHw4IO8^NWHupLr=76-5}J@-MF*N5cvx7 z1VX%{t}gINq6(V`I`RqKWzfR>kZ^2dwRD+y^0X(Wuz zxflez}v~KRH?3Q`2Y3;__v7}xk@p+;P zs;g<+S*t~Cid*6O1LC`+(}u*kQ1VH4N*n4fP1D9%t3|BLt?*2eldcEa0~Y$!VO^YT z3A)n}H7#kamR>-n5s3GeR*IuO(V?v?0T;AkLt|mwTIlNLp6U&aD9(6E@*>HJ&UC5k zlCDKRQnhXf0=%J7$Ebz0TnV_K3#v=H76tVB{Ec7U8IgoSW-^O(v`^EHp)&U=oM{4R zspW=NI&BO?TNgom(w#D0fSLwUfK;s;f&edR46#!rl0dE`T+juD)}erAB$&mxrV;Uy z#uQ_$=)DqfK^uwKW(q8O_;iDdW|U`|S=AzJl2EUj;F>ng39?K{6r*rAroE)i{-GPc zyR(noc8bPu+NRqFKIUO8N)P^LkA?EAlww&0QPHtz6IrL1W|Z62GO*QoW@U zK!h=ECCKZRwpXTd2xlP#g3UdZ(I%Z~hKyEcyySV2bV*liT}AO@JEdv^zNLvkA9vaq zMqpjHbVOy~rBBw45O_bV?QGmD?ey-_JC`IROpzbRH z7qpSeox-5pXF2JLAb__}%ZV25ma)_!If&~@f>D?c+LoRjLnSO7+}u+cZPG@k4?t*& zG%Uu8(!PS)n(oF1V2%B<7qrft-xT0HHlQD5mB5Aq0 z%QKCXW;Ad{*R%`xhK`E_2hZK$Ij$sJ(2+6058QZ7`t2J(?(tVuK5L&i&zVP^=RJP| zK35sP;%KV$`;JskoUVL-(fM4GE0AXCg33P?|6uY*1HT8Sje@9jjBf=CICuIJ(UWwV z(uRwq+l-i_6UPNjA4O3q_YMH{){=|k8Ve{oA*gQTCW!!9f(RqRPcK6k(C zvV#_|o#5)EVG<*nWFtwZfoWKi0ZP6-=#nEll3GMG>hc^p8rqHpu~BOkz!4$=xDpyh zc7jY!qCHv0Es#+Gsazdl5+j;q<4_?)qKO4_BcbM~SUQMo)Cp*7ZRWIeA1DnCV#HS3 zE9Z{mIBfmPVEb+iKoF-94jw)9C$y6NJ{-Yr)2}tYW zxK@E=K_u4{Jm~sYNB2wVRaRNuMN_3s91CWGTwX?QHB$QnmR;{g#ow(!) zG)D{ODwwrbfjTvj0Mh)R>t7uXWO{hc6I_ATy>C50Rxok#B)Jdtp=2)0(TId%n1NM0 zS{rq#=)|OSW+}PMo>c%xhy>vB1K7X%Om&#>d$2P{pdx`2yMhI{l}^@?9+b=l*Ki}~ zXAtS6Xj82Li2=wxWG0WLP3&3))Ct9t2%EXoIX))HuQQPevS@);#4$cZh`CHiPu&K< zCr#QfVOp*%GqrRd$gC{nVo!i6X-0y-LnBcK??iaoEzpWU zO&q}j=sMXrKpGe+*Ki}D5bf<6q68O(y~b+K*psn1q!qV zg%%i<(D+ucfS&X>5)ip*N*it@6r!}#>Qo@zH5qkS8s;2JwLloBtAHj6sH9~JgK}uY zIx!O&7rKX__tv8ho((n#9W&F!A5WAx;Tmp4RpgYh;>UnmgQZn#$Vvjzx;U;?fP_Ht zkmd)le{~R)-|*ChV1}^nU7r*{UCE46Pm*J@4`pIlTO&f2!KzT(uNxeg(zwbEXh<%fahf` zR}ZQ*qa17On98QI)3O;Cc7oE!Au>?`;})=;AXUdw;fWHR2k#^*gh(_cBbKg>QqWez z%*5D9;6whsU47(P9*k#DDO=plKlDUG@h9hXwXB9K^zgy=!;hXh~%NJCK zI^~=5tBWRIqf|eTw(7?aW7G*CDY%cp;~(|Zb2Gp4*7>X9DgVN!{@<qF-<0Y)-9NRh$guYOh6esYimS8VHsF8 zZO7P&OCG!j%?;q8>UGTSg*j3$8q~t#a3IqiA`=-GT3}SGhsFsu2pzNKP7DAr6nm9) zk)OKd4XaK-j8PjnR-EK$DQ#lcDp02;rXBhQallRxVVy`Or$q}$3R1O?suLqhs_v)| znx>SxEy;6;2-<4fEQnPnpsj%9c$^tC5GzPbv_^?E%aBn`5Mk*2^kRzOkOZO2sv*Gw z#n8ewIIP&)t}_~uP(9UHaU~OD)CoXNJZ+PSv@TV%3aA@pr)3LclL<;66AQA}#4>?0 zmc5fBOkzZntOG6mO~dNQ>_I}!ksS$b5yK+mLcOcyi;hMFTg61!5jA?0Bur8AM%#c{0ynuToJ z5o}>BCWx?3tfEB5#VsHyxH@S-N@7HlY+U+u8rEc>A>ST&b8M?&Al+4X$%E^K&&3hB zD-dS)!mN7<7y+v^k`!g4W*N6YE8-a63Kr0VEfJ|EO)1F*CqJsl536kv1X69_SPOR+ z`Eq*;$0|UJK+`Tgn+!Wa4#mQ!mvIaH$gML-e$qaN-*M|aJn#M&-uf?o>c9Ke{hR;q z58fC5m;5vLyZ_+p?(3g<|Lgm!{^R!RpE_TF|5NK%zUy_FFkFILk6k_4d}A5|wtG|42X zLhfs-wBbfVB|+NQT9T>Q1_`Luc;-w3(z@0_)B>}s?6hoQRN?C$DNt;|%_@8B0aMCJ z)a6mg6#I0VQtGzg<;QUcE(1U^6(kXUX=sZvQq)$x#%?YI6k?ao=AcX(NeVP*jWaet zEA~zr%uJ9TQrlrQY0hAm9wT+j4{MKCn*#O(W+ZWE(L&=Y3Y}FzlMqxnvmm4Pw^nb! zFc*YqjSDT%3VhS4iU}6hlVl>g2PGb)4L5?M<*Kb&Fre08X+;F2VUQjZyH*u-I+UH3 zEzCnu`q0wM6`VF4XA-Bdlh`CO3B!<@h*aWu;FiX~P|7J|#g$COftMT&Sp{f83~;Oh z$!cY%Wea1I2}&Om$y~u{!!e4Z2M9e8+bBG|7FS52b2*wMHZq7LIMTEr@Kx zOAcN{XF8$hpsZk+-BngC;1)*KoJmfiJz2&rU^_voO2|djDD}w2feIlKO)Quj2{lK> z(m`aSPC#4d(9(=@tW}kQpIW+Y0J?@u(R7B&lw@Yab_=v3P!nKC>qsAdJXmqy1YV0A zwHlF7hz3@zt&LG9psl*bLgNB#l~sVu5YsMAnhcAt=>9d)Y?g5gjB4wo)66LKI7J+V z)`|35oWV|@BBAD}7)V?Ct!s*N>QihsFtZfa24WED!cHlqUM4fsV>?qb@~U&Pf@Ub4Q*);g;%&)3wfu+)GH? z0B(|;kx3&-fe;a%rUhCNs0lEn2qT(g;{byfskGtb+XHWoZ8gkH#RRBWRO%e%6gbiHk9AEe7#GVLG(>+8_GCh#5V3p`P*+@Y8*V@vC8wrI7L0fGL zBHJ+h(vXEfWzU#N)6k&AsEl+uU}EhclRGinLH7{!);H@GY-@C;qV&LtvKG098wrI4 zL0jn{BHOS5qakZ7hzrNmt3a}V5@0wUbp5M?p!}Lxhn@vdnAx`;Fe_Lk#!faaJvvRU z;pEeGB1w3@ubvZcFk;epn4NQ*jcpjg^vvmhNmT zlFehGZY5Fxw=l93ls@72;It{+c1fRAY!aCSf>f)h>R>2#LR!NfRpbq;ZUiW<1dbID zN=s>x+cc|Z9O4~7Z$E(ji_M_)3BLz7tBfUq6U!M7mkE>H2Pn$e^^YZ8^+wQE!^||p z1_>Z3fqFfrb5fwK6lEkQPFkUz-71GFFnwS-WErlzkZ4qdU+Q6|E?krl$Jp<+3cKj04E!K~{{2*` zC&|VElE61k1J>4vghDj1YHe-QrD9pl8cS&tyH)`lAre5EAHe?AVJh7z`1CSvfsCz> zxC}srI7)OL5}^vAk#f{+>8~N8fmIVujAGRZn0V$)rhO5blGRa2u#iexLc<&^o&7b@ zp7tqs5iqlFJz(gDCvEhohyyzbNFq&^767RvRA~--kYofl03d;SJtD3GQ?CN*SOjac zwt)+J9pyC4MSYf`i$%~ui4y=r+Ck_%p!C3rvKG098wr&JX~(ha7#lXgL<610V|Qi( zJJ?>cG5C_t42w*G!+}f>&v}BIRmO7kfLX!9dQ?zF`@^@+op3W*n2p-sTD<{oA_YRU zP99pI72cz6a8e0LfGkc*)VEuWqyg$10F4l`y)pg|W#5 zr4J%AT}+t~v*V$1WrhwGC}xt414L+~T*HloN`iD!w6(Ee#>q(H9_&_HIutsq0GWXj zfXfeH|LQQ+^zfV~xMHn)-+F+oVEgDva!mH=G^GtEKe-N)b_p*aQz|aZA6gl)CY}ie z?4Q=!y}?o(6WTM`M#j-S)T!u2A?66!GhMG5GNwTelm0Pf?4)7sEG?6ZlMi%K8UeDQ z%wYwD(o)*Qu2mpetwg~OpVTQPD1E~3!D&OEnS_#rT*)*NkzHt#13f5J>uHThs3a_F zkJrYAmpp;aViJuBKUM)U10{eoKY;zKuV}+MF%uaVTA&pPoY)mCfUc8`Ob<%(;Tmql zTtx`l%4C^RabN>RLsk-y*2QtH0?C4yTq8TX*`|@G>r76BG&tnm19^f+smCd(q9x(y zZ@q?civC{eF!$!2)r^^PhF!FPWNv3x4Q3_?ovGe;RR7(2e@pds z)v!mUXkgWiXpGu%mWF}rU{Y!E*ea`ljU^H=J)3mMp$+Rqdroj+HtBuq0kVP}K#vL| zQcarDWHENZ$;Ih0<01=^j5_+$au4>^jkW}udKE|(1QhxF_1EskXlR>Eq;+vzs{jds5`fDOXkD@sL|7+Qf$X3KBn4L|4J*-vr0R~gfTVFu1{xGm zIf!L}svGI9s1LaZ`^HjQJaM`TXp*pPN3eylz~>MSQwyJ76oysy*h6DwAdV8FheQAa zBu$Q54SQ6I23FmO#;EDBtdtz!b8*D!3S^5wfQVfXQ2Sdem%bEe&>9q4U{pdAyMhJu zB-zOH=rpAbC%+ppNZM)b@oH1du3}jWcNV)<3xt?@70@IEMZOC%YJY3x(w71aTH`_s zj7kW1t}0fLo+KNY9-T&BGGghv<%bm@nP%Yf+=(2xkSNV4#~OQPin7zP8Hc8>4oV*r z$y~u{!!e3D#z%L=T=GfQ4F?4tJ1a*mT}L#qYHe+dIst7pE19&di>S2<&?3;ZOV1|5 zPEbiI!Ne-;g|5LN2?A=u3NaV1s^sRt3A`4$r4b2*g=1Usi^w*-LVz>;i)IM2@kUCuHb+i?M*UOY*q#fU^G>C zR0vIzqn-5I~wAiK7}WAy6I_%L6qZL!2;;Y$*&G8OFVE5cehZ=DPzS? z0BQ}ER;?i`2}tYWxK;rY0>wj`AHe?AK@emoatP9H0ZC-&ny^C5g^Lu*IdB56MQ&+C zLMf+=6+ea;8(#9@gaIL~%j{VNk_D6i!wn4+XJ{nqI&%dF9MW!R?I)Kp1u9{ZjRQpJ zm~ss_5(*LdZ6#Yowqf|Cq3u`@%dJ%)Sr7>@91rUL)nO`Z5^L{S5DC(DRLa^{SjJR| zv6B_22W9L!$1=H!5VW=S&NL&i0i$7*WFoDL<5~rh1(94+@Sy8o9o@tJooTHQa|G;L z4`Mmvh!R~V>qrkuhp<;`k|mXdWi`x9Gm^OE#HTNieG|J@MQb?+g`3mTjB>28 z<0k=SCajJPLcmUt$>>lhJ7@vhMKJ3YY!EtA6Oph66O-9w#%)^rR~yFa8wiIP%Jv^g-)6#X~!g#O<2VP%ufR5zzMt-xup>ag@t2V z$rh1qc*%<~Yb=NXj#VI8t?aaHVQex%4#mP}3wGv3Rm)Xb5hXPdmq8c0<;J;?b5T!uoF}g?_^~f zXP|S1+)QGg?l0VSOv%oH0{tg2m$L* z#P}P24=w^mnX0Xm+$2Vn)OMstr`4w49h0FBQ3u?qGc&b!m+(;T4l$dQo zn;?ggWrXgLqF~ND-9w~*5sVohm0|&OrY1raLL=p<+mfy$KdiWtsW^$)#!AUS3u3vo z3Xl+D+8No|%}!AIm`Elcue7006bYQz6)b>JA94?zC~J{xxRFp;5VY04AhHd^FAZ4; z;Ji2@cLl-ht`aGLTNqg!1m!n8^#nJoOjUW)1BME5`sm@HJ1XR!l~T6_FF%nVR>Q;< zs7}CW(l<%n*%Xq^W1%hsU2ta!II5heMD|VVf)p!#271K)!;BT(fnA^8}Bb z6C6JsN~GOHFoH!J)`?`U;KHnX-+F-1 z(_n)fMuicnCQWIwwBY5(K`c`PBvU~h{b{)e`^JK}Vob>@ph*JIE-c7gEbw)Y6eyI1 zHa7wYz_n_aL|sLzm;QD(~?65nq*1Mv8_&pAc3k6d2lB- zi5sl~Q?CNa0-AP2U=Cx^G!k_xdQljr$-M{i6-*=vliY_)O4CS}MnsFlY}Ou+q=?KQ z03d-LJTju(S_RYz#iOwTnoNPOd!#6s^8^ceY}M)kvw|H!5AlgeHEBxCkT19!&5^x+ zd%W5du=CS$5B6OXBE-~dRdED@DrpvE)c)3*p!7jxri(%nak}q4Kvpnu@g%tqaT=*~ zY0#=~Ic~)6R2(C+9C>iPE!7jJt3xLXQ*uf3gSvld0NuagIZtr2%2eq{KeS(2%MfCMfLZR=n+#73=EK$B1@J1tuno8apnDU!K@n>FUK2h0j~ z0EZz}H(XVKoRv}o1qn4r#nM4!qdw$S6;4=^xKV&(6-ZVqktPxuwZF9{h%h4j^rBF3 z(7gw8&Tvsy!X%T_Jt&jS+8Pn+hz4dHv%nRz^K)1!HAY%;j%$tG+*aZ?qCvE%CMbOn z(I(5#0>QQ(PSBZAqBAuSYH47a9Cce99}3IBsvAkyB*HI^joU(FxwQ(A8DiR{Nt0pm znQD41e0mwTfEg@DC!IQ?L}zLu)FL->{L!F@ia-Tz&CJ9oNaB(Q*K0Q#8?{z}WI;^3 zG-;BQx!{Q?-zb4WG1f8@1ihx!U!B&v8Q^bK?kNK9#O}=%@4=VueW3;Jfh5`hq zZ;proj#WV2C_61%81zFM)`_%O!G&38FI>m?=!uvsx+vzrsne7$4VlTs$;TwsjU*%R zk|$8F78)D1Rsk|YB-a$$0QRpAhcF$MB8$Qqv3GpLv2qb*qArg@rch0qQj*((mmdci zSal-_Od|Z!&=zAYxj5oXmiEkq(12bLbVn;`Bq@@)f}3q}?*X%diHs-x`#>Mc*jYJp zp1S1?t9HCLMt#Vu!r7V3Ul+%<3e>5IX-7nZXkjOauuiN3*+C0P3a(BXCNZL<>W-EM zrg2OL8Wd4Eh-HDQ8%ZE84Uv1WZ!D$76Q`?yCJEbigbiF!4xK@W=hMs3Jp{zog8^2s zLFhV}h@ONJCtSmgAc+V;TLCbj)?jJX8nO^@^x~NCWW*6D9@6{(_OA|t@*AFdg40HU zOZh@U)RoLea}??T_$tY_4BSYlBrGeFWlD?BepmrArGdCSd&+^%$ApEEJ!|ZkgrE?+1cZRa zVJhP$u?mC-Eift^%SUOjK{AlKmBA-XDRo+&p!T;`E`2G`pfxD8z^H^Kb_EOQNwSgY(P>H>PJTCHkhIg<u*DxgUSihLJj)c)4Wr7r~^S%MU9+ zGR?r{xf3~XAyJx9jy3ko6lJGnGcN1|rH_eZuHdxc7{%%lW5%MLE{~a%vwb>^V>0BY zjwtP{S$bHzCS&9b%_v3Hj~6>S8dam;fota6ozS1&^Sd_h@;fQ zArYw*#?IOr_NXFnm>V#QOa<%-94k(!mg3jcYwVaLpb)zRgn*r(^f8gl6`VF4qllwd zyCLS1Ppa-Zd+Icf$pAGljpnErNSkU6#4ovr%+gXjV{#aHgR-l{ZNwr|;9$7?BI1z6 zlrYLvtsYJ2h!SHRZDWe3PLrdaF;chuumU7gfojJYD^oyeK@4!L0ydU_LhOQ!+TU6e zls+bsxq_Qj_Shp#GL4KU$;PEer`4w4-F)>er)P!&fJU8$GSzJ~G)92Mi(|r*p&2I1 zPRkZWoS_YKN#+WMrb+Kx4-k)ZvX8?w)B$zE96M`kL_#slz^cirk1QShIjp2~rhO5X zWLB`4gc6VlaLE)CL|7+QQJisc3rGsCP8yJs7||pfmp+|_H5q8gw+G%F+uGx)Rz&=g zd$6x=`W8#}tOCgbvTaAOg|TQFiMk$R1qU2*?*X%dZCweIY+U+KCY!Z2BGeA8pshU~ zNf9Agau4(!{D)ht3$SL<-;*#wHV#KH>M^v?-kAPM=OIc2Q)F~r!g0iz)+2}tWQdscyD0VTk2L&MmlX(Z}8lhednhZHm}z62}8 zFvoT<6@zgnnpmO{35952)!N$FiA$b9b4;2U@t}ow$zB*iy0S$fi=*iNMZ_VCDPfeU zDsOt!W`#IPj2;pJ43IQAYBlUpDH>RHBO0T2oTW+M9BFLSS_SIV#Iz%#K^(C79D*nc zKD`VrFsju9`3g1&ovDc+A0TOR)N0tHQZ%sYMl?q4OiPo#IhGcWt+EQ(SRw(_vq^^> zI)jwXPcI#76Z$>&09nBfz_6ya1CR!UMrsDANT@j~HX?{@CoXvc2?July3C$cAXz{O z`0v*Ju2}M=<(uK#@hj@DbA6`)hxCg^>sKwu*UvXyc78lqMHM{RtgT^>3MZej8U_Mn zE1AOz2&JVvV*(^AOLmn=yGfH#O;GxnNahMo8;(&BoE{)lh`r+BU=4n^YV675r0&EhA%u&b`s!5YG zL!Q7;b5v|Z5ZO+&znp=3TbfahwJN*1mvAPm2=MMgQ325X8=mt7H>*sQj`6Kv0X@X) zhND@hDWwKd5mn@e6_)~yQ6KWE3U^v9hyjjOfP_HP&dAPgc7jSWiDWv~xCLyt_dvda z?PH`~vBE+;KqCp5)d=BI= z;WWEN2uCelx4dE135YRj1IJ1VIa;_;fMXS)MWAVy zo=t|GAi_G4%oQAP$h`;53bqeMFWu4R4wB8Prq-$dq+9QM$vyJ*a=YB9W#C3a@q`9e z&DU5TSzVsjw`SH@@=c=x)|rL6m7SK2to#n0=RrK5E!dd>T9Lqsr5j={MM+IW_vkdG zOJiUfB|$nVtYj*-0q13<_LFKf&9{tH5SBC%_=}bh~yfM2e5zjb&pB3CyPSc zID`v6Db#flHA+26HZnaZ@gQxu5%euTtcHmz#$}IXH7f~DnAo*Cv?igl(LPIUVJs#n zeN4rN3Dk4I--G9lSPaI)d^@TPAHG1MQBP^0TMz0 zCM{bS*$E=76Ukh`0f$Wbta6j6%RbYKe1N23vVj@{Lq#s_w9Y)nhL=2e4@v{os0=5I za`tUPr)V-N9Odla@SGV-VHjnqRu6oL5J#zpgKh)hlctoqEqM8f{ICKfQ-Nv&$68Xo zS`Y&qs{jdspzOjD5CV3B(kDFi1g8zhDB>933Kq~qYTT&++fGPpL_%e>Br#6;F<4@- z+^m%3wA9YF(%N=RB%nl^Wyr7-L>M|hy^NbCAqk?KSSrL(qVs?}C;+8)R*syf#+zeX z0Yp&RDu!hRTV`%St%A?oTe>6c;XorqMErgUjg zL=`!zvYFCI0s}8On)KCOQq-7d6{u4a$u*@MO{SP22VvpU%eVzH>ej;vIx|XirY1ry zBN-`2-InBy{IKG}5M$H{7>$jEkk&=iS_Mc5H0{tg2my<)=>84Q**;BEFV*S+vVv9W zaf&zqk(;K}AW#EC%~7!tL6Sj$=VdLNz9duwF2|TvK$8R_#4Z|GEGEe0B$By;(}rUd zdnYk7?H5my6@${>wAu*asHN+cH>?^@eKba$fYH!4ndT!jC95q-AVy`UWeX$Qv|%o` z$ue$w8iHW!BsPf=O|lNO$W6m)1NI=H=E#o3uGD6zKIFm8Y3V*t8XA~W8n4U=Kx< zV?kU(Q?d$>5MtUH+1btFBPhQn+LLA60;AeG>CBk+(c=_x03w%UO9tq4M9@ceB(;bn z1DEF_j6h;eOZCL*8oODo?6hpgg`FUiS@`r~N{^d%G{=!uY!Y=9k^V%anlz4`)lIq) zmZ&wbh$Ik~=T78c-)^8ft)^i+W=7pvI$Ob_GZgU|V+E%TM|)$Yauv&%0u0Nb9gzwU zSdLnaflZkZ7+v{AWE)=cjw(1wh)ZaztO6v2NWk=LGAurV@@t|!S;j3es;!ewW@x1z z4vu92Jy&RzGFrNB`C&E8OvMBwaIB;xQz`B0xK?I~NeC)w*(4DlOHleCvW3gg%(YZH z#;4+`C|YHe-o#3fIlUM;1?W2>wJHkL>LX?{@m@6Z{fbbfj< zWkzfTzUfp&XGVFkKlmYDa5h8O+$MW+j0E>mqio0-A&p zUGjy1 zO+jlO3w0}T1Ed+Xze8sbM{x|7anmI2l0K{0Bu13fM0AgY(tu|CXi3K&WMCDJhRUv} zF)S-32a_04Zmj~Egr*((`k?OLp_80MGFNcga0DgjSj8q$ms66iI6XQIYvC}xN2O?B z!V6H4j3mM@4Q=btQq-DyjlJj)P>5XuLcroPm8c|=xq_Q*k|4_QQ5h_N(Nx`0AvaA9 zzAZM3H-fenO%F^)3S9JR1@SNj?xnu7n&w%C%8I^ zO=3iotb=qCkZ8&RDiUgrih;DXk)uCY=U`ZlhzUPdfn-6%lL(T!3q{jN)YVo+mZ1e& z5y$veumC#a6X85GFip-H4T`96dTeWtM^Z#Z9&!&esM38Pv$7;cCGKug9JRl-CMbOz zVunR0-9z*wKuuT~EP$?)jRPd1R-5RICRtL5IJT8+5seKmc`;^<1+lSf6(AwRv@^1^ zo1Gx0;7&S|6Xpho6!eDU*rldD8=i3bFm-;b0XNa?eVswIoF< z2?uNznTn&vwyf!!V|TVG5t$_>dn8w&9La7KncX9Wb7oN(R+;?`q(?4dOe}yO4^|vF zfvct*jYud&1FP27#!g)F1e$|87srH?ML9b&Qui`sR1P)CrSsE^DYjAIa`Yf}1q(Y2 z@hCGHJ*%umX~Q5bX}M}^7N>{+q32~aE15{^GJ945O%k^42(~a56J#ZkK%%ay!X9Mv=gYEo>5Xbx+-J^yoCD)NLW}*I)H@{OV8s8uCZ-8|%Cj zz9YZ!_IdyPhvm2K`G=B!Ci$1~_wB#?>HFUQ!l!=xwfFx3;NJkQKKfVjE6%+1{cTX> zlz$6;#hF{}e*^OLf7v;=>yw{<2lDju#q`(n=kopE{a^8}U;SJ6o8SHX1Lwc`zw+Jp zMei@6|CisddiV3t=Rf)G{5!B;4`dy7j{0BE^Z2*VRtoPN^&iJ`OWW_`iKqLBJf8jl z{&YhRU&i)t!h8*X9Pbn6OE})`soA;aJ)d`f$M*VP$@e?GzuQyKyU#m6*k1n;{KM1b zSwB9}$EW`X@^!<<)6HAXI@n%=4LLc=a`WN$kW2f8miT79It^Tomy{J4rF5cPM_geB#<+pcx z{vDY2o_`Yt+v`7y?MQua_xl9- z5_po&`xUlp_xehEh3%fbzS3S{yJxSjv{%^fnRoPV=l#9oz+Z9m?!VH!;{Fl;F)6H&3>o5!^b!8_U!YH-|eHj+wXjX-}wd~-Q7OAyZz2L_>+&vy?wTFhgZFd z8{rMSY^%5t-oVSYiW}h#ylkr&A%6(l@8fA_Qa>Y)j>prp`Wbn2Jf5D_&&Z?W@${^I zLQaN39k1 z(_0R;PBHkLXFua}p8ZaDhmUXeJKY^VzS-||PdK;k(`P+?ry6X|v!C%f&wi)7!^b!K zo$d}F-|Tm~C%g^Yt&7Z;G5DNkKjU+r{Z4m>k8k!n-5ox@+3$2uc)bwoUFiDh9ccg^ z<*q-UM{~yKJo}yQ4j}Pz=v)}3N@bS%lr@O<)H~XFL2`|I`&gAI_+3`H( z`Sv)a9(>NTpYb`*ey6*`$2a?(?hYT{>`TYv{>H_7^quP)t+jrqH(s9NcRt@TSL5&W z#{2F5M114-du|@#_Y%J6{&stx;CH^jZ{Gg*`}-B}E%GEjzkkKE_uA{PWPQc6_ndd= zt2^)CSH1YYc=q&kE1T$ zBd(l2eva$&zUedjS98AVo4$Di`6lxD?D?y{=`;IRbH3`EzWY0vr^w2;8s;7UQQck6 z9sbeWkLvFF_zqM^-)YF~Ze>Ep>?y=50Jo~|<8vJg5r$6)O^Evk&|5^PxZ{FS6d5Zhz;ZVfRUDfaO zXa0OXXTIYnUCa6+I^Z4u zS^YWheEw|SXZ7cPybruXv+wo1D?7H=mmC zj5wzLOy0Zvcl`7m-Ry(UKKSel_vz-X$dk@DwAa26{#DGk9c0HpsvjL*>ht-GzvDlv zKj-D?XOO9Ttoo|v+q&mRvwOX%&HE~Ee3LW#;Ij`t`x&3}?4z4~@Yxs6V|a1##J_XB zi2e}uo!)rgHsM8h@p(bLvMp{EUkv)9ZM9v+T@Zf|FMeOMubkoDD_(ba&Av~6t9aAk z<(Km|oNxbo3Ey*npnts}^PXJoR~YZqNAb^SuQIQ&&t$&U`h@tFc~Z~&6?Uh+!mqH; zWPV0_)$0m-$b9o8`Sbnw(|&$h-|^0?^PP;h-pi+cd~45t5eIku7xTz}MDwc0EAAuN zZ#6vLx_EM2IUVPq&-ht)cfP|vn)^}RU7tIA@&@Q>zxRCm|s z4qv(4x=){#Z|m`#(=&e7-JS37kLG?zkQxTzNtKo<9vsob$91G{G+)a)!p^E!&hzgOfy{dDB@dH6gWih5-8EuS^} z`JMjEpU-<1^*jEpo`~)D@w79k&&XMS=FjJM`ZIq%e>U%}{?y>%%Q&0synR)&XZ@K! zpWo@v{Q3OZytn##L3qb_Pv%{l`f5L8yBCh@jP2*Wj`~r5=FjJI{yYAyzTct!U7Gr8 zKV!UqJGR$nj`JBi>(Bi8{7!%7&*!JS)6+Ws63*XT&W}%v)1&UppU>~~Xa0QtY~EXa zHQ1lvUUPHj^BH@`e^!6aJD)$B_g2s2tBdz&zk(k4oM+#))Pv7H{P}#w-|?T-pYzV= zr#v2UeR|8GpZbn3;*EZElXpI!@pt@Z_2<0v`6-WE_vy19zf%o1=h+9Jeel`O_?%}S z-Ry(Ue#YlK`{-sLeD*Uw=h;U$H@)4uk~2Q%+3)mc{(K(a;Ij`t`x&3}?4z4~@Y&D! zoM#^$@5la5P@mtK{rpaU=FjJI{yYA&`g7j-{FGNO#QsiDpWm7NO#MoK=FjJI{yYA& z`g7j-{FLX@&)bl{gnVOD=dXcvzPap!&wj@bKDyZlpMCJz&-k2YAKmPO&wj?|Jp1VK zG+F1H%6`V@Jo}yg%%9KW8+`V`XFua}o_%z)4?g=DpY!aa<7xlo#b47;UOere{QPM1 zx^dovpZxvV?mvF}Jb@qm3V8Q<|9)!!$==*E?>_I~PwhY1n|uD*r%(5v`pZk-pMCn? z{*V3z@-1-9e}i-S=gvRXcka`7pr63Ji}P>6KXInVKhfvzKJy0VU*UNd^Yp{~-8?;i zqVL>S-oQM8`L6(2k5xsgzw7*W<$iSTpM3YH_q>Zg@oxd|1y7q#Z~y4|r}}cQ_O7k} z#OpDn`X|ry`iZ{uw0A8(`3~l^#x^_wmHH{fIoCKF|Nvd7qu<;bv=?)9n@Th&-P7 zwjYtl(|i82`tSBT`+1M@)UxmO{*I0Aujc>iyqRXlCI`6ad=BXav%Hz}E^!SASYW}az`|P|tSuWm_{R(=l zJ4bymslLjO@2|?6oXMSi@YxrhSKg=GC4UU{g*<*v$G?o9+{bQtio>!0T|7^o->h@u zTY5wuPk$Xhx&KtZ>9hKF;g>LE?Wf!K3GAA`il5BCDsOToclN<&U${>?Z$q9cUr4U} zRq&AkNCwk@8aG5 z!YXy=`zZe1{C9qy_N&iJvj0N94)QKs?N`{Y z-Rmpu752`v^Lu@jb%pJoy}s1GeENBNUj2o<;^rlJr+LM{;y#l9&VT*Xk8kb%FC&`o z`v30yck}UnaPe+%<#e2bKI5aC{Z4m>|F8WjK(N)uVfVlFT9O695J(`e{q2_dgA-ep zWrs|6pYj~H?eX;d&a>^}a4F&^O*DSo_x#4$_HaG_Q=a3tJ)WN5dA3~~TIcEGjcFbC z+MWHz+4gWf|5KjhwmqJn-+8uOTn0FV0B8HL@JiM@`@OfVe(!CcaJ_iXZ(iFTuIJzK zc;dPpIAuKTUBO!3qu)5&9iR*S;-4#Tx{O~!Z>}#Ac#?S7b z@}Bsf@^z|sYf^RJ^BZT|!}a`6d5+umczS;4*>-Uez{%S_U>%Yo@2&wcScuF#n*VdCg#43>x7!kulUvc zmG4w9E(q=f?uqY)*H`w+_a(p01vj4RYxnK?3VY>ibJ{hqxF5I^w(Z)!*0$>_?3J(0 zY1hD3e2v$A#n*ks*Zm1!roDd&*OR0tdBdSx{ z_jfVhtvgJO?O&cws>bWS;_JTR>;8ntam9ms(LVC-e(!CKt^1zeI9uav?YFId?`?ay zp5J-4#$oEdBe|rb`n|U`w(fg=<7|zywcob-y|?Y*dVc5G8iy119nngj>i6E(*t+le zjk7h*)_&XS_ujUL>-n8$Yh1P;JNv!2t$y!qd${Va_!_VKim&^Mulo}|E<$@J zu=d?({qQ+<QsQzV3T|<7|zywcob-y|?Y*dVc5G8mH68dz(m%NVi09`HizR zzV3T|<7|zywcob-y|?Y*dVc5G8i&*U^5Bd4JD)GwnTreY;pao@m$F;#wsYE89dsJ@0B&5H(RHN z@A-#y;j+X}nrM7(|2A*7P7mMn59>nfJbk<|&AGkM^Lev%dib7ySXTl$gdofIW1-d4 zeiZk_^EUoAZ_~{a*KNTm;u*d>iDlAc9&%>t^zc1DxDms1TZbpE+w$fvA@fZha#ZzO z{%0Oe2Q$*7G_CRd+q~I2J$%nUtc$0>_IV;YmGQk!4{v=5fA#)zUEYxW<-20LN3^oP zXP@w^&a%RY%3Ij(_IUr>+k;*0?wA?{RmIw^*#7@T=pW`AP$& z{W!j(=eVm;oW>Dx+^vs)uE!Je$>~<>6ym64`dG=-T+_dOR_l5+b z{kHb&^mQA&*3OfX#l5+T+uz38JNE0i>$;Au{kHbo+ON~sZSYz3@0dxDk8ee#(D}Kl0;vZ+>nEAQ>4)!eWPcJs& zf5JTRKjlBgANhGgm=fl6wpYF$_R4?dYdPEX752*al+(uB^%WOBkGm7Ti#_G~)4Sui zi;d(zX?Wxh!1>hoN!`V+znDGOH}8()E;f??q~Vbt7bqTl1$eaE_wt_o71!JM{N}ao z;d=h3JjZQ&JUA_%n7@Q}pWztDJ+2q;`ORzF!}a`6d5+umcyQ7>ZLR8FQB3b1*NgZ3 z=C$qNdj6+8$8CE&*;b54tGe&`E3UWi`ORzF!}a`6d5+umcyzM5C0BO@kt_dGp5wMX zo}S-%wokZTyyrJB?#105L9BOo?()oQ+vDl^ooD-m>&1J1^Ww9=yCXV%A?|QJo}S;l zwmn?W|CHyrZI4H%uiK%wCZ<#O^mL2#*5sb@9Je)J-S_;)**@WV@t)tjxCiZnui@@# zu~=7Zjn{p}*L}s;{S&^%>%Q{gG~1uUrVZq}x+=cL>%QXazT)ft318!NU-@u??eB5d zMzpS;im&myulTyJ__}|>*LdAmzS4e-r+SY2;||k3dXBpqJ>x54HoxL4U)@(c?gQ?K zZM(Lwwe8w>tM#?(E3PYFo71kZu+>-bHD31>kNcogk!{yChJE!_e2v$A#n*ks*ZmW| z#_PWF9Zz2$bM>|RcI~^>`r7sWRqa#1^0hhb`U+cp6<_0ZU-8G&$L=(t7~wv}cI~^> z`r7ps*Ojl$Y1dcS>Z|w~ultI}$>6~~X&-razxTE+7Hj)$9k2VI-#FVIuKFv!#_PV~ zaoRrdT+&hf-rKfVtnIgTyzYB`<7|7l>aX}3ultI}3G0--lJ!neOwFr%ym3a?ea~;4 zZ4X!d6<_0ZU-52>LvS!`^?PsI!`FD-JFoh^x9#Dozv64W?kip=t6SnVES`FIu&;jm z?6<9c?`?ayp5J-4#@X6$Tm9bKZaAK_Zi!AE6Vp$c!`>LgycQah-F;pgdLP8n}cMz;FBx9#C;yzZS>{odR5aMfS&HD31>?^D}PP7|tr?`?bd8n1ij zRloPPJzVuye2v$A#pAU1C(y?S>G8xH|Jq&P+U+42fj!8)4a&fqAQ(1U5|9}3(9<5w zT?%4+3woy@c6;WwV=xHc1wpuFQfKaV^;z=}M09_^us;E8Si>I`fW-oh??L43L5B7q zbajFZY(Y@(w90aj#r7aIbZ8~eLD1~9sXl`dYl3d=A_&+{%jK3yjx9)?4ecTG5X>@} zi6PxREe!SXlib+h5g|z40GPySUoxo3O^GK!@ZXhS^zQ_qHWYR=V7hmJavG;W1@R{a z2>Zgo{5yuP0M4y1GD>rNy)IH(gs(-yd&j`}-T}zl2SDFl3Gm+n5WN8?+6Um{IzG^P zeprLe>PzLZVKNwxQ&xb))i8mb|9B8S4JfQ(Wa-hCu-=PgE_B&M%zX1OR|H z`oN0@(Y9Zb96*E~K*SuFknqTK0znNP1f(fe1577S_RugTYojSUZp&5BHkJ)UU59cA z9m`R%N^j*(AUKZYvg07fW0E7tW4QsTQmY9wJe8|VCy*pW=GL7TyYST6fC9r|u$e51 zcG?GO>VqWup@ST%d-e?iL01^*IQh;%wo{Ukc4EBrT{^Fl*8)Rb)wmCGZ-zaL0GnfQ zcJQ65fbZlg0}m^bI~K9vwgd9+ed>dqjYJ&Va#)e*?_>44=W!`3q)B{Q)!NYIr2S zZIHeF-QZ9-s+s5TZv|rxgAMYN90Uh;m>CB#490ge2u53fI~ZCAWrWyaa5gr?4D2W~ zv4dcq+YgvA2S^@~0pKWD-`ODZ3^;_2$_V5*I64~zgZC$3kPb5I95~;h7kks`sR)Ls zT0y0&paqB`iTl0=0rl^rz$|oR0587A_3h_CFoNg zu0Z~o3bP4RGTuW-nW|vvqZNRksSsl35-4Q60{We)0PswLu;Uf>znoEq#qeJzEwIN@ z7mzv-paYSN!NG@iPa+++nL@f8@V6u=O65TWL3+W$pOv&td2nAR0ezjsfRn%mX1>tI zK+=Xc(?HT*BvyE?D@DwtYIu{!EGLlxs_`4|=_DjsR@Td}^M6@0!3r5jR zlA6dQN+ljIkx+fGl3VOdMMI+Ll@Ry+ZRb>c7v=6F?=A-@-hN#Sxh0dL6TFF znoy=2&q}JQN^otJ5I_Z$fg7tp#IM%58cKqso1U}6v@-7Mlxgkr`i ziJFiCc0{)%;+=##5lyXa65InxAXfpw_oP^ngZ!l8$_Y+F{y-8nZk4)8XcdI)3ABy8 zUJ{i!iK?9>mP>hU5^}amRN$VdgVBWS^5B?Ef{^PZc=-=6taf>A5`v5Zg9jT~x*5!5 zx`mE6|2YhPP=SCCnOX*MC`my^?~+{MoFt7|DGOuA>U$;Wfi(WM^4cV3=F566l2DW_ zw)jnB+fyU2n}heASw8m zl)dLnw986ulIoHprq@kEf&F2#_nQB z`YRLBz*dok;{|6Zy^Xa&ehRr?me|{h ztYT_-YQld2tvHJr90 z^I7vOHHABojkrg01tasx8?UU0D%fSBCXk85C`3>~Gx?Ahc~GE4-xlX69czHQGDMav3XE=e<)A9*iEwX;3SC!JJqut`ClN^gQdD-BiK=#;sHCk&WbjAI+ltI*5S5sb zCY#w4BMLM#L@icIWQmkYkYdbf6BF!;paU2Aq;5;39#;g0^%16XPeedGrWI7UHgQMB zN`Wnr6rDuY77~dl-Kj`*jno~sTB&oc)%2CDVkKO#y9K7Dz#x{ zEEIf6MgyqvqB1i7amg6hOW90(n~VX!tjr<5P0oXTBm3};O@1Vy`ERMrvM(!3^FJ(^ zUlk4~PI&SQqMcmGp+j4fxN>XUoHqojzEZhRjNt8-OH|5N)f zy=!2c`m5g2EdIUieQ(*vzV%*bV1WI$IP3Q3u%WUxDzz;prMrb~7LX=+UdbMJJgfu7 zMpR}(vPeF~^SsgD)>GrN-xe6JVbPsfW!tqoZ+-2WaaP?KkL=hTE1u$q>dmPz@27B#ab~yGv&S3E znekZhzB=1h8+pjw#4ER(sebQ$?mpqV@A-{WhnliaLDe0R_0GrbJ^J?y+-tj66xFVA z=)+CAr*Mzp;K7^?s`=9>)si2(`nsf&lY${ zIsTOGF-WDw<%3C^$5g!T7}X1+Zcl(?!f_W9AR|bSsK@Iod-6rYIsw^sjZW+SC*aGcCSWMZ0d#-@V(S8lJkBxACLF>@$~i$g67bdv9wjn8fCltapk^ ziG8q}KNlov07=Wrs(wUxS9=t*x~#oBcJMd7qqnx#t;bwAcJQ~o(5Ka!Y${`%lR;`ug{$p22V zKnZkwR`MSSMi_&{jpsCZwx0%@G+$0<)AE5N+WI4uK*m=kLGYOb^Y}~xf{!Hia|^op zzbAhR38a6V2?+iOl92IL2|x6YB>ciYDgnpWcF6IOL>%8p;#By2JDu+&`Hg-a>{lg` zuPgUK2{e2qVILn!bhiHz65ji7gT>i?oeA>)2ohZTk4l2!I|=^-`%MXSd{(l81R_2v z;k}K|^y`Cz!2^?$u!0*J>DyugFq!}~;s-QHP*Se`Bf=~_A`FM~`TJ*NO&=BQ&Hc-W z@+;@beP~VdSu*S;inc4ApXA%t@?L*G{#8UV9bKyJw!LbN4gDL5^6UB98iu@LM)*s( zf(WPnM-hdX6^e4kD_UEk2zve~B8XX_NON95Br%_faKBcx#*c6TQOswJqCbr&X0xI- zem^UScLkv_uD!^WIf%qV!45JAfl zB7B7lh%lcOM4IseMXQKnHY-{SmNi89yRkqKDHag%&=)9zlr^obA>w=%D3Y4*MBvd4 z`!;OyEFcokccS>CI`Bl#V^0nT;GCBvVjmv7u|O1WSOWOs-Y8}~!-mLE>9HntJ)wr*Rd|;?52{p?*X^yGIY8jIxx#1@;?5v&2vmt`sKIJ3eLwHg) zMCKvflLeK|$8a?RclZ&6p&e+GXy9X!FV65lM%p!Gsquk~r-(jom60TMLn6@1U{`91 zK91v_%njcw+h%)5#=@u@Toe@GvobjOK&%Br1#*}NgFT%AHJ!lR!nNpOQPpNq)&)1p zvpQb>*?{5Tv#BIp&de1Fld1I&A%w<-3en#mO&Ic4D%{R_(F{4el?wmg2ywUz4@F7TRFI6`0#BllLzkHp!Xy`xCf!O>a8cu* zPD+_;mGYbyw;yBtM=0G7ec??1pGo7?SGUhkX)!7O8m(3u68{;bxS5NU@{AUfO5|G7 z?0hXL|J$~9ror}$_Lq}V=W3<@k*+`ef5J@p7g_)LSoKMvZ9VbnuObE2V$zVdoHTm> z|0TrX1SV_2eg_Lmwd5Oen*Q*gLAo3%<~^XidimJ$|GM*%D_$GMag-;n;7^14q7oU2LkEiEV24O-ehr?^__FG%BnR=3aZ zb1^B-{hy$epC>-ifoBTK{|fUJgCM#7r1COgGMo@X^y5(BdBzeq3f38n5ED73LJCH) zLXdGkw;5uc$dC$5V6?l?I02r(26U7d$Z_mA7@VWTJhx%+znC~aoYA?( zaL2Jz;Qxs6mxIsWJqpPfQo$p}AC!Twg7+Q30q`VtSaI-IMfgyO1n3Ej?>k5rx$BCo0 z;qCw&1y5n8z%wF#L>$LEssi_T7&`@?AdXWTRe|qp7&`_21BlVtV0(lOW8;KJpZM!L z{)`Le%|v&R0@--OC#!qw?&0%Vk<3B{9%Y4LQ{WsT7Avqp@kC74L1M__zX4*D9#%X~ zOynRjCh!x*6T}$8=qNz@8^sgEIN@Q%>~3hr0*w*l%O6$Dt1_sVS89+LH*RzknAcAf zPY`2Zql%AzStrWCI=K%`5R#(ph9lgb}ZY@ z2#w{MT(o*~P$I`05A9Aa(X97ZkyBJEapRJ8HI`Idm8W9KZxhC=(a`bpH^JsQLIY{_`i@*7vAWdmr#_%TuU8%z z2x?8TIslpZ8!P;S13quB$q4~0fOyQYf*jmC-&%5Yhxb7_`ju0GEYA&IV`G^j2Pv74 zDSjeC}7iX5Z5ASVX` z)Fm70SctobJSrz-m>k>}V}U$Z1i)3V$sv(Cfg>TTmOGC*5}n(P9MnuhVP?T%;TN{? z2*o8~z{AJiMCaDzLVyS`X2D|N7q<80Ok}QVB{>ah50SGX3k94!`FxDU#;J zDgZtuUnsvOM>hYZ4U-R6(3xe>6#v1%&tGAYOJrKfz=p}~cB6BO3J96Be;K)ios_fj zvvQ)(2g;5vluwbPWsin>yn~Ftl$=L~*KsT%4*D2{B#j5MuaqbHr2LvZym%@(J4Yq` zBAj)J{zML@E9I8*19HuYJ+uc(C4RBT*l-lq5|;n)g6~TC3UXArAg5SNTtQT)Ytuw^ zR<2FL2>~T@$H~F3H0E+kO1ME5k03YX6w1;-Lp7CiaVBR$l71kEs!Qcr9BMWtSn^+( zwx+s;TFBvxywZ;hqFD;nasepO$Y3HJhBK%LGKkb`njC2;1sZAL7FFT+L=IlEeId6F zPC|5WN##j76gvae0n8V(l!->=k3Cb6mYmySkQ{hCrnn~;sY*c^A7oQT+E(<>$fftB z+?|}pa;aP`fJ7a5tk>lBs#)zoi)8*w3oiSCI*=28o_Wqcq0MRh<>XPD9Tnx-5tW4| zG`07see~Q$E|fC~{v)}^RmeF`Q|{zIM}GX`Us@P!enU>)GdY#8C}LJh9lMue8Bcgo zF3~6QK;cp1V)9_UB4-0lNtEZD4L>ONxU2GOaxyXBV|qcZ$wK;ZEK}sD%Y3~k1D+{z zG#LqeVAUCwjgxy4N=^;rxFSb2YV-@+hC+$y*yfYUOrBD~6;F5_*A8jnf&Z2q$>UE} z8%|((2%|y>(jqMWNg@}Jiigk%{B**91y-Ixep{d%IkY_G;WNMoGWAtVGlBv?l`xH;PM8#g>pRgw%w`ThoiJG-;4_MVp2ANj{9y%nw&N{F zyHg#cYDt!bB3nys_;VQA>Wk#|BnoD-`Q~W zCkmjasRC^FcLj2g!v&yyrUOi$Gh)|#@p!<4NFH4Ju%9ac?wJDMpXor_$2$=B%!pd!2JU61L0zAQq0x_T2L!RqkvH(UiQvmco?O?J1^-pxbuYaNg9__@4biKw4=;ItO z0OZ_=W;>WHz>kNDhA-Mo2YJ7S3jp}r4kioWv@n_B4gekR0MMBNnEgxvD4!`H><>Gb zERf9(kBHyq#E5vZ6COWC4Ah;~nJJIXt5Lc!moE|5OLT&-?YG j0uVD(0EheA4kioWbDQY^;%7Pl_e=qDPZS{cc!B=`zSILN literal 0 HcmV?d00001 diff --git a/auto_round/algorithms/rotation/hadamard/utils/math.py b/auto_round/algorithms/rotation/hadamard/utils/math.py new file mode 100644 index 000000000..2bb11b099 --- /dev/null +++ b/auto_round/algorithms/rotation/hadamard/utils/math.py @@ -0,0 +1,143 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Hadamard matrix construction utilities. + +Provides ``deterministic_hadamard_matrix`` (Sylvester construction) and +``random_hadamard_matrix`` (loaded from a precomputed safetensors file). +""" +# note that hadamard matrix multiplication reuses code from +# https://github.com/vllm-project/compressed-tensors/blob/main/src/compressed_tensors/transform/utils/hadamard.py + +from __future__ import annotations + +import math +from pathlib import Path + +import torch +from safetensors import safe_open + +__all__ = ["deterministic_hadamard_matrix", "random_hadamard_matrix", "is_pow2"] + +# Precomputed Hadamard matrices for non-power-of-2 sizes. +_HADAMARD_MATRICES_PATH: Path = Path(__file__).parent / "hadamards.safetensors" + + +def deterministic_hadamard_matrix( + size: int, + dtype: torch.dtype = torch.bfloat16, + device: torch.device = torch.device("cpu"), +) -> torch.Tensor: + """Construct an ``(size × size)`` Hadamard matrix via Sylvester's construction. + + ``size`` must be a power of 2. + + Adapted from https://github.com/scipy/scipy/blob/v1.15.2/scipy/linalg/_special_matrices.py + + Args: + size: Order of the matrix; must be a power of 2. + dtype: Output dtype. + device: Output device. + + Returns: + Hadamard tensor of shape ``(size, size)``. + """ + if size <= 0: + raise ValueError("Cannot construct Hadamard matrix with size <= 0") + log2 = int(math.log2(size)) + if size != 2**log2: + raise ValueError("Deterministic Hadamard requires size == 2^n") + + H = torch.tensor([[1]], dtype=dtype, device=device) + for _ in range(log2): + H = torch.vstack((torch.hstack((H, H)), torch.hstack((H, -H)))) + return H + + +def random_hadamard_matrix( + size: int, + dtype: torch.dtype = torch.bfloat16, + device: torch.device = torch.device("cpu"), + gen: torch.Generator | None = None, +) -> torch.Tensor: + """Create a randomly signed Hadamard matrix of order *size*. + + Supports non-powers-of-2 by reading a precomputed base matrix from + ``hadamards.safetensors`` and composing it with a random ±1 diagonal. + + Adapted from https://github.com/facebookresearch/SpinQuant/blob/main/utils/hadamard_utils.py + + Args: + size: Dimension of the matrix. + dtype: Output dtype. + device: Output device. + gen: Optional seeded ``torch.Generator`` for reproducibility. + + Returns: + Randomly signed Hadamard tensor of shape ``(size, size)``. + """ + Q = torch.randint(0, 2, (size,), generator=gen, dtype=dtype).to(device) + Q = Q * 2 - 1 + return _matmul_hadU(torch.diag(Q)) + + +def is_pow2(n: int) -> bool: + """Return ``True`` iff *n* is a positive power of two.""" + return n > 0 and (n & (n - 1)) == 0 + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + + +def _fetch_hadamard_divisor( + n: int, + dtype: torch.dtype, + device: torch.device = torch.device("cpu"), + file_path: Path = _HADAMARD_MATRICES_PATH, +) -> torch.Tensor | None: + """Return the largest precomputed Hadamard divisor ``k`` of *n* such that + ``n / k`` is a power of two, or ``None`` if no such entry exists.""" + open_device = torch.device("cpu") if device.type == "meta" else device + with safe_open(str(file_path), framework="pt", device=str(open_device)) as f: + divisors = sorted((int(key) for key in f.keys()), reverse=True) + for divisor in divisors: + if n % divisor == 0 and is_pow2(n // divisor): + return f.get_tensor(str(divisor)).to(dtype=dtype, device=device) + return None + + +def _matmul_hadU(X: torch.Tensor) -> torch.Tensor: + """Multiply *X* (a diagonal matrix) by the appropriate Hadamard matrix.""" + size = X.size(0) + dtype = X.dtype + device = X.device + + hadK = _fetch_hadamard_divisor(size, dtype, device=device) + if hadK is None: + raise ValueError(f"Cannot construct random Hadamard matrix of size {size}") + K = hadK.size(0) + + inp = X.clone().view(-1, size, 1) + out = inp.clone() + while inp.shape[1] > K: + inp = inp.view(inp.shape[0], inp.shape[1] // 2, 2, inp.shape[2]) + out = out.view(inp.shape) + out[:, :, 0, :] = inp[:, :, 0, :] + inp[:, :, 1, :] + out[:, :, 1, :] = inp[:, :, 0, :] - inp[:, :, 1, :] + out = out.view(inp.shape[0], inp.shape[1], -1) + inp, out = out, inp + assert inp.shape[1] == K + del out + return (hadK.view(1, K, K).to(inp) @ inp).view(X.shape) diff --git a/auto_round/algorithms/rotation/hadamard/utils/matrix.py b/auto_round/algorithms/rotation/hadamard/utils/matrix.py new file mode 100644 index 000000000..e459127d1 --- /dev/null +++ b/auto_round/algorithms/rotation/hadamard/utils/matrix.py @@ -0,0 +1,101 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Linear-algebra helpers for applying weight/activation rotation matrices. + +Note: ``apply_transform_weight`` reuses ideas from +https://github.com/vllm-project/compressed-tensors/blob/main/src/compressed_tensors/transform/utils/matrix.py +""" +from __future__ import annotations + +import torch + +__all__ = ["apply_transform_weight", "multihead_matmul"] + + +def apply_transform_weight( + transform_weight: torch.Tensor, + value: torch.Tensor, + location: str, + module_type: type[torch.nn.Module], +) -> torch.Tensor: + """Apply *transform_weight* to *value* according to *location*. + + The mathematical relationship for a ``torch.nn.Linear`` layer: + + .. code-block:: none + + y = x W.T (standard linear) + yh = (x V) (U.T W Vi.T).T (rotated linear) + + where *V* is the input-side rotation and *U* the output-side rotation. + + Args: + transform_weight: The rotation matrix to apply. + value: The tensor to rotate (weight or activation). + location: ``"input"`` or ``"weight"``. + module_type: ``type(module)`` – determines how the weight transform + is oriented. + + Returns: + Rotated tensor with the same shape as *value*. + """ + if location == "input": + return multihead_matmul(value, transform_weight) + + if module_type is torch.nn.Linear: + return multihead_matmul(value, transform_weight.T) + + raise NotImplementedError( + f"apply_transform_weight: unsupported location={location!r} " f"with module_type={module_type}" + ) + + +def multihead_matmul(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: + """Block-diagonal matrix multiplication over the last two dimensions. + + Handles the case where *A* and *B* have different sizes in their inner + dimension by treating the smaller matrix as a repeated block-diagonal. + + For example, if ``A.shape[-1] == 2 * B.shape[-2]``, this is equivalent to:: + + A @ block_diag(B, B) + + Args: + A: Left-hand tensor. + B: Right-hand tensor. + + Returns: + Result of the generalised matrix multiplication. + + Raises: + ValueError: If the inner dimensions are not evenly divisible. + """ + a_inner = A.shape[-1] + b_inner = B.shape[-2] + + if a_inner > b_inner: + if a_inner % b_inner != 0: + raise ValueError(f"multihead_matmul: A.shape[-1]={a_inner} is not divisible " f"by B.shape[-2]={b_inner}") + num_heads = a_inner // b_inner + A = A.unflatten(-1, (num_heads, b_inner)) + return (A @ B).flatten(-2, -1) + + if a_inner < b_inner: + if b_inner % a_inner != 0: + raise ValueError(f"multihead_matmul: B.shape[-2]={b_inner} is not divisible " f"by A.shape[-1]={a_inner}") + num_heads = b_inner // a_inner + B = B.unflatten(-2, (num_heads, a_inner)) + return (A @ B).flatten(-3, -2) + + return A @ B diff --git a/auto_round/algorithms/rotation/hadamard/utils/triton/__init__.py b/auto_round/algorithms/rotation/hadamard/utils/triton/__init__.py new file mode 100644 index 000000000..0b3bf2eae --- /dev/null +++ b/auto_round/algorithms/rotation/hadamard/utils/triton/__init__.py @@ -0,0 +1,2 @@ +# # Copyright (C) 2026 Intel Corporation +# # SPDX-License-Identifier: Apache-2.0 diff --git a/auto_round/algorithms/rotation/hadamard/utils/triton/mxfp4.py b/auto_round/algorithms/rotation/hadamard/utils/triton/mxfp4.py new file mode 100644 index 000000000..c26413248 --- /dev/null +++ b/auto_round/algorithms/rotation/hadamard/utils/triton/mxfp4.py @@ -0,0 +1,192 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Refer code here: +# https://github.com/IST-DASLab/FP-Quant/blob/master/inference_lib/src/fp_quant/module/triton/mxfp4.py + +import torch +import triton # pylint: disable=E0401 +import triton.language as tl # pylint: disable=E0401 + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_SIZE": 32 * 32}), + triton.Config({"BLOCK_SIZE": 64 * 32}), + triton.Config({"BLOCK_SIZE": 128 * 32}), + triton.Config({"BLOCK_SIZE": 256 * 32}), + triton.Config({"BLOCK_SIZE": 512 * 32}), + ], + key=[], +) +@triton.jit +def mxfp4_forward_kernel( + x_ptr, + hadamard_matrix_ptr, + output_ptr, + clip_mask_ptr, + n_elements: tl.constexpr, + hadamard_dim: tl.constexpr, + group_size: tl.constexpr, + gaussian_scale: tl.constexpr, + quest: tl.constexpr, + BLOCK_SIZE: tl.constexpr, +): + offsets_hadamard = tl.arange(0, hadamard_dim * hadamard_dim) + hadamard_matrix = tl.load(hadamard_matrix_ptr + offsets_hadamard).reshape(hadamard_dim, hadamard_dim) + + # load x + pid = tl.program_id(0) + start_idx = pid * BLOCK_SIZE + offsets = start_idx + tl.arange(0, BLOCK_SIZE) + mask = offsets < n_elements + x_flat = tl.load(x_ptr + offsets, mask=mask) + + # hadamard transform + x = tl.reshape(x_flat, (BLOCK_SIZE // hadamard_dim, hadamard_dim)) + x_had = tl.dot(x, hadamard_matrix) + + # group + x_had_grouped = tl.reshape(x_had, (BLOCK_SIZE // group_size, group_size)) + + # scale + # quest=True: per-group Gaussian-based scale = gaussian_scale * std + # quest=False: per-group max-abs-based scale, adjusted to FP4 range + if quest: + mean_squared = tl.sum(x_had_grouped * x_had_grouped, axis=-1, keep_dims=True) / group_size + mean = tl.sum(x_had_grouped, axis=-1, keep_dims=True) / group_size + std = tl.sqrt(mean_squared - mean * mean) + scales = gaussian_scale * std + 1e-8 + shared_exps = tl.exp2(tl.floor(tl.log2(scales))) + x_had_scaled = x_had_grouped / shared_exps + else: + scales = tl.max(tl.abs(x_had_grouped), axis=-1, keep_dims=True) + shared_exps = tl.exp2(tl.floor(tl.log2(scales)) - 2) / (3 / 4) + x_had_scaled = x_had_grouped / shared_exps + + # quantize + # Map abs(x) to FP4 levels {0, 0.5, 1, 1.5, 2, 3, 4, 6} + x_had_scaled_abs = tl.abs(x_had_scaled) + x_had_scaled_sign = tl.where( + x_had_scaled > 0, + 1, + -1, + ) + + x_fp4 = ( + tl.where( + x_had_scaled_abs > 5, + 6, + tl.where( + x_had_scaled_abs > 3.5, + 4, + tl.where( + x_had_scaled_abs > 2.5, + 3, + tl.where( + x_had_scaled_abs > 1.75, + 2, + tl.where( + x_had_scaled_abs > 1.25, + 1.5, + tl.where( + x_had_scaled_abs > 0.75, + 1, + tl.where( + x_had_scaled_abs > 0.25, + 0.5, + 0, + ), + ), + ), + ), + ), + ), + ) + * x_had_scaled_sign + ) + if clip_mask_ptr is not None: + tl.store( + clip_mask_ptr + offsets, + tl.reshape(x_had_scaled_abs < 6, (BLOCK_SIZE,)), + mask=mask, + ) + + # dequantize + x_dequantized = x_fp4 * shared_exps + + # Reshape back to flat form for storage + x_dequantized_flat = tl.reshape(x_dequantized, (BLOCK_SIZE,)) + + # store + tl.store(output_ptr + offsets, x_dequantized_flat, mask=mask) + + +@torch.compiler.disable() +def mxfp4_forward_kernel_wrapper( + x, + hadamard_matrix, + return_clip_mask=False, + quest=False, + gaussian_scale=3 / 4, +): + """ + Refer code here: + https://github.com/IST-DASLab/FP-Quant/blob/master/inference_lib/src/fp_quant/module/triton/mxfp4.py + Apply Hadamard transform + group-wise FP4 quantize/dequantize on x. + + Note: + The output is still in the Hadamard-transformed space (no inverse Hadamard is applied). + """ + # Pick a device — we require CUDA + device = x.device + if device.type != "cuda": + raise RuntimeError( + f"mxfp4_forward_kernel_wrapper requires a CUDA tensor for 'x', " + f"but got device '{device.type}'. Please move inputs to CUDA before calling." + ) + + # Ensure hadamard_matrix is on the same CUDA device + if hadamard_matrix.device != device: + hadamard_matrix = hadamard_matrix.to(device) + + # Make sure inputs are contiguous + x = x.contiguous() + hadamard_matrix = hadamard_matrix.contiguous() + + # Create output tensors on CUDA + output = torch.empty_like(x, device=device) + if return_clip_mask: + clip_mask = torch.empty_like(x, dtype=torch.bool, device=device).contiguous() + else: + clip_mask = None + + # Get total number of elements and calculate grid for launching the kernel + n_elements = x.numel() + grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),) + + # Launch kernel – no need for `with torch.device(...)` + mxfp4_forward_kernel[grid]( + x_ptr=x, + hadamard_matrix_ptr=hadamard_matrix, + output_ptr=output, + clip_mask_ptr=clip_mask, + n_elements=n_elements, + hadamard_dim=hadamard_matrix.shape[-1], + group_size=32, + gaussian_scale=gaussian_scale, + quest=quest, + ) + + return output, clip_mask diff --git a/auto_round/compressors_new/base.py b/auto_round/compressors_new/base.py index b145b7211..d64fc207f 100644 --- a/auto_round/compressors_new/base.py +++ b/auto_round/compressors_new/base.py @@ -11,7 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import copy import os +import sys from dataclasses import asdict, dataclass, fields from typing import Any, Optional, Union @@ -20,20 +22,39 @@ from auto_round.algorithms.alg_config import AlgConfig from auto_round.algorithms.quantization import BaseQuantizers, QuantizationConfig +from auto_round.algorithms.rotation import ( + HadamardConfig, + apply_rotation, + check_supported_schemes, + normalize_hadamard_config, +) from auto_round.compressors_new.shard_writer import ShardWriter -from auto_round.compressors_new.utils import _get_save_folder_name, block_forward +from auto_round.compressors_new.utils import _get_save_folder_name, block_forward, set_layer_config from auto_round.context.compress import CompressContext from auto_round.context.model import ModelContext from auto_round.formats import OutputFormat, get_formats from auto_round.logger import logger +from auto_round.schemes import ( + QuantizationScheme, + _handle_special_schemes, + _parse_scheme, + get_gguf_scheme, + preset_name_to_scheme, +) +from auto_round.special_model_handler import get_predefined_ignore_layers from auto_round.utils import ( INNER_SUPPORTED_LAYER_TYPES, SUPPORTED_LAYER_TYPES, TORCH_VERSION_AT_LEAST_2_6, compile_func, + compress_layer_names, + convert_dtype_str2torch, extract_block_names_to_str, + find_matching_blocks, + get_block_names, is_debug_mode, is_hpex_available, + is_quantized_input_module, memory_monitor, ) from auto_round.utils.device import set_non_auto_device_map @@ -73,7 +94,7 @@ class SerializedCompressorConfig: super_bits: Optional[int] = None super_group_size: Optional[int] = None to_quant_block_names: Optional[list[str]] = None - transform_config: Optional[dict[str, Any]] = None + hadamard_config: Optional[dict[str, Any]] = None class BaseCompressor(object): @@ -83,7 +104,21 @@ class BaseCompressor(object): shard_writer: ShardWriter = None supported_types = SUPPORTED_LAYER_TYPES inner_supported_types = INNER_SUPPORTED_LAYER_TYPES - quant_block_list = None + + # ── Scheme state (populated during resolve_scheme / _scheme_post_init) ── + is_auto_scheme: bool = False + orig_scheme = None + scheme = None + scale_dtype = None + layer_config = None + has_qlayer_outside_block: bool = False + regex_config: dict = None + quant_block_list: list = None + to_quant_block_names = None + ignore_layers: str = "" + quant_lm_head: bool = False + _scheme_resolved: bool = False + scheme_generator = None def __init__( self, @@ -92,11 +127,13 @@ def __init__( tokenizer=None, platform="hf", format=None, + scheme="W4A16", low_gpu_mem_usage: bool = False, device_map: Union[str, torch.device, int, dict] = 0, enable_torch_compile: bool = False, seed: int = 42, low_cpu_mem_usage: bool = True, + hadamard_config: str | dict | HadamardConfig | None = None, **kwargs, ): self.quantize_config = None @@ -107,6 +144,9 @@ def __init__( assert self.quantize_config is not None, "QuantizationConfig is required for Compressor" self.config_list.remove(self.quantize_config) + # Scheme is passed directly to the compressor, not stored in QuantizationConfig. + self.scheme = scheme + # TODO: refactor calibration self.calibration = None @@ -176,7 +216,10 @@ def __init__( logger.info("habana_frameworks is available, import htcore explicitly.") import habana_frameworks.torch.core as htcore # pylint: disable=E0401 - self.transform_config = kwargs.pop("transform_config", {}) + # Accept legacy name for backward compatibility + self._hadamard_config = ( + hadamard_config or kwargs.pop("hadamard_config", None) or kwargs.pop("transform_config", None) + ) # Reset both context singletons before creating fresh instances so that # consecutive AutoRound creations don't inherit stale config from earlier ones. @@ -222,6 +265,209 @@ def __init__( self._adjust_torch_compile(enable_torch_compile) self.compress_context.enable_torch_compile = self.enable_torch_compile + # ── Scheme resolution ───────────────────────────────────────────────────── + + def resolve_scheme(self, model_context=None, compress_context=None, dataset: str = None) -> None: + """Phase-1 init: resolve scheme and bind config attrs (no model structure needed). + + Must be called BEFORE ``get_formats()`` and BEFORE ``_scheme_post_init()``. + Idempotent: safe to call multiple times. + """ + if self._scheme_resolved: + return + + if model_context is not None: + self.model_context = model_context + if compress_context is not None: + self.compress_context = compress_context + if dataset is not None: + self.dataset = dataset + + scheme_fields = {f.name for f in fields(QuantizationScheme)} + user_scheme_overrides = { + k: getattr(self.quantize_config, k) + for k in scheme_fields + if getattr(self.quantize_config, k, None) is not None + } + default_scheme, self.is_auto_scheme, final_attrs = _parse_scheme(self.scheme, user_scheme_overrides) + + for key, value in final_attrs.items(): + setattr(self.quantize_config, key, value) + if hasattr(self, key): + setattr(self, key, value) + self.quantize_config.check_config() + + self.orig_scheme = copy.deepcopy(self.scheme) + self.scheme = default_scheme + + gguf_scheme_name = get_gguf_scheme(self.scheme) + if self.scale_dtype is None: + self.scale_dtype = "fp32" if gguf_scheme_name else "fp16" + self.scale_dtype = convert_dtype_str2torch(self.scale_dtype) + + self._scheme_resolved = True + + def _scheme_post_init(self) -> None: + """Phase-4 init: build layer config on the patched model. + + Requires ``resolve_scheme()`` to have been called first. + Must be called AFTER ``model_context.apply_patches()``. + """ + assert self._scheme_resolved, ( + "resolve_scheme() must be called before _scheme_post_init(). " + "BaseCompressor.post_init() does this automatically." + ) + + enable_gguf_official_mixed = not self.is_auto_scheme + + if self.quant_block_list is None: + quant_nontext_module = getattr(self.model_context, "quant_nontext_module", False) + all_blocks = get_block_names(self.model_context.model, quant_vision=quant_nontext_module) + self.quant_block_list = find_matching_blocks( + self.model_context.model, all_blocks, self.to_quant_block_names + ) + if self.to_quant_block_names is None and self.quant_block_list: + self.to_quant_block_names = extract_block_names_to_str(self.quant_block_list) + self.quantize_config.to_quant_block_names = self.to_quant_block_names + + self.configure_layer_config(enable_gguf_official_mixed=enable_gguf_official_mixed) + + def _gen_auto_scheme(self) -> dict[str, dict]: + """Generate per-layer config via AutoScheme delta-loss selection.""" + if self.model_context.is_mllm: + logger.info("AutoScheme is not yet supported for multimodal LLMs.") + sys.exit(-1) + + if is_quantized_input_module(self.model_context.model): + logger.info("AutoScheme does not currently support quantized input models (e.g., FP8).") + sys.exit(-1) + + all_dtypes = [] + all_gguf = True + for option in self.orig_scheme.options: + dtype = "int" + if isinstance(option, str): + if not option.lower().startswith("gguf"): + all_gguf = False + option = preset_name_to_scheme(option) + else: + all_gguf = False + + if isinstance(option, QuantizationScheme): + dtype = option.data_type + elif isinstance(option, dict): + dtype = option.get("data_type", "int") + + all_dtypes.append(dtype) + + unique_dtypes = set(all_dtypes) + if len(unique_dtypes) > 1 and not all_gguf: + logger.warning( + "Models with mixed data_types " + "cannot yet be exported to real formats except GGUF. " + "Please save the model using the `fake` format for now." + ) + + layer_config, self.has_qlayer_outside_block, self.regex_config = set_layer_config( + self.model_context.model, + self.layer_config, + self.scheme, + self.scale_dtype, + self.supported_types, + self.inner_supported_types, + self.quant_block_list, + self.ignore_layers, + self.quant_lm_head, + enable_gguf_official_mixed=False, + is_mllm=self.model_context.is_mllm, + ) + quant_layer_names = layer_config.keys() + scheme_keys = {f.name for f in fields(QuantizationScheme)} + fixed_layer_scheme_new = { + k: {key: v[key] for key in scheme_keys & v.keys()} + for k, v in layer_config.items() + if v.get("fixed_by_user", False) + } + + from auto_round.auto_scheme.gen_auto_scheme import GenScheme + + if ( + not self.compress_context.enable_torch_compile + and self.quantize_config.super_bits is None + and not self.orig_scheme.low_gpu_mem_usage + ): + logger.warning("we strongly recommend to set `enable_torch_compile` to True for AutoScheme to save VRAM") + self.scheme_generator = GenScheme( + self.orig_scheme, + self.model_context.model, + quant_layer_names, + fixed_layer_scheme_new, + self.dataset, + device_map=self.compress_context.device_map, + tokenizer=self.model_context.tokenizer, + enable_torch_compile=self.compress_context.enable_torch_compile, + ) + layer_config = self.scheme_generator.get_layer_config() + return layer_config + + def configure_layer_config(self, enable_gguf_official_mixed: bool | None = True) -> None: + """Build ``self.layer_config`` from the resolved scheme on the patched model.""" + is_gguf_format = (f := getattr(self.compress_context, "formats", None)) is not None and "gguf" in f + predefined_ignore_layers = get_predefined_ignore_layers(self.model_context.model) + compressed_predefined_ignore_layers = compress_layer_names(predefined_ignore_layers) + if not is_gguf_format: + predefined_ignore_layers = get_predefined_ignore_layers(self.model_context.model) + if predefined_ignore_layers: + logger.info(f"Using predefined ignore_layers: {compressed_predefined_ignore_layers}") + tmp_str = ",".join(predefined_ignore_layers) + if self.ignore_layers == "": + self.ignore_layers = tmp_str + else: + self.ignore_layers += "," + tmp_str + + if self.is_auto_scheme: + self.layer_config = self._gen_auto_scheme() + else: + self.layer_config = _handle_special_schemes( + self.orig_scheme, + self.layer_config, + self.model_context.model, + supported_types=SUPPORTED_LAYER_TYPES, + inner_supported_types=INNER_SUPPORTED_LAYER_TYPES, + quant_lm_head=self.quant_lm_head, + mllm=self.model_context.is_mllm, + ) + _gguf_orig_fmt = getattr(self, "_gguf_original_format_name", None) + if _gguf_orig_fmt and "_MIXED" in _gguf_orig_fmt.upper(): + self.layer_config = _handle_special_schemes( + _gguf_orig_fmt.lower(), + self.layer_config, + self.model_context.model, + supported_types=SUPPORTED_LAYER_TYPES, + inner_supported_types=INNER_SUPPORTED_LAYER_TYPES, + quant_lm_head=self.quant_lm_head, + mllm=self.model_context.is_mllm, + ) + + fill_default_value = not self.is_auto_scheme + self.layer_config, self.has_qlayer_outside_block, self.regex_config = set_layer_config( + self.model_context.model, + self.layer_config, + self.scheme, + self.scale_dtype, + SUPPORTED_LAYER_TYPES, + INNER_SUPPORTED_LAYER_TYPES, + self.quant_block_list, + self.ignore_layers, + self.quant_lm_head, + enable_gguf_official_mixed=enable_gguf_official_mixed, + is_mllm=self.model_context.is_mllm, + fill_default_value=fill_default_value, + gguf_format_name=getattr(self, "_gguf_format_name", None), + ) + + # ───────────────────────────────────────────────────────────────────────── + @property def mllm(self): return self.model_context.is_mllm @@ -236,7 +482,7 @@ def _adjust_torch_compile(self, enable_torch_compile: bool) -> None: # Determine fp8 / nvfp4 intent from raw config before scheme resolution. cfg = self.quantize_config - raw_scheme = cfg.scheme if isinstance(cfg.scheme, str) else "" + raw_scheme = self.scheme if isinstance(self.scheme, str) else "" raw_dt = (cfg.data_type or "").lower() raw_adt = (cfg.act_data_type or "").lower() raw_scheme_upper = raw_scheme.upper() @@ -285,7 +531,7 @@ def _get_calibration_dataset(self) -> str: return dataset from auto_round.auto_scheme.gen_auto_scheme import AutoScheme - scheme = self.quantize_config.scheme + scheme = self.scheme if isinstance(scheme, AutoScheme) and scheme.dataset: return scheme.dataset return "NeelNanda/pile-10k" @@ -309,14 +555,28 @@ def post_init(self) -> None: return # ── Phase 1: resolve scheme ─────────────────────────────────────────── - # Creates the quantizer and runs scheme parsing (pure config work: - # sets data_type / bits / sym / scale_dtype etc.). - self.quantizer = BaseQuantizers.from_config(self.quantize_config) - self.quantizer.resolve_scheme( + # Initialize scheme state from quantize_config before resolving. + cfg = self.quantize_config + self.scale_dtype = cfg.scale_dtype + self.layer_config = cfg.layer_config + self.ignore_layers = cfg.ignore_layers + self.quant_lm_head = cfg.quant_lm_head + self.to_quant_block_names = cfg.to_quant_block_names + + # Resolve the scheme (pure config work: sets data_type / bits / sym / + # scale_dtype etc. on both self and self.quantize_config). + self.resolve_scheme( model_context=self.model_context, compress_context=self.compress_context, dataset=self._get_calibration_dataset(), ) + + # Create the quantizer now that the config holds resolved values. + self.quantizer = BaseQuantizers.from_config(self.quantize_config) + self.quantizer.model_context = self.model_context + self.quantizer.compress_context = self.compress_context + self.quantizer.model = self.model_context.model + self.quantizer.scale_dtype = self.scale_dtype self.wrapper_block = wrapper_block # ── Phase 2: resolve output format ─────────────────────────────────── @@ -328,12 +588,12 @@ def post_init(self) -> None: ShardWriter.reset() self.shard_writer = ShardWriter(self.model_context.model, bits=8) - # ── Phase 2b: propagate format-adjusted attrs back to quantizer ────── + # ── Phase 2b: propagate GGUF-adjusted attrs back to quantizer ──────── # gguf_args_check (called inside get_formats) may have overridden # bits / sym / data_type / super_bits / super_group_size / group_size # on *this* BaseCompressor object. The quantizer stored its own copies # from Phase 1 (resolve_scheme), so we must sync them now, before - # quantizer.post_init() builds the layer_config in Phase 4. + # _scheme_post_init() builds the layer_config in Phase 4. _gguf_forwarded_attrs = ( "bits", "sym", @@ -350,8 +610,10 @@ def post_init(self) -> None: if _attr not in ("scale_dtype", "act_bits") and getattr(self.quantizer, _attr) != self.__dict__[_attr]: _any_gguf_attr_changed = True setattr(self.quantizer, _attr, self.__dict__[_attr]) - # If gguf_args_check changed scheme attrs, rebuild self.quantizer.scheme - # so that set_layer_config() uses the correct default_dict and gguf_name. + # If gguf_args_check changed scheme attrs, rebuild the scheme on both + # the compressor (SchemeMixin) and the quantizer so that + # configure_layer_config() and set_layer_config() use the correct + # default_dict and gguf_name. if _any_gguf_attr_changed: from auto_round.schemes import PRESET_SCHEMES from auto_round.schemes import QuantizationScheme as _QS @@ -380,30 +642,45 @@ def post_init(self) -> None: _gguf_fmt_name = _preset_key break if _gguf_preset_scheme is not None: - self.quantizer.scheme = _gguf_preset_scheme - # Store the exact gguf format name so set_layer_config can - # use it directly, avoiding Q4_K_S / Q4_K_M ambiguity. - self.quantizer._gguf_format_name = _gguf_fmt_name + # Update scheme on both compressor and quantizer. + self.scheme = _gguf_preset_scheme + # Store the exact gguf format name so configure_layer_config / + # set_layer_config can use it directly, avoiding Q4_K_S / Q4_K_M ambiguity. + self._gguf_format_name = _gguf_fmt_name # Store original format name (may include _mixed) for _handle_special_schemes if _gguf_original_fmt_name: - self.quantizer._gguf_original_format_name = _gguf_original_fmt_name + self._gguf_original_format_name = _gguf_original_fmt_name else: - _new_scheme_dict = {f.name: getattr(self.quantizer, f.name, None) for f in fields(_QS)} - self.quantizer.scheme = _QS.from_dict({k: v for k, v in _new_scheme_dict.items() if v is not None}) + _new_scheme_dict = {f.name: getattr(self, f.name, None) for f in fields(_QS)} + _new_scheme = _QS.from_dict({k: v for k, v in _new_scheme_dict.items() if v is not None}) + self.scheme = _new_scheme - # ── Phase 2c: sync layer_config set by GGUFFormat._mixed handling ─── + # ── Phase 2c: merge layer_config set by GGUFFormat._mixed handling ─── # Inner GGUFFormat("q2_k_mixed", ar) calls _handle_special_schemes and - # stores the result in ar.__dict__["layer_config"] (via ar.layer_config=). - # This is NOT the same object as quantizer.layer_config, so we must - # forward it here before quantizer.post_init() builds the final config. - _compressor_layer_cfg = self.__dict__.get("layer_config") - if _compressor_layer_cfg is not None and isinstance(_compressor_layer_cfg, dict) and _compressor_layer_cfg: - # Merge: let the GGUFFormat-set entries take precedence over any - # user-provided entries already in quantizer.layer_config. - if self.quantizer.layer_config is None: - self.quantizer.layer_config = {} - for _lname, _lval in _compressor_layer_cfg.items(): - self.quantizer.layer_config.setdefault(_lname, _lval) + # stores the result directly in ar.__dict__["layer_config"]. + # self.layer_config is already the authoritative owner of this attr, so + # just merge any GGUFFormat-supplied per-layer entries that may have + # been set before Phase 1 (during get_formats → gguf_args_check). + _gguf_layer_cfg = { + k: v + for k, v in self.__dict__.get("layer_config", {}).items() + if k not in (self.quantize_config.layer_config or {}) + } + if _gguf_layer_cfg: + if self.layer_config is None: + self.layer_config = {} + for _lname, _lval in _gguf_layer_cfg.items(): + self.layer_config.setdefault(_lname, _lval) + + # ── Phase 2d: apply hadamard transform ─────────────────────────────── + if self._hadamard_config: + check_supported_schemes(self.scheme) + self.model_context.model = apply_rotation( + self.model_context.model, + self._hadamard_config, + need_calibration=True if self.quantize_config.iters > 0 else False, + ) + self._hadamard_config = normalize_hadamard_config(self._hadamard_config) # ── Phase 3: patch model structure ─────────────────────────────────── # update_module() may replace layers (e.g. MoE expert merging); must @@ -413,7 +690,19 @@ def post_init(self) -> None: # ── Phase 4: build layer config ────────────────────────────────────── # configure_layer_config() walks the patched model; _gen_auto_scheme() # (AutoScheme path) runs delta-loss forward+backward passes. - self.quantizer.post_init() + # Both methods now live in BaseCompressor and operate on self directly. + self._scheme_post_init() + + # Sync the fully-resolved scheme state to the quantizer so that + # quantization methods (quantize_block, quantize_layer, etc.) have + # access to layer_config, scale_dtype, quant_block_list, etc. + self.quantizer.layer_config = self.layer_config + self.quantizer.has_qlayer_outside_block = self.has_qlayer_outside_block + self.quantizer.regex_config = self.regex_config + self.quantizer.quant_block_list = self.quant_block_list + self.quantizer.to_quant_block_names = self.to_quant_block_names + self.quantizer.scale_dtype = self.scale_dtype + self.quantizer.ignore_layers = self.ignore_layers # ── Phase 5: hardware / compile setup ──────────────────────────────── set_non_auto_device_map(self.model_context.model, self.compress_context.device_map) @@ -427,7 +716,7 @@ def post_init(self) -> None: self._offloader.reset() # Disable inplace when quantized layers live outside transformer blocks. - if self.quantizer.has_qlayer_outside_block and self.need_calib: + if self.has_qlayer_outside_block and self.need_calib: self.inplace = False if not hasattr(self, "formats"): @@ -483,7 +772,7 @@ def _adjust_immediate_packing_and_saving(self): if len(formats) == 1 and not formats[0].is_fake() and self.inplace: self.is_immediate_packing = True - if self.quantizer.has_qlayer_outside_block and self.need_calib: + if self.has_qlayer_outside_block and self.need_calib: self.is_immediate_packing = False if not ("causallm" in self.model_context.model.__class__.__name__.lower() and not self.model_context.is_mllm): @@ -522,7 +811,7 @@ def _adjust_immediate_packing_and_saving(self): "Keeping `low_cpu_mem_usage` enabled in RTN mode (iters=0): " "RTN path uses blockwise quantization and supports per-block offloading." ) - elif self.quantizer.has_qlayer_outside_block and not isinstance(self.quantize_config, RTNConfig): + elif self.has_qlayer_outside_block and not isinstance(self.quantize_config, RTNConfig): logger.warning( "`low_cpu_mem_usage` is not fully supported " "when there are quantized layers outside blocks and optimized RTN is disabled. " diff --git a/auto_round/compressors_new/calib.py b/auto_round/compressors_new/calib.py index 2fb746987..413521b81 100644 --- a/auto_round/compressors_new/calib.py +++ b/auto_round/compressors_new/calib.py @@ -48,6 +48,7 @@ convert_module_to_hp_if_necessary, get_block_names, get_module, + is_auto_device_mapping, is_quantized_input_module, memory_monitor, mv_module_from_gpu, @@ -702,13 +703,93 @@ def _quantize_blocks( self._offloader.reload(model, names) block_name_or_names = n if nblocks == 1 else names - q_input, input_ids = self.quantizer.quantize_block( - block_name_or_names, + + # ── Infrastructure: materialize, dtype convert, device placement ── + materialize_model_(m) + convert_module_to_hp_if_necessary(m, self.model_context.amp_dtype, self.compress_context.device) + + if is_auto_device_mapping(self.compress_context.device_map) and len(self.compress_context.device_list) > 1: + from auto_round.utils.device import set_auto_device_map_for_block_with_tuning + + card_0_in_high_risk, loss_device = set_auto_device_map_for_block_with_tuning( + m, + self.compress_context.device_map, + input_ids, + self.compress_context.low_gpu_mem_usage, + self.quantizer.batch_size, + self.compress_context.device, + ) + else: + m = m.to(self.compress_context.device) + card_0_in_high_risk, loss_device = False, self.compress_context.device + + if len(self.compress_context.device_list) > 1: + from accelerate.hooks import AlignDevicesHook, add_hook_to_module + + for _n, _mod in m.named_modules(): + if len(list(_mod.children())) != 0 or not hasattr(_mod, "tuning_device"): + continue + add_hook_to_module(_mod, AlignDevicesHook(_mod.tuning_device, io_same_device=True), True) + + # ── Infrastructure: collect reference output and act_max ────────── + bs = self.quantizer.batch_size * self.quantizer.infer_bs_coeff + if q_input is None: + hook_handles = self.quantizer._register_act_max_hook(m) + reference_output = self.quantizer._get_block_outputs(m, input_ids, input_others, bs) + for h in hook_handles: + h.remove() + else: + reference_output = self.quantizer._get_block_outputs(m, input_ids, input_others, bs) + hook_handles = self.quantizer._register_act_max_hook(m) + if hook_handles: + self.quantizer._get_block_outputs(m, q_input, input_others, bs, save_output=False) + for h in hook_handles: + h.remove() + + # ── Infrastructure: swap q_input ────────────────────────────────── + if q_input is not None: + if input_ids is not q_input: + clear_memory(input_ids, device_list=self.compress_context.device_list) + else: + clear_memory(device_list=self.compress_context.device_list) + input_ids = q_input + + # ── Pure algorithm: delegates to quantizer ──────────────────────── + mid_iter_mem_check = self.compress_context.low_gpu_mem_usage and card_0_in_high_risk + self.quantizer.quantize_block( + m, input_ids, input_others, - q_input=q_input, + reference_output, + loss_device=loss_device, + mid_iter_mem_check=mid_iter_mem_check, ) + # ── Infrastructure: collect q_outputs if needed ─────────────────── + if self.quantizer.enable_quanted_input: + q_input = self.quantizer._get_block_outputs(m, input_ids, input_others, bs) + else: + q_input = None + + # ── Infrastructure: hook removal, device cleanup, logging ───────── + if len(self.compress_context.device_list) > 1: + accelerate.hooks.remove_hook_from_submodules(m) + mv_module_from_gpu(m) + if self.enable_torch_compile: + torch._dynamo.reset() + clear_memory(input_ids if q_input is None else None, device_list=self.compress_context.device_list) + memory_monitor.log_summary() + + # ── Infrastructure: immediate_pack / shard write ────────────────── + if self.compress_context.is_immediate_saving: + for _n, _mod in m.named_modules(): + if hasattr(_mod, "bits") and check_to_quantized(_mod): + from auto_round.compressors_new.utils import immediate_pack as _immediate_pack + + _immediate_pack(_mod.global_name, self.quantizer.layer_config) + + input_ids = q_input if q_input is not None else input_ids + if self.is_immediate_saving: self.shard_writer.write(m, is_finalize=False) @@ -864,7 +945,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: ) if len(unquantized_layers) > 0: compressed_unquantized_layers = compress_layer_names(unquantized_layers) - summary_info += f", {compressed_unquantized_layers} have not been quantized" + summary_info += f", unquantized layers: {compressed_unquantized_layers}" logger.info(summary_info) self.model_context.quantized = True @@ -1100,11 +1181,57 @@ def _quantize_via_rtn_blockwise(self) -> None: for block_name in block_names: pbar.set_description(f"Quantizing {block_name}") - self.quantizer.quantize_block( - block_name, + block = get_module(self.model_context.model, block_name) + + # ── Infrastructure: materialize, dtype convert, device placement ── + materialize_model_(block) + block.to("cpu") + block = convert_module_to_hp_if_necessary( + block, dtype=self.model_context.amp_dtype, device=self.compress_context.device + ) + if ( + is_auto_device_mapping(self.compress_context.device_map) + and len(self.compress_context.device_list) > 1 + ): + from auto_round.utils.device import set_auto_device_map_for_block_with_tuning + + set_auto_device_map_for_block_with_tuning( + block, + self.compress_context.device_map, + input_ids, + self.compress_context.low_gpu_mem_usage, + self.quantizer.batch_size, + self.compress_context.device, + ) + if len(self.compress_context.device_list) > 1: + from accelerate.hooks import AlignDevicesHook, add_hook_to_module + + for _, _mod in block.named_modules(): + if len(list(_mod.children())) != 0 or not hasattr(_mod, "tuning_device"): + continue + add_hook_to_module(_mod, AlignDevicesHook(_mod.tuning_device, io_same_device=True), True) + else: + block = block.to(self.compress_context.device) + + # ── Infrastructure: register act_max hook and run forward pass ── + hook_handles = self.quantizer._register_act_max_hook(block) + self.quantizer._get_block_outputs( + block, input_ids, input_others, + self.quantizer.batch_size * self.quantizer.infer_bs_coeff, ) + for h in hook_handles: + h.remove() + + if len(self.compress_context.device_list) > 1: + accelerate.hooks.remove_hook_from_submodules(block) + + # ── Pure algorithm ──────────────────────────────────────────── + self.quantizer.quantize_block(block) + + # ── Infrastructure: cleanup ─────────────────────────────────── + mv_module_from_gpu(block) if self.compress_context.low_cpu_mem_usage and not self.is_immediate_saving: self._offloader(self.model_context.model, block_name) @@ -1246,7 +1373,9 @@ def quantize(self): self.post_init() return self._quantize_impl() - @torch.inference_mode() + # Use no_grad instead of inference_mode + # https://github.com/intel/auto-round/issues/1620 + @torch.no_grad() def _quantize_impl(self): formats = getattr(self, "formats", None) or [] diff --git a/auto_round/compressors_new/entry.py b/auto_round/compressors_new/entry.py index 0f6bf7cc6..45bb5b997 100644 --- a/auto_round/compressors_new/entry.py +++ b/auto_round/compressors_new/entry.py @@ -13,7 +13,31 @@ from auto_round.compressors_new.utils import check_need_act_calibration from auto_round.compressors_new.zero_shot import ZeroShotCompressor from auto_round.logger import logger -from auto_round.schemes import QuantizationScheme +from auto_round.schemes import QuantizationScheme, _parse_scheme + + +def _preview_resolved_attrs(config, scheme=None) -> dict: + """Resolve scheme attributes without mutating config, for routing decisions. + + Called in ``Compressor.__new__`` before the concrete compressor class is + chosen. ``SchemeMixin.resolve_scheme()`` will do the authoritative + resolution later; this is just a lightweight preview so routing logic + (``enable_imatrix``, ``needs_act_calib``, etc.) can use the correct values + even when the user specified only ``scheme=`` without explicit bit/dtype args. + + Returns: + dict: resolved attributes (may be empty if scheme cannot be previewed). + """ + if isinstance(scheme, AutoScheme): + # AutoScheme needs model info — cannot preview, rely on raw config attrs + return {} + scheme_attr_names = QuantizationScheme.get_attributes() + user_overrides = {k: getattr(config, k) for k in scheme_attr_names if getattr(config, k, None) is not None} + try: + _, _, final_attrs = _parse_scheme(scheme, user_overrides) + return final_attrs + except Exception: + return {} def is_weight_scheme(scheme): @@ -63,6 +87,7 @@ def __new__( tokenizer=None, platform="hf", format=None, + scheme="W4A16", **kwargs, ): # using different compressor base on AlgConfigs @@ -106,23 +131,41 @@ class DiffusionCalibCompressor(DiffusionMixin, CalibCompressor): elif isinstance(config, RTNConfig): enable_imatrix = False disable_opt_rtn = getattr(config, "disable_opt_rtn", False) + # If disable_opt_rtn was not explicitly set and scheme is W8A16/W8A8, + # auto-disable optimization to improve efficiency. + if getattr(config, "orig_disable_opt_rtn", None) is None: + if isinstance(scheme, str) and scheme.upper() in ["W8A16", "W8A8"]: + logger.warning("`disable_opt_rtn` is turned on for W8A16/W8A8 quantization to improve efficiency.") + disable_opt_rtn = True + config.disable_opt_rtn = True if not disable_opt_rtn: has_gguf_k = "gguf" in format.lower() and "_k" in format.lower() if format else False if has_gguf_k: enable_imatrix = True else: - sym = getattr(config, "sym", True) - if sym is not None and sym is False: + # Resolve scheme attrs for routing (config hasn't been through + # SchemeMixin yet; user may have specified only scheme="W4A16"). + _resolved = _preview_resolved_attrs(config, scheme) + _sym = _resolved.get("sym", getattr(config, "sym", None)) + _data_type = _resolved.get("data_type", getattr(config, "data_type", "") or "") + if _sym is not None and _sym is False: enable_imatrix = False - elif getattr(config, "data_type", "") == "int": + elif _data_type == "int": enable_imatrix = True - elif is_weight_scheme(config.scheme): + elif is_weight_scheme(scheme): enable_imatrix = True - - needs_act_calib = getattr(config, "is_act_quantize", False) and check_need_act_calibration( - getattr(config, "act_dynamic", None), - getattr(config, "act_data_type", None), - getattr(config, "act_bits", 16), + else: + _resolved = {} + + _resolved = _resolved if not disable_opt_rtn else _preview_resolved_attrs(config, scheme) + _act_bits = _resolved.get("act_bits", getattr(config, "act_bits", None)) + _act_data_type = _resolved.get("act_data_type", getattr(config, "act_data_type", None)) + _act_dynamic = _resolved.get("act_dynamic", getattr(config, "act_dynamic", None)) + _is_act_quantize = _act_bits is not None and _act_bits <= 8 + needs_act_calib = _is_act_quantize and check_need_act_calibration( + _act_dynamic, + _act_data_type, + _act_bits if _act_bits is not None else 16, static_kv_dtype=kwargs.get("static_kv_dtype"), static_attention_dtype=kwargs.get("static_attention_dtype"), ) @@ -131,7 +174,7 @@ class DiffusionCalibCompressor(DiffusionMixin, CalibCompressor): # scheme selection, regardless of whether imatrix is needed. from auto_round.auto_scheme.gen_auto_scheme import AutoScheme as _AutoScheme - is_auto_scheme = isinstance(config.scheme, _AutoScheme) + is_auto_scheme = isinstance(scheme, _AutoScheme) if enable_imatrix or needs_act_calib or is_auto_scheme: config._alg_cls = "OptimizedRTNQuantizer" @@ -307,7 +350,6 @@ def __new__( # RTN mode disable_opt_rtn = kwargs.pop("disable_opt_rtn", None) config = RTNConfig( - scheme=scheme, layer_config=layer_config, bits=bits, group_size=group_size, @@ -334,7 +376,6 @@ def __new__( enable_quanted_input = kwargs.pop("enable_quanted_input", True) config = AutoRoundConfig( - scheme=scheme, layer_config=layer_config, iters=iters, nsamples=nsamples, @@ -389,6 +430,7 @@ def __new__( tokenizer=tokenizer, platform=platform, format=format, + scheme=scheme, dataset=dataset, low_gpu_mem_usage=low_gpu_mem_usage, device_map=device_map, diff --git a/auto_round/compressors_new/utils.py b/auto_round/compressors_new/utils.py index 75a99da28..c48144411 100644 --- a/auto_round/compressors_new/utils.py +++ b/auto_round/compressors_new/utils.py @@ -353,7 +353,10 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str m = get_module(model, name) if len(list(m.children())) == 0 and type(m) not in supported_types: layer_config.pop(name) - logger.warning(f"{name} is not supported in current scheme, ignoring its setting in `layer_config`") + logger.warning( + f"'{name}' exists in the model but is not a supported quantization target " + f"in the current scheme, ignoring its setting in `layer_config`" + ) continue regex = re.compile(to_standard_regex(name)) @@ -636,13 +639,16 @@ def _set_config(config, target_config): i_attention_wv = 0 i_ffn_down = 0 layer_config_copy = copy.deepcopy(layer_config) - target_bits = None + base_target_bits = None if inner_gguf_format.startswith("gguf:q") and len(inner_gguf_format) >= 7 and (inner_gguf_format[6]).isdigit(): - target_bits = int(inner_gguf_format[6]) + base_target_bits = int(inner_gguf_format[6]) for layer_name, config in layer_config_copy.items(): if not check_to_quantized(config): continue + # Reset target_bits each iteration to prevent lm_head/embedding settings + # from bleeding into subsequent block layers and bypassing their special logic. + target_bits = base_target_bits new_type = GGUF_CONFIG[target_gguf_format]["mostly"] layer = get_module(model, layer_name) if type(layer) == transformers.pytorch_utils.Conv1D: diff --git a/auto_round/compressors_new/zero_shot.py b/auto_round/compressors_new/zero_shot.py index 915c5da08..840ff9e54 100644 --- a/auto_round/compressors_new/zero_shot.py +++ b/auto_round/compressors_new/zero_shot.py @@ -14,20 +14,14 @@ import copy from typing import Any, Union -import accelerate import torch from tqdm import tqdm from auto_round.algorithms.alg_config import AlgConfig from auto_round.compressors_new.base import BaseCompressor -from auto_round.compressors_new.utils import ( - _get_quantized_layer_names_outside_blocks, - check_need_act_calibration, -) from auto_round.logger import logger from auto_round.modeling.fused_moe.replace_modules import materialize_model_ from auto_round.utils import ( - SUPPORTED_LAYER_TYPES, check_to_quantized, clear_memory, convert_module_to_hp_if_necessary, @@ -38,8 +32,6 @@ global_state, memory_monitor, set_module, - to_device, - to_dtype, ) @@ -77,100 +69,9 @@ def __init__( ) self.lr = 5e-3 - def _quantize_via_rtn_blockwise(self) -> None: - """Quantize model layers block by block using cached inputs and imatrix.""" - - all_blocks = self.quantizer.quant_block_list if self.quantizer.quant_block_list else get_block_names(self.model) - if not all_blocks: - raise ValueError("Could not find any blocks. Check the model or quant_block_list.") - - all_first_block_names = [block[0] for block in all_blocks] - layer_names = _get_quantized_layer_names_outside_blocks( - model=self.model_context.model, - layer_config=self.quantizer.layer_config, - supported_types=SUPPORTED_LAYER_TYPES, - quant_block_list=self.quantizer.quant_block_list, - ) - if self.quantize_config.is_act_quantize and (not self.quantize_config.act_dynamic or len(layer_names) > 0): - if len(layer_names) > 0: - logger.warning( - "quantize layers outside blocks for static activation quantizaiton" - " will significantly increase calibration time" - ) - all_inputs = self.try_cache_inter_data_gpucpu( - all_first_block_names, self.quantize_config.nsamples, layer_names - ) - else: - all_inputs = self.cache_inter_data(all_first_block_names, self.quantize_config.nsamples) - - # Clear hooks for multi-GPU setups - if hasattr(self.model_context.model, "hf_device_map") and len(self.model_context.model.hf_device_map) > 1: - accelerate.hooks.remove_hook_from_submodules(self.model_context.model) - - pbar = tqdm(range(sum(len(block) for block in all_blocks))) - - for block_names in all_blocks: - first_block = block_names[0] - inputs = all_inputs.pop(first_block) - input_keys = [k for k in inputs if k.startswith("hidden_state")] - if len(input_keys) != 1: - raise RuntimeError( - "hidden_states arg mismatch. Please file an issue at https://github.com/intel/auto-round/issues" - ) - inputs["input_ids"] = inputs.pop(input_keys[0]) - - clear_memory(self.inputs, device_list=self.compress_context.device_list) - - total_samples = len(inputs["input_ids"]) - if total_samples < self.quantize_config.batch_size: - self.quantize_config.batch_size = total_samples - logger.warning(f"Forcing batch size to {total_samples}") - - input_ids = to_device(inputs.pop("input_ids"), self.compress_context.cache_device) - input_others = to_device(inputs, self.compress_context.cache_device) - - tmp_dtype = self.model_context.amp_dtype if self.model_context.amp else torch.float32 - input_ids = [id_.to(tmp_dtype) for id_ in input_ids] - - for key, val in input_others.items(): - if isinstance(val, torch.Tensor) and val.dtype in (torch.float16, torch.bfloat16): - input_others[key] = val.to(tmp_dtype) - elif isinstance(val, list): - input_others[key] = [to_dtype(v, tmp_dtype) for v in val] - - for block_name in block_names: - pbar.set_description(f"Quantizing {block_name}") - self.quantizer.quantize_block(block_name, input_ids, input_others) - - if self.low_cpu_mem_usage and not self.is_immediate_saving: - self._offloader(self.model_context.model, block_name) - if block_name == block_names[-1]: - clear_memory(input_ids, device_list=self.compress_context.device_list) - else: - clear_memory(device_list=self.compress_context.device_list) - - memory_monitor.log_summary() - pbar.update(1) - pbar.close() - # Process remaining layers not in blocks - # Collect names of quantizable layers not belonging to any block - remain_layer_names = [] - block_name_set = set(name for block in all_blocks for name in block) - for n, m in self.model_context.model.named_modules(): - if not check_to_quantized(m): - continue - # Skip if this layer is part of any block (by prefix match) - if any(n == block_name or n.startswith(f"{block_name}.") for block_name in block_name_set): - continue - remain_layer_names.append(n) - - for name in remain_layer_names: - dtype = None - if self.super_group_size is not None: - dtype = torch.float32 - self.quantizer.quantize_layer(name, dtype=dtype) - - @torch.inference_mode() + # Use no_grad instead of inference_mode + # https://github.com/intel/auto-round/issues/1620 + @torch.no_grad() def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: """Quantize the model and return the quantized model along with layer configurations.The entry of AutoRound. Returns: @@ -186,119 +87,93 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: # Release memory clear_memory(device_list=self.device_list) - if self.quantize_config.is_act_quantize and check_need_act_calibration( - self.quantize_config.act_dynamic, - self.quantize_config.act_data_type, - self.quantize_config.act_bits, - self.static_kv_dtype, - self.static_attention_dtype, - ): - model = self.model_context.model - hook_handles = self.quantizer._register_act_max_hook(model) - try: - self._quantize_via_rtn_blockwise() - except torch.OutOfMemoryError: - logger.warning("Fallback to CPU. Consider using more GPUs via `--device 0,1,2,3`.") - model = model.to("cpu") - self.model_context.model = model - clear_memory(device_list=self.device_list) - if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1: - import accelerate - - accelerate.hooks.remove_hook_from_submodules(model) - orig_device = self.compress_context.device - self.compress_context.device = "cpu" - self._quantize_via_rtn_blockwise() - self.compress_context.device = orig_device - for handle in hook_handles: - handle.remove() + # By default, we go with layer-wise way if no replacement happened. + # In RTN mode (iters == 0), force blockwise quantization to avoid + # full-model materialization and linear CPU RAM growth. + use_blockwise_quantization = global_state.replaced_module_count > 0 + if not use_blockwise_quantization: + logger.info( + "RTN mode detected (iters=0): force blockwise quantization to avoid " + "layer-wise full-model materialization." + ) + use_blockwise_quantization = True + tied_weights_keys = getattr(self.model, "_tied_weights_keys", []) + if tied_weights_keys is None: + tied_weights_keys = [] + if isinstance(tied_weights_keys, dict): + tied_weights_values = list(tied_weights_keys.values()) else: - # By default, we go with layer-wise way if no replacement happened. - # In RTN mode (iters == 0), force blockwise quantization to avoid - # full-model materialization and linear CPU RAM growth. - use_blockwise_quantization = global_state.replaced_module_count > 0 - if not use_blockwise_quantization: - logger.info( - "RTN mode detected (iters=0): force blockwise quantization to avoid " - "layer-wise full-model materialization." - ) - use_blockwise_quantization = True - tied_weights_keys = getattr(self.model, "_tied_weights_keys", []) - if tied_weights_keys is None: - tied_weights_keys = [] - if isinstance(tied_weights_keys, dict): - tied_weights_values = list(tied_weights_keys.values()) - else: - tied_weights_values = list(tied_weights_keys) - tied_weights_layers = [".".join(val.split(".")[:-1]) for val in tied_weights_values] # rm weight/bias - # In fact, we should detect whether it is is_separate_lm_head, to simplify, we don't do it - if getattr(self, "formats", None) and self.formats[0].is_gguf(): - lm_head_name = get_lm_head_name(self.model) - if lm_head_name is not None: - tied_weights_layers.append(lm_head_name) - - if use_blockwise_quantization: # The ram usage is a little higher - - all_blocks = self.quant_block_list if self.quant_block_list else get_block_names(self.model) - pbar = tqdm(range(sum(len(block) for block in all_blocks))) - for block_names in all_blocks: - for block_name in block_names: - pbar.set_description(f"Quantizing {block_name}") - self.quantizer.quantize_block(block_name) - - if self.low_cpu_mem_usage and not self.is_immediate_saving: - self._offloader(self.model, block_name) - clear_memory(device_list=self.device_list) - memory_monitor.log_summary() - pbar.update(1) - cnt = 1 - remain_layer_names = [] - block_name_set = set(name for block in all_blocks for name in block) - for n, m in self.model_context.model.named_modules(): - if not check_to_quantized(m): - continue - # Skip if this layer is part of any block (by prefix match) - if any(n == block_name or n.startswith(f"{block_name}.") for block_name in block_name_set): - continue - remain_layer_names.append(n) - for name in remain_layer_names: - logger.info(f"Quantizing remaining layer {name} on CPU.") - self.quantizer.quantize_layer(name) + tied_weights_values = list(tied_weights_keys) + tied_weights_layers = [".".join(val.split(".")[:-1]) for val in tied_weights_values] # rm weight/bias + # In fact, we should detect whether it is is_separate_lm_head, to simplify, we don't do it + if getattr(self, "formats", None) and self.formats[0].is_gguf(): + lm_head_name = get_lm_head_name(self.model) + if lm_head_name is not None: + tied_weights_layers.append(lm_head_name) + + if use_blockwise_quantization: # The ram usage is a little higher + + all_blocks = self.quant_block_list if self.quant_block_list else get_block_names(self.model) + pbar = tqdm(range(sum(len(block) for block in all_blocks))) + for block_names in all_blocks: + for block_name in block_names: + pbar.set_description(f"Quantizing {block_name}") + block = get_module(self.model, block_name) + self.quantizer.quantize_block(block) + + if self.low_cpu_mem_usage and not self.is_immediate_saving: + self._offloader(self.model, block_name) + clear_memory(device_list=self.device_list) + memory_monitor.log_summary() + pbar.update(1) + cnt = 1 + remain_layer_names = [] + block_name_set = set(name for block in all_blocks for name in block) + for n, m in self.model_context.model.named_modules(): + if not check_to_quantized(m): + continue + # Skip if this layer is part of any block (by prefix match) + if any(n == block_name or n.startswith(f"{block_name}.") for block_name in block_name_set): + continue + remain_layer_names.append(n) + for name in remain_layer_names: + logger.info(f"Quantizing remaining layer {name} on CPU.") + self.quantizer.quantize_layer(name) + cnt += 1 + if cnt % 10 == 0: + clear_memory(device_list=self.device_list) + memory_monitor.log_summary() + else: + all_to_quantized_module_names: list[str] = [ + n for n, m in self.model.named_modules() if check_to_quantized(m) + ] + all_to_quantized_module_names = all_to_quantized_module_names + materialize_model_(self.model) + self.model.to("cpu") + block_names_cnt = len(flatten_list(get_block_names(self.model, True))) + clear_mem_freq = len(all_to_quantized_module_names) // block_names_cnt + cnt = 0 + pbar = tqdm(all_to_quantized_module_names) + + for n, m in self.model.named_modules(): + if hasattr(m, "global_name") and m.global_name in all_to_quantized_module_names: + pbar.set_description(f"Quantizing {m.global_name}") + self.quantizer.quantize_layer(m.global_name) cnt += 1 - if cnt % 10 == 0: + pbar.update() + if cnt % clear_mem_freq == 0: clear_memory(device_list=self.device_list) memory_monitor.log_summary() - else: - all_to_quantized_module_names: list[str] = [ - n for n, m in self.model.named_modules() if check_to_quantized(m) - ] - all_to_quantized_module_names = all_to_quantized_module_names - materialize_model_(self.model) - self.model.to("cpu") - block_names_cnt = len(flatten_list(get_block_names(self.model, True))) - clear_mem_freq = len(all_to_quantized_module_names) // block_names_cnt - cnt = 0 - pbar = tqdm(all_to_quantized_module_names) - - for n, m in self.model.named_modules(): - if hasattr(m, "global_name") and m.global_name in all_to_quantized_module_names: - pbar.set_description(f"Quantizing {m.global_name}") - self.quantizer.quantize_layer(m.global_name) - cnt += 1 - pbar.update() - if cnt % clear_mem_freq == 0: - clear_memory(device_list=self.device_list) - memory_monitor.log_summary() - elif ( - not any(m.children()) - and len(m.state_dict()) > 0 - and n not in tied_weights_layers - and self.is_immediate_saving - ): - set_module(self.model, n, copy.deepcopy(m)) - self.shard_writer.write(name=n) - m.to("meta") + elif ( + not any(m.children()) + and len(m.state_dict()) > 0 + and n not in tied_weights_layers + and self.is_immediate_saving + ): + set_module(self.model, n, copy.deepcopy(m)) + self.shard_writer.write(name=n) + m.to("meta") # Convert remaining fp8 convert_module_to_hp_if_necessary(self.model, self.amp_dtype, self.device) From b32bc685ffac9a5deb64233f47b5f7fde50e819f Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Mon, 30 Mar 2026 16:15:48 +0800 Subject: [PATCH 21/90] support multi rotation Signed-off-by: n1ck-guo --- auto_round/compressors_new/base.py | 40 ++++++++++++++---------------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/auto_round/compressors_new/base.py b/auto_round/compressors_new/base.py index d64fc207f..b7b510bae 100644 --- a/auto_round/compressors_new/base.py +++ b/auto_round/compressors_new/base.py @@ -23,10 +23,9 @@ from auto_round.algorithms.alg_config import AlgConfig from auto_round.algorithms.quantization import BaseQuantizers, QuantizationConfig from auto_round.algorithms.rotation import ( - HadamardConfig, + BaseRotationConfig, apply_rotation, check_supported_schemes, - normalize_hadamard_config, ) from auto_round.compressors_new.shard_writer import ShardWriter from auto_round.compressors_new.utils import _get_save_folder_name, block_forward, set_layer_config @@ -94,7 +93,7 @@ class SerializedCompressorConfig: super_bits: Optional[int] = None super_group_size: Optional[int] = None to_quant_block_names: Optional[list[str]] = None - hadamard_config: Optional[dict[str, Any]] = None + rotation_configs: Optional[list[dict[str, Any]]] = None class BaseCompressor(object): @@ -133,16 +132,17 @@ def __init__( enable_torch_compile: bool = False, seed: int = 42, low_cpu_mem_usage: bool = True, - hadamard_config: str | dict | HadamardConfig | None = None, **kwargs, ): self.quantize_config = None - self.config_list = config if isinstance(config, list) else [config] - for config in self.config_list: - if isinstance(config, QuantizationConfig): - self.quantize_config = config + self.rotation_configs: list[BaseRotationConfig] = [] + _config_list = config if isinstance(config, list) else [config] + for _cfg in _config_list: + if isinstance(_cfg, QuantizationConfig): + self.quantize_config = _cfg + elif isinstance(_cfg, BaseRotationConfig): + self.rotation_configs.append(_cfg) assert self.quantize_config is not None, "QuantizationConfig is required for Compressor" - self.config_list.remove(self.quantize_config) # Scheme is passed directly to the compressor, not stored in QuantizationConfig. self.scheme = scheme @@ -216,11 +216,6 @@ def __init__( logger.info("habana_frameworks is available, import htcore explicitly.") import habana_frameworks.torch.core as htcore # pylint: disable=E0401 - # Accept legacy name for backward compatibility - self._hadamard_config = ( - hadamard_config or kwargs.pop("hadamard_config", None) or kwargs.pop("transform_config", None) - ) - # Reset both context singletons before creating fresh instances so that # consecutive AutoRound creations don't inherit stale config from earlier ones. CompressContext.reset_context() @@ -672,15 +667,16 @@ def post_init(self) -> None: for _lname, _lval in _gguf_layer_cfg.items(): self.layer_config.setdefault(_lname, _lval) - # ── Phase 2d: apply hadamard transform ─────────────────────────────── - if self._hadamard_config: + # ── Phase 2d: apply rotation transforms ────────────────────────────── + if self.rotation_configs: check_supported_schemes(self.scheme) - self.model_context.model = apply_rotation( - self.model_context.model, - self._hadamard_config, - need_calibration=True if self.quantize_config.iters > 0 else False, - ) - self._hadamard_config = normalize_hadamard_config(self._hadamard_config) + need_calibration = self.quantize_config.iters > 0 + for rotation_cfg in self.rotation_configs: + self.model_context.model = apply_rotation( + self.model_context.model, + rotation_cfg, + need_calibration=need_calibration, + ) # ── Phase 3: patch model structure ─────────────────────────────────── # update_module() may replace layers (e.g. MoE expert merging); must From f4da8be29d7ebd50d89c6fa3a0577b0828e0d394 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Mon, 30 Mar 2026 16:38:22 +0800 Subject: [PATCH 22/90] sync compressors_new: add is_dynamic_afp8, is_block_wfp8, _get_safetensor helper, safetensor_only_matched, dispatch None guard, extend ignore_layers Signed-off-by: n1ck-guo --- auto_round/compressors_new/utils.py | 84 ++++++++++++++++++++++++++++- 1 file changed, 83 insertions(+), 1 deletion(-) diff --git a/auto_round/compressors_new/utils.py b/auto_round/compressors_new/utils.py index c48144411..f319d00a5 100644 --- a/auto_round/compressors_new/utils.py +++ b/auto_round/compressors_new/utils.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import copy +import json import os import random import re @@ -100,6 +101,19 @@ def is_dynamic_wint8aint8(ar_or_format: Union[str, Callable]) -> bool: return False +def is_dynamic_afp8(ar_or_format: Callable) -> bool: + return ar_or_format.act_dynamic and ar_or_format.act_data_type.startswith("fp") and ar_or_format.act_bits == 8 + + +def is_block_wfp8(ar_or_format: Callable) -> bool: + return ( + isinstance(ar_or_format.group_size, tuple) + and len(ar_or_format.group_size) == 2 + and ar_or_format.data_type.startswith("fp") + and ar_or_format.bits == 8 + ) + + def block_forward( block: torch.nn.Module, input_ids: torch.Tensor, @@ -211,6 +225,63 @@ def infer_bits_by_data_type(data_type: str): return None +def _get_safetensor_layer_names_not_in_model(model, all_module_names: list) -> list: + """Collect layer names from safetensor files that are not loaded into the model. + + Some tensors (e.g. MTP layers) exist in the original checkpoint but are not + instantiated by ``transformers``. This function discovers them so that regex + patterns in ``layer_config`` can still match them. + + Returns: + List of layer names (the path without the ``.weight`` suffix) for weight + tensors present in the safetensor files but absent from *all_module_names*. + """ + name_or_path = None + if hasattr(model, "config") and hasattr(model.config, "name_or_path"): + name_or_path = model.config.name_or_path + if not name_or_path: + return [] + + if not os.path.isdir(name_or_path): + try: + from auto_round.utils.model import download_hf_model + + name_or_path = download_hf_model(name_or_path) + except Exception as e: + logger.debug(f"Could not resolve source model path to check for missing tensors: {e}") + return [] + + try: + from safetensors import safe_open + except ImportError: + return [] + + # Build tensor-name list from the safetensors index or single file + source_index_file = os.path.join(name_or_path, "model.safetensors.index.json") + source_single_file = os.path.join(name_or_path, "model.safetensors") + + tensor_names: list = [] + if os.path.exists(source_index_file): + with open(source_index_file) as f: + src_index = json.load(f) + tensor_names = list(src_index["weight_map"].keys()) + elif os.path.exists(source_single_file): + with safe_open(source_single_file, framework="pt", device="cpu") as f: + tensor_names = list(f.keys()) + else: + return [] + + module_name_set = set(all_module_names) + extra_layer_names = [] + for tensor_name in tensor_names: + if not tensor_name.endswith(".weight"): + continue + layer_name = tensor_name[: -len(".weight")] + if layer_name not in module_name_set: + extra_layer_names.append(layer_name) + return extra_layer_names + + def set_layer_config( model: torch.nn.Module, layer_config: dict[str, Union[str, dict, "QuantizationScheme"]], @@ -239,6 +310,10 @@ def dispatch_layer_config(layer_config: dict[str, dict]) -> None: """Assign scheme values as attributes to matched modules.""" for layer_name, scheme in layer_config.items(): module = get_module(model, layer_name) + if module is None: + # Layer exists in safetensor files but is not loaded into the model + # (e.g. MTP layers that transformers does not instantiate). Skip. + continue for attr, value in scheme.items(): setattr(module, attr, value) @@ -344,6 +419,10 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str if isinstance(m, embedding_types) or m.__class__.__name__.endswith("Embedding"): embedding_layer_names.append(n) + # Also include layer names from safetensor files not loaded into the model + # (e.g. MTP layers that transformers does not instantiate). + safetensor_only_names = _get_safetensor_layer_names_not_in_model(model, all_module_names) + # 6. expand regex configs regex_config = {} for name in list(layer_config.keys()): @@ -361,7 +440,9 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str regex = re.compile(to_standard_regex(name)) matched = [ln for ln in all_supported_layer_names if regex.search(ln)] - if not matched: + safetensor_only_matched = [ln for ln in safetensor_only_names if regex.search(ln)] + # skip it for mtp layers not loaded in transformers + if not matched and not safetensor_only_matched: # type(mlp.gate) is Qwen3VLMoeTextTopKRouter instead of Linear logger.warning_once( f"Layer name or regex '{name}' in layer_config does not match any supported layers. " @@ -913,6 +994,7 @@ def get_fp_layer_names(model: torch.nn.Module, ignore_layers: str): for name in all_layer_names: if fp_layer in name: not_to_quantized_layers.append(name) + not_to_quantized_layers.extend(ignore_layers) # keep regex name for later use logger.trace(f"not_to_quantized_layers: {not_to_quantized_layers}") return not_to_quantized_layers From 75a472a2250117c1660d87d44dfcc8aad0cb9f72 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Tue, 31 Mar 2026 09:28:08 +0800 Subject: [PATCH 23/90] merge main Signed-off-by: n1ck-guo --- .../quantization/auto_round/quantizer.py | 136 ------------------ auto_round/algorithms/quantization/base.py | 131 +++++++++++++++++ .../algorithms/quantization/rtn/config.py | 4 - .../algorithms/quantization/rtn/quantizer.py | 113 +-------------- 4 files changed, 132 insertions(+), 252 deletions(-) diff --git a/auto_round/algorithms/quantization/auto_round/quantizer.py b/auto_round/algorithms/quantization/auto_round/quantizer.py index 066d77e34..5b369666e 100644 --- a/auto_round/algorithms/quantization/auto_round/quantizer.py +++ b/auto_round/algorithms/quantization/auto_round/quantizer.py @@ -589,86 +589,6 @@ def quantize_layer( dump_info = f"quantized {layer_name}, loss iter 0: {init_loss:.6f} -> iter {best_iter}: {last_loss:.6f}" logger.info(dump_info) - @torch.no_grad() - def _get_block_outputs( - self, - block: torch.nn.Module, - input_ids: torch.Tensor | list[torch.Tensor], - input_others: torch.Tensor | dict, - bs: int, - save_output: bool = True, - ): - """Compute the output of a given block of the model for a given input. - - Args: - block: The block of the model. - input_ids: The input tensor containing tokenized input ids. - input_others: A dictionary containing additional input data. - bs: The batch size for computing the output. - device: The device for computation. - cache_device: The device for storing the output. - batch_dim: The batch dimension of the output tensor. - - Returns: - The output tensor of the block. - """ - if self.model_context.is_diffusion: - return self._get_diffusion_block_outputs( - block, - input_ids, - input_others, - bs, - self.compress_context.device, - self.compress_context.cache_device, - ) - - if ( - (self.config.is_act_quantize and (not self.config.act_dynamic or self.config.is_act_nv_fp)) # have hooks - or self.enable_alg_ext # Use imatrix - # or not self.disable_opt_rtn # Use imatrix - ): - self.block_forward = block_forward - else: - # TODO FIXME - # This function could not be compiled, causing a large accuracy drop when `enable_alg_ext` is used. - # To avoid issues, remove it in all scenarios except WOQ. - self.block_forward = ( - compile_func(block_forward, self.compress_context.device) - if self.compress_context.enable_torch_compile - else block_forward - ) - - output = [] - nsamples = len(input_ids) - for i in range(0, nsamples, bs): - end_index = min(nsamples, i + bs) - indices = torch.arange(i, end_index).to(torch.long) - tmp_input_ids, tmp_input_others = self._sampling_inputs( - input_ids, - input_others, - indices, - self.seqlen, - self.batch_dim, - share_cache_keys=self.model_context.shared_cache_keys, - ) - tmp_output = self.block_forward( - block, - tmp_input_ids, - tmp_input_others, - self.model_context.amp, - self.model_context.amp_dtype, - self.compress_context.device, - ).to(self.compress_context.cache_device) - if save_output: - if self.batch_size == 1: - output.append(tmp_output) - else: - output.extend(list(torch.split(tmp_output, 1, dim=self.batch_dim))) - if self.compress_context.low_gpu_mem_usage: - clear_memory(device_list=self.compress_context.device_list) - - return output - @torch.no_grad() def _get_diffusion_block_outputs( self, @@ -741,62 +661,6 @@ def _get_diffusion_block_outputs( return output - @classmethod - @torch.no_grad() - def _sampling_inputs( - cls, - input_ids: Union[list[torch.Tensor], dict], - input_others: dict, - indices: list[int] | torch.Tensor, - seqlen: int, - batch_dim: int = 0, - share_cache_keys: tuple = (), - ): - """Samples inputs based on the given indices and sequence length. - - Args: - input_ids: The list of input tensor containing input_ids. - input_others: A dictionary containing other input data. - indices: The indices to sample from the input. - seqlen: The sequence length. - - Returns: - current_input_ids: The sampled input IDs. - current_input_others: The sampled other input data. - """ - if isinstance(input_ids, list): - current_input_ids = [input_ids[i] for i in indices] - current_input_ids = torch.cat(current_input_ids, dim=batch_dim) - elif isinstance(input_ids, dict): - current_input_ids = defaultdict(list) - for k in input_ids.keys(): - current_input_ids[k].extend([input_ids[k][i] for i in indices]) - current_input_ids[k] = torch.cat(current_input_ids[k], dim=batch_dim) - - current_input_others = {"positional_inputs": input_others["positional_inputs"]} - for key in input_others.keys(): - if "positional_inputs" in key: - continue - # Shared cache keys (e.g. position_embeddings, position_ids, cache_position) are stored - # directly as-is (not wrapped in a per-sample list) when batch_size > 1. Indexing such - # values by sample index would incorrectly decompose them (e.g. (cos, sin)[0] == cos). - # Always pass them through unchanged. - if key in share_cache_keys or isinstance(input_others[key], (str, bool, type(None))): - current_input_others[key] = input_others[key] - elif input_others[key] is not None: - current_input_others[key] = [input_others[key][i] for i in indices] - if len(indices) == 1: - current_input_others[key] = current_input_others[key][0] - else: - try: - current_input_others[key] = torch.cat(current_input_others[key], dim=0) - except TypeError as err: - logger.warning_once("Please check the model cache inputs or try setting batch_size to 1.") - else: - current_input_others[key] = None - - return current_input_ids, current_input_others - def _get_optimizer(self, optimizer: Any): """Returns the specified optimizer. In SignRound, we fix the optimizer. diff --git a/auto_round/algorithms/quantization/base.py b/auto_round/algorithms/quantization/base.py index 7a60f5191..a5d72f59b 100644 --- a/auto_round/algorithms/quantization/base.py +++ b/auto_round/algorithms/quantization/base.py @@ -13,11 +13,14 @@ # limitations under the License. import importlib import traceback +from collections import defaultdict +from typing import Union import torch from auto_round.algorithms.quantization.config import QuantizationConfig from auto_round.compressors_new.utils import ( + block_forward, check_need_act_calibration, ) from auto_round.data_type import QUANT_FUNC_WITH_DTYPE @@ -28,6 +31,7 @@ SUPPORTED_LAYER_TYPES, check_to_quantized, clear_memory, + compile_func, ) @@ -40,6 +44,7 @@ class BaseQuantizers: dataset = None supported_types = SUPPORTED_LAYER_TYPES inner_supported_types = INNER_SUPPORTED_LAYER_TYPES + enable_alg_ext = False def __init__(self, config: QuantizationConfig): self.config = config @@ -59,6 +64,12 @@ def __init__(self, config: QuantizationConfig): self.ignore_layers = config.ignore_layers self.quant_lm_head = config.quant_lm_head self.to_quant_block_names = config.to_quant_block_names + # Calibration / sampling attrs – populated from config if present. + self.seqlen = getattr(config, "seqlen", 2048) + self.nsamples = getattr(config, "nsamples", 128) + self.batch_size = getattr(config, "batch_size", 8) + self.batch_dim = getattr(config, "batch_dim", None) + self.infer_bs_coeff = getattr(config, "infer_bs_coeff", 1) @classmethod def from_config(cls, config: QuantizationConfig): @@ -274,3 +285,123 @@ def quantize_layer(self, layer_name: str, **kwargs): retrieved internally via get_module(model, layer_name). """ raise NotImplementedError("quantize_layer must be implemented in subclasses of BaseQuantizers") + + @torch.no_grad() + def _get_block_outputs( + self, + block: torch.nn.Module, + input_ids, + input_others, + bs: int, + save_output: bool = True, + ): + """Compute the output of a block for calibration inputs. + + Shared by ARQuantizer and OptimizedRTNQuantizer. Algorithm-specific + block-forward selection (compile vs. plain) is handled here based on + ``enable_alg_ext`` and act-quantization flags. + """ + if getattr(self.model_context, "is_diffusion", False) and hasattr(self, "_get_diffusion_block_outputs"): + return self._get_diffusion_block_outputs( + block, + input_ids, + input_others, + bs, + self.compress_context.device, + self.compress_context.cache_device, + ) + + if ( + (self.config.is_act_quantize and (not self.config.act_dynamic or self.config.is_act_nv_fp)) # have hooks + or self.enable_alg_ext # Use imatrix + # or not self.disable_opt_rtn # Use imatrix + ): + self.block_forward = block_forward + else: + # TODO FIXME + # This function could not be compiled, causing a large accuracy drop when `enable_alg_ext` is used. + # To avoid issues, remove it in all scenarios except WOQ. + self.block_forward = ( + compile_func(block_forward, self.compress_context.device) + if self.compress_context.enable_torch_compile + else block_forward + ) + + output = [] + nsamples = len(input_ids) + for i in range(0, nsamples, bs): + end_index = min(nsamples, i + bs) + indices = torch.arange(i, end_index).to(torch.long) + tmp_input_ids, tmp_input_others = self._sampling_inputs( + input_ids, + input_others, + indices, + self.seqlen, + self.batch_dim, + share_cache_keys=self.model_context.shared_cache_keys, + ) + tmp_output = self.block_forward( + block, + tmp_input_ids, + tmp_input_others, + self.model_context.amp, + self.model_context.amp_dtype, + self.compress_context.device, + ).to(self.compress_context.cache_device) + if save_output: + if self.batch_size == 1: + output.append(tmp_output) + else: + output.extend(list(torch.split(tmp_output, 1, dim=self.batch_dim))) + if self.compress_context.low_gpu_mem_usage: + clear_memory(device_list=self.compress_context.device_list) + + return output + + @classmethod + @torch.no_grad() + def _sampling_inputs( + cls, + input_ids: Union[list[torch.Tensor], dict], + input_others: dict, + indices, + seqlen: int, + batch_dim: int = 0, + share_cache_keys: tuple = (), + ): + """Sample a mini-batch of calibration inputs by indices. + + Shared by ARQuantizer and OptimizedRTNQuantizer. + """ + if isinstance(input_ids, list): + current_input_ids = [input_ids[i] for i in indices] + current_input_ids = torch.cat(current_input_ids, dim=batch_dim) + elif isinstance(input_ids, dict): + current_input_ids = defaultdict(list) + for k in input_ids.keys(): + current_input_ids[k].extend([input_ids[k][i] for i in indices]) + current_input_ids[k] = torch.cat(current_input_ids[k], dim=batch_dim) + + current_input_others = {"positional_inputs": input_others["positional_inputs"]} + for key in input_others.keys(): + if "positional_inputs" in key: + continue + # Shared cache keys (e.g. position_embeddings, position_ids, cache_position) are stored + # directly as-is (not wrapped in a per-sample list) when batch_size > 1. Indexing such + # values by sample index would incorrectly decompose them (e.g. (cos, sin)[0] == cos). + # Always pass them through unchanged. + if key in share_cache_keys or isinstance(input_others[key], (str, bool, type(None))): + current_input_others[key] = input_others[key] + elif input_others[key] is not None: + current_input_others[key] = [input_others[key][i] for i in indices] + if len(indices) == 1: + current_input_others[key] = current_input_others[key][0] + else: + try: + current_input_others[key] = torch.cat(current_input_others[key], dim=0) + except TypeError as err: + logger.warning_once("Please check the model cache inputs or try setting batch_size to 1.") + else: + current_input_others[key] = None + + return current_input_ids, current_input_others diff --git a/auto_round/algorithms/quantization/rtn/config.py b/auto_round/algorithms/quantization/rtn/config.py index e470a49b2..686d2bed1 100644 --- a/auto_round/algorithms/quantization/rtn/config.py +++ b/auto_round/algorithms/quantization/rtn/config.py @@ -43,10 +43,6 @@ def __init__( self.infer_bs_coeff = 1 self.batch_dim = None - # Automatically adjust the disable_opt_rtn option if the user does not explicitly set it. - # To avoid None issue, we keep a copy though it's a little ugly - if enable_opt_rtn and disable_opt_rtn: - raise ValueError("`enable_opt_rtn` and `disable_opt_rtn` are mutually exclusive; " "only one can be set.") if enable_opt_rtn: disable_opt_rtn = False self.orig_disable_opt_rtn = disable_opt_rtn diff --git a/auto_round/algorithms/quantization/rtn/quantizer.py b/auto_round/algorithms/quantization/rtn/quantizer.py index 3466507ab..921ecd15e 100644 --- a/auto_round/algorithms/quantization/rtn/quantizer.py +++ b/auto_round/algorithms/quantization/rtn/quantizer.py @@ -273,115 +273,4 @@ def quantize_block(self, block: torch.nn.Module, **kwargs): if hasattr(m, "global_name") and check_to_quantized(m): self.quantize_layer(m.global_name) - @torch.no_grad() - def _get_block_outputs( - self, - block: torch.nn.Module, - input_ids: torch.Tensor | list[torch.Tensor], - input_others: torch.Tensor | dict, - bs: int, - save_output: bool = True, - ): - """Compute the output of a given block of the model for a given input. - - Args: - block: The block of the model. - input_ids: The input tensor containing tokenized input ids. - input_others: A dictionary containing additional input data. - bs: The batch size for computing the output. - device: The device for computation. - cache_device: The device for storing the output. - batch_dim: The batch dimension of the output tensor. - - Returns: - The output tensor of the block. - """ - - self.block_forward = block_forward - - output = [] - nsamples = len(input_ids) - for i in range(0, nsamples, bs): - end_index = min(nsamples, i + bs) - indices = torch.arange(i, end_index).to(torch.long) - tmp_input_ids, tmp_input_others = self._sampling_inputs( - input_ids, - input_others, - indices, - self.seqlen, - self.batch_dim, - share_cache_keys=self.model_context.shared_cache_keys, - ) - tmp_output = self.block_forward( - block, - tmp_input_ids, - tmp_input_others, - self.model_context.amp, - self.model_context.amp_dtype, - self.compress_context.device, - ).to(self.compress_context.cache_device) - if save_output: - if self.batch_size == 1: - output.append(tmp_output) - else: - output.extend(list(torch.split(tmp_output, 1, dim=self.batch_dim))) - if self.compress_context.low_gpu_mem_usage: - clear_memory(device_list=self.compress_context.device_list) - - return output - - @classmethod - @torch.no_grad() - def _sampling_inputs( - cls, - input_ids: Union[list[torch.Tensor], dict], - input_others: dict, - indices: list[int] | torch.Tensor, - seqlen: int, - batch_dim: int = 0, - share_cache_keys: tuple = (), - ): - """Samples inputs based on the given indices and sequence length. - - Args: - input_ids: The list of input tensor containing input_ids. - input_others: A dictionary containing other input data. - indices: The indices to sample from the input. - seqlen: The sequence length. - - Returns: - current_input_ids: The sampled input IDs. - current_input_others: The sampled other input data. - """ - if isinstance(input_ids, list): - current_input_ids = [input_ids[i] for i in indices] - current_input_ids = torch.cat(current_input_ids, dim=batch_dim) - elif isinstance(input_ids, dict): - current_input_ids = defaultdict(list) - for k in input_ids.keys(): - current_input_ids[k].extend([input_ids[k][i] for i in indices]) - current_input_ids[k] = torch.cat(current_input_ids[k], dim=batch_dim) - - current_input_others = {"positional_inputs": input_others["positional_inputs"]} - for key in input_others.keys(): - if "positional_inputs" in key: - continue - # Shared cache keys (e.g. position_embeddings, position_ids, cache_position) are stored - # directly as-is (not wrapped in a per-sample list) when batch_size > 1. Indexing such - # values by sample index would incorrectly decompose them (e.g. (cos, sin)[0] == cos). - # Always pass them through unchanged. - if key in share_cache_keys or isinstance(input_others[key], (str, bool, type(None))): - current_input_others[key] = input_others[key] - elif input_others[key] is not None: - current_input_others[key] = [input_others[key][i] for i in indices] - if len(indices) == 1: - current_input_others[key] = current_input_others[key][0] - else: - try: - current_input_others[key] = torch.cat(current_input_others[key], dim=0) - except TypeError as err: - logger.warning_once("Please check the model cache inputs or try setting batch_size to 1.") - else: - current_input_others[key] = None - - return current_input_ids, current_input_others + # _get_block_outputs and _sampling_inputs are defined in BaseQuantizers and inherited. From 01f687182129e7cff29a512b8a547eae81b784e5 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Tue, 31 Mar 2026 13:25:09 +0800 Subject: [PATCH 24/90] fix Signed-off-by: n1ck-guo --- auto_round/algorithms/quantization/base.py | 5 +++-- auto_round/compressors_new/calib.py | 20 ++++++++++---------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/auto_round/algorithms/quantization/base.py b/auto_round/algorithms/quantization/base.py index a5d72f59b..88f4debcd 100644 --- a/auto_round/algorithms/quantization/base.py +++ b/auto_round/algorithms/quantization/base.py @@ -301,8 +301,9 @@ def _get_block_outputs( block-forward selection (compile vs. plain) is handled here based on ``enable_alg_ext`` and act-quantization flags. """ - if getattr(self.model_context, "is_diffusion", False) and hasattr(self, "_get_diffusion_block_outputs"): - return self._get_diffusion_block_outputs( + diffusion_fn = getattr(self, "_get_diffusion_block_outputs", None) + if getattr(self.model_context, "is_diffusion", False) and diffusion_fn is not None: + return diffusion_fn( block, input_ids, input_others, diff --git a/auto_round/compressors_new/calib.py b/auto_round/compressors_new/calib.py index 413521b81..ef0372dba 100644 --- a/auto_round/compressors_new/calib.py +++ b/auto_round/compressors_new/calib.py @@ -85,16 +85,16 @@ def __init__( self.dataset = dataset self.iters = iters super().__init__( - config, - model, - tokenizer, - platform, - format, - low_gpu_mem_usage, - device_map, - enable_torch_compile, - seed, - low_cpu_mem_usage, + config=config, + model=model, + tokenizer=tokenizer, + platform=platform, + format=format, + low_gpu_mem_usage=low_gpu_mem_usage, + device_map=device_map, + enable_torch_compile=enable_torch_compile, + seed=seed, + low_cpu_mem_usage=low_cpu_mem_usage, **kwargs, ) if iters == 0: From 41e75bd24cf60a534e36458e9e3941908665854f Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Tue, 31 Mar 2026 14:28:19 +0800 Subject: [PATCH 25/90] fix Signed-off-by: n1ck-guo --- auto_round/algorithms/quantization/base.py | 2 +- auto_round/compressors_new/base.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/auto_round/algorithms/quantization/base.py b/auto_round/algorithms/quantization/base.py index 88f4debcd..85492f210 100644 --- a/auto_round/algorithms/quantization/base.py +++ b/auto_round/algorithms/quantization/base.py @@ -302,7 +302,7 @@ def _get_block_outputs( ``enable_alg_ext`` and act-quantization flags. """ diffusion_fn = getattr(self, "_get_diffusion_block_outputs", None) - if getattr(self.model_context, "is_diffusion", False) and diffusion_fn is not None: + if getattr(self.model_context, "is_diffusion", False) and callable(diffusion_fn): return diffusion_fn( block, input_ids, diff --git a/auto_round/compressors_new/base.py b/auto_round/compressors_new/base.py index b7b510bae..bf86ed2c9 100644 --- a/auto_round/compressors_new/base.py +++ b/auto_round/compressors_new/base.py @@ -658,7 +658,7 @@ def post_init(self) -> None: # been set before Phase 1 (during get_formats → gguf_args_check). _gguf_layer_cfg = { k: v - for k, v in self.__dict__.get("layer_config", {}).items() + for k, v in (self.__dict__.get("layer_config") or {}).items() if k not in (self.quantize_config.layer_config or {}) } if _gguf_layer_cfg: From 92139d6c44da9f09f3a7ad66309f7a70f16fb610 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Tue, 31 Mar 2026 16:13:47 +0800 Subject: [PATCH 26/90] fix Signed-off-by: n1ck-guo --- auto_round/algorithms/quantization/base.py | 2 +- auto_round/compressors_new/base.py | 6 ------ auto_round/export/export_to_autogptq/export.py | 2 +- auto_round/export/export_to_autoround/export.py | 5 ++--- auto_round/export/export_to_autoround/export_to_fp8.py | 2 +- .../export/export_to_autoround/export_to_nvfp_mxfp.py | 4 ++-- 6 files changed, 7 insertions(+), 14 deletions(-) diff --git a/auto_round/algorithms/quantization/base.py b/auto_round/algorithms/quantization/base.py index 85492f210..751fcb653 100644 --- a/auto_round/algorithms/quantization/base.py +++ b/auto_round/algorithms/quantization/base.py @@ -310,7 +310,7 @@ def _get_block_outputs( bs, self.compress_context.device, self.compress_context.cache_device, - ) + ) # pylint : disable=E1102 if ( (self.config.is_act_quantize and (not self.config.act_dynamic or self.config.is_act_nv_fp)) # have hooks diff --git a/auto_round/compressors_new/base.py b/auto_round/compressors_new/base.py index bf86ed2c9..72412d3b3 100644 --- a/auto_round/compressors_new/base.py +++ b/auto_round/compressors_new/base.py @@ -650,12 +650,6 @@ def post_init(self) -> None: _new_scheme = _QS.from_dict({k: v for k, v in _new_scheme_dict.items() if v is not None}) self.scheme = _new_scheme - # ── Phase 2c: merge layer_config set by GGUFFormat._mixed handling ─── - # Inner GGUFFormat("q2_k_mixed", ar) calls _handle_special_schemes and - # stores the result directly in ar.__dict__["layer_config"]. - # self.layer_config is already the authoritative owner of this attr, so - # just merge any GGUFFormat-supplied per-layer entries that may have - # been set before Phase 1 (during get_formats → gguf_args_check). _gguf_layer_cfg = { k: v for k, v in (self.__dict__.get("layer_config") or {}).items() diff --git a/auto_round/export/export_to_autogptq/export.py b/auto_round/export/export_to_autogptq/export.py index e2f48e8ec..b3ca711d3 100644 --- a/auto_round/export/export_to_autogptq/export.py +++ b/auto_round/export/export_to_autogptq/export.py @@ -252,7 +252,7 @@ def save_quantized_as_autogptq( continue # Handle block layers if in_blocks or (block_name_to_quantize and check_start_with_block_name(layer_name, block_name_to_quantize)): - neq_keys = check_neq_config(cfg, **{k: quantization_config[k] for k in scheme_keys}) + neq_keys = check_neq_config(cfg, **{k: quantization_config.get(k) for k in scheme_keys}) if neq_keys: if matches_any_regex(layer_name, regex_config): continue diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py index 372638e99..fb489159a 100644 --- a/auto_round/export/export_to_autoround/export.py +++ b/auto_round/export/export_to_autoround/export.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import copy import functools import inspect @@ -287,7 +286,7 @@ def save_quantized_as_autoround( elif cfg["in_blocks"] or ( block_name_to_quantize is not None and check_start_with_block_name(layer_name, block_name_to_quantize) ): - neq_keys = check_neq_config(cfg, **{k: quantization_config[k] for k in scheme_keys}) + neq_keys = check_neq_config(cfg, **{k: quantization_config.get(k) for k in scheme_keys}) if len(neq_keys) > 0: extra_config[layer_name] = {} for key in neq_keys: @@ -298,7 +297,7 @@ def save_quantized_as_autoround( if regex_config is not None: for name, cfg in regex_config.items(): regex_name = to_standard_regex(name) - neq_keys = check_neq_config(cfg, **{k: quantization_config[k] for k in scheme_keys}) + neq_keys = check_neq_config(cfg, **{k: quantization_config.get(k) for k in scheme_keys}) if len(neq_keys) > 0: extra_config[regex_name] = {} for key in neq_keys: diff --git a/auto_round/export/export_to_autoround/export_to_fp8.py b/auto_round/export/export_to_autoround/export_to_fp8.py index fadebba5d..76f8fdbfe 100644 --- a/auto_round/export/export_to_autoround/export_to_fp8.py +++ b/auto_round/export/export_to_autoround/export_to_fp8.py @@ -230,7 +230,7 @@ def save_quantized_as_autoround( elif cfg["in_blocks"] or ( block_name_to_quantize is not None and check_start_with_block_name(layer_name, block_name_to_quantize) ): - neq_keys = check_neq_config(cfg, **{k: quantization_config[k] for k in scheme_keys}) + neq_keys = check_neq_config(cfg, **{k: quantization_config.get(k) for k in scheme_keys}) if len(neq_keys) > 0: extra_config[layer_name] = {} for key in neq_keys: diff --git a/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py b/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py index 502c49676..836238114 100644 --- a/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py +++ b/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py @@ -214,7 +214,7 @@ def save_quantized_as_fp( elif cfg["in_blocks"] or ( block_name_to_quantize is not None and check_start_with_block_name(layer_name, block_name_to_quantize) ): - neq_keys = check_neq_config(cfg, **{k: quantization_config[k] for k in scheme_keys}) + neq_keys = check_neq_config(cfg, **{k: quantization_config.get(k) for k in scheme_keys}) if len(neq_keys) > 0: extra_config[layer_name] = {} for key in neq_keys: @@ -225,7 +225,7 @@ def save_quantized_as_fp( if regex_config is not None: for name, cfg in regex_config.items(): regex_name = to_standard_regex(name) - neq_keys = check_neq_config(cfg, **{k: quantization_config[k] for k in scheme_keys}) + neq_keys = check_neq_config(cfg, **{k: quantization_config.get(k) for k in scheme_keys}) if len(neq_keys) > 0: extra_config[regex_name] = {} for key in neq_keys: From 166b5b6535497c3bd7f4e147cd4e9a64f0cc6c80 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Tue, 31 Mar 2026 16:48:36 +0800 Subject: [PATCH 27/90] fix output dir Signed-off-by: n1ck-guo --- auto_round/__main__.py | 38 +++------------------ auto_round/compressors_new/base.py | 54 ++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 34 deletions(-) diff --git a/auto_round/__main__.py b/auto_round/__main__.py index 9da465810..9b5037eeb 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -38,6 +38,7 @@ class BasicArgumentParser(argparse.ArgumentParser): + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.add_argument( @@ -724,41 +725,10 @@ def tune(args): trust_remote_code=not args.disable_trust_remote_code, ) - model_name = args.model.rstrip("/") - - if model_name.split("/")[-1].strip(".") == "" and "gguf" not in args.format: - if autoround.group_size <= 0: - if "fp" in autoround.act_data_type: - suffix = f"afp{autoround.act_bits}" - else: - suffix = f"a{autoround.act_bits}" - else: - suffix = f"g{autoround.group_size}" - export_dir = os.path.join(args.output_dir, f"w{autoround.bits}{suffix}") - elif model_name.split("/")[-1].strip(".") == "" and "gguf" in args.format: - export_dir = args.output_dir - elif model_name.split("./")[-1].strip("./") != "" and "gguf" in args.format: - export_dir = os.path.join(args.output_dir, model_name.split("/")[-1] + "-gguf") - else: - if isinstance(autoround.group_size, tuple): - assert len(autoround.group_size) == 2, f"Only support 2D group_size, but get {autoround.group_size}" - suffix = f"g{autoround.group_size[0]}x{autoround.group_size[1]}" - else: - if autoround.group_size <= 0: - if "fp" in autoround.act_data_type: - suffix = f"afp{autoround.act_bits}" - else: - suffix = f"a{autoround.act_bits}" - else: - suffix = f"g{autoround.group_size}" - prefix = autoround.data_type.lower().replace("_", "") if "int" not in autoround.data_type else "" - export_dir = os.path.join( - args.output_dir, - model_name.split("/")[-1] + (f"-{prefix}" if prefix else "") + f"-w{autoround.bits}{suffix}", - ) - # ======================= Quantize and save model ======================= - model, folders = autoround.quantize_and_save(export_dir, format=args.format) # pylint: disable=E1101 + # Export directory is now derived automatically inside quantize_and_save via + # BaseCompressor._get_export_dir(), so we only need to pass the base output_dir. + model, folders = autoround.quantize_and_save(args.output_dir, format=args.format) # pylint: disable=E1101 tokenizer = autoround.tokenizer # pylint: disable=E1101 model.eval() diff --git a/auto_round/compressors_new/base.py b/auto_round/compressors_new/base.py index 72412d3b3..5f072982c 100644 --- a/auto_round/compressors_new/base.py +++ b/auto_round/compressors_new/base.py @@ -898,6 +898,55 @@ def save_quantized( else: return compressed_model + def _get_export_dir(self, output_dir: str, format_str: str) -> str: + """Derive a descriptive export directory from model name and quantization config. + + Must be called after ``post_init()`` so that scheme-resolved attrs + (bits, group_size, data_type, etc.) are available on ``self.quantize_config``. + + Mirrors the logic previously in ``__main__.py`` so callers only need to + pass the base ``output_dir`` and the format string. + """ + model_name = (getattr(self.model_context.model, "name_or_path", "") or "").rstrip("/") + cfg = self.quantize_config + group_size = cfg.group_size + bits = cfg.bits + data_type = cfg.data_type or "int" + act_bits = cfg.act_bits or 16 + act_data_type = cfg.act_data_type or "float" + + is_gguf = "gguf" in (format_str or "") + last = model_name.split("/")[-1].strip(".") + + if last == "" and not is_gguf: + # model path is just '.' or './' – put inside output_dir with suffix + if group_size <= 0: + suffix = f"afp{act_bits}" if "fp" in act_data_type else f"a{act_bits}" + else: + suffix = f"g{group_size}" + return os.path.join(output_dir, f"w{bits}{suffix}") + + if last == "" and is_gguf: + return output_dir + + if is_gguf: + return os.path.join(output_dir, model_name.split("/")[-1] + "-gguf") + + # Normal case: derive suffix from group_size / act config + if isinstance(group_size, tuple): + assert len(group_size) == 2, f"Only support 2D group_size, but got {group_size}" + suffix = f"g{group_size[0]}x{group_size[1]}" + elif group_size <= 0: + suffix = f"afp{act_bits}" if "fp" in act_data_type else f"a{act_bits}" + else: + suffix = f"g{group_size}" + + prefix = data_type.lower().replace("_", "") if "int" not in data_type else "" + return os.path.join( + output_dir, + model_name.split("/")[-1] + (f"-{prefix}" if prefix else "") + f"-w{bits}{suffix}", + ) + def quantize_and_save( self, output_dir: str = "tmp_autoround", format: str = None, inplace: bool = True, **kwargs ) -> tuple[torch.nn.Module, dict[str, Any]]: @@ -948,6 +997,11 @@ def quantize_and_save( # IMPORTANT: post_init() must run outside any @torch.inference_mode() context # because AutoScheme's delta-loss selection requires gradient tracking. self.post_init() + # Derive descriptive export dir after post_init so scheme-resolved attrs are available. + _fmt_str = format or (self.formats if isinstance(self.formats, str) else "") + output_dir = self._get_export_dir(output_dir, _fmt_str) + self.output_dir = output_dir + self.compress_context.output_dir = output_dir if self.static_attention_dtype is not None: from auto_round.experimental.attention import attention_quant_ctx From 31b2d2b96069ae33ef841a74cde818e7cafe9a37 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Wed, 1 Apr 2026 15:14:38 +0800 Subject: [PATCH 28/90] update by comment Signed-off-by: n1ck-guo --- .../algorithms/quantization/__init__.py | 6 +- auto_round/algorithms/quantization/base.py | 89 +++++++++-- auto_round/algorithms/quantization/config.py | 1 - .../algorithms/quantization/rtn/config.py | 7 +- .../algorithms/quantization/rtn/quantizer.py | 60 +------- .../{auto_round => sign_round}/__init__.py | 0 .../{auto_round => sign_round}/adam.py | 10 +- .../{auto_round => sign_round}/config.py | 31 +--- .../{auto_round => sign_round}/quantizer.py | 113 +++----------- .../quantization/sign_round}/sign_sgd.py | 0 .../{rotation => transforms}/__init__.py | 10 +- .../{rotation => transforms}/base.py | 4 +- .../hadamard/__init__.py | 8 +- .../hadamard/apply.py | 24 +-- .../hadamard/config.py | 6 +- .../hadamard/patch.py | 0 .../hadamard/transforms.py | 4 +- .../hadamard/utils/__init__.py | 0 .../hadamard/utils/hadamards.safetensors | Bin .../hadamard/utils/math.py | 0 .../hadamard/utils/matrix.py | 0 .../hadamard/utils/triton/__init__.py | 0 .../hadamard/utils/triton/mxfp4.py | 0 auto_round/autoround.py | 4 +- auto_round/compressors/base.py | 2 +- auto_round/compressors_new/__init__.py | 16 +- .../architecture_visualization.py | 46 +++--- auto_round/compressors_new/base.py | 33 ++-- auto_round/compressors_new/calib.py | 52 +++---- .../docs/compressors_new_architecture.md | 14 +- .../docs/compressors_new_architecture_CN.md | 18 +-- auto_round/compressors_new/entry.py | 145 ++++++++++++------ auto_round/compressors_new/mllm_mixin.py | 2 +- auto_round/compressors_new/zero_shot.py | 31 +++- auto_round/context/compress.py | 6 + auto_round/inference/backend.py | 4 +- auto_round/wrapper.py | 14 +- 37 files changed, 387 insertions(+), 373 deletions(-) rename auto_round/algorithms/quantization/{auto_round => sign_round}/__init__.py (100%) rename auto_round/algorithms/quantization/{auto_round => sign_round}/adam.py (87%) rename auto_round/algorithms/quantization/{auto_round => sign_round}/config.py (74%) rename auto_round/algorithms/quantization/{auto_round => sign_round}/quantizer.py (87%) rename auto_round/{ => algorithms/quantization/sign_round}/sign_sgd.py (100%) rename auto_round/algorithms/{rotation => transforms}/__init__.py (93%) rename auto_round/algorithms/{rotation => transforms}/base.py (97%) rename auto_round/algorithms/{rotation => transforms}/hadamard/__init__.py (74%) rename auto_round/algorithms/{rotation => transforms}/hadamard/apply.py (90%) rename auto_round/algorithms/{rotation => transforms}/hadamard/config.py (94%) rename auto_round/algorithms/{rotation => transforms}/hadamard/patch.py (100%) rename auto_round/algorithms/{rotation => transforms}/hadamard/transforms.py (97%) rename auto_round/algorithms/{rotation => transforms}/hadamard/utils/__init__.py (100%) rename auto_round/algorithms/{rotation => transforms}/hadamard/utils/hadamards.safetensors (100%) rename auto_round/algorithms/{rotation => transforms}/hadamard/utils/math.py (100%) rename auto_round/algorithms/{rotation => transforms}/hadamard/utils/matrix.py (100%) rename auto_round/algorithms/{rotation => transforms}/hadamard/utils/triton/__init__.py (100%) rename auto_round/algorithms/{rotation => transforms}/hadamard/utils/triton/mxfp4.py (100%) diff --git a/auto_round/algorithms/quantization/__init__.py b/auto_round/algorithms/quantization/__init__.py index 00de9d3ac..719528d14 100644 --- a/auto_round/algorithms/quantization/__init__.py +++ b/auto_round/algorithms/quantization/__init__.py @@ -14,8 +14,8 @@ from auto_round.algorithms.quantization.base import BaseQuantizers from auto_round.algorithms.quantization.config import QuantizationConfig -from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig -from auto_round.algorithms.quantization.auto_round.quantizer import ARQuantizer -from auto_round.algorithms.quantization.auto_round.adam import ARAdamQuantizer +from auto_round.algorithms.quantization.sign_round.config import SignRoundConfig +from auto_round.algorithms.quantization.sign_round.quantizer import SignRoundQuantizer +from auto_round.algorithms.quantization.sign_round.adam import SignRoundAdamQuantizer from auto_round.algorithms.quantization.rtn.config import RTNConfig from auto_round.algorithms.quantization.rtn.quantizer import RTNQuantizer, OptimizedRTNQuantizer diff --git a/auto_round/algorithms/quantization/base.py b/auto_round/algorithms/quantization/base.py index 751fcb653..99a156ac4 100644 --- a/auto_round/algorithms/quantization/base.py +++ b/auto_round/algorithms/quantization/base.py @@ -45,10 +45,14 @@ class BaseQuantizers: supported_types = SUPPORTED_LAYER_TYPES inner_supported_types = INNER_SUPPORTED_LAYER_TYPES enable_alg_ext = False + # Subclasses that support diffusion models should override this with the + # appropriate output key mapping, e.g.: + # DIFFUSION_OUTPUT_CONFIGS = {"FluxTransformerBlock": ["encoder_hidden_states", "hidden_states"]} + DIFFUSION_OUTPUT_CONFIGS: dict = {} def __init__(self, config: QuantizationConfig): self.config = config - self.layer_config = config.layer_config + self.layer_config = None self.bits = config.bits self.group_size = config.group_size self.sym = config.sym @@ -64,9 +68,9 @@ def __init__(self, config: QuantizationConfig): self.ignore_layers = config.ignore_layers self.quant_lm_head = config.quant_lm_head self.to_quant_block_names = config.to_quant_block_names - # Calibration / sampling attrs – populated from config if present. - self.seqlen = getattr(config, "seqlen", 2048) - self.nsamples = getattr(config, "nsamples", 128) + # Calibration / sampling attrs – synced from compressor in post_init. + self.seqlen = 2048 + self.nsamples = 128 self.batch_size = getattr(config, "batch_size", 8) self.batch_dim = getattr(config, "batch_dim", None) self.infer_bs_coeff = getattr(config, "infer_bs_coeff", 1) @@ -270,7 +274,7 @@ def quantize_block( reference_output: FP reference outputs collected by Compressor (None for algorithms that don't need a reconstruction loss). **kwargs: Algorithm-specific keyword arguments (e.g. ``loss_device``, - ``card_0_in_high_risk`` for ARQuantizer). + ``card_0_in_high_risk`` for SignRoundQuantizer). Returns: dict: Best quantization parameters found, or ``{}`` if not applicable. @@ -297,20 +301,20 @@ def _get_block_outputs( ): """Compute the output of a block for calibration inputs. - Shared by ARQuantizer and OptimizedRTNQuantizer. Algorithm-specific + Shared by SignRoundQuantizer and OptimizedRTNQuantizer. Algorithm-specific block-forward selection (compile vs. plain) is handled here based on ``enable_alg_ext`` and act-quantization flags. """ diffusion_fn = getattr(self, "_get_diffusion_block_outputs", None) - if getattr(self.model_context, "is_diffusion", False) and callable(diffusion_fn): - return diffusion_fn( + if getattr(self.model_context, "is_diffusion", False): + return self._get_diffusion_block_outputs( block, input_ids, input_others, bs, self.compress_context.device, self.compress_context.cache_device, - ) # pylint : disable=E1102 + ) if ( (self.config.is_act_quantize and (not self.config.act_dynamic or self.config.is_act_nv_fp)) # have hooks @@ -354,8 +358,7 @@ def _get_block_outputs( output.append(tmp_output) else: output.extend(list(torch.split(tmp_output, 1, dim=self.batch_dim))) - if self.compress_context.low_gpu_mem_usage: - clear_memory(device_list=self.compress_context.device_list) + self.compress_context.clear_memory() return output @@ -372,7 +375,7 @@ def _sampling_inputs( ): """Sample a mini-batch of calibration inputs by indices. - Shared by ARQuantizer and OptimizedRTNQuantizer. + Shared by SignRoundQuantizer and OptimizedRTNQuantizer. """ if isinstance(input_ids, list): current_input_ids = [input_ids[i] for i in indices] @@ -406,3 +409,65 @@ def _sampling_inputs( current_input_others[key] = None return current_input_ids, current_input_others + + @torch.no_grad() + def _get_diffusion_block_outputs( + self, + block: torch.nn.Module, + input_ids: Union[torch.Tensor, dict], + input_others, + bs: int, + device: Union[str, torch.device], + cache_device: Union[str, torch.device], + save_output: bool = True, + ): + """Compute block outputs for diffusion models. + + Uses ``self.DIFFUSION_OUTPUT_CONFIGS`` to map block class names to their + output keys. Subclasses override ``DIFFUSION_OUTPUT_CONFIGS`` to add + support for new diffusion architectures. + """ + output = defaultdict(list) + output_config = self.DIFFUSION_OUTPUT_CONFIGS.get(block.__class__.__name__, []) + if isinstance(input_ids, dict): + nsamples = len(input_ids["hidden_states"]) + else: + nsamples = len(input_ids) + + for i in range(0, nsamples, bs): + end_index = min(nsamples, i + bs) + indices = torch.arange(i, end_index).to(torch.long) + tmp_input_ids, tmp_input_others = self._sampling_inputs( + input_ids, + input_others, + indices, + self.seqlen, + self.batch_dim, + share_cache_keys=self.model_context.shared_cache_keys, + ) + if isinstance(tmp_input_ids, dict): + hidden_states = tmp_input_ids.pop("hidden_states") + tmp_input_others.update(tmp_input_ids) + tmp_input_ids = hidden_states + + tmp_output = block_forward( + block, + tmp_input_ids, + tmp_input_others, + self.model_context.amp, + self.model_context.amp_dtype, + device, + None, + ) + assert len(output_config) == len(tmp_output) + tmp_output = dict(zip(output_config, tmp_output)) + + if save_output: + for name, out in tmp_output.items(): + if self.batch_size == 1: + output[name].append(out.to(cache_device)) + else: + output[name].extend(list(torch.split(out.to(cache_device), 1, dim=self.batch_dim))) + self.compress_context.clear_memory() + + return output diff --git a/auto_round/algorithms/quantization/config.py b/auto_round/algorithms/quantization/config.py index 45eb47bcf..4ce703396 100644 --- a/auto_round/algorithms/quantization/config.py +++ b/auto_round/algorithms/quantization/config.py @@ -34,7 +34,6 @@ class QuantizationConfig(AlgConfig): _alg_cls: ClassVar[str] = None # quantization args - layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None bits: int = None group_size: int = None sym: bool = None diff --git a/auto_round/algorithms/quantization/rtn/config.py b/auto_round/algorithms/quantization/rtn/config.py index 686d2bed1..6afc41b0f 100644 --- a/auto_round/algorithms/quantization/rtn/config.py +++ b/auto_round/algorithms/quantization/rtn/config.py @@ -22,21 +22,16 @@ class RTNConfig(QuantizationConfig): def __init__( self, - layer_config=None, *, disable_opt_rtn: bool = None, # for opt-rtn - seqlen: int = 2048, - nsamples: int = 128, batch_size: int = 8, **kwargs, ): # pop before super().__init__ so it doesn't leak into QuantizationConfig as an unknown kwarg enable_opt_rtn = kwargs.pop("enable_opt_rtn", None) - super().__init__(layer_config=layer_config, **kwargs) + super().__init__(**kwargs) - self.seqlen = seqlen - self.nsamples = nsamples self.batch_size = batch_size # Some helpers diff --git a/auto_round/algorithms/quantization/rtn/quantizer.py b/auto_round/algorithms/quantization/rtn/quantizer.py index 921ecd15e..607cc9c4d 100644 --- a/auto_round/algorithms/quantization/rtn/quantizer.py +++ b/auto_round/algorithms/quantization/rtn/quantizer.py @@ -19,9 +19,9 @@ import accelerate import torch -from auto_round.algorithms.quantization.auto_round.quantizer import ARQuantizer from auto_round.algorithms.quantization.base import BaseQuantizers from auto_round.algorithms.quantization.rtn.config import RTNConfig +from auto_round.algorithms.quantization.sign_round.quantizer import SignRoundQuantizer from auto_round.compressors_new.shard_writer import ShardWriter from auto_round.compressors_new.utils import ( IndexSampler, @@ -38,10 +38,8 @@ ) from auto_round.data_type.utils import update_block_global_scale_if_needed from auto_round.logger import logger -from auto_round.modeling.fused_moe.replace_modules import materialize_model_, safe_to_cpu_ from auto_round.utils import ( check_to_quantized, - clear_memory, convert_module_to_hp_if_necessary, get_lm_head_name, get_module, @@ -49,7 +47,6 @@ is_auto_device_mapping, is_hpex_available, memory_monitor, - mv_module_from_gpu, set_amax_for_all_moe_layers, set_module, ) @@ -73,11 +70,11 @@ def quantize_block( ) -> dict: """Apply zero-shot RTN quantization to a block. - Pure-algorithm entry point. Materialize / device placement is handled - by the Compressor before calling this method. + Pure-algorithm entry point. Infrastructure (materialize, shard writing, + device cleanup) is handled by the Compressor before/after this call. Args: - block: Module already materialized. + block: Module already materialized and placed on the correct device. input_ids: Unused for zero-shot RTN (accepted for interface consistency). input_others: Unused for zero-shot RTN. reference_output: Unused for zero-shot RTN. @@ -85,50 +82,9 @@ def quantize_block( Returns: dict: Empty dict (zero-shot RTN has no tunable parameters to return). """ - shard_writer = ShardWriter.get_shard_writer() - - tied_weights_keys = getattr(self.model, "_tied_weights_keys", []) - if tied_weights_keys is None: - tied_weights_keys = [] - if isinstance(tied_weights_keys, dict): - tied_weights_values = list(tied_weights_keys.values()) - else: - tied_weights_values = list(tied_weights_keys) - tied_weights_layers = [".".join(val.split(".")[:-1]) for val in tied_weights_values] # rm weight/bias - # In fact, we should detect whether it is is_separate_lm_head, to simplify, we don't do it - if getattr(self.compress_context, "formats", None) and self.compress_context.formats[0].is_gguf(): - lm_head_name = get_lm_head_name(self.model) - if lm_head_name is not None: - tied_weights_layers.append(lm_head_name) - - materialize_model_(block) - for name, m in block.named_modules(): + for _name, m in block.named_modules(): if hasattr(m, "global_name") and check_to_quantized(m): self.quantize_layer(m.global_name) - elif ( - not any(m.children()) - and len(m.state_dict()) > 0 - and m.global_name not in tied_weights_layers - and self.compress_context.is_immediate_saving - ): - set_module(self.model, m.global_name, copy.deepcopy(m)) - if self.compress_context.is_immediate_saving: - shard_writer.write(name=m.global_name) - copied_m = get_module(self.model, m.global_name) - copied_m.to("meta") - m.to("meta") - - # Move remaining GPU tensors to CPU; offload to disk if low_cpu_mem_usage. - if not self.compress_context.is_immediate_saving: - mv_module_from_gpu(block) - else: - # Save once at block scope to capture tensors that are not saved - # in per-layer branch (e.g., custom module-level params/buffers). - block_name = getattr(block, "name", None) or getattr(block, "global_name", None) - if block_name: - shard_writer.write(name=block_name) - block.to("meta") - return {} def quantize_layer(self, name: str, dtype: torch.dtype = None) -> None: @@ -238,8 +194,6 @@ class OptimizedRTNQuantizer(RTNQuantizer): def __init__(self, config: RTNConfig): BaseQuantizers.__init__(self, config) self.batch_size = config.batch_size - self.seqlen = config.seqlen - self.nsamples = config.nsamples self.batch_dim = config.batch_dim self.data_type = config.data_type self.group_size = config.group_size @@ -263,10 +217,6 @@ def quantize_block(self, block: torch.nn.Module, **kwargs): # enable moe experts act_max automatic generation for Linear set_amax_for_all_moe_layers(block, attr_name="act_max") # Normalize imatrix and quantize layers - if self.compress_context.low_gpu_mem_usage: - block.to("cpu") - clear_memory(device_list=self.compress_context.device_list) - for name, m in block.named_modules(): if hasattr(m, "imatrix"): m.imatrix /= m.imatrix_cnt diff --git a/auto_round/algorithms/quantization/auto_round/__init__.py b/auto_round/algorithms/quantization/sign_round/__init__.py similarity index 100% rename from auto_round/algorithms/quantization/auto_round/__init__.py rename to auto_round/algorithms/quantization/sign_round/__init__.py diff --git a/auto_round/algorithms/quantization/auto_round/adam.py b/auto_round/algorithms/quantization/sign_round/adam.py similarity index 87% rename from auto_round/algorithms/quantization/auto_round/adam.py rename to auto_round/algorithms/quantization/sign_round/adam.py index b7a6131da..e81c02960 100644 --- a/auto_round/algorithms/quantization/auto_round/adam.py +++ b/auto_round/algorithms/quantization/sign_round/adam.py @@ -15,14 +15,16 @@ import torch -from auto_round.algorithms.quantization.auto_round.quantizer import ARQuantizer -from auto_round.compressors.base import BaseCompressor +from auto_round.algorithms.quantization.sign_round.quantizer import SignRoundQuantizer from auto_round.schemes import QuantizationScheme from auto_round.utils import check_is_cpu, htcore, is_hpex_available -class ARAdamQuantizer(ARQuantizer): - is_adam: bool = True +class SignRoundAdamQuantizer(SignRoundQuantizer): + + def _get_extra_optimizer_kwargs(self) -> dict: + """AdamW handles momentum internally; no extra kwargs needed.""" + return {} def _get_optimizer(self, optimizer): if optimizer is None: diff --git a/auto_round/algorithms/quantization/auto_round/config.py b/auto_round/algorithms/quantization/sign_round/config.py similarity index 74% rename from auto_round/algorithms/quantization/auto_round/config.py rename to auto_round/algorithms/quantization/sign_round/config.py index 66920a45c..4513f5ad3 100644 --- a/auto_round/algorithms/quantization/auto_round/config.py +++ b/auto_round/algorithms/quantization/sign_round/config.py @@ -17,7 +17,7 @@ from auto_round.logger import logger -class AutoRoundConfig(QuantizationConfig): +class SignRoundConfig(QuantizationConfig): """ Args: @@ -30,18 +30,15 @@ class AutoRoundConfig(QuantizationConfig): enable_norm_bias_tuning (bool): Whether to enable fast norm/layer_bias tuning """ - _alg_cls = "ARQuantizer" + _alg_cls = "SignRoundQuantizer" def __init__( self, - layer_config: dict[str, Union[str, dict]] = None, *, iters: int = 200, lr: float = None, minmax_lr: float = None, lr_scheduler=None, - seqlen: int = 2048, - nsamples: int = 128, momentum: float = 0.0, batch_size: int = 8, nblocks: int = 1, @@ -56,7 +53,7 @@ def __init__( enable_adam: bool = False, **kwargs, ): - super().__init__(layer_config=layer_config, **kwargs) + super().__init__(**kwargs) self.iters = iters if self.iters < 0: logger.warning("`iters` must be non-negative, reset it to 200") @@ -74,8 +71,6 @@ def __init__( self.minmax_lr = minmax_lr or self.lr self.lr_scheduler = lr_scheduler - self.seqlen = seqlen - self.nsamples = nsamples self.batch_size, self.gradient_accumulate_steps = batch_size, gradient_accumulate_steps self.nblocks = nblocks self.momentum = momentum @@ -96,7 +91,7 @@ def __init__( self.enable_adam = enable_adam if self.enable_adam: - self._alg_cls = "ARAdamQuantizer" + self._alg_cls = "SignRoundAdamQuantizer" def check_configs(self) -> None: """Checks if the configurations are valid. @@ -110,25 +105,7 @@ def check_configs(self) -> None: raise ValueError("`batch_size` must be positive") if self.iters < 0: raise ValueError("`iters` must be non-negative") - if self.seqlen <= 0: - raise ValueError("`seqlen` must be positive") if self.nblocks <= 0: raise ValueError("`nblocks` must be positive") if self.gradient_accumulate_steps <= 0: raise ValueError("`gradient_accumulate_steps` must be positive") - - if self.nsamples < self.gradient_accumulate_steps * self.batch_size: - if self.batch_size > self.nsamples: - if self.iters > 0: # GGUF should log this warning, but we don't know the format here - logger.warning( - f"reset `batch_size` to {self.nsamples} as `nsamples`({self.nsamples})" - f" is smaller than batch_size({self.batch_size})" - ) - self.batch_size = self.nsamples - if self.gradient_accumulate_steps > self.nsamples // self.batch_size: - self.gradient_accumulate_steps = self.nsamples // self.batch_size - logger.warning( - f"reset `gradient_accumulate_steps` to {self.gradient_accumulate_steps}" - f" as nsamples must equal or greater" - f" than gradient_accumulate_steps * batch_size" - ) diff --git a/auto_round/algorithms/quantization/auto_round/quantizer.py b/auto_round/algorithms/quantization/sign_round/quantizer.py similarity index 87% rename from auto_round/algorithms/quantization/auto_round/quantizer.py rename to auto_round/algorithms/quantization/sign_round/quantizer.py index 5b369666e..339c8732c 100644 --- a/auto_round/algorithms/quantization/auto_round/quantizer.py +++ b/auto_round/algorithms/quantization/sign_round/quantizer.py @@ -21,8 +21,9 @@ import torch from torch import autocast -from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig from auto_round.algorithms.quantization.base import BaseQuantizers +from auto_round.algorithms.quantization.sign_round.config import SignRoundConfig +from auto_round.algorithms.quantization.sign_round.sign_sgd import SignSGD from auto_round.compressors_new.utils import ( IndexSampler, block_forward, @@ -31,11 +32,8 @@ immediate_pack, ) from auto_round.logger import logger -from auto_round.modeling.fused_moe.replace_modules import materialize_model_ -from auto_round.sign_sgd import SignSGD from auto_round.utils import ( check_to_quantized, - clear_memory, compile_func, convert_module_to_hp_if_necessary, get_module, @@ -61,10 +59,14 @@ } -class ARQuantizer(BaseQuantizers): - is_adam: bool = False +class SignRoundQuantizer(BaseQuantizers): + # Override the base empty dict with Flux-specific output key mappings. + DIFFUSION_OUTPUT_CONFIGS = { + "FluxTransformerBlock": ["encoder_hidden_states", "hidden_states"], + "FluxSingleTransformerBlock": ["encoder_hidden_states", "hidden_states"], + } - def __init__(self, config: AutoRoundConfig): + def __init__(self, config: SignRoundConfig): super().__init__(config) self.attention_mask = [] @@ -72,8 +74,6 @@ def __init__(self, config: AutoRoundConfig): self.lr = config.lr self.minmax_lr = config.minmax_lr self.lr_scheduler = config.lr_scheduler - self.seqlen = config.seqlen - self.nsamples = config.nsamples self.batch_size = config.batch_size self.batch_dim = config.batch_dim self.momentum = config.momentum @@ -89,6 +89,14 @@ def __init__(self, config: AutoRoundConfig): self.optimizer = self._get_optimizer(optimizer=config.optimizer) self.wrapper_block = wrapper_block + def _get_extra_optimizer_kwargs(self) -> dict: + """Return extra keyword arguments passed to the optimizer constructor. + + SignSGD requires ``momentum``; AdamW-based subclasses override this to + return ``{}`` because AdamW handles its own momentum internally. + """ + return {"momentum": self.momentum} + def post_init(self): super().post_init() if self.enable_alg_ext: @@ -119,7 +127,7 @@ def _get_diffusion_current_q_output( device: str, cache_device: str = "cpu", ): - output_config = DIFFUSION_OUTPUT_CONFIGS.get(block.__class__.__name__, []) + output_config = self.DIFFUSION_OUTPUT_CONFIGS.get(block.__class__.__name__, []) idx = None if "hidden_states" not in output_config else output_config.index("hidden_states") current_input_ids, current_input_others = self._sampling_inputs( input_ids, @@ -259,14 +267,8 @@ def quantize_block( self.enable_norm_bias_tuning, enable_torch_compile=self.compress_context.enable_torch_compile, device=device, + is_nv_fp=self.config.is_nv_fp, ) - # Call this before quantization and after applying the block wrapper. - if self.config.is_nv_fp: # enable qkv and moe structure global_scale fuse. - from auto_round.data_type.utils import update_fused_layer_global_scales - - modules = block.modules() - for module in modules: - update_fused_layer_global_scales(module) round_params = [] minmax_params = [] for n, m in block.named_modules(): @@ -280,7 +282,7 @@ def quantize_block( lr = torch.tensor(self.lr) minmax_lr = torch.tensor(self.minmax_lr) - extra_kwargs = {} if self.is_adam else {"momentum": self.momentum} + extra_kwargs = self._get_extra_optimizer_kwargs() if self.enable_minmax_tuning: params = [ @@ -397,8 +399,7 @@ def quantize_block( "layers in the block" ) - if self.compress_context.low_gpu_mem_usage: - clear_memory(device_list=self.compress_context.device_list) # clear cached memory during training + self.compress_context.clear_memory() # clear cached memory during training if len(unquantized_layer_names) != 0: logger.info(f"Unquantized layers: {unquantized_layer_names}") with torch.no_grad(): @@ -589,78 +590,6 @@ def quantize_layer( dump_info = f"quantized {layer_name}, loss iter 0: {init_loss:.6f} -> iter {best_iter}: {last_loss:.6f}" logger.info(dump_info) - @torch.no_grad() - def _get_diffusion_block_outputs( - self, - block: torch.nn.Module, - input_ids: Union[torch.Tensor, dict], - input_others: torch.Tensor, - bs: int, - device: Union[str, torch.device], - cache_device: Union[str, torch.device], - save_output: bool = True, - ): - """Compute the output of a given block of the model for a given input. - - Args: - block: The block of the model. - input_ids: The input tensor containing tokenized input ids. - input_others: A dictionary containing additional input data. - bs: The batch size for computing the output. - device: The device for computation. - cache_device: The device for storing the output. - batch_dim: The batch dimension of the output tensor. - - Returns: - The output tensor of the block. - """ - - output = defaultdict(list) - output_config = DIFFUSION_OUTPUT_CONFIGS.get(block.__class__.__name__, []) - if isinstance(input_ids, dict): - nsamples = len(input_ids["hidden_states"]) - else: - nsamples = len(input_ids) - - for i in range(0, nsamples, bs): - end_index = min(nsamples, i + bs) - indices = torch.arange(i, end_index).to(torch.long) - tmp_input_ids, tmp_input_others = self._sampling_inputs( - input_ids, - input_others, - indices, - self.seqlen, - self.batch_dim, - share_cache_keys=self.model_context.shared_cache_keys, - ) - if isinstance(tmp_input_ids, dict): - hidden_states = tmp_input_ids.pop("hidden_states") - tmp_input_others.update(tmp_input_ids) - tmp_input_ids = hidden_states - - tmp_output = block_forward( - block, - tmp_input_ids, - tmp_input_others, - self.model_context.amp, - self.model_context.amp_dtype, - device, - None, - ) - assert len(output_config) == len(tmp_output) - tmp_output = dict(zip(output_config, tmp_output)) - - if save_output: - for name, out in tmp_output.items(): - if self.batch_size == 1: - output[name].append(out.to(cache_device)) - else: - output[name].extend(list(torch.split(out.to(cache_device), 1, dim=self.batch_dim))) - if self.compress_context.low_gpu_mem_usage: - clear_memory() - - return output - def _get_optimizer(self, optimizer: Any): """Returns the specified optimizer. In SignRound, we fix the optimizer. diff --git a/auto_round/sign_sgd.py b/auto_round/algorithms/quantization/sign_round/sign_sgd.py similarity index 100% rename from auto_round/sign_sgd.py rename to auto_round/algorithms/quantization/sign_round/sign_sgd.py diff --git a/auto_round/algorithms/rotation/__init__.py b/auto_round/algorithms/transforms/__init__.py similarity index 93% rename from auto_round/algorithms/rotation/__init__.py rename to auto_round/algorithms/transforms/__init__.py index 06e1bbdc9..ddc00687f 100644 --- a/auto_round/algorithms/rotation/__init__.py +++ b/auto_round/algorithms/transforms/__init__.py @@ -20,18 +20,18 @@ Current algorithms ------------------ * **hadamard** – Block-diagonal Hadamard rotations (QuaRot / SpinQuant style). - See :mod:`auto_round.algorithms.rotation.hadamard`. + See :mod:`auto_round.algorithms.transforms.hadamard`. Adding a new algorithm ----------------------- -1. Create ``algorithms/rotation//`` with ``config.py`` and ``apply.py``. +1. Create ``algorithms/transforms//`` with ``config.py`` and ``apply.py``. 2. Subclass :class:`BaseRotationConfig` and :class:`BaseRotation`; register with ``@BaseRotation.register("")``. 3. Re-export from this ``__init__.py``. Typical usage ------------- ->>> from auto_round.algorithms.rotation import apply_rotation +>>> from auto_round.algorithms.transforms import apply_rotation >>> model = apply_rotation(model, config={"hadamard_type": "random_hadamard"}) """ from __future__ import annotations @@ -40,13 +40,13 @@ import torch -from auto_round.algorithms.rotation.base import ( +from auto_round.algorithms.transforms.base import ( BaseRotation, BaseRotationConfig, ROTATION_SUPPORTED_SCHEMES, check_supported_schemes, ) -from auto_round.algorithms.rotation.hadamard import ( +from auto_round.algorithms.transforms.hadamard import ( HadamardConfig, HadamardRotation, apply_hadamard_transform, diff --git a/auto_round/algorithms/rotation/base.py b/auto_round/algorithms/transforms/base.py similarity index 97% rename from auto_round/algorithms/rotation/base.py rename to auto_round/algorithms/transforms/base.py index 658d932f4..05bce2472 100644 --- a/auto_round/algorithms/rotation/base.py +++ b/auto_round/algorithms/transforms/base.py @@ -55,7 +55,7 @@ class BaseRotation(ABC): Example ------- - >>> from auto_round.algorithms.rotation import apply_rotation + >>> from auto_round.algorithms.transforms import apply_rotation >>> model = apply_rotation(model, config={"algorithm": "hadamard", ...}) """ @@ -162,7 +162,7 @@ def _ensure_registry_populated() -> None: for sub in ("hadamard",): try: - importlib.import_module(f"auto_round.algorithms.rotation.{sub}") + importlib.import_module(f"auto_round.algorithms.transforms.{sub}") except ImportError: pass _registry_populated = True diff --git a/auto_round/algorithms/rotation/hadamard/__init__.py b/auto_round/algorithms/transforms/hadamard/__init__.py similarity index 74% rename from auto_round/algorithms/rotation/hadamard/__init__.py rename to auto_round/algorithms/transforms/hadamard/__init__.py index 02b61f979..d86923fa5 100644 --- a/auto_round/algorithms/rotation/hadamard/__init__.py +++ b/auto_round/algorithms/transforms/hadamard/__init__.py @@ -11,11 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Hadamard rotation sub-package for ``algorithms/rotation``.""" +"""Hadamard rotation sub-package for ``algorithms/transforms``.""" -from auto_round.algorithms.rotation.hadamard.apply import HadamardRotation, apply_hadamard_transform -from auto_round.algorithms.rotation.hadamard.config import HadamardConfig, normalize_hadamard_config -from auto_round.algorithms.rotation.hadamard.transforms import ( +from auto_round.algorithms.transforms.hadamard.apply import HadamardRotation, apply_hadamard_transform +from auto_round.algorithms.transforms.hadamard.config import HadamardConfig, normalize_hadamard_config +from auto_round.algorithms.transforms.hadamard.transforms import ( HADAMARDS, HadamardTransform, RandomHadamardTransform, diff --git a/auto_round/algorithms/rotation/hadamard/apply.py b/auto_round/algorithms/transforms/hadamard/apply.py similarity index 90% rename from auto_round/algorithms/rotation/hadamard/apply.py rename to auto_round/algorithms/transforms/hadamard/apply.py index 219e684fa..428827ff1 100644 --- a/auto_round/algorithms/rotation/hadamard/apply.py +++ b/auto_round/algorithms/transforms/hadamard/apply.py @@ -25,9 +25,9 @@ import torch import tqdm -from auto_round.algorithms.rotation.base import BaseRotation -from auto_round.algorithms.rotation.hadamard.config import HadamardConfig, normalize_hadamard_config -from auto_round.algorithms.rotation.hadamard.transforms import build_hadamard_transform +from auto_round.algorithms.transforms.base import BaseRotation +from auto_round.algorithms.transforms.hadamard.config import HadamardConfig, normalize_hadamard_config +from auto_round.algorithms.transforms.hadamard.transforms import build_hadamard_transform from auto_round.experimental.qmodules.mx import MXQuantLinearBase # optional dep, guarded below __all__ = ["HadamardRotation", "apply_hadamard_transform"] @@ -40,7 +40,7 @@ def _triton_available() -> bool: if not torch.cuda.is_available(): return False - from auto_round.algorithms.rotation.hadamard.utils.triton.mxfp4 import ( # noqa: F401 + from auto_round.algorithms.transforms.hadamard.utils.triton.mxfp4 import ( # noqa: F401 mxfp4_forward_kernel_wrapper, ) @@ -54,16 +54,16 @@ class HadamardRotation(BaseRotation): """Hadamard rotation algorithm. Registered under ``"hadamard"`` in the - :class:`~auto_round.algorithms.rotation.base.BaseRotation` registry. + :class:`~auto_round.algorithms.transforms.base.BaseRotation` registry. Typical usage (via the top-level helper):: - from auto_round.algorithms.rotation import apply_rotation + from auto_round.algorithms.transforms import apply_rotation model = apply_rotation(model, config={"hadamard_type": "random_hadamard"}) Or directly:: - from auto_round.algorithms.rotation.hadamard import apply_hadamard_transform + from auto_round.algorithms.transforms.hadamard import apply_hadamard_transform model = apply_hadamard_transform(model, config=HadamardConfig(), need_calibration=True) """ @@ -139,7 +139,7 @@ def _apply_to_module( location: str, ) -> None: """Apply the configured Hadamard transform to a single *module*.""" - from auto_round.algorithms.rotation.hadamard.patch import ( + from auto_round.algorithms.transforms.hadamard.patch import ( patch_quantlinear, patch_wrapperlinear_to_apply_transform, patch_wrapperwalayer_forward_to_apply_transform, @@ -157,7 +157,7 @@ def _apply_to_module( def _apply_input_transform(module: torch.nn.Module, config: HadamardConfig) -> None: """Register a forward pre-hook that applies the Hadamard to the input activation.""" - from auto_round.algorithms.rotation.hadamard.utils.matrix import multihead_matmul + from auto_round.algorithms.transforms.hadamard.utils.matrix import multihead_matmul inp_transform = build_hadamard_transform( **config.model_dump(), @@ -173,7 +173,7 @@ def _apply_input_transform(module: torch.nn.Module, config: HadamardConfig) -> N hadamard_weight = None if _triton_available(): - from auto_round.algorithms.rotation.hadamard.utils.triton.mxfp4 import mxfp4_forward_kernel_wrapper + from auto_round.algorithms.transforms.hadamard.utils.triton.mxfp4 import mxfp4_forward_kernel_wrapper def _input_hook(self, args): x = args[0] @@ -207,7 +207,7 @@ def _apply_weight_transform( need_calibration: bool, ) -> None: """Fuse or patch the Hadamard rotation into the weight of *module*.""" - from auto_round.algorithms.rotation.hadamard.patch import ( + from auto_round.algorithms.transforms.hadamard.patch import ( patch_quantlinear, patch_wrapperlinear_to_apply_transform, patch_wrapperwalayer_forward_to_apply_transform, @@ -259,7 +259,7 @@ def apply_hadamard_transform( """Apply a Hadamard rotation to *model*. This is the main public entry point when you only want Hadamard (rather - than the polymorphic :func:`~auto_round.algorithms.rotation.apply_rotation`). + than the polymorphic :func:`~auto_round.algorithms.transforms.apply_rotation`). Args: model: Target model. diff --git a/auto_round/algorithms/rotation/hadamard/config.py b/auto_round/algorithms/transforms/hadamard/config.py similarity index 94% rename from auto_round/algorithms/rotation/hadamard/config.py rename to auto_round/algorithms/transforms/hadamard/config.py index 7ee370207..8801c1d86 100644 --- a/auto_round/algorithms/rotation/hadamard/config.py +++ b/auto_round/algorithms/transforms/hadamard/config.py @@ -18,7 +18,7 @@ from pydantic import BaseModel, Field, field_validator -from auto_round.algorithms.rotation.base import BaseRotationConfig +from auto_round.algorithms.transforms.base import BaseRotationConfig __all__ = ["HadamardConfig", "normalize_hadamard_config"] @@ -31,11 +31,11 @@ class HadamardConfig(BaseModel, BaseRotationConfig): This config is designed to be embedded inside a model's ``config.json`` for serialisation, and is also used at runtime to drive - :class:`~auto_round.algorithms.rotation.hadamard.apply.HadamardRotation`. + :class:`~auto_round.algorithms.transforms.hadamard.apply.HadamardRotation`. Attributes: algorithm: Fixed to ``"hadamard"`` – identifies this config in the - :class:`~auto_round.algorithms.rotation.base.BaseRotation` registry. + :class:`~auto_round.algorithms.transforms.base.BaseRotation` registry. block_size: Block size for the block-diagonal Hadamard matrix. hadamard_type: Which transform to use (``"hadamard"`` or ``"random_hadamard"``). diff --git a/auto_round/algorithms/rotation/hadamard/patch.py b/auto_round/algorithms/transforms/hadamard/patch.py similarity index 100% rename from auto_round/algorithms/rotation/hadamard/patch.py rename to auto_round/algorithms/transforms/hadamard/patch.py diff --git a/auto_round/algorithms/rotation/hadamard/transforms.py b/auto_round/algorithms/transforms/hadamard/transforms.py similarity index 97% rename from auto_round/algorithms/rotation/hadamard/transforms.py rename to auto_round/algorithms/transforms/hadamard/transforms.py index 0f41b4cd8..00b23aef4 100644 --- a/auto_round/algorithms/rotation/hadamard/transforms.py +++ b/auto_round/algorithms/transforms/hadamard/transforms.py @@ -26,11 +26,11 @@ import torch import torch.nn as nn -from auto_round.algorithms.rotation.hadamard.utils.math import ( +from auto_round.algorithms.transforms.hadamard.utils.math import ( deterministic_hadamard_matrix, random_hadamard_matrix, ) -from auto_round.algorithms.rotation.hadamard.utils.matrix import apply_transform_weight +from auto_round.algorithms.transforms.hadamard.utils.matrix import apply_transform_weight __all__ = [ "HadamardTransform", diff --git a/auto_round/algorithms/rotation/hadamard/utils/__init__.py b/auto_round/algorithms/transforms/hadamard/utils/__init__.py similarity index 100% rename from auto_round/algorithms/rotation/hadamard/utils/__init__.py rename to auto_round/algorithms/transforms/hadamard/utils/__init__.py diff --git a/auto_round/algorithms/rotation/hadamard/utils/hadamards.safetensors b/auto_round/algorithms/transforms/hadamard/utils/hadamards.safetensors similarity index 100% rename from auto_round/algorithms/rotation/hadamard/utils/hadamards.safetensors rename to auto_round/algorithms/transforms/hadamard/utils/hadamards.safetensors diff --git a/auto_round/algorithms/rotation/hadamard/utils/math.py b/auto_round/algorithms/transforms/hadamard/utils/math.py similarity index 100% rename from auto_round/algorithms/rotation/hadamard/utils/math.py rename to auto_round/algorithms/transforms/hadamard/utils/math.py diff --git a/auto_round/algorithms/rotation/hadamard/utils/matrix.py b/auto_round/algorithms/transforms/hadamard/utils/matrix.py similarity index 100% rename from auto_round/algorithms/rotation/hadamard/utils/matrix.py rename to auto_round/algorithms/transforms/hadamard/utils/matrix.py diff --git a/auto_round/algorithms/rotation/hadamard/utils/triton/__init__.py b/auto_round/algorithms/transforms/hadamard/utils/triton/__init__.py similarity index 100% rename from auto_round/algorithms/rotation/hadamard/utils/triton/__init__.py rename to auto_round/algorithms/transforms/hadamard/utils/triton/__init__.py diff --git a/auto_round/algorithms/rotation/hadamard/utils/triton/mxfp4.py b/auto_round/algorithms/transforms/hadamard/utils/triton/mxfp4.py similarity index 100% rename from auto_round/algorithms/rotation/hadamard/utils/triton/mxfp4.py rename to auto_round/algorithms/transforms/hadamard/utils/triton/mxfp4.py diff --git a/auto_round/autoround.py b/auto_round/autoround.py index f3fff6b98..bba4c4921 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -163,9 +163,9 @@ def __new__( local_args = {k: v for k, v in locals().items() if k not in cls.SKIP_ARGS} if NEW_ARCH: - from auto_round.compressors_new.entry import AutoRound as AutoRoundNew + from auto_round.compressors_new.entry import AutoRoundCompatible - return AutoRoundNew(**local_args, **kwargs) + return AutoRoundCompatible(**local_args, **kwargs) model_cls = [] diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 3cf88c321..537e565ed 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -33,6 +33,7 @@ from transformers import AutoConfig, set_seed from auto_round import envs +from auto_round.algorithms.quantization.sign_round.sign_sgd import SignSGD from auto_round.auto_scheme.gen_auto_scheme import AutoScheme from auto_round.compressors.shard_writer import shard_writer from auto_round.compressors.utils import ( @@ -68,7 +69,6 @@ get_gguf_scheme, preset_name_to_scheme, ) -from auto_round.sign_sgd import SignSGD from auto_round.special_model_handler import get_predefined_ignore_layers, update_module from auto_round.utils import ( INNER_SUPPORTED_LAYER_TYPES, diff --git a/auto_round/compressors_new/__init__.py b/auto_round/compressors_new/__init__.py index 8bee639c5..66dbe5de9 100644 --- a/auto_round/compressors_new/__init__.py +++ b/auto_round/compressors_new/__init__.py @@ -19,26 +19,26 @@ if TYPE_CHECKING: from auto_round.compressors_new.calib import CalibCompressor, CalibratedRTNCompressor - from auto_round.compressors_new.entry import AutoRound, Compressor + from auto_round.compressors_new.entry import AutoRoundCompatible, AutoRound from auto_round.compressors_new.zero_shot import ZeroShotCompressor __all__ = [ - "Compressor", + "AutoRound", "CalibCompressor", "CalibratedRTNCompressor", "ZeroShotCompressor", - "AutoRound", + "AutoRoundCompatible", ] def __getattr__(name): """Lazy import to avoid circular dependencies.""" - if name == "Compressor" or name == "AutoRound": - from auto_round.compressors_new.entry import Compressor, AutoRound + if name == "AutoRound" or name == "AutoRoundCompatible": + from auto_round.compressors_new.entry import AutoRound, AutoRoundCompatible - if name == "Compressor": - return Compressor - return AutoRound + if name == "AutoRound": + return AutoRound + return AutoRoundCompatible elif name in ("CalibCompressor", "CalibratedRTNCompressor"): from auto_round.compressors_new.calib import CalibCompressor, CalibratedRTNCompressor diff --git a/auto_round/compressors_new/architecture_visualization.py b/auto_round/compressors_new/architecture_visualization.py index 50c72ce05..2a0fc9648 100644 --- a/auto_round/compressors_new/architecture_visualization.py +++ b/auto_round/compressors_new/architecture_visualization.py @@ -11,21 +11,21 @@ def print_architecture_table(): """Print architecture combination table""" print("\n" + "=" * 110) - print("Compressor New Architecture - Mixin Pattern Combination Table") + print("AutoRound New Architecture - Mixin Pattern Combination Table") print("=" * 110 + "\n") - print(f"{'Model Type':<15} {'Config Type':<20} {'Compressor (dynamic class)':<40} {'Base classes':<35}") + print(f"{'Model Type':<15} {'Config Type':<20} {'AutoRound (dynamic class)':<40} {'Base classes':<35}") print("-" * 110) # LLM combinations - print(f"{'LLM':<15} {'AutoRoundConfig':<20} {'CalibCompressor':<40} {'CalibCompressor':<35}") + print(f"{'LLM':<15} {'SignRoundConfig':<20} {'CalibCompressor':<40} {'CalibCompressor':<35}") print(f"{'LLM':<15} {'RTNConfig':<20} {'CalibratedRTNCompressor':<40} {'CalibratedRTNCompressor':<35}") print(f"{'LLM':<15} {'RTNConfig':<20} {'ZeroShotCompressor':<40} {'ZeroShotCompressor':<35}") print() # MLLM combinations (dynamic classes created in entry.py) - print(f"{'MLLM':<15} {'AutoRoundConfig':<20} {'MLLMCalibCompressor':<40} {'MLLMMixin + CalibCompressor':<35}") + print(f"{'MLLM':<15} {'SignRoundConfig':<20} {'MLLMCalibCompressor':<40} {'MLLMMixin + CalibCompressor':<35}") print( f"{'MLLM':<15} {'RTNConfig':<20} {'MLLMCalibratedRTNCompressor':<40} " f"{'MLLMMixin + CalibratedRTNCompressor':<35}" @@ -36,7 +36,7 @@ def print_architecture_table(): # Diffusion combinations (dynamic classes created in entry.py) print( - f"{'Diffusion':<15} {'AutoRoundConfig':<20} {'DiffusionCalibCompressor':<40} " + f"{'Diffusion':<15} {'SignRoundConfig':<20} {'DiffusionCalibCompressor':<40} " f"{'DiffusionMixin + CalibCompressor':<35}" ) print( @@ -62,13 +62,13 @@ def print_mixin_explanation(): print("-" * 110) print(" 1. MLLMMixin - MLLM features (processor, template, quant_nontext_module, etc.)") print(" 2. DiffusionMixin - Diffusion features (pipeline loading, guidance_scale, etc.)") - print(" 3. CalibCompressor - AutoRound: gradient-based calibration quantization") + print(" 3. CalibCompressor - AutoRoundCompatible: gradient-based calibration quantization") print(" 4. CalibratedRTNCompressor - RTN with importance-matrix (imatrix) or act calibration") print(" 5. ZeroShotCompressor - Zero-shot RTN (no calibration data needed)") print("\n🎯 Combination Approach:") print("-" * 110) - print(" Dynamic classes created on-the-fly inside Compressor.__new__():") + print(" Dynamic classes created on-the-fly inside AutoRound.__new__():") print(" class MLLMCalibCompressor(MLLMMixin, CalibCompressor): pass") print(" class MLLMCalibratedRTNCompressor(MLLMMixin, CalibratedRTNCompressor): pass") print(" class MLLMZeroShotCompressor(MLLMMixin, ZeroShotCompressor): pass") @@ -126,15 +126,15 @@ def print_usage_examples(): print("Usage Examples") print("=" * 110 + "\n") - print("Example 1: MLLM + AutoRound (gradient-based)") + print("Example 1: MLLM + AutoRoundCompatible (gradient-based)") print("-" * 110) print( """ -from auto_round.compressors_new.entry import Compressor -from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig +from auto_round.compressors_new.entry import AutoRound +from auto_round.algorithms.quantization.sign_round.config import SignRoundConfig -config = AutoRoundConfig(scheme="W4A16", iters=200, nsamples=128) -compressor = Compressor( +config = SignRoundConfig(scheme="W4A16", iters=200, nsamples=128) +compressor = AutoRound( config=config, model="/models/Qwen2-VL-2B-Instruct", processor=processor, @@ -152,7 +152,7 @@ def print_usage_examples(): from auto_round.algorithms.quantization.rtn.config import RTNConfig config = RTNConfig(scheme="W4A16") -compressor = Compressor( +compressor = AutoRound( config=config, model="/models/Qwen2-VL-2B-Instruct", format="gguf_k", # gguf_k triggers CalibratedRTNCompressor @@ -162,12 +162,12 @@ def print_usage_examples(): """ ) - print("\nExample 3: Diffusion + AutoRound") + print("\nExample 3: Diffusion + AutoRoundCompatible") print("-" * 110) print( """ -config = AutoRoundConfig(scheme="W4A16", iters=200) -compressor = Compressor( +config = SignRoundConfig(scheme="W4A16", iters=200) +compressor = AutoRound( config=config, model="/models/stable-diffusion-2-1", guidance_scale=7.5, @@ -190,7 +190,7 @@ def print_mro_example(): print("-" * 110) print( """ -MLLMCalibCompressor (dynamic, created in Compressor.__new__) +MLLMCalibCompressor (dynamic, created in AutoRound.__new__) └─> MLLMMixin └─> CalibCompressor └─> BaseCompressor @@ -222,12 +222,12 @@ def print_decision_tree(): """Print decision tree""" print("=" * 110) - print("Compressor Creation Decision Tree") + print("AutoRound Creation Decision Tree") print("=" * 110 + "\n") print( """ -Compressor.__new__(config, model, format, **kwargs) +AutoRound.__new__(config, model, format, **kwargs) │ ├─ Step 1: Detect model type │ model_type = detect_model_type(model) @@ -235,7 +235,7 @@ def print_decision_tree(): │ ├─ is_mllm_model() → "mllm" │ └─ else → "llm" │ -├─ isinstance(config, AutoRoundConfig) +├─ isinstance(config, SignRoundConfig) │ ├─ model_type == "mllm" │ │ └─> class MLLMCalibCompressor(MLLMMixin, CalibCompressor) │ ├─ model_type == "diffusion" @@ -294,7 +294,7 @@ def print_quantizer_interface(): Implementations: ├─ RTNQuantizer.quantize_block(block_name: str) ├─ OptimizedRTNQuantizer.quantize_block(block_name: str, input_ids, input_others) - └─ ARQuantizer.quantize_block(block_name: Union[str, list[str]], input_ids, input_others) + └─ SignRoundQuantizer.quantize_block(block_name: Union[str, list[str]], input_ids, input_others) """ ) @@ -326,7 +326,7 @@ def main(): print() # MLLM combinations - print(f"{'MLLM':<15} {'AutoRoundConfig':<20} {'AutoRound':<20} {'MLLMCalibCompressor':<35}") + print(f"{'MLLM':<15} {'SignRoundConfig':<20} {'AutoRoundCompatible':<20} {'MLLMCalibCompressor':<35}") print(f"{'':<15} {'':<20} {'':<20} {' = MLLMMixin + CalibCompressor':<35}") print(f"{'MLLM':<15} {'RTNConfig':<20} {'RTN + imatrix':<20} {'MLLMImatrixCompressor':<35}") print(f"{'':<15} {'':<20} {'':<20} {' = MLLMMixin + ImatrixCompressor':<35}") @@ -336,7 +336,7 @@ def main(): print() # Diffusion combinations - print(f"{'Diffusion':<15} {'AutoRoundConfig':<20} {'AutoRound':<20} {'DiffusionCalibCompressor':<35}") + print(f"{'Diffusion':<15} {'SignRoundConfig':<20} {'AutoRoundCompatible':<20} {'DiffusionCalibCompressor':<35}") print(f"{'':<15} {'':<20} {'':<20} {' = DiffusionMixin + CalibCompressor':<35}") print(f"{'Diffusion':<15} {'RTNConfig':<20} {'RTN + imatrix':<20} {'DiffusionImatrixCompressor':<35}") print(f"{'':<15} {'':<20} {'':<20} {' = DiffusionMixin + ImatrixCompressor':<35}") diff --git a/auto_round/compressors_new/base.py b/auto_round/compressors_new/base.py index 5f072982c..1d52ee509 100644 --- a/auto_round/compressors_new/base.py +++ b/auto_round/compressors_new/base.py @@ -22,7 +22,7 @@ from auto_round.algorithms.alg_config import AlgConfig from auto_round.algorithms.quantization import BaseQuantizers, QuantizationConfig -from auto_round.algorithms.rotation import ( +from auto_round.algorithms.transforms import ( BaseRotationConfig, apply_rotation, check_supported_schemes, @@ -93,7 +93,7 @@ class SerializedCompressorConfig: super_bits: Optional[int] = None super_group_size: Optional[int] = None to_quant_block_names: Optional[list[str]] = None - rotation_configs: Optional[list[dict[str, Any]]] = None + transform_configs: Optional[list[dict[str, Any]]] = None class BaseCompressor(object): @@ -132,18 +132,26 @@ def __init__( enable_torch_compile: bool = False, seed: int = 42, low_cpu_mem_usage: bool = True, + layer_config=None, + nsamples: int = None, + seqlen: int = None, **kwargs, ): self.quantize_config = None - self.rotation_configs: list[BaseRotationConfig] = [] + self.transform_configs: list[BaseRotationConfig] = [] _config_list = config if isinstance(config, list) else [config] for _cfg in _config_list: if isinstance(_cfg, QuantizationConfig): self.quantize_config = _cfg elif isinstance(_cfg, BaseRotationConfig): - self.rotation_configs.append(_cfg) + self.transform_configs.append(_cfg) assert self.quantize_config is not None, "QuantizationConfig is required for Compressor" + # Compressor-level calibration/layer params (do not live in QuantizationConfig). + self.layer_config = layer_config + self.nsamples = nsamples if nsamples is not None else 128 + self.seqlen = seqlen if seqlen is not None else 2048 + # Scheme is passed directly to the compressor, not stored in QuantizationConfig. self.scheme = scheme @@ -553,7 +561,7 @@ def post_init(self) -> None: # Initialize scheme state from quantize_config before resolving. cfg = self.quantize_config self.scale_dtype = cfg.scale_dtype - self.layer_config = cfg.layer_config + # self.layer_config is already set from __init__ (direct compressor param). self.ignore_layers = cfg.ignore_layers self.quant_lm_head = cfg.quant_lm_head self.to_quant_block_names = cfg.to_quant_block_names @@ -572,6 +580,9 @@ def post_init(self) -> None: self.quantizer.compress_context = self.compress_context self.quantizer.model = self.model_context.model self.quantizer.scale_dtype = self.scale_dtype + # Sync compressor-owned calibration params to quantizer. + self.quantizer.seqlen = self.seqlen + self.quantizer.nsamples = self.nsamples self.wrapper_block = wrapper_block # ── Phase 2: resolve output format ─────────────────────────────────── @@ -583,6 +594,10 @@ def post_init(self) -> None: ShardWriter.reset() self.shard_writer = ShardWriter(self.model_context.model, bits=8) + # Snapshot the user-specified layer_config before GGUF processing may + # add extra entries, so we can distinguish them later in Phase 2b. + _pre_gguf_layer_config = copy.copy(self.layer_config) or {} + # ── Phase 2b: propagate GGUF-adjusted attrs back to quantizer ──────── # gguf_args_check (called inside get_formats) may have overridden # bits / sym / data_type / super_bits / super_group_size / group_size @@ -651,9 +666,7 @@ def post_init(self) -> None: self.scheme = _new_scheme _gguf_layer_cfg = { - k: v - for k, v in (self.__dict__.get("layer_config") or {}).items() - if k not in (self.quantize_config.layer_config or {}) + k: v for k, v in (self.__dict__.get("layer_config") or {}).items() if k not in (_pre_gguf_layer_config) } if _gguf_layer_cfg: if self.layer_config is None: @@ -662,10 +675,10 @@ def post_init(self) -> None: self.layer_config.setdefault(_lname, _lval) # ── Phase 2d: apply rotation transforms ────────────────────────────── - if self.rotation_configs: + if self.transform_configs: check_supported_schemes(self.scheme) need_calibration = self.quantize_config.iters > 0 - for rotation_cfg in self.rotation_configs: + for rotation_cfg in self.transform_configs: self.model_context.model = apply_rotation( self.model_context.model, rotation_cfg, diff --git a/auto_round/compressors_new/calib.py b/auto_round/compressors_new/calib.py index ef0372dba..a272a0f6c 100644 --- a/auto_round/compressors_new/calib.py +++ b/auto_round/compressors_new/calib.py @@ -324,11 +324,11 @@ def calib(self, nsamples, bs): # slow here self.dataloader = get_dataloader( self.model_context.tokenizer, - self.quantize_config.seqlen, + self.seqlen, dataset, self.seed, bs, - self.quantize_config.nsamples, + self.nsamples, ) else: self.dataloader = self.dataset @@ -349,7 +349,7 @@ def calib(self, nsamples, bs): logger.error("please provide tokenizer for string input") exit(-1) data = self.model_context.tokenizer( - data, truncation=True, max_length=self.quantize_config.seqlen, return_tensors="pt" + data, truncation=True, max_length=self.seqlen, return_tensors="pt" ).data data_new = {} for key in data.keys(): @@ -365,7 +365,7 @@ def calib(self, nsamples, bs): if key == "images": data_new[key] = to_dtype(data_new[key], self.model.dtype) input_ids = data_new["input_ids"] - if input_ids.shape[-1] < self.quantize_config.seqlen: + if input_ids.shape[-1] < self.seqlen: continue if need_attention_mask: if ( @@ -429,7 +429,7 @@ def calib(self, nsamples, bs): except RuntimeError as error: error_msg = str(error) if "The expanded size of the tensor" in str(error_msg) and "must match the existing size" in error_msg: - check_seqlen_compatible(self.quantize_config.seqlen, self.model_context.tokenizer, self.model) + check_seqlen_compatible(self.seqlen, self.model_context.tokenizer, self.model) logger.warning( "When quantization encounters tensor shape mismatch error, " "you can try to avoid it with batch_size=1" @@ -444,7 +444,7 @@ def calib(self, nsamples, bs): if total_cnt == 0: logger.error( f"no data has been cached, please provide more data with sequence length " - f">={self.quantize_config.seqlen} in the dataset or decease the sequence length" + f">={self.seqlen} in the dataset or decease the sequence length" ) exit(-1) elif total_cnt < nsamples: @@ -849,7 +849,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: logger.info("start to cache block inputs") all_inputs = self.try_cache_inter_data_gpucpu( all_first_block_names, - self.quantize_config.nsamples, + self.nsamples, layer_names, ) self.inputs = all_inputs @@ -859,9 +859,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: if is_quantized_embedding: all_inputs = copy.deepcopy(self.inputs) clear_memory(self.inputs, device_list=self.compress_context.device_list) - all_q_inputs = self.try_cache_inter_data_gpucpu( - all_first_block_names, self.quantize_config.nsamples, layer_names - ) + all_q_inputs = self.try_cache_inter_data_gpucpu(all_first_block_names, self.nsamples, layer_names) self.inputs = all_q_inputs # Remove accelerate dispatch hooks before moving parameters. # hf_device_map is kept for reference but hooks are no longer needed. @@ -1022,9 +1020,7 @@ def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None: if enable_quanted_input: logger.info("starting to cache layer inputs for %s, this may be quite slow ", layer_names) - q_layer_inputs = self.try_cache_inter_data_gpucpu( - [], self.quantize_config.nsamples, layer_names=layer_names - ) + q_layer_inputs = self.try_cache_inter_data_gpucpu([], self.nsamples, layer_names=layer_names) if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1: accelerate.hooks.remove_hook_from_submodules( self.model @@ -1052,29 +1048,25 @@ def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None: def _check_compatibility(self) -> None: """Checks compatibility of the configurations and model.""" if ( - self.quantize_config.seqlen is not None + self.seqlen is not None and hasattr(self.model_context.model, "config") and hasattr(self.model_context.model.config, "max_position_embeddings") ): - if self.model_context.model.config.max_position_embeddings < self.quantize_config.seqlen: + if self.model_context.model.config.max_position_embeddings < self.seqlen: logger.warning( f"Change sequence length to {self.model_context.model.config.max_position_embeddings} " "due to the limitation of max_position_embeddings" ) - self.quantize_config.seqlen = min( - self.quantize_config.seqlen, self.model_context.model.config.max_position_embeddings - ) + self.seqlen = min(self.seqlen, self.model_context.model.config.max_position_embeddings) - if self.quantize_config.seqlen is not None and hasattr(self.model_context.tokenizer, "model_max_length"): - if self.model_context.tokenizer.model_max_length < self.quantize_config.seqlen: + if self.seqlen is not None and hasattr(self.model_context.tokenizer, "model_max_length"): + if self.model_context.tokenizer.model_max_length < self.seqlen: logger.warning( f"Change sequence length to {self.model_context.tokenizer.model_max_length} " "due to the limitation of model_max_length. " "You can also try to increase the model_max_length to avoid this issue." ) - self.quantize_config.seqlen = min( - self.quantize_config.seqlen, self.model_context.tokenizer.model_max_length - ) + self.seqlen = min(self.seqlen, self.model_context.tokenizer.model_max_length) if self.group_size == 0 and "fp8" not in self.data_type: logger.warning("`group_size==0` is not supported for data_type other than fp8 ") @@ -1138,11 +1130,9 @@ def _quantize_via_rtn_blockwise(self) -> None: "quantize layers outside blocks for static activation quantizaiton" " will significantly increase calibration time" ) - all_inputs = self.try_cache_inter_data_gpucpu( - all_first_block_names, self.quantize_config.nsamples, layer_names - ) + all_inputs = self.try_cache_inter_data_gpucpu(all_first_block_names, self.nsamples, layer_names) else: - all_inputs = self.cache_inter_data(all_first_block_names, self.quantize_config.nsamples) + all_inputs = self.cache_inter_data(all_first_block_names, self.nsamples) # Clear hooks for multi-GPU setups if hasattr(self.model_context.model, "hf_device_map") and len(self.model_context.model.hf_device_map) > 1: @@ -1227,6 +1217,10 @@ def _quantize_via_rtn_blockwise(self) -> None: if len(self.compress_context.device_list) > 1: accelerate.hooks.remove_hook_from_submodules(block) + if self.compress_context.low_gpu_mem_usage: + block.to("cpu") + self.compress_context.clear_memory() + # ── Pure algorithm ──────────────────────────────────────────── self.quantizer.quantize_block(block) @@ -1285,11 +1279,11 @@ def _quant_rtn_with_imatrix(self) -> None: dataset_name = self.dataset.replace(" ", "") self.dataloader = get_dataloader( self.model_context.tokenizer, - self.quantize_config.seqlen, + self.seqlen, dataset_name, self.seed, self.quantize_config.batch_size, - self.quantize_config.nsamples, + self.nsamples, ) else: self.dataloader = self.dataset diff --git a/auto_round/compressors_new/docs/compressors_new_architecture.md b/auto_round/compressors_new/docs/compressors_new_architecture.md index 7b3012988..78250c3c7 100644 --- a/auto_round/compressors_new/docs/compressors_new_architecture.md +++ b/auto_round/compressors_new/docs/compressors_new_architecture.md @@ -63,7 +63,7 @@ class QuantizationConfig(AlgConfig): Subclasses: - `RTNConfig(QuantizationConfig)` — adds `disable_opt_rtn`, `seqlen`, `nsamples`, `batch_size` -- `AutoRoundConfig(QuantizationConfig)` — adds `iters`, `lr`, `nblocks`, `enable_minmax_tuning`, … +- `SignRoundConfig(QuantizationConfig)` — adds `iters`, `lr`, `nblocks`, `enable_minmax_tuning`, … ### AlgConfig @@ -159,7 +159,7 @@ Compressor.__new__(config, model, format, **kwargs) │ ├─ is_mllm_model() → "mllm" │ └─ else → "llm" │ -├─ isinstance(config, AutoRoundConfig) +├─ isinstance(config, SignRoundConfig) │ ├─ mllm → class MLLMCalibCompressor(MLLMMixin, CalibCompressor) │ ├─ diffusion → class DiffusionCalibCompressor(DiffusionMixin, CalibCompressor) │ └─ llm → CalibCompressor @@ -216,9 +216,9 @@ class MLLMMixin: ```python from auto_round.compressors_new.entry import Compressor -from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig +from auto_round.algorithms.quantization.sign_round.config import SignRoundConfig -config = AutoRoundConfig(scheme="W4A16", iters=200, nsamples=128) +config = SignRoundConfig(scheme="W4A16", iters=200, nsamples=128) compressor = Compressor(config=config, model="/path/to/llm", tokenizer=tokenizer) quantized_model, layer_config = compressor.quantize() ``` @@ -226,7 +226,7 @@ quantized_model, layer_config = compressor.quantize() ### MLLM (vision-language model) ```python -config = AutoRoundConfig(scheme="W4A16", iters=200) +config = SignRoundConfig(scheme="W4A16", iters=200) compressor = Compressor( config=config, model="/models/Qwen2-VL-2B-Instruct", @@ -240,7 +240,7 @@ compressor = Compressor( ### Diffusion model ```python -config = AutoRoundConfig(scheme="W4A16", iters=200) +config = SignRoundConfig(scheme="W4A16", iters=200) compressor = Compressor( config=config, model="/models/stable-diffusion-2-1", @@ -307,7 +307,7 @@ if model_type == "audio": | Aspect | Description | |---|---| | **Entry point** | Single `Compressor` class, auto-detects model type | -| **Config** | `QuantizationConfig` dataclass; subclasses `RTNConfig`, `AutoRoundConfig` | +| **Config** | `QuantizationConfig` dataclass; subclasses `RTNConfig`, `SignRoundConfig` | | **Model loading** | `ModelContext.__init__` loads eagerly; `apply_patches()` runs before quantizer setup | | **9 combinations** | 3 model types × 3 compressors, dynamic classes via Mixin | | **Quantizer interface** | Name-based `quantize_block(name)` / `quantize_layer(name)`, not module objects | diff --git a/auto_round/compressors_new/docs/compressors_new_architecture_CN.md b/auto_round/compressors_new/docs/compressors_new_architecture_CN.md index 88bebc103..8ab713d07 100644 --- a/auto_round/compressors_new/docs/compressors_new_architecture_CN.md +++ b/auto_round/compressors_new/docs/compressors_new_architecture_CN.md @@ -60,7 +60,7 @@ class QuantizationConfig(AlgConfig): 子类: - `RTNConfig(QuantizationConfig)` — 新增 `disable_opt_rtn`、`seqlen`、`nsamples`、`batch_size` -- `AutoRoundConfig(QuantizationConfig)` — 新增 `iters`、`lr`、`nblocks`、`enable_minmax_tuning` 等 +- `SignRoundConfig(QuantizationConfig)` — 新增 `iters`、`lr`、`nblocks`、`enable_minmax_tuning` 等 ### AlgConfig @@ -155,7 +155,7 @@ Compressor.__new__(config, model, format, **kwargs) │ ├─ is_mllm_model() → "mllm" │ └─ 其他 → "llm" │ -├─ isinstance(config, AutoRoundConfig) +├─ isinstance(config, SignRoundConfig) │ ├─ mllm → class MLLMCalibCompressor(MLLMMixin, CalibCompressor) │ ├─ diffusion → class DiffusionCalibCompressor(DiffusionMixin, CalibCompressor) │ └─ llm → CalibCompressor @@ -240,9 +240,9 @@ MLLMCalibCompressor(entry.py 中动态创建) ```python from auto_round.compressors_new.entry import Compressor -from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig +from auto_round.algorithms.quantization.sign_round.config import SignRoundConfig -config = AutoRoundConfig(scheme="W4A16", iters=200, nsamples=128) +config = SignRoundConfig(scheme="W4A16", iters=200, nsamples=128) compressor = Compressor(config=config, model="/path/to/llm", tokenizer=tokenizer) quantized_model, layer_config = compressor.quantize() ``` @@ -250,7 +250,7 @@ quantized_model, layer_config = compressor.quantize() ### MLLM(视觉-语言模型) ```python -config = AutoRoundConfig(scheme="W4A16", iters=200) +config = SignRoundConfig(scheme="W4A16", iters=200) compressor = Compressor( config=config, model="/models/Qwen2-VL-2B-Instruct", @@ -265,7 +265,7 @@ quantized_model, layer_config = compressor.quantize() ### Diffusion 扩散模型 ```python -config = AutoRoundConfig(scheme="W4A16", iters=200) +config = SignRoundConfig(scheme="W4A16", iters=200) compressor = Compressor( config=config, model="/models/stable-diffusion-2-1", @@ -335,12 +335,12 @@ if model_type == "audio": ```python from auto_round.compressors_new.entry import detect_model_type, Compressor -from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig +from auto_round.algorithms.quantization.sign_round.config import SignRoundConfig model_path = "/your/model/path" print(f"模型类型: {detect_model_type(model_path)}") -config = AutoRoundConfig(scheme="W4A16") +config = SignRoundConfig(scheme="W4A16") comp = Compressor(config=config, model=model_path) print(f"Compressor 类型: {type(comp).__name__}") ``` @@ -364,7 +364,7 @@ print(f"Compressor 类型: {type(comp).__name__}") | 特性 | 说明 | |---|---| | **统一入口** | 单一 `Compressor` 类,自动检测模型类型 | -| **配置** | `QuantizationConfig` dataclass;子类 `RTNConfig`、`AutoRoundConfig` | +| **配置** | `QuantizationConfig` dataclass;子类 `RTNConfig`、`SignRoundConfig` | | **模型加载** | `ModelContext.__init__` 立即加载;`apply_patches()` 在量化器初始化前运行 | | **9 种组合** | 3 种模型类型 × 3 种 Compressor,通过 Mixin 动态创建 | | **量化器接口** | 基于名称的 `quantize_block(name)` / `quantize_layer(name)`,非模块对象 | diff --git a/auto_round/compressors_new/entry.py b/auto_round/compressors_new/entry.py index 45bb5b997..b97d8aa65 100644 --- a/auto_round/compressors_new/entry.py +++ b/auto_round/compressors_new/entry.py @@ -6,8 +6,9 @@ import torch from auto_round.algorithms.alg_config import AlgConfig -from auto_round.algorithms.quantization.auto_round.config import AutoRoundConfig from auto_round.algorithms.quantization.rtn.config import RTNConfig +from auto_round.algorithms.quantization.sign_round.config import SignRoundConfig +from auto_round.algorithms.transforms.hadamard.config import HadamardConfig from auto_round.auto_scheme.gen_auto_scheme import AutoScheme from auto_round.compressors_new.calib import CalibCompressor, CalibratedRTNCompressor from auto_round.compressors_new.utils import check_need_act_calibration @@ -19,7 +20,7 @@ def _preview_resolved_attrs(config, scheme=None) -> dict: """Resolve scheme attributes without mutating config, for routing decisions. - Called in ``Compressor.__new__`` before the concrete compressor class is + Called in ``AutoRound.__new__`` before the concrete compressor class is chosen. ``SchemeMixin.resolve_scheme()`` will do the authoritative resolution later; this is just a lightweight preview so routing logic (``enable_imatrix``, ``needs_act_calib``, etc.) can use the correct values @@ -77,19 +78,66 @@ def detect_model_type(model): return "llm" -class Compressor(object): - SKIP_ARGS = ("local_args", "kwargs", "cls", "config") +class AutoRound(object): + SKIP_ARGS = ("local_args", "kwargs", "cls", "alg_configs", "quant_config", "quant_configs") + + # Mapping from string alias to config class (and optional defaults override). + _CONFIG_ALIASES: dict[str, type] = { + "sign_round": SignRoundConfig, + "signround": SignRoundConfig, + "rtn": RTNConfig, + "hadamard": HadamardConfig, + } + + @classmethod + def _resolve_config(cls, config: Union[str, AlgConfig, list]) -> Union[AlgConfig, list[AlgConfig]]: + """Convert string alias(es) to the corresponding config instance(s) with default parameters.""" + if isinstance(config, str): + key = config.strip().lower() + if key not in cls._CONFIG_ALIASES: + raise ValueError(f"Unknown config alias '{config}'. " f"Supported: {list(cls._CONFIG_ALIASES.keys())}") + return cls._CONFIG_ALIASES[key]() + if isinstance(config, list): + return [cls._resolve_config(c) for c in config] + return config def __new__( cls, - config: Union[AlgConfig, list[AlgConfig]], + alg_configs: Union[str, AlgConfig, list[Union[str, AlgConfig]]], model: Union[torch.nn.Module, str], tokenizer=None, platform="hf", format=None, scheme="W4A16", + low_gpu_mem_usage: bool = False, + device_map: Union[str, torch.device, int, dict] = 0, + enable_torch_compile: bool = False, + seed: int = 42, + low_cpu_mem_usage: bool = True, + layer_config=None, + nsamples: int = None, + seqlen: int = None, **kwargs, ): + from auto_round.algorithms.quantization.config import QuantizationConfig + + # Resolve string alias(es) to config instance(s) before routing. + alg_configs = cls._resolve_config(alg_configs) + + # Extract the single QuantizationConfig from a list; validate at most one exists. + if isinstance(alg_configs, list): + quant_configs = [c for c in alg_configs if isinstance(c, QuantizationConfig)] + if len(quant_configs) == 0: + raise ValueError("At least one QuantizationConfig (SignRoundConfig / RTNConfig) is required.") + if len(quant_configs) > 1: + raise ValueError( + f"Only one QuantizationConfig is allowed, but got {len(quant_configs)}: " + f"{[type(c).__name__ for c in quant_configs]}" + ) + quant_config = quant_configs[0] + else: + quant_config = alg_configs + # using different compressor base on AlgConfigs local_args = {k: v for k, v in locals().items() if k not in cls.SKIP_ARGS} @@ -102,42 +150,42 @@ def __new__( if has_multimodal_assets and model_type != "mllm": model_type = "mllm" - if isinstance(config, AutoRoundConfig): - # For AutoRound, we need calibration-based compression + if isinstance(quant_config, SignRoundConfig): + # For AutoRoundCompatible, we need calibration-based compression # Dynamically create combined class using Mixin pattern if model_type == "mllm": from auto_round.compressors_new.mllm_mixin import MLLMMixin # Create dynamic class: MLLMMixin + CalibCompressor class MLLMCalibCompressor(MLLMMixin, CalibCompressor): - """MLLM model with AutoRound calibration compression""" + """MLLM model with AutoRoundCompatible calibration compression""" pass - return MLLMCalibCompressor(config, **local_args, **kwargs) + return MLLMCalibCompressor(alg_configs, **local_args, **kwargs) elif model_type == "diffusion": from auto_round.compressors_new.diffusion_mixin import DiffusionMixin # Create dynamic class: DiffusionMixin + CalibCompressor class DiffusionCalibCompressor(DiffusionMixin, CalibCompressor): - """Diffusion model with AutoRound calibration compression""" + """Diffusion model with AutoRoundCompatible calibration compression""" pass - return DiffusionCalibCompressor(config, **local_args, **kwargs) + return DiffusionCalibCompressor(alg_configs, **local_args, **kwargs) else: - return CalibCompressor(config, **local_args, **kwargs) + return CalibCompressor(alg_configs, **local_args, **kwargs) - elif isinstance(config, RTNConfig): + elif isinstance(quant_config, RTNConfig): enable_imatrix = False - disable_opt_rtn = getattr(config, "disable_opt_rtn", False) + disable_opt_rtn = getattr(quant_config, "disable_opt_rtn", False) # If disable_opt_rtn was not explicitly set and scheme is W8A16/W8A8, # auto-disable optimization to improve efficiency. - if getattr(config, "orig_disable_opt_rtn", None) is None: + if getattr(quant_config, "orig_disable_opt_rtn", None) is None: if isinstance(scheme, str) and scheme.upper() in ["W8A16", "W8A8"]: logger.warning("`disable_opt_rtn` is turned on for W8A16/W8A8 quantization to improve efficiency.") disable_opt_rtn = True - config.disable_opt_rtn = True + quant_config.disable_opt_rtn = True if not disable_opt_rtn: has_gguf_k = "gguf" in format.lower() and "_k" in format.lower() if format else False if has_gguf_k: @@ -145,9 +193,9 @@ class DiffusionCalibCompressor(DiffusionMixin, CalibCompressor): else: # Resolve scheme attrs for routing (config hasn't been through # SchemeMixin yet; user may have specified only scheme="W4A16"). - _resolved = _preview_resolved_attrs(config, scheme) - _sym = _resolved.get("sym", getattr(config, "sym", None)) - _data_type = _resolved.get("data_type", getattr(config, "data_type", "") or "") + _resolved = _preview_resolved_attrs(quant_config, scheme) + _sym = _resolved.get("sym", getattr(quant_config, "sym", None)) + _data_type = _resolved.get("data_type", getattr(quant_config, "data_type", "") or "") if _sym is not None and _sym is False: enable_imatrix = False elif _data_type == "int": @@ -157,10 +205,10 @@ class DiffusionCalibCompressor(DiffusionMixin, CalibCompressor): else: _resolved = {} - _resolved = _resolved if not disable_opt_rtn else _preview_resolved_attrs(config, scheme) - _act_bits = _resolved.get("act_bits", getattr(config, "act_bits", None)) - _act_data_type = _resolved.get("act_data_type", getattr(config, "act_data_type", None)) - _act_dynamic = _resolved.get("act_dynamic", getattr(config, "act_dynamic", None)) + _resolved = _resolved if not disable_opt_rtn else _preview_resolved_attrs(quant_config, scheme) + _act_bits = _resolved.get("act_bits", getattr(quant_config, "act_bits", None)) + _act_data_type = _resolved.get("act_data_type", getattr(quant_config, "act_data_type", None)) + _act_dynamic = _resolved.get("act_dynamic", getattr(quant_config, "act_dynamic", None)) _is_act_quantize = _act_bits is not None and _act_bits <= 8 needs_act_calib = _is_act_quantize and check_need_act_calibration( _act_dynamic, @@ -177,7 +225,7 @@ class DiffusionCalibCompressor(DiffusionMixin, CalibCompressor): is_auto_scheme = isinstance(scheme, _AutoScheme) if enable_imatrix or needs_act_calib or is_auto_scheme: - config._alg_cls = "OptimizedRTNQuantizer" + quant_config._alg_cls = "OptimizedRTNQuantizer" # For RTN with calibration data, dynamically combine with model-specific Mixin if model_type == "mllm": from auto_round.compressors_new.mllm_mixin import MLLMMixin @@ -187,7 +235,7 @@ class MLLMCalibratedRTNCompressor(MLLMMixin, CalibratedRTNCompressor): pass - return MLLMCalibratedRTNCompressor(config, **local_args, **kwargs) + return MLLMCalibratedRTNCompressor(alg_configs, **local_args, **kwargs) elif model_type == "diffusion": from auto_round.compressors_new.diffusion_mixin import DiffusionMixin @@ -196,11 +244,11 @@ class DiffusionCalibratedRTNCompressor(DiffusionMixin, CalibratedRTNCompressor): pass - return DiffusionCalibratedRTNCompressor(config, **local_args, **kwargs) + return DiffusionCalibratedRTNCompressor(alg_configs, **local_args, **kwargs) else: - return CalibratedRTNCompressor(config, **local_args, **kwargs) + return CalibratedRTNCompressor(alg_configs, **local_args, **kwargs) else: - config._alg_cls = "RTNQuantizer" + quant_config._alg_cls = "RTNQuantizer" # Zero-shot RTN: no calibration data needed if model_type == "mllm": from auto_round.compressors_new.mllm_mixin import MLLMMixin @@ -210,7 +258,7 @@ class MLLMZeroShotCompressor(MLLMMixin, ZeroShotCompressor): pass - return MLLMZeroShotCompressor(config, **local_args, **kwargs) + return MLLMZeroShotCompressor(alg_configs, **local_args, **kwargs) elif model_type == "diffusion": from auto_round.compressors_new.diffusion_mixin import DiffusionMixin @@ -219,16 +267,16 @@ class DiffusionZeroShotCompressor(DiffusionMixin, ZeroShotCompressor): pass - return DiffusionZeroShotCompressor(config, **local_args, **kwargs) + return DiffusionZeroShotCompressor(alg_configs, **local_args, **kwargs) else: - return ZeroShotCompressor(config, **local_args, **kwargs) + return ZeroShotCompressor(alg_configs, **local_args, **kwargs) -class AutoRound: - """AutoRound wrapper class for backward compatibility. +class AutoRoundCompatible: + """AutoRoundCompatible wrapper class for backward compatibility. - This class provides the same API as the old AutoRound class but internally - uses the new Compressor architecture with Mixin pattern. + This class provides the same API as the old AutoRoundCompatible class but internally + uses the new AutoRound architecture with Mixin pattern. Args: model: Model object or model name to load @@ -251,8 +299,8 @@ class AutoRound: Example: >>> # Old API - still works - >>> from auto_round.compressors_new.entry import AutoRound - >>> autoround = AutoRound( + >>> from auto_round.compressors_new.entry import AutoRoundCompatible + >>> autoround = AutoRoundCompatible( ... model="/models/opt-125m", ... bits=4, ... group_size=128, @@ -326,9 +374,9 @@ def __new__( low_cpu_mem_usage: bool = True, **kwargs, ): - """Create AutoRound instance using new Compressor architecture. + """Create AutoRoundCompatible instance using new AutoRound architecture. - This method translates old AutoRound API to new Compressor API. + This method translates old AutoRoundCompatible API to new AutoRound API. """ from auto_round.utils import is_diffusion_model, is_mllm_model @@ -350,7 +398,6 @@ def __new__( # RTN mode disable_opt_rtn = kwargs.pop("disable_opt_rtn", None) config = RTNConfig( - layer_config=layer_config, bits=bits, group_size=group_size, sym=sym, @@ -362,24 +409,19 @@ def __new__( act_dynamic=act_dynamic, disable_opt_rtn=disable_opt_rtn, # for optRTN - seqlen=seqlen, - nsamples=nsamples, batch_size=batch_size, **common_config_kwargs, ) else: - # AutoRound mode + # AutoRoundCompatible mode lr = kwargs.pop("lr", None) minmax_lr = kwargs.pop("minmax_lr", None) enable_minmax_tuning = kwargs.pop("enable_minmax_tuning", True) enable_norm_bias_tuning = kwargs.pop("enable_norm_bias_tuning", False) enable_quanted_input = kwargs.pop("enable_quanted_input", True) - config = AutoRoundConfig( - layer_config=layer_config, + config = SignRoundConfig( iters=iters, - nsamples=nsamples, - seqlen=seqlen, batch_size=batch_size, gradient_accumulate_steps=gradient_accumulate_steps, bits=bits, @@ -423,9 +465,9 @@ def __new__( else: logger.info("Using LLM mode (new architecture).") - # Create Compressor instance using new architecture - compressor = Compressor( - config=config, + # Create AutoRound instance using new architecture + compressor = AutoRound( + alg_configs=config, model=model, tokenizer=tokenizer, platform=platform, @@ -437,6 +479,9 @@ def __new__( enable_torch_compile=enable_torch_compile, seed=seed, low_cpu_mem_usage=low_cpu_mem_usage, + layer_config=layer_config, + nsamples=nsamples, + seqlen=seqlen, # MLLM parameters processor=processor, image_processor=image_processor, diff --git a/auto_round/compressors_new/mllm_mixin.py b/auto_round/compressors_new/mllm_mixin.py index 50cda6492..6fc5f480a 100644 --- a/auto_round/compressors_new/mllm_mixin.py +++ b/auto_round/compressors_new/mllm_mixin.py @@ -139,7 +139,7 @@ def calib(self, nsamples, bs): image_processor=image_processor, dataset=dataset, extra_data_dir=self.extra_data_dir, - seqlen=self.quantize_config.seqlen, + seqlen=self.seqlen, bs=bs, seed=self.seed, nsamples=nsamples, diff --git a/auto_round/compressors_new/zero_shot.py b/auto_round/compressors_new/zero_shot.py index 840ff9e54..49df93bcb 100644 --- a/auto_round/compressors_new/zero_shot.py +++ b/auto_round/compressors_new/zero_shot.py @@ -31,6 +31,7 @@ get_module, global_state, memory_monitor, + mv_module_from_gpu, set_module, ) @@ -119,10 +120,36 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: for block_name in block_names: pbar.set_description(f"Quantizing {block_name}") block = get_module(self.model, block_name) + + # ── Infrastructure: materialize ─────────────────────────── + materialize_model_(block) + + # ── Pure algorithm ──────────────────────────────────────── self.quantizer.quantize_block(block) - if self.low_cpu_mem_usage and not self.is_immediate_saving: - self._offloader(self.model, block_name) + # ── Infrastructure: shard write / device cleanup ────────── + if self.is_immediate_saving: + # Save non-quantized leaf modules (e.g. norms, embeddings in block). + for _n, m in block.named_modules(): + if ( + not any(m.children()) + and len(m.state_dict()) > 0 + and hasattr(m, "global_name") + and m.global_name not in tied_weights_layers + and not check_to_quantized(m) + ): + set_module(self.model, m.global_name, copy.deepcopy(m)) + self.shard_writer.write(name=m.global_name) + get_module(self.model, m.global_name).to("meta") + m.to("meta") + # Write at block scope for any remaining params/buffers. + self.shard_writer.write(name=block_name) + block.to("meta") + else: + mv_module_from_gpu(block) + if self.low_cpu_mem_usage: + self._offloader(self.model, block_name) + clear_memory(device_list=self.device_list) memory_monitor.log_summary() pbar.update(1) diff --git a/auto_round/context/compress.py b/auto_round/context/compress.py index 979b6f67d..07f378de6 100644 --- a/auto_round/context/compress.py +++ b/auto_round/context/compress.py @@ -17,6 +17,7 @@ from auto_round.context.base import BaseContext from auto_round.utils.device import ( + clear_memory, clear_memory_if_reached_threshold, get_major_device, parse_available_devices, @@ -65,3 +66,8 @@ def __init__( self.formats = formats self.static_kv_dtype = static_kv_dtype self.static_attention_dtype = static_attention_dtype + + def clear_memory(self, tensor=None): + """Clear GPU/CPU memory only when ``low_gpu_mem_usage`` is enabled.""" + if self.low_gpu_mem_usage: + clear_memory(tensor, device_list=self.device_list) diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index d98545679..5596642bb 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -30,7 +30,7 @@ import cpuinfo if TYPE_CHECKING: - from transformers import AutoRoundConfig + from transformers import SignRoundConfig def get_cpu_manufacturer(): @@ -1127,7 +1127,7 @@ def get_layer_backend( def get_highest_priority_backend( - quantization_config: "AutoRoundConfig", device: str, packing_format: str + quantization_config: "SignRoundConfig", device: str, packing_format: str ) -> str | None: supported_backends = [] for key in BackendInfos.keys(): diff --git a/auto_round/wrapper.py b/auto_round/wrapper.py index 2478db548..af330a422 100644 --- a/auto_round/wrapper.py +++ b/auto_round/wrapper.py @@ -492,6 +492,7 @@ def forward(self, x): class WrapperWALayer(torch.nn.Module): + def __init__(self, orig_layer, enable_torch_compile=False, device="cpu"): super(WrapperWALayer, self).__init__() self.orig_layer = orig_layer @@ -664,7 +665,13 @@ def forward(self, x, **kwargs): def wrapper_block( - block, enable_minmax_tuning, enable_norm_bias_tuning, enable_torch_compile=False, device="cpu", **kwargs + block, + enable_minmax_tuning, + enable_norm_bias_tuning, + enable_torch_compile=False, + device="cpu", + is_nv_fp=False, + **kwargs, ): """Wraps the layers in the given block with a custom Wrapper module. @@ -711,6 +718,11 @@ def wrapper_block( set_module(block, n, new_m) else: logger.warning_once(f"{m.__class__.__name__} is not supported") + if is_nv_fp: + from auto_round.data_type.utils import update_fused_layer_global_scales + + for module in block.modules(): + update_fused_layer_global_scales(module) return quantized_layers, unquantized_layers From fdc92c2fb95532a4378adb81e56d2130402dc094 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 2 Apr 2026 09:47:50 +0800 Subject: [PATCH 29/90] update Signed-off-by: n1ck-guo --- auto_round/algorithms/quantization/base.py | 57 ++++++++++++- .../quantization/sign_round/adam.py | 6 +- .../quantization/sign_round/quantizer.py | 79 +------------------ 3 files changed, 58 insertions(+), 84 deletions(-) diff --git a/auto_round/algorithms/quantization/base.py b/auto_round/algorithms/quantization/base.py index 99a156ac4..2eb90072c 100644 --- a/auto_round/algorithms/quantization/base.py +++ b/auto_round/algorithms/quantization/base.py @@ -321,12 +321,12 @@ def _get_block_outputs( or self.enable_alg_ext # Use imatrix # or not self.disable_opt_rtn # Use imatrix ): - self.block_forward = block_forward + _bf = block_forward else: # TODO FIXME # This function could not be compiled, causing a large accuracy drop when `enable_alg_ext` is used. # To avoid issues, remove it in all scenarios except WOQ. - self.block_forward = ( + _bf = ( compile_func(block_forward, self.compress_context.device) if self.compress_context.enable_torch_compile else block_forward @@ -345,7 +345,7 @@ def _get_block_outputs( self.batch_dim, share_cache_keys=self.model_context.shared_cache_keys, ) - tmp_output = self.block_forward( + tmp_output = _bf( block, tmp_input_ids, tmp_input_others, @@ -362,6 +362,57 @@ def _get_block_outputs( return output + @torch.no_grad() + def _get_current_q_output( + self, + block: torch.nn.Module, + input_ids, + input_others: dict, + indices, + device, + cache_device: str = "cpu", + ) -> torch.Tensor: + """Compute block output for a mini-batch selected by *indices* (used during training). + + Handles both LLM and diffusion model block formats. Always calls the + plain (non-compiled) ``block_forward`` because this runs inside the + autograd training loop where compilation is not needed. + """ + current_input_ids, current_input_others = self._sampling_inputs( + input_ids, + input_others, + indices, + seqlen=self.seqlen, + batch_dim=self.batch_dim, + share_cache_keys=self.model_context.shared_cache_keys, + ) + if getattr(self.model_context, "is_diffusion", False): + output_config = self.DIFFUSION_OUTPUT_CONFIGS.get(block.__class__.__name__, []) + idx = None if "hidden_states" not in output_config else output_config.index("hidden_states") + if isinstance(current_input_ids, dict): + hidden_states = current_input_ids.pop("hidden_states") + current_input_others.update(current_input_ids) + current_input_ids = hidden_states + output_q = block_forward( + block, + current_input_ids, + current_input_others, + self.model_context.amp, + self.model_context.amp_dtype, + device, + idx, + ) + else: + output_q = block_forward( + block, + current_input_ids, + current_input_others, + self.model_context.amp, + self.model_context.amp_dtype, + device, + ) + return output_q.to(cache_device) + @classmethod @torch.no_grad() def _sampling_inputs( diff --git a/auto_round/algorithms/quantization/sign_round/adam.py b/auto_round/algorithms/quantization/sign_round/adam.py index e81c02960..315a05c9e 100644 --- a/auto_round/algorithms/quantization/sign_round/adam.py +++ b/auto_round/algorithms/quantization/sign_round/adam.py @@ -22,9 +22,9 @@ class SignRoundAdamQuantizer(SignRoundQuantizer): - def _get_extra_optimizer_kwargs(self) -> dict: - """AdamW handles momentum internally; no extra kwargs needed.""" - return {} + def __init__(self, config): + super().__init__(config) + self.momentum = None # AdamW handles momentum internally def _get_optimizer(self, optimizer): if optimizer is None: diff --git a/auto_round/algorithms/quantization/sign_round/quantizer.py b/auto_round/algorithms/quantization/sign_round/quantizer.py index 339c8732c..09b1d7e1b 100644 --- a/auto_round/algorithms/quantization/sign_round/quantizer.py +++ b/auto_round/algorithms/quantization/sign_round/quantizer.py @@ -41,7 +41,6 @@ is_auto_device_mapping, is_hpex_available, memory_monitor, - merge_block_output_keys, mv_module_from_gpu, set_amax_for_all_moe_layers, to_device, @@ -53,18 +52,8 @@ from auto_round.utils.distributed import setup_ddp_if_needed_ from auto_round.wrapper import WrapperLinear, unwrapper_block, unwrapper_layer, wrapper_block -DIFFUSION_OUTPUT_CONFIGS = { - "FluxTransformerBlock": ["encoder_hidden_states", "hidden_states"], - "FluxSingleTransformerBlock": ["encoder_hidden_states", "hidden_states"], -} - class SignRoundQuantizer(BaseQuantizers): - # Override the base empty dict with Flux-specific output key mappings. - DIFFUSION_OUTPUT_CONFIGS = { - "FluxTransformerBlock": ["encoder_hidden_states", "hidden_states"], - "FluxSingleTransformerBlock": ["encoder_hidden_states", "hidden_states"], - } def __init__(self, config: SignRoundConfig): super().__init__(config) @@ -89,14 +78,6 @@ def __init__(self, config: SignRoundConfig): self.optimizer = self._get_optimizer(optimizer=config.optimizer) self.wrapper_block = wrapper_block - def _get_extra_optimizer_kwargs(self) -> dict: - """Return extra keyword arguments passed to the optimizer constructor. - - SignSGD requires ``momentum``; AdamW-based subclasses override this to - return ``{}`` because AdamW handles its own momentum internally. - """ - return {"momentum": self.momentum} - def post_init(self): super().post_init() if self.enable_alg_ext: @@ -118,64 +99,6 @@ def _get_current_output(self, output: list[torch.Tensor], indices: list[int]) -> current_output = torch.cat(current_output, dim=self.batch_dim) return current_output - def _get_diffusion_current_q_output( - self, - block: torch.nn.Module, - input_ids: dict, - input_others: dict, - indices: list[int], - device: str, - cache_device: str = "cpu", - ): - output_config = self.DIFFUSION_OUTPUT_CONFIGS.get(block.__class__.__name__, []) - idx = None if "hidden_states" not in output_config else output_config.index("hidden_states") - current_input_ids, current_input_others = self._sampling_inputs( - input_ids, - input_others, - indices, - seqlen=self.seqlen, - batch_dim=self.batch_dim, - share_cache_keys=self.model_context.shared_cache_keys, - ) - if isinstance(current_input_ids, dict): - hidden_states = current_input_ids.pop("hidden_states") - merge_block_output_keys(block, current_input_others, current_input_ids) - current_input_ids = hidden_states - output_q = block_forward( - block, - current_input_ids, - current_input_others, - self.model_context.amp, - self.model_context.amp_dtype, - device, - idx, - ) - return output_q.to(cache_device) - - def _get_current_q_output( - self, - block: torch.nn.Module, - input_ids: list[torch.Tensor], - input_others: dict, - indices: list[int], - device: str, - cache_device: str = "cpu", - ) -> torch.Tensor: - if self.model_context.is_diffusion: - return self._get_diffusion_current_q_output(block, input_ids, input_others, indices, device, cache_device) - current_input_ids, current_input_others = self._sampling_inputs( - input_ids, - input_others, - indices, - seqlen=self.seqlen, - batch_dim=self.batch_dim, - share_cache_keys=self.model_context.shared_cache_keys, - ) - output_q = self.block_forward( - block, current_input_ids, current_input_others, self.model_context.amp, self.model_context.amp_dtype, device - ) - return output_q.to(cache_device) - def _get_current_num_elm( self, input_ids: list[torch.Tensor], @@ -282,7 +205,7 @@ def quantize_block( lr = torch.tensor(self.lr) minmax_lr = torch.tensor(self.minmax_lr) - extra_kwargs = self._get_extra_optimizer_kwargs() + extra_kwargs = {} if self.momentum is None else {"momentum": self.momentum} if self.enable_minmax_tuning: params = [ From fb046131771a9789a0314671545416bd27fa5d79 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 2 Apr 2026 10:10:07 +0800 Subject: [PATCH 30/90] fix Signed-off-by: n1ck-guo --- auto_round/algorithms/quantization/base.py | 12 ------------ auto_round/algorithms/quantization/rtn/quantizer.py | 1 - .../algorithms/quantization/sign_round/quantizer.py | 3 +-- 3 files changed, 1 insertion(+), 15 deletions(-) diff --git a/auto_round/algorithms/quantization/base.py b/auto_round/algorithms/quantization/base.py index 2eb90072c..aa3ba39d4 100644 --- a/auto_round/algorithms/quantization/base.py +++ b/auto_round/algorithms/quantization/base.py @@ -98,18 +98,6 @@ def amp_dtype(self): return getattr(self.model_context, "amp_dtype", torch.float32) - def resolve_scheme(self, *args, **kwargs) -> None: - raise NotImplementedError( - "resolve_scheme() has been moved to BaseCompressor in compressors_new/base.py. " - "Call BaseCompressor.post_init() instead." - ) - - def post_init(self, *args, **kwargs) -> None: - raise NotImplementedError( - "post_init() has been moved to BaseCompressor/_scheme_post_init() in " - "compressors_new/base.py. Call BaseCompressor.post_init() instead." - ) - def _register_act_max_hook(self, model): def get_act_max_hook(module, input, output): diff --git a/auto_round/algorithms/quantization/rtn/quantizer.py b/auto_round/algorithms/quantization/rtn/quantizer.py index 607cc9c4d..94dea2dfe 100644 --- a/auto_round/algorithms/quantization/rtn/quantizer.py +++ b/auto_round/algorithms/quantization/rtn/quantizer.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import copy import traceback from collections import defaultdict from typing import Any, Callable, Optional, Union diff --git a/auto_round/algorithms/quantization/sign_round/quantizer.py b/auto_round/algorithms/quantization/sign_round/quantizer.py index 09b1d7e1b..767849d3f 100644 --- a/auto_round/algorithms/quantization/sign_round/quantizer.py +++ b/auto_round/algorithms/quantization/sign_round/quantizer.py @@ -31,6 +31,7 @@ collect_best_params, immediate_pack, ) +from auto_round.data_type.utils import reshape_pad_tensor_by_group_size from auto_round.logger import logger from auto_round.utils import ( check_to_quantized, @@ -78,8 +79,6 @@ def __init__(self, config: SignRoundConfig): self.optimizer = self._get_optimizer(optimizer=config.optimizer) self.wrapper_block = wrapper_block - def post_init(self): - super().post_init() if self.enable_alg_ext: try: logger.info("using algorithm extension for quantization.") From 458827985a4588a328d90d062285f221aa38628f Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 2 Apr 2026 13:26:22 +0800 Subject: [PATCH 31/90] fix by comment Signed-off-by: n1ck-guo --- auto_round/algorithms/quantization/__init__.py | 2 +- .../algorithms/quantization/adam_round/__init__.py | 13 +++++++++++++ .../quantization/{sign_round => adam_round}/adam.py | 2 +- auto_round/algorithms/quantization/base.py | 10 +++++++++- .../algorithms/quantization/sign_round/config.py | 2 +- .../algorithms/quantization/sign_round/quantizer.py | 4 ++-- auto_round/compressors_new/calib.py | 4 ++-- 7 files changed, 29 insertions(+), 8 deletions(-) create mode 100644 auto_round/algorithms/quantization/adam_round/__init__.py rename auto_round/algorithms/quantization/{sign_round => adam_round}/adam.py (97%) diff --git a/auto_round/algorithms/quantization/__init__.py b/auto_round/algorithms/quantization/__init__.py index 719528d14..6a727f31b 100644 --- a/auto_round/algorithms/quantization/__init__.py +++ b/auto_round/algorithms/quantization/__init__.py @@ -16,6 +16,6 @@ from auto_round.algorithms.quantization.config import QuantizationConfig from auto_round.algorithms.quantization.sign_round.config import SignRoundConfig from auto_round.algorithms.quantization.sign_round.quantizer import SignRoundQuantizer -from auto_round.algorithms.quantization.sign_round.adam import SignRoundAdamQuantizer +from auto_round.algorithms.quantization.adam_round.adam import AdamRoundQuantizer from auto_round.algorithms.quantization.rtn.config import RTNConfig from auto_round.algorithms.quantization.rtn.quantizer import RTNQuantizer, OptimizedRTNQuantizer diff --git a/auto_round/algorithms/quantization/adam_round/__init__.py b/auto_round/algorithms/quantization/adam_round/__init__.py new file mode 100644 index 000000000..14a492441 --- /dev/null +++ b/auto_round/algorithms/quantization/adam_round/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/auto_round/algorithms/quantization/sign_round/adam.py b/auto_round/algorithms/quantization/adam_round/adam.py similarity index 97% rename from auto_round/algorithms/quantization/sign_round/adam.py rename to auto_round/algorithms/quantization/adam_round/adam.py index 315a05c9e..96835b533 100644 --- a/auto_round/algorithms/quantization/sign_round/adam.py +++ b/auto_round/algorithms/quantization/adam_round/adam.py @@ -20,7 +20,7 @@ from auto_round.utils import check_is_cpu, htcore, is_hpex_available -class SignRoundAdamQuantizer(SignRoundQuantizer): +class AdamRoundQuantizer(SignRoundQuantizer): def __init__(self, config): super().__init__(config) diff --git a/auto_round/algorithms/quantization/base.py b/auto_round/algorithms/quantization/base.py index aa3ba39d4..3a0a0da59 100644 --- a/auto_round/algorithms/quantization/base.py +++ b/auto_round/algorithms/quantization/base.py @@ -278,6 +278,15 @@ def quantize_layer(self, layer_name: str, **kwargs): """ raise NotImplementedError("quantize_layer must be implemented in subclasses of BaseQuantizers") + def quantize_layer_outside_block(self, layer_name: str, **kwargs): + """Quantizes a single layer of the model outside of a block. + + Args: + layer_name (str): The name of the layer to quantize. The layer module is + retrieved internally via get_module(model, layer_name). + """ + raise NotImplementedError("quantize_layer_outside_block must be implemented in subclasses of BaseQuantizers") + @torch.no_grad() def _get_block_outputs( self, @@ -350,7 +359,6 @@ def _get_block_outputs( return output - @torch.no_grad() def _get_current_q_output( self, block: torch.nn.Module, diff --git a/auto_round/algorithms/quantization/sign_round/config.py b/auto_round/algorithms/quantization/sign_round/config.py index 4513f5ad3..90b7d86b1 100644 --- a/auto_round/algorithms/quantization/sign_round/config.py +++ b/auto_round/algorithms/quantization/sign_round/config.py @@ -91,7 +91,7 @@ def __init__( self.enable_adam = enable_adam if self.enable_adam: - self._alg_cls = "SignRoundAdamQuantizer" + self._alg_cls = "AdamRoundQuantizer" def check_configs(self) -> None: """Checks if the configurations are valid. diff --git a/auto_round/algorithms/quantization/sign_round/quantizer.py b/auto_round/algorithms/quantization/sign_round/quantizer.py index 767849d3f..0f177d970 100644 --- a/auto_round/algorithms/quantization/sign_round/quantizer.py +++ b/auto_round/algorithms/quantization/sign_round/quantizer.py @@ -127,7 +127,7 @@ def _get_loss( ): autocast_ctx = ( nullcontext() - if self.model_context.amp + if not self.model_context.amp else autocast(device_type=str(device).split(":")[0], dtype=self.model_context.amp_dtype) ) if self.attention_mask: @@ -334,7 +334,7 @@ def quantize_block( logger.infoclean(dump_info) return best_params - def quantize_layer( + def quantize_layer_outside_block( self, layer_name: str, input_ids: torch.Tensor, q_inputs: torch.Tensor = None, device: str = "cpu", **kwargs ): """Quantize a specific layer of the model using the provided inputs. diff --git a/auto_round/compressors_new/calib.py b/auto_round/compressors_new/calib.py index a272a0f6c..8b2e92b60 100644 --- a/auto_round/compressors_new/calib.py +++ b/auto_round/compressors_new/calib.py @@ -1028,7 +1028,7 @@ def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None: if not self.is_immediate_saving: self.model = mv_module_from_gpu(self.model) clear_memory(device_list=self.compress_context.device_list) - quant_layer = self.quantizer.quantize_layer + quant_layer = self.quantizer.quantize_layer_outside_block for layer_name in layer_names: layer_input = layer_inputs[layer_name] layer_input = to_device(layer_input, self.compress_context.cache_device) @@ -1253,7 +1253,7 @@ def _quantize_via_rtn_blockwise(self) -> None: dtype = None if self.super_group_size is not None: dtype = torch.float32 - self.quantizer.quantize_layer(name, dtype=dtype) + self.quantizer.quantize_layer_outside_block(name, dtype=dtype) # clear_memory(device_list=self.compress_context.device_list) # if self.is_immediate_saving: # shard_writer(self, is_finalize=True) From a313c26493055128ba007c957aa7b13549f0760a Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 2 Apr 2026 14:50:58 +0800 Subject: [PATCH 32/90] fix output_dir Signed-off-by: n1ck-guo --- auto_round/compressors_new/shard_writer.py | 33 ++++++++++++------- auto_round/inference/backend.py | 4 +-- docs/step_by_step.md | 8 ++--- docs/step_by_step_CN.md | 8 ++--- test/test_ark/test_model.py | 8 +++-- test/test_cpu/core/test_autoround.py | 15 +++++---- test/test_cuda/quantization/test_asym.py | 18 +++++----- test/test_cuda/quantization/test_mxfp_nvfp.py | 12 ++++--- test/test_xpu/test_autoround.py | 13 ++++---- 9 files changed, 68 insertions(+), 51 deletions(-) diff --git a/auto_round/compressors_new/shard_writer.py b/auto_round/compressors_new/shard_writer.py index 4ea5ce0ca..68443ac8f 100644 --- a/auto_round/compressors_new/shard_writer.py +++ b/auto_round/compressors_new/shard_writer.py @@ -88,17 +88,24 @@ def __init__( self.total_param_size_bytes = 0 self.skipped_meta_tensors = [] - # Directory Setup + ShardWriter._initialized = True + + @property + def output_dir(self) -> str: + """Derive the output directory from the current CompressContext at access time. + + Reading from context rather than caching the path at construction time ensures + the ShardWriter always uses the final export directory even if + ``CompressContext.output_dir`` is updated after the ShardWriter was created + (e.g. by ``_get_export_dir()`` in ``quantize_and_save()``). + """ compress_context = CompressContext.get_context() formats = compress_context.formats base_dir = _get_save_folder_name(formats[0]) subfolder = getattr(self.model, "_autoround_pipeline_subfolder", None) if subfolder: base_dir = os.path.join(base_dir, subfolder) - self.output_dir = os.path.join(base_dir, "") - os.makedirs(self.output_dir, exist_ok=True) - - ShardWriter._initialized = True + return os.path.join(base_dir, "") @classmethod def reset(cls): @@ -179,8 +186,10 @@ def _flush_shard(self): return self.shard_counter += 1 + output_dir = self.output_dir + os.makedirs(output_dir, exist_ok=True) tmp_name = f"model-shard-{self.shard_counter:05d}.{self.shard_suffix}" - tmp_path = os.path.join(self.output_dir, tmp_name) + tmp_path = os.path.join(output_dir, tmp_name) if self.use_safetensors: from safetensors.torch import save_file @@ -190,7 +199,7 @@ def _flush_shard(self): torch.save(self.current_shard_tensors, tmp_path) saved_params = list(self.current_shard_tensors.keys()) - self.shard_meta.append({"tmp_file": tmp_name, "params": saved_params}) + self.shard_meta.append({"tmp_file": tmp_name, "params": saved_params, "dir": output_dir}) self._all_saved.update(saved_params) # Offload logic: move modules to meta device once all params are saved @@ -245,21 +254,23 @@ def finalize(self): logger.warning("No tensors saved.") return + output_dir = self.output_dir for idx, meta in enumerate(self.shard_meta, start=1): - old_path = os.path.join(self.output_dir, meta["tmp_file"]) + shard_dir = meta.get("dir", output_dir) + old_path = os.path.join(shard_dir, meta["tmp_file"]) new_name = ( f"model.{self.shard_suffix}" if self.shard_counter == 1 else f"model-{idx:05d}-of-{self.shard_counter:05d}.{self.shard_suffix}" ) - - os.rename(old_path, os.path.join(self.output_dir, new_name)) + new_path = os.path.join(shard_dir, new_name) + os.rename(old_path, new_path) for p in meta["params"]: self.global_weight_map[p] = new_name # 3. Write Index JSON index_ext = "safetensors.index.json" if self.use_safetensors else "bin.index.json" - index_path = os.path.join(self.output_dir, f"model.{index_ext}") + index_path = os.path.join(output_dir, f"model.{index_ext}") index_data = { "metadata": { diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index 5596642bb..d98545679 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -30,7 +30,7 @@ import cpuinfo if TYPE_CHECKING: - from transformers import SignRoundConfig + from transformers import AutoRoundConfig def get_cpu_manufacturer(): @@ -1127,7 +1127,7 @@ def get_layer_backend( def get_highest_priority_backend( - quantization_config: "SignRoundConfig", device: str, packing_format: str + quantization_config: "AutoRoundConfig", device: str, packing_format: str ) -> str | None: supported_backends = [] for key in BackendInfos.keys(): diff --git a/docs/step_by_step.md b/docs/step_by_step.md index a076e9acb..54a6ffb31 100644 --- a/docs/step_by_step.md +++ b/docs/step_by_step.md @@ -759,10 +759,10 @@ The backend may not always be the most suitable for certain devices. You can specify your preferred backend such as "ark" for CPU and Intel GPU, "marlin/exllamav2/triton" for CUDA, according to your needs or hardware compatibility. Please note that additional corresponding libraries may be required. ```python -from transformers import AutoModelForCausalLM, AutoTokenizer, AutoRoundConfig +from transformers import AutoModelForCausalLM, AutoTokenizer, SignRoundConfig model_name = "OPEA/Qwen2.5-1.5B-Instruct-int4-sym-inc" -quantization_config = AutoRoundConfig(backend="ark") +quantization_config = SignRoundConfig(backend="ark") model = AutoModelForCausalLM.from_pretrained( model_name, device_map="cpu", quantization_config=quantization_config, torch_dtype="auto" ) @@ -797,10 +797,10 @@ print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50, do_sample=Fal Most GPTQ/AWQ models can be converted to the AutoRound format for better compatibility and support with Intel devices. Please note that the quantization config will be changed if the model is serialized. ```python -from transformers import AutoModelForCausalLM, AutoTokenizer, AutoRoundConfig +from transformers import AutoModelForCausalLM, AutoTokenizer, SignRoundConfig model_name = "ybelkada/opt-125m-gptq-4bit" -quantization_config = AutoRoundConfig() +quantization_config = SignRoundConfig() model = AutoModelForCausalLM.from_pretrained( model_name, device_map="cpu", quantization_config=quantization_config, torch_dtype="auto" ) diff --git a/docs/step_by_step_CN.md b/docs/step_by_step_CN.md index b7cd57f64..d764363c1 100644 --- a/docs/step_by_step_CN.md +++ b/docs/step_by_step_CN.md @@ -714,10 +714,10 @@ AutoRound 会根据兼容性为每个层自动选择推理后端,默认优先 指定后端的示例: ```python -from transformers import AutoModelForCausalLM, AutoTokenizer, AutoRoundConfig +from transformers import AutoModelForCausalLM, AutoTokenizer, SignRoundConfig model_name = "OPEA/Qwen2.5-1.5B-Instruct-int4-sym-inc" -quantization_config = AutoRoundConfig(backend="ark") +quantization_config = SignRoundConfig(backend="ark") model = AutoModelForCausalLM.from_pretrained( model_name, device_map="cpu", quantization_config=quantization_config, torch_dtype="auto" ) @@ -754,10 +754,10 @@ print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50, do_sample=Fal 转换并推理的示例: ```python -from transformers import AutoModelForCausalLM, AutoTokenizer, AutoRoundConfig +from transformers import AutoModelForCausalLM, AutoTokenizer, SignRoundConfig model_name = "ybelkada/opt-125m-gptq-4bit" -quantization_config = AutoRoundConfig() +quantization_config = SignRoundConfig() model = AutoModelForCausalLM.from_pretrained( model_name, device_map="cpu", quantization_config=quantization_config, torch_dtype="auto" ) diff --git a/test/test_ark/test_model.py b/test/test_ark/test_model.py index ca0d8b4b6..5500999fb 100644 --- a/test/test_ark/test_model.py +++ b/test/test_ark/test_model.py @@ -44,14 +44,16 @@ def main_op(self, format, bits, group_size, sym, dtype, device, fast_cfg=True, t else: autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym) quantized_model_path = self.save_folder - autoround.quantize_and_save(output_dir=quantized_model_path, format=format) ##will convert to gptq model + _, saved_folders = autoround.quantize_and_save( + output_dir=quantized_model_path, format=format + ) ##will convert to gptq model quantization_config = AutoRoundConfig(backend="ark") model = AutoModelForCausalLM.from_pretrained( - quantized_model_path, dtype=dtype, device_map=device, quantization_config=quantization_config + saved_folders[0], dtype=dtype, device_map=device, quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(saved_folders[0]) model_infer(model, tokenizer) evaluate_accuracy(model, tokenizer, threshold=tar_acc, batch_size=32, limit=limit) torch.xpu.empty_cache() diff --git a/test/test_cpu/core/test_autoround.py b/test/test_cpu/core/test_autoround.py index dacd7bf4a..1bb8af079 100644 --- a/test/test_cpu/core/test_autoround.py +++ b/test/test_cpu/core/test_autoround.py @@ -20,6 +20,7 @@ class TestAutoRound: + @classmethod def setup_class(self): model_name = opt_name_or_path @@ -372,14 +373,14 @@ def test_rtn(self, tiny_opt_model_path): bits, group_size, sym = 4, 128, True autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=0, nsamples=1) quantized_model_path = self.save_folder - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, saved_folders = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, + saved_folders[0], torch_dtype=torch.float16, device_map="auto", ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(saved_folders[0]) model_infer(model, tokenizer) def test_embed_quant(self, tiny_opt_model_path, dataloader): @@ -636,8 +637,8 @@ def test_quant_lm_head(self, tiny_untied_qwen_model_path): nsamples=1, disable_opt_rtn=True, ) - ar.quantize_and_save(output_dir=self.save_folder, format="auto_round") - model = AutoModelForCausalLM.from_pretrained(self.save_folder, device_map="cpu") + _, saved_folders = ar.quantize_and_save(output_dir=self.save_folder, format="auto_round") + model = AutoModelForCausalLM.from_pretrained(saved_folders[0], device_map="cpu") assert "lm_head" in model.config.quantization_config.extra_config assert model.config.quantization_config.extra_config["lm_head"]["bits"] == 4 @@ -653,8 +654,8 @@ def test_quant_lm_head_layer_config(self, tiny_untied_qwen_model_path): disable_opt_rtn=True, layer_config=layer_config, ) - ar.quantize_and_save(output_dir=self.save_folder, format="auto_round") - model = AutoModelForCausalLM.from_pretrained(self.save_folder, device_map="cpu") + _, saved_folders = ar.quantize_and_save(output_dir=self.save_folder, format="auto_round") + model = AutoModelForCausalLM.from_pretrained(saved_folders[0], device_map="cpu") assert "lm_head" in model.config.quantization_config.extra_config assert model.config.quantization_config.extra_config["lm_head"]["bits"] == 4 diff --git a/test/test_cuda/quantization/test_asym.py b/test/test_cuda/quantization/test_asym.py index 5145cf13d..eaca19d3f 100644 --- a/test/test_cuda/quantization/test_asym.py +++ b/test/test_cuda/quantization/test_asym.py @@ -29,15 +29,15 @@ def setup_and_teardown_class(self): def test_asym_group_size_with_tuning(self, group_size, tiny_opt_model_path): bits, sym = 4, False ar = AutoRound(tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1) - ar.quantize_and_save(format="auto_round", output_dir=self.save_dir) + _, saved_folders = ar.quantize_and_save(format="auto_round", output_dir=self.save_dir) model = AutoModelForCausalLM.from_pretrained( - self.save_dir, + saved_folders[0], torch_dtype="auto", device_map="auto", ) - tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + tokenizer = AutoTokenizer.from_pretrained(saved_folders[0]) model_infer(model, tokenizer) @pytest.mark.skip_ci(reason="Not necessary since it's covered by backend tests") # skip this test in CI @@ -45,15 +45,15 @@ def test_asym_group_size_with_tuning(self, group_size, tiny_opt_model_path): def test_asym_bits_with_tuning(self, bits, tiny_opt_model_path): group_size, sym = 128, False ar = AutoRound(tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1) - ar.quantize_and_save(format="auto_round", output_dir=self.save_dir) + _, saved_folders = ar.quantize_and_save(format="auto_round", output_dir=self.save_dir) model = AutoModelForCausalLM.from_pretrained( - self.save_dir, + saved_folders[0], torch_dtype="auto", device_map="auto", ) - tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + tokenizer = AutoTokenizer.from_pretrained(saved_folders[0]) model_infer(model, tokenizer) @pytest.mark.skip_ci(reason="Not necessary since it's covered by backend tests") # skip this test in CI @@ -61,17 +61,17 @@ def test_asym_bits_with_tuning(self, bits, tiny_opt_model_path): def test_asym_format_with_tuning(self, format, tiny_opt_model_path): bits, group_size, sym = 4, 128, False ar = AutoRound(tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1) - ar.quantize_and_save(format=format, output_dir=self.save_dir) + _, saved_folders = ar.quantize_and_save(format=format, output_dir=self.save_dir) if format == "auto_round:auto_gptq": # Cannot load correctly, skip auto_gptq since it's deprecated. return model = AutoModelForCausalLM.from_pretrained( - self.save_dir, + saved_folders[0], torch_dtype="auto", device_map="auto", ) - tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + tokenizer = AutoTokenizer.from_pretrained(saved_folders[0]) model_infer(model, tokenizer) diff --git a/test/test_cuda/quantization/test_mxfp_nvfp.py b/test/test_cuda/quantization/test_mxfp_nvfp.py index 3e40e7288..0befbe6f6 100644 --- a/test/test_cuda/quantization/test_mxfp_nvfp.py +++ b/test/test_cuda/quantization/test_mxfp_nvfp.py @@ -55,11 +55,11 @@ def test_e2e_quant_and_infer(scheme, tiny_qwen_model_path): # Quantize and save the model to the temporary directory quantized_model_path = f"{temp_dir}/tmp_autoround_{scheme}" - autoround.quantize_and_save(format="auto_round", output_dir=quantized_model_path) + _, saved_folders = autoround.quantize_and_save(format="auto_round", output_dir=quantized_model_path) # Perform inference with the quantized model model = AutoModelForCausalLM.from_pretrained( - quantized_model_path, + saved_folders[0], torch_dtype="auto", ) model.eval() @@ -144,9 +144,11 @@ def test_qwen_moe_quant_infer(self, dataloader): layer_config=layer_config, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="auto_round") - model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="auto") - tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) + _, saved_folders = autoround.quantize_and_save( + output_dir=quantized_model_path, inplace=False, format="auto_round" + ) + model = AutoModelForCausalLM.from_pretrained(saved_folders[0], torch_dtype="auto", device_map="auto") + tokenizer = AutoTokenizer.from_pretrained(saved_folders[0]) from ...helpers import evaluate_accuracy evaluate_accuracy(model, tokenizer, threshold=0.49, batch_size=16, task="piqa", limit=10) diff --git a/test/test_xpu/test_autoround.py b/test/test_xpu/test_autoround.py index 4c4c63678..21ec59552 100644 --- a/test/test_xpu/test_autoround.py +++ b/test/test_xpu/test_autoround.py @@ -12,6 +12,7 @@ class TestAutoRoundXPU: + @classmethod def setup_class(self): self.device = "xpu" @@ -47,13 +48,13 @@ def test_gptq_format(self, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path) + _, saved_folders = autoround.quantize_and_save(output_dir=quantized_model_path) quantization_config = AutoRoundConfig(backend="auto") model = AutoModelForCausalLM.from_pretrained( - quantized_model_path, device_map=self.device, quantization_config=quantization_config + saved_folders[0], device_map=self.device, quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) + tokenizer = AutoTokenizer.from_pretrained(saved_folders[0]) text = "There is a girl who likes adventure," inputs = tokenizer(text, return_tensors="pt").to(model.device) res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]) @@ -78,13 +79,13 @@ def test_awq_format(self, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq") + _, saved_folders = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq") quantization_config = AutoRoundConfig(backend="auto") model = AutoModelForCausalLM.from_pretrained( - quantized_model_path, device_map=self.device, quantization_config=quantization_config + saved_folders[0], device_map=self.device, quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) + tokenizer = AutoTokenizer.from_pretrained(saved_folders[0]) text = "There is a girl who likes adventure," inputs = tokenizer(text, return_tensors="pt").to(model.device) res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]) From 19f95eda7e53c83ab2d8b92fb1ce6fbda0f83c29 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 2 Apr 2026 16:51:08 +0800 Subject: [PATCH 33/90] fix Signed-off-by: n1ck-guo --- auto_round/compressors_new/base.py | 2 + auto_round/compressors_new/calib.py | 11 ++++- test/__init__.pyc | Bin 0 -> 141 bytes test/test_ark/test_model.py | 6 +-- test/test_cpu/backends/test_torch_backend.py | 16 ++++--- test/test_cpu/core/test_autoround.py | 30 +++++++----- test/test_cpu/export/test_export.py | 18 ++++--- test/test_cpu/export/test_gguf_format.py | 18 ++++--- test/test_cpu/export/test_llmc_format.py | 8 ++-- test/test_cpu/models/test_mllm.py | 8 ++-- test/test_cpu/models/test_moe_model.py | 4 +- .../quantization/test_act_quantization.py | 8 ++-- test/test_cpu/quantization/test_asym.py | 18 +++---- test/test_cpu/quantization/test_mix_bits.py | 12 +++-- test/test_cpu/quantization/test_mxfp_nvfp.py | 27 +++++++---- .../quantization/test_mxfp_save_load.py | 2 +- test/test_cpu/quantization/test_new_arch.py | 38 +++++++++++++++ test/test_cpu/schemes/test_scheme.py | 24 +++++----- test/test_cpu/utils/test_generation.py | 6 ++- test/test_cuda/algorithms/test_alg_ext.py | 4 +- .../backends/test_exllamav2_backend.py | 40 +++++++++------- .../test_cuda/backends/test_marlin_backend.py | 33 +++++++------ test/test_cuda/backends/test_torch_backend.py | 24 ++++++---- .../test_cuda/backends/test_triton_backend.py | 44 +++++++++--------- test/test_cuda/export/test_auto_awq_format.py | 4 +- .../test_cuda/export/test_auto_gptq_format.py | 2 +- .../export/test_auto_round_format.py | 28 +++++++---- test/test_cuda/export/test_gguf_format.py | 20 ++++---- test/test_cuda/integrations/test_sglang.py | 14 +++--- test/test_cuda/integrations/test_vllm.py | 6 +-- test/test_cuda/models/test_fp8_model.py | 32 ++++++------- test/test_cuda/models/test_moe_model.py | 4 +- test/test_cuda/quantization/test_asym.py | 18 +++---- test/test_cuda/quantization/test_mxfp_nvfp.py | 10 ++-- .../quantization/test_torch_compile.py | 4 +- .../transform/test_mxfp4_transform.py | 18 +++---- test/test_xpu/test_autoround.py | 26 +++++++---- 37 files changed, 359 insertions(+), 228 deletions(-) create mode 100755 test/__init__.pyc create mode 100644 test/test_cpu/quantization/test_new_arch.py diff --git a/auto_round/compressors_new/base.py b/auto_round/compressors_new/base.py index 1d52ee509..929b7fe7e 100644 --- a/auto_round/compressors_new/base.py +++ b/auto_round/compressors_new/base.py @@ -907,6 +907,8 @@ def save_quantized( folders.append(save_folder) if return_folders: + if len(folders) == 1: + folders = folders[0] return compressed_model, folders else: return compressed_model diff --git a/auto_round/compressors_new/calib.py b/auto_round/compressors_new/calib.py index 8b2e92b60..08791f303 100644 --- a/auto_round/compressors_new/calib.py +++ b/auto_round/compressors_new/calib.py @@ -777,7 +777,14 @@ def _quantize_blocks( mv_module_from_gpu(m) if self.enable_torch_compile: torch._dynamo.reset() - clear_memory(input_ids if q_input is None else None, device_list=self.compress_context.device_list) + # Always advance input_ids to the current block's output so that the next + # block receives the correct activations. When enable_quanted_input is + # False we reuse reference_output (unquantized block output); otherwise + # q_input already holds the quantized-block output. + next_input_ids = q_input if q_input is not None else reference_output + clear_memory( + input_ids if input_ids is not next_input_ids else None, device_list=self.compress_context.device_list + ) memory_monitor.log_summary() # ── Infrastructure: immediate_pack / shard write ────────────────── @@ -788,7 +795,7 @@ def _quantize_blocks( _immediate_pack(_mod.global_name, self.quantizer.layer_config) - input_ids = q_input if q_input is not None else input_ids + input_ids = next_input_ids if self.is_immediate_saving: self.shard_writer.write(m, is_finalize=False) diff --git a/test/__init__.pyc b/test/__init__.pyc new file mode 100755 index 0000000000000000000000000000000000000000..56ea0061b2641c037f4f73888d6b74e44a3ec687 GIT binary patch literal 141 zcmZSn%*&;~H#sAj0SXv_v;z{p`6;D2 MsdgX>N`ROF0LsB0EdT%j literal 0 HcmV?d00001 diff --git a/test/test_ark/test_model.py b/test/test_ark/test_model.py index 5500999fb..476dc7ec9 100644 --- a/test/test_ark/test_model.py +++ b/test/test_ark/test_model.py @@ -44,16 +44,16 @@ def main_op(self, format, bits, group_size, sym, dtype, device, fast_cfg=True, t else: autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym) quantized_model_path = self.save_folder - _, saved_folders = autoround.quantize_and_save( + _, saved_folder = autoround.quantize_and_save( output_dir=quantized_model_path, format=format ) ##will convert to gptq model quantization_config = AutoRoundConfig(backend="ark") model = AutoModelForCausalLM.from_pretrained( - saved_folders[0], dtype=dtype, device_map=device, quantization_config=quantization_config + saved_folder, dtype=dtype, device_map=device, quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(saved_folders[0]) + tokenizer = AutoTokenizer.from_pretrained(saved_folder) model_infer(model, tokenizer) evaluate_accuracy(model, tokenizer, threshold=tar_acc, batch_size=32, limit=limit) torch.xpu.empty_cache() diff --git a/test/test_cpu/backends/test_torch_backend.py b/test/test_cpu/backends/test_torch_backend.py index 14951f2e6..7e45c69c4 100644 --- a/test/test_cpu/backends/test_torch_backend.py +++ b/test/test_cpu/backends/test_torch_backend.py @@ -41,23 +41,25 @@ def test_torch_4bits_asym(self, dataloader): dataset=dataloader, ) quantized_model_path = self.save_folder - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round:gptqmodel" + ) quantization_config = AutoRoundConfig(backend="torch") model = AutoModelForCausalLM.from_pretrained( quantized_model_path, dtype=torch.float16, device_map="cpu", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) evaluate_accuracy(model, tokenizer, threshold=0.35, batch_size=16, limit=10) torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_folder, dtype=torch.bfloat16, device_map="cpu", quantization_config=quantization_config + quantized_model_path, dtype=torch.bfloat16, device_map="cpu", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) evaluate_accuracy(model, tokenizer, threshold=0.35, batch_size=16, limit=10) torch.cuda.empty_cache() @@ -77,14 +79,16 @@ def test_torch_4bits_sym(self, dataloader): dataset=dataloader, ) quantized_model_path = self.save_folder - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") ##will convert to gptq model + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round" + ) ##will convert to gptq model quantization_config = AutoRoundConfig(backend="torch") model = AutoModelForCausalLM.from_pretrained( quantized_model_path, dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) evaluate_accuracy(model, tokenizer, threshold=0.28, batch_size=32, limit=1000) torch.cuda.empty_cache() diff --git a/test/test_cpu/core/test_autoround.py b/test/test_cpu/core/test_autoround.py index 1bb8af079..8ac9ca53a 100644 --- a/test/test_cpu/core/test_autoround.py +++ b/test/test_cpu/core/test_autoround.py @@ -373,14 +373,14 @@ def test_rtn(self, tiny_opt_model_path): bits, group_size, sym = 4, 128, True autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=0, nsamples=1) quantized_model_path = self.save_folder - _, saved_folders = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") model = AutoModelForCausalLM.from_pretrained( - saved_folders[0], + quantized_model_path, torch_dtype=torch.float16, device_map="auto", ) - tokenizer = AutoTokenizer.from_pretrained(saved_folders[0]) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) def test_embed_quant(self, tiny_opt_model_path, dataloader): @@ -427,7 +427,9 @@ def test_fallback_layers(self, tiny_opt_model_path, dataloader): autoround.quantize() quantized_model_path = self.save_folder - autoround.save_quantized(output_dir=quantized_model_path, format="auto_round", inplace=True) + _, quantized_model_path = autoround.save_quantized( + output_dir=quantized_model_path, format="auto_round", inplace=True + ) model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu") tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) @@ -458,7 +460,9 @@ def test_fallback_layers_regex_awq(self, tiny_opt_model_path, dataloader): autoround.quantize() quantized_model_path = self.save_folder - autoround.save_quantized(output_dir=quantized_model_path, format="auto_awq", inplace=True) + _, quantized_model_path = autoround.save_quantized( + output_dir=quantized_model_path, format="auto_awq", inplace=True + ) quantization_config = AutoRoundConfig() model = AutoModelForCausalLM.from_pretrained( @@ -493,7 +497,9 @@ def test_fallback_layers_regex_gptq(self, tiny_opt_model_path, dataloader): autoround.quantize() quantized_model_path = self.save_folder - autoround.save_quantized(output_dir=quantized_model_path, format="auto_gptq", inplace=True) + _, quantized_model_path = autoround.save_quantized( + output_dir=quantized_model_path, format="auto_gptq", inplace=True + ) quantization_config = AutoRoundConfig() model = AutoModelForCausalLM.from_pretrained( @@ -528,7 +534,9 @@ def test_fallback_layers_regex_round(self, tiny_opt_model_path, dataloader): autoround.quantize() quantized_model_path = self.save_folder - autoround.save_quantized(output_dir=quantized_model_path, format="auto_round", inplace=True) + _, quantized_model_path = autoround.save_quantized( + output_dir=quantized_model_path, format="auto_round", inplace=True + ) quantization_config = AutoRoundConfig() model = AutoModelForCausalLM.from_pretrained( @@ -637,8 +645,8 @@ def test_quant_lm_head(self, tiny_untied_qwen_model_path): nsamples=1, disable_opt_rtn=True, ) - _, saved_folders = ar.quantize_and_save(output_dir=self.save_folder, format="auto_round") - model = AutoModelForCausalLM.from_pretrained(saved_folders[0], device_map="cpu") + _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_folder, format="auto_round") + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu") assert "lm_head" in model.config.quantization_config.extra_config assert model.config.quantization_config.extra_config["lm_head"]["bits"] == 4 @@ -654,8 +662,8 @@ def test_quant_lm_head_layer_config(self, tiny_untied_qwen_model_path): disable_opt_rtn=True, layer_config=layer_config, ) - _, saved_folders = ar.quantize_and_save(output_dir=self.save_folder, format="auto_round") - model = AutoModelForCausalLM.from_pretrained(saved_folders[0], device_map="cpu") + _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_folder, format="auto_round") + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu") assert "lm_head" in model.config.quantization_config.extra_config assert model.config.quantization_config.extra_config["lm_head"]["bits"] == 4 diff --git a/test/test_cpu/export/test_export.py b/test/test_cpu/export/test_export.py index 34207e33e..3df20dcdd 100644 --- a/test/test_cpu/export/test_export.py +++ b/test/test_cpu/export/test_export.py @@ -55,7 +55,7 @@ def test_autogptq_format(self, dataloader): ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") if group_size == -1: continue @@ -82,7 +82,7 @@ def test_autoround_format(self, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") if group_size == -1: continue @@ -107,7 +107,9 @@ def test_autoround_awq_format(self, dataloader): ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round:auto_awq" + ) # quantization_config = AutoRoundConfig( # backend="cpu" @@ -216,7 +218,7 @@ def test_static_afp8_export(self, static_kv_dtype): static_kv_dtype=static_kv_dtype, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt") assert "model.decoder.layers.8.self_attn.k_proj.input_scale" in f.keys() assert "model.decoder.layers.8.self_attn.k_proj.weight_scale" in f.keys() @@ -276,7 +278,7 @@ def test_static_afp8_export(self, static_kv_dtype): act_group_size=0, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt") assert "model.decoder.layers.8.self_attn.k_proj.input_scale" in f.keys() @@ -301,7 +303,7 @@ def test_static_fp8_attn(self): static_attention_dtype="fp8", ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt") assert "model.decoder.layers.8.self_attn.k_proj.input_scale" in f.keys() assert "model.decoder.layers.8.self_attn.k_proj.weight_scale" in f.keys() @@ -457,7 +459,9 @@ def test_autoawq_qwen3_vl_infer(self, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="auto_awq") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, inplace=False, format="auto_awq" + ) # Check items of modules_to_not_convert in quantization config quantization_config_path = f"{quantized_model_path}/quantization_config.json" diff --git a/test/test_cpu/export/test_gguf_format.py b/test/test_cpu/export/test_gguf_format.py index 945c28db0..cc7a9e4d7 100644 --- a/test/test_cpu/export/test_gguf_format.py +++ b/test/test_cpu/export/test_gguf_format.py @@ -46,7 +46,9 @@ def test_q4_0(self, tiny_qwen_model_path): ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q4_0") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, inplace=False, format="gguf:q4_0" + ) gguf_file = os.listdir(quantized_model_path)[0] assert gguf_file.endswith(".gguf"), "Saved file is not in gguf format" # Accuracy test is covered in test_cuda/export/test_gguf_format.py::TestAutoRound::test_q4_0_accuracy @@ -59,7 +61,9 @@ def test_func(self): disable_opt_rtn=True, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q*_1") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, inplace=False, format="gguf:q*_1" + ) assert autoround.group_size == 32 assert not autoround.sym gguf_file = os.listdir(self.save_dir)[0] @@ -91,7 +95,9 @@ def test_q4_k_m(self, dataloader, tiny_qwen_model_path): disable_opt_rtn=True, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m,fake") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="gguf:q4_k_m,fake" + ) assert autoround.layer_config["model.layers.1.self_attn.v_proj"]["super_group_size"] == 16 assert autoround.layer_config["model.layers.1.self_attn.v_proj"]["data_type"] == "int_sym_dq" assert autoround.layer_config["model.layers.0.self_attn.v_proj"]["data_type"] == "int_asym_dq" @@ -144,7 +150,7 @@ def test_vlm_gguf(self, tiny_qwen_vl_model_path): quant_nontext_module=True, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0") assert "mmproj-model.gguf" in os.listdir(self.save_dir) for file_name in os.listdir(quantized_model_path): file_size = os.path.getsize(os.path.join(quantized_model_path, file_name)) / 1024**2 @@ -164,7 +170,7 @@ def test_vlm_gguf_wo_quant_nontext_module(self, tiny_qwen_vl_model_path): quant_nontext_module=False, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0") assert "mmproj-model.gguf" in os.listdir(self.save_dir) for file_name in os.listdir(quantized_model_path): file_size = os.path.getsize(os.path.join(quantized_model_path, file_name)) / 1024**2 @@ -254,7 +260,7 @@ def test_q2k_mixed(self, tiny_qwen_moe_model_path): disable_opt_rtn=True, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q2_k_mixed") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q2_k_mixed") gguf_file = os.listdir(quantized_model_path)[0] file_size = os.path.getsize(os.path.join(quantized_model_path, gguf_file)) / 1024**2 assert file_size < 1150, f"file size {file_size} MB is too large for q2_k_mixed format" diff --git a/test/test_cpu/export/test_llmc_format.py b/test/test_cpu/export/test_llmc_format.py index ebc1bd87c..212dfb6f6 100644 --- a/test/test_cpu/export/test_llmc_format.py +++ b/test/test_cpu/export/test_llmc_format.py @@ -55,7 +55,7 @@ def test_llmcompressor_fp8(self): nsamples=2, iters=0, ) - autoround.quantize_and_save(self.save_dir, format="llm_compressor") + _, quantized_model_path = autoround.quantize_and_save(self.save_dir, format="llm_compressor") # from vllm import LLM # model = LLM(self.save_dir) # result = model.generate("Hello my name is") @@ -63,7 +63,7 @@ def test_llmcompressor_fp8(self): import json - config = json.load(open(os.path.join(self.save_dir, "config.json"))) + config = json.load(open(os.path.join(quantized_model_path, "config.json"))) assert "group_0" in config["quantization_config"]["config_groups"] assert config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["num_bits"] == 8 assert config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"] == "channel" @@ -80,11 +80,11 @@ def test_autoround_llmcompressor_fp8(self): nsamples=2, iters=0, ) - autoround.quantize_and_save(self.save_dir, format="auto_round:llm_compressor") + _, quantized_model_path = autoround.quantize_and_save(self.save_dir, format="auto_round:llm_compressor") import json - config = json.load(open(os.path.join(self.save_dir, "config.json"))) + config = json.load(open(os.path.join(quantized_model_path, "config.json"))) assert "group_0" in config["quantization_config"]["config_groups"] assert config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["num_bits"] == 8 assert config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"] == "tensor" diff --git a/test/test_cpu/models/test_mllm.py b/test/test_cpu/models/test_mllm.py index 9fd6c69df..d75be5667 100644 --- a/test/test_cpu/models/test_mllm.py +++ b/test/test_cpu/models/test_mllm.py @@ -225,15 +225,17 @@ def test_qwen2_5(self, tiny_qwen_2_5_vl_model_path): processor=processor, image_processor=image_processor, ) - autoround.quantize_and_save(self.save_dir, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(self.save_dir, format="auto_round") import requests from PIL import Image from transformers import AutoProcessor, AutoTokenizer, Qwen2_5_VLForConditionalGeneration - model = Qwen2_5_VLForConditionalGeneration.from_pretrained(self.save_dir, torch_dtype="auto", device_map="auto") + model = Qwen2_5_VLForConditionalGeneration.from_pretrained( + quantized_model_path, torch_dtype="auto", device_map="auto" + ) image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" - processor = AutoProcessor.from_pretrained(self.save_dir) + processor = AutoProcessor.from_pretrained(quantized_model_path) messages = [ { "role": "user", diff --git a/test/test_cpu/models/test_moe_model.py b/test/test_cpu/models/test_moe_model.py index ff7a9e732..f597a4f9d 100644 --- a/test/test_cpu/models/test_moe_model.py +++ b/test/test_cpu/models/test_moe_model.py @@ -106,9 +106,9 @@ def test_qwen3_vl_moe_mxfp(tiny_qwen3_vl_moe_model_path): disable_opt_rtn=True, ignore_layers="self_attn,lm_head, mlp.gate", ) - quantized_model, _ = autoround.quantize_and_save(format="auto_round", output_dir=output_dir) + quantized_model, quantized_model_path = autoround.quantize_and_save(format="auto_round", output_dir=output_dir) assert quantized_model is not None, "Quantized model should not be None." - loaded_model = Qwen3VLMoeForConditionalGeneration.from_pretrained(output_dir, device_map="cpu") + loaded_model = Qwen3VLMoeForConditionalGeneration.from_pretrained(quantized_model_path, device_map="cpu") for n, m in quantized_model.named_modules(): if m.__class__.__name__ == "QuantLinear": diff --git a/test/test_cpu/quantization/test_act_quantization.py b/test/test_cpu/quantization/test_act_quantization.py index 4798e0fe4..6546efd62 100644 --- a/test/test_cpu/quantization/test_act_quantization.py +++ b/test/test_cpu/quantization/test_act_quantization.py @@ -108,7 +108,7 @@ def test_act_config_MXFP4_saving(self, tiny_opt_model_path, dataloader): layer_config=layer_config, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu") assert "lm_head" not in model.config.quantization_config.extra_config @@ -129,7 +129,7 @@ def test_act_config_NVFP4_saving(self, tiny_opt_model_path, dataloader): layer_config=layer_config, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu") kproj_config = model.config.quantization_config.extra_config["model.decoder.layers.1.self_attn.k_proj"] assert "act_bits" in kproj_config.keys() and kproj_config["act_bits"] == 16 @@ -148,7 +148,7 @@ def test_WOQ_config_INT_saving(self, tiny_opt_model_path, dataloader): layer_config=layer_config, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu") extra_config = model.config.quantization_config.extra_config @@ -178,7 +178,7 @@ def test_act_config_FP8_saving(self, tiny_opt_model_path, dataloader): layer_config=layer_config, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") from transformers import AutoConfig extra_config = AutoConfig.from_pretrained(quantized_model_path).quantization_config["extra_config"] diff --git a/test/test_cpu/quantization/test_asym.py b/test/test_cpu/quantization/test_asym.py index 05f9c1990..0f792a3a1 100644 --- a/test/test_cpu/quantization/test_asym.py +++ b/test/test_cpu/quantization/test_asym.py @@ -28,15 +28,15 @@ def test_asym_group_size(self, tiny_opt_model_path): ar = AutoRound( tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1 ) - ar.quantize_and_save(format="auto_round", output_dir=self.save_folder) + _, quantized_model_path = ar.quantize_and_save(format="auto_round", output_dir=self.save_folder) model = AutoModelForCausalLM.from_pretrained( - self.save_folder, + quantized_model_path, torch_dtype="auto", device_map="auto", ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) def test_asym_bits(self, tiny_opt_model_path): @@ -45,15 +45,15 @@ def test_asym_bits(self, tiny_opt_model_path): ar = AutoRound( tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1 ) - ar.quantize_and_save(format="auto_round", output_dir=self.save_folder) + _, quantized_model_path = ar.quantize_and_save(format="auto_round", output_dir=self.save_folder) model = AutoModelForCausalLM.from_pretrained( - self.save_folder, + quantized_model_path, torch_dtype="auto", device_map="auto", ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) # use parameters later @@ -70,13 +70,13 @@ def test_asym_format(self, tiny_opt_model_path): nsamples=1, disable_opt_rtn=True, ) - ar.quantize_and_save(format=format, output_dir=self.save_folder) + _, quantized_model_path = ar.quantize_and_save(format=format, output_dir=self.save_folder) model = AutoModelForCausalLM.from_pretrained( - self.save_folder, + quantized_model_path, torch_dtype="auto", device_map="auto", ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) diff --git a/test/test_cpu/quantization/test_mix_bits.py b/test/test_cpu/quantization/test_mix_bits.py index 71a0dc1cd..1e45d1b73 100644 --- a/test/test_cpu/quantization/test_mix_bits.py +++ b/test/test_cpu/quantization/test_mix_bits.py @@ -59,7 +59,7 @@ def test_mixed_gptqmodel(self, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") # test original GPTQModel inference from gptqmodel import GPTQModel @@ -87,7 +87,7 @@ def test_mixed_gptqmodel_convert_to_ar(self, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") quantization_config = AutoRoundConfig() model = AutoModelForCausalLM.from_pretrained( quantized_model_path, device_map="cpu", quantization_config=quantization_config @@ -114,7 +114,7 @@ def test_mixed_autoround_format(self, dataloader): layer_config=layer_config, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") config_file = Path(quantized_model_path) / "config.json" with open(config_file, "r", encoding="utf-8") as f: config = json.load(f) @@ -147,7 +147,7 @@ def test_fallback_regex_for_awq_format(self, dataloader): layer_config=layer_config, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq") quantization_config = AutoRoundConfig() model = AutoModelForCausalLM.from_pretrained( quantized_model_path, device_map="cpu", quantization_config=quantization_config @@ -219,7 +219,9 @@ def test_mixed_MXFP_autoround_format_loading(self, dataloader): layer_config=layer_config, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, inplace=False, format="auto_round" + ) model = AutoModelForCausalLM.from_pretrained( quantized_model_path, torch_dtype="auto", diff --git a/test/test_cpu/quantization/test_mxfp_nvfp.py b/test/test_cpu/quantization/test_mxfp_nvfp.py index fb8d51c84..acd411954 100644 --- a/test/test_cpu/quantization/test_mxfp_nvfp.py +++ b/test/test_cpu/quantization/test_mxfp_nvfp.py @@ -78,7 +78,9 @@ def test_nvfp4_moe_actmax_ar(self, tiny_deepseek_v2_model_path, dataloader): layer_config=layer_config, trust_remote_code=False, ) - compressed_model, _ = autoround.quantize_and_save(output_dir=self.save_dir, inplace=True, format="auto_round") + compressed_model, quantized_model_path = autoround.quantize_and_save( + output_dir=self.save_dir, inplace=True, format="auto_round" + ) lm_head = compressed_model.lm_head assert ( hasattr(lm_head, "weight_scale") @@ -87,7 +89,6 @@ def test_nvfp4_moe_actmax_ar(self, tiny_deepseek_v2_model_path, dataloader): and lm_head.weight_packed.dtype is torch.uint8 and lm_head.weight_scale.dtype is torch.float8_e4m3fn ), "Illegal NVFP4 packing for lm_head layer" - quantized_model_path = self.save_dir assert is_model_outputs_similar(model_name, quantized_model_path) def test_mxfp4_moe_ar(self, tiny_deepseek_v2_model_path, dataloader): @@ -209,7 +210,9 @@ def test_mxfp8_llmcompressor_format(self, tiny_opt_model_path, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor") + compressed_model, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="llm_compressor" + ) tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj assert ( hasattr(tmp_layer, "weight_scale") @@ -245,7 +248,9 @@ def test_nvfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor") + compressed_model, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="llm_compressor" + ) tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj assert ( hasattr(tmp_layer, "weight_scale") @@ -281,7 +286,9 @@ def test_nvfp4_autoround_format(self, tiny_opt_model_path, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + compressed_model, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round" + ) tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj assert ( hasattr(tmp_layer, "weight_scale") @@ -306,7 +313,9 @@ def test_nvfp4_autoround_save_quantized(self, tiny_opt_model_path, dataloader): ) quantized_model_path = self.save_dir autoround.quantize() - compressed_model = autoround.save_quantized(output_dir=quantized_model_path, format="auto_round") + compressed_model, quantized_model_path = autoround.save_quantized( + output_dir=quantized_model_path, format="auto_round" + ) tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj assert ( hasattr(tmp_layer, "weight_scale") @@ -335,7 +344,9 @@ def test_qwen_moe_quant_infer(self, tiny_qwen_moe_model_path, dataloader): layer_config=layer_config, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, inplace=True, format="auto_round" + ) assert is_model_outputs_similar(model_name, quantized_model_path) @pytest.mark.parametrize( @@ -372,7 +383,7 @@ def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype, tiny ) quantized_model_path = self.save_dir - compressed_model, _ = autoround.quantize_and_save( + compressed_model, quantized_model_path = autoround.quantize_and_save( output_dir=quantized_model_path, format="auto_round", ) diff --git a/test/test_cpu/quantization/test_mxfp_save_load.py b/test/test_cpu/quantization/test_mxfp_save_load.py index 5e12edc68..1f9cb645c 100644 --- a/test/test_cpu/quantization/test_mxfp_save_load.py +++ b/test/test_cpu/quantization/test_mxfp_save_load.py @@ -60,7 +60,7 @@ def test_e2e_quant_and_load(scheme_name, weight_data_type, act_data_type): # Quantize and save the model to the temporary directory quantized_model_path = f"{temp_dir}/tmp_autoround" - autoround.quantize_and_save(format="auto_round", output_dir=quantized_model_path) + _, quantized_model_path = autoround.quantize_and_save(format="auto_round", output_dir=quantized_model_path) # Perform inference with the quantized model model = AutoModelForCausalLM.from_pretrained( diff --git a/test/test_cpu/quantization/test_new_arch.py b/test/test_cpu/quantization/test_new_arch.py new file mode 100644 index 000000000..dd24e7e9d --- /dev/null +++ b/test/test_cpu/quantization/test_new_arch.py @@ -0,0 +1,38 @@ +import copy +import shutil +import sys + +import pytest +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +from auto_round import AutoRound + +from ...helpers import get_model_path + + +class TestAutoRound: + # def test_calib(self, tiny_opt_model_path): + # from auto_round.compressors_new import Compressor + # from auto_round.algorithms.quantization.sign_round.config import SignRoundConfig + # config = SignRoundConfig(scheme="W4A16", iters=200, lr=0.005, bits=2, group_size=32) + # compressor = Compressor(config, tiny_opt_model_path, format="auto_round") + # compressor.quantize_and_save() + + def test_opt_rtn(self, tiny_opt_model_path): + from auto_round.algorithms.quantization.rtn.config import RTNConfig + from auto_round.compressors_new import Compressor + + config = RTNConfig(scheme="W4A16", bits=2, group_size=32) + compressor = Compressor(config, tiny_opt_model_path, format="auto_round") + compressor.quantize_and_save() + + ar = AutoRound(tiny_opt_model_path, bits=2, group_size=32, iters=0) + ar.quantize_and_save() + + # def test_rtn(self, tiny_opt_model_path): + # from auto_round.compressors_new import Compressor + # from auto_round.algorithms.quantization.rtn.config import RTNConfig + # config = RTNConfig(scheme="W4A16", bits=2, group_size=32, disable_opt_rtn=True) + # compressor = Compressor(config, tiny_opt_model_path, format="auto_round") + # compressor.quantize_and_save() diff --git a/test/test_cpu/schemes/test_scheme.py b/test/test_cpu/schemes/test_scheme.py index 8af28b7a5..3c4ecc713 100644 --- a/test/test_cpu/schemes/test_scheme.py +++ b/test/test_cpu/schemes/test_scheme.py @@ -56,13 +56,13 @@ def test_w4a16_mixed(self, tiny_qwen_moe_model_path, dataloader): low_cpu_mem_usage=False, layer_config=layer_config, ) - ar.quantize_and_save(self.save_folder) + _, quantized_model_path = ar.quantize_and_save(self.save_folder) assert ar.bits == 4 assert ar.model.model.layers[0].self_attn.q_proj.bits == 8 assert ar.model.model.layers[0].self_attn.k_proj.bits == 16 assert ar.model.model.layers[0].mlp.experts[0].up_proj.bits == 4 # assert ar.model.model.layers[0].mlp.shared_expert.gate_proj.bits == 8 # gate has been added to ignore_layers - model = transformers.AutoModelForCausalLM.from_pretrained(self.save_folder, trust_remote_code=True) + model = transformers.AutoModelForCausalLM.from_pretrained(quantized_model_path, trust_remote_code=True) assert model is not None, "Model loading failed after quantization with W4A16_MIXED scheme on MoE" def test_w4a16_mixed_mllm(self, tiny_qwen_2_5_vl_model_path, dataloader): @@ -78,8 +78,8 @@ def test_w4a16_mixed_mllm(self, tiny_qwen_2_5_vl_model_path, dataloader): dataset=dataloader, low_cpu_mem_usage=False, ) - ar.quantize_and_save(self.save_folder) - model = transformers.Qwen2_5_VLForConditionalGeneration.from_pretrained(self.save_folder) + _, quantized_model_path = ar.quantize_and_save(self.save_folder) + model = transformers.Qwen2_5_VLForConditionalGeneration.from_pretrained(quantized_model_path) assert model is not None, "Model loading failed after quantization with W4A16_MIXED scheme on MLLM" assert ar.bits == 4 assert ar.model.model.language_model.layers[0].self_attn.q_proj.bits == 16 @@ -98,8 +98,8 @@ def test_mxfp4_rceil(self, tiny_opt_model_path): assert ar.act_bits == 4 assert ar.data_type == "mx_fp" assert ar.act_data_type == "mx_fp_rceil" - ar.quantize_and_save() - model = transformers.AutoModelForCausalLM.from_pretrained("tmp_autoround", trust_remote_code=True) + _, quantized_model_path = ar.quantize_and_save() + model = transformers.AutoModelForCausalLM.from_pretrained(quantized_model_path, trust_remote_code=True) assert model is not None, "Model loading failed after quantization with MXFP4 scheme" def test_vlm(self, tiny_qwen_vl_model_path): @@ -115,8 +115,8 @@ def test_nvfp4(self, tiny_opt_model_path, dataloader): assert ar.act_bits == 4 assert ar.data_type == "nv_fp" assert ar.act_data_type == "nv_fp4_with_static_gs" - ar.quantize_and_save(self.save_folder) - model = transformers.AutoModelForCausalLM.from_pretrained(self.save_folder, trust_remote_code=True) + _, quantized_model_path = ar.quantize_and_save(self.save_folder) + model = transformers.AutoModelForCausalLM.from_pretrained(quantized_model_path, trust_remote_code=True) assert model is not None, "Model loading failed after quantization with NVFP4 scheme" @pytest.mark.parametrize( @@ -204,8 +204,8 @@ def test_fp8_static(self, tiny_opt_model_path): assert ar.act_data_type == "fp" assert ar.group_size == -1 assert ar.act_dynamic is False - ar.quantize_and_save() - model = transformers.AutoModelForCausalLM.from_pretrained("tmp_autoround", trust_remote_code=True) + _, quantized_model_path = ar.quantize_and_save() + model = transformers.AutoModelForCausalLM.from_pretrained(quantized_model_path, trust_remote_code=True) assert model is not None, "Model loading failed after quantization with FP8_STATIC scheme" def test_fp8_static_rtn(self, tiny_opt_model_path): @@ -216,6 +216,6 @@ def test_fp8_static_rtn(self, tiny_opt_model_path): assert ar.act_data_type == "fp" assert ar.group_size == -1 assert ar.act_dynamic is False - ar.quantize_and_save(self.save_folder) - model = transformers.AutoModelForCausalLM.from_pretrained(self.save_folder, trust_remote_code=True) + _, quantized_model_path = ar.quantize_and_save(self.save_folder) + model = transformers.AutoModelForCausalLM.from_pretrained(quantized_model_path, trust_remote_code=True) assert model is not None, "Model loading failed after quantization with FP8_STATIC scheme" diff --git a/test/test_cpu/utils/test_generation.py b/test/test_cpu/utils/test_generation.py index 448c3cc40..2acb560a2 100644 --- a/test/test_cpu/utils/test_generation.py +++ b/test/test_cpu/utils/test_generation.py @@ -43,7 +43,9 @@ def test_4bits_sym(self, dataloader): ) quantized_model_path = self.save_folder - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round", inplace=False) + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round", inplace=False + ) model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu") tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) @@ -78,7 +80,7 @@ def test_autoround_sym(self, dataloader): ) quantized_model_path = self.save_folder - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") model = AutoModelForCausalLM.from_pretrained( quantized_model_path, device_map="auto", trust_remote_code=True diff --git a/test/test_cuda/algorithms/test_alg_ext.py b/test/test_cuda/algorithms/test_alg_ext.py index 17cbe6d90..7aceb84fb 100644 --- a/test/test_cuda/algorithms/test_alg_ext.py +++ b/test/test_cuda/algorithms/test_alg_ext.py @@ -48,9 +48,9 @@ def test_all_support_dtype(self, scheme, tiny_qwen_model_path): def test_2bits(self): model_name = get_model_path("facebook/opt-125m") ar = AutoRound(model=model_name, bits=2, group_size=64, enable_alg_ext=True) - ar.quantize_and_save(self.save_folder) + _, quantized_model_path = ar.quantize_and_save(self.save_folder) model = AutoModelForCausalLM.from_pretrained( - self.save_folder, + quantized_model_path, device_map="auto", ) diff --git a/test/test_cuda/backends/test_exllamav2_backend.py b/test/test_cuda/backends/test_exllamav2_backend.py index 4ed812105..8b9027b31 100644 --- a/test/test_cuda/backends/test_exllamav2_backend.py +++ b/test/test_cuda/backends/test_exllamav2_backend.py @@ -41,23 +41,25 @@ def test_gptqmodel_exllmav2_4bits_asym(self, dataloader): model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, dataset=dataloader ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round:gptqmodel" + ) quantization_config = AutoRoundConfig(backend="gptqmodel:exllamav2") model = AutoModelForCausalLM.from_pretrained( - self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) evaluate_accuracy(model, tokenizer, threshold=0.35, batch_size=16) torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_dir, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) evaluate_accuracy(model, tokenizer, threshold=0.35, batch_size=16) torch.cuda.empty_cache() @@ -83,10 +85,10 @@ def test_gptq_exllamav2_4bits_sym(self, dataloader): quantization_config = AutoRoundConfig(backend="gptq:exllamav2") ## or exllamav2 model = AutoModelForCausalLM.from_pretrained( - self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) evaluate_accuracy(model, tokenizer, threshold=0.27, batch_size=16) torch.cuda.empty_cache() @@ -108,14 +110,16 @@ def test_gptq_exllamav2_4bits_sym_group_size(self, group_size): sym=True, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") ##will convert to gptq model + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round" + ) ##will convert to gptq model quantization_config = AutoRoundConfig(backend="gptq:exllamav2") ## or exllamav2 model = AutoModelForCausalLM.from_pretrained( - self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) evaluate_accuracy(model, tokenizer, threshold=0.15, batch_size=64) torch.cuda.empty_cache() @@ -134,15 +138,17 @@ def test_gptqmodel_awq_exllamav2_4bits_asym(self, dataloader): disable_opt_rtn=True, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round:auto_awq" + ) quantization_config = AutoRoundConfig(backend="gptqmodel:awq_exllamav2") # test awq bfloat16 inference model = AutoModelForCausalLM.from_pretrained( - self.save_dir, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) assert model.dtype == torch.bfloat16, f"Expected model dtype bfloat16, got {model.dtype}" - tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) # Inference generation check eval_generated_prompt(model, tokenizer) # Accuracy check @@ -165,14 +171,16 @@ def test_gptqmodel_awq_exllamav2_4bits_sym(self, dataloader): disable_opt_rtn=True, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round:auto_awq" + ) quantization_config = AutoRoundConfig(backend="gptqmodel:awq_exllamav2") model = AutoModelForCausalLM.from_pretrained( # test awq bfloat16 inference - self.save_dir, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) # Inference generation check eval_generated_prompt(model, tokenizer) # Accuracy check diff --git a/test/test_cuda/backends/test_marlin_backend.py b/test/test_cuda/backends/test_marlin_backend.py index ff00315d4..e1115ce6a 100644 --- a/test/test_cuda/backends/test_marlin_backend.py +++ b/test/test_cuda/backends/test_marlin_backend.py @@ -49,14 +49,14 @@ def test_marlin_4bits_sym_with_zp_m_1(self, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") quantization_config = AutoRoundConfig(backend="marlin") model = AutoModelForCausalLM.from_pretrained( - self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) evaluate_accuracy(model, tokenizer, threshold=0.27, batch_size=16) torch.cuda.empty_cache() @@ -80,7 +80,7 @@ def test_marlin_group_size(self, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") quantization_config = AutoRoundConfig(backend="marlin") model = AutoModelForCausalLM.from_pretrained( @@ -107,14 +107,17 @@ def test_marlin_group_size(self, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") quantization_config = AutoRoundConfig(backend="marlin") model = AutoModelForCausalLM.from_pretrained( - self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + quantized_model_path, + torch_dtype=torch.float16, + device_map="auto", + quantization_config=quantization_config, ) - tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) evaluate_accuracy(model, tokenizer, threshold=0.14, batch_size=16) @@ -179,14 +182,16 @@ def test_gptqmodel_awq_marlin_4bits_sym(self): disable_opt_rtn=True, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round:auto_awq" + ) quantization_config = AutoRoundConfig(backend="gptqmodel:awq_marlin") model = AutoModelForCausalLM.from_pretrained( - self.save_dir, torch_dtype="auto", device_map="cuda:0", quantization_config=quantization_config + quantized_model_path, torch_dtype="auto", device_map="cuda:0", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) # Inference generation check eval_generated_prompt(model, tokenizer) # Accuracy check @@ -211,14 +216,16 @@ def test_gptqmodel_awq_marlin_group_size(self, group_size): disable_opt_rtn=True, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round:auto_awq" + ) quantization_config = AutoRoundConfig(backend="gptqmodel:awq_marlin") model = AutoModelForCausalLM.from_pretrained( - self.save_dir, torch_dtype="auto", device_map="cuda:0", quantization_config=quantization_config + quantized_model_path, torch_dtype="auto", device_map="cuda:0", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) # Inference generation check eval_generated_prompt(model, tokenizer) # Accuracy check diff --git a/test/test_cuda/backends/test_torch_backend.py b/test/test_cuda/backends/test_torch_backend.py index dfd2c85eb..fafa5ccff 100644 --- a/test/test_cuda/backends/test_torch_backend.py +++ b/test/test_cuda/backends/test_torch_backend.py @@ -49,14 +49,16 @@ def test_torch_4bits_asym(self, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round:gptqmodel" + ) quantization_config = AutoRoundConfig(backend="torch") model = AutoModelForCausalLM.from_pretrained( quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) evaluate_accuracy(model, tokenizer, threshold=0.35, batch_size=16) torch.cuda.empty_cache() @@ -79,13 +81,15 @@ def test_torch_4bits_sym(self, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") ##will convert to gptq model + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round" + ) ##will convert to gptq model quantization_config = AutoRoundConfig(backend="torch") model = AutoModelForCausalLM.from_pretrained( quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) evaluate_accuracy(model, tokenizer, threshold=0.28, batch_size=16) torch.cuda.empty_cache() @@ -130,7 +134,9 @@ def test_autoround_3bit_sym_torch_format(self, tiny_opt_model_path, dataloader): autoround.quantize() quantized_model_path = self.save_dir - autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round") + _, quantized_model_path = autoround.save_quantized( + output_dir=quantized_model_path, inplace=False, format="auto_round" + ) device = "auto" ##cpu, hpu, cuda from transformers import AutoRoundConfig @@ -160,14 +166,16 @@ def test_gptqmodel_awq_torch_4bits_group_size_16(self, dataloader): disable_opt_rtn=True, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round:auto_awq" + ) quantization_config = AutoRoundConfig(backend="gptqmodel:awq_torch") model = AutoModelForCausalLM.from_pretrained( - self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) # Inference generation check output = model_infer(model, tokenizer) assert isinstance(output, str) and len(output.strip()) > 0, "Model failed to generate non-empty output" diff --git a/test/test_cuda/backends/test_triton_backend.py b/test/test_cuda/backends/test_triton_backend.py index 6675d1620..fa1c1f152 100644 --- a/test/test_cuda/backends/test_triton_backend.py +++ b/test/test_cuda/backends/test_triton_backend.py @@ -35,14 +35,14 @@ def test_tritonv2_2bits_asym(self): bits, group_size, sym = 2, 32, False autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym) quantized_model_path = self.save_folder - autoround.quantize_and_save(output_dir=quantized_model_path) + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path) quantization_config = AutoRoundConfig(backend="tritonv2") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) evaluate_accuracy(model, tokenizer, threshold=0.19, batch_size=16) torch.cuda.empty_cache() @@ -65,14 +65,16 @@ def test_tritonv2_4bits_asym(self, dataloader): dataset=dataloader, ) quantized_model_path = self.save_folder - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round:gptqmodel" + ) quantization_config = AutoRoundConfig(backend="tritonv2") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) evaluate_accuracy(model, tokenizer, threshold=0.34, batch_size=16) torch.cuda.empty_cache() @@ -95,23 +97,23 @@ def test_tritonv2_4bits_sym(self, dataloader): dataset=dataloader, ) quantized_model_path = self.save_folder - autoround.quantize_and_save(output_dir=quantized_model_path) + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path) quantization_config = AutoRoundConfig(backend="tritonv2") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) evaluate_accuracy(model, tokenizer, threshold=0.26, batch_size=16) torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) evaluate_accuracy(model, tokenizer, threshold=0.26, batch_size=16) torch.cuda.empty_cache() @@ -125,23 +127,23 @@ def test_tritonv2_8bits_sym(self): bits, group_size, sym = 4, 256, True autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, nsamples=1, iters=1) quantized_model_path = self.save_folder - autoround.quantize_and_save(output_dir=quantized_model_path) + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path) quantization_config = AutoRoundConfig(backend="tritonv2") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) evaluate_accuracy(model, tokenizer, threshold=0.27, batch_size=16) torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) evaluate_accuracy(model, tokenizer, threshold=0.27, batch_size=16) torch.cuda.empty_cache() @@ -161,23 +163,23 @@ def test_tritonv2_2bits_sym(self): sym=sym, ) quantized_model_path = self.save_folder - autoround.quantize_and_save(output_dir=quantized_model_path) + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path) quantization_config = AutoRoundConfig(backend="tritonv2") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) evaluate_accuracy(model, tokenizer, threshold=0.18, batch_size=16) torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) evaluate_accuracy(model, tokenizer, threshold=0.18, batch_size=16) torch.cuda.empty_cache() diff --git a/test/test_cuda/export/test_auto_awq_format.py b/test/test_cuda/export/test_auto_awq_format.py index 394b6ab9f..2ee2b9de4 100644 --- a/test/test_cuda/export/test_auto_awq_format.py +++ b/test/test_cuda/export/test_auto_awq_format.py @@ -70,7 +70,7 @@ def test_autoawq_format_fp_qsave_layers(self): layer_config=layer_config, ) quantized_model_path = os.path.join(self.save_dir, "test_export") - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq") # test loading with AutoRoundConfig model = AutoModelForCausalLM.from_pretrained( @@ -99,7 +99,7 @@ def test_fallback_regex_for_awq_format(self, tiny_opt_model_path, dataloader): layer_config=layer_config, ) quantized_model_path = "self.save_dir" - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq") quantization_config = AutoRoundConfig() model = AutoModelForCausalLM.from_pretrained( quantized_model_path, device_map="auto", quantization_config=quantization_config diff --git a/test/test_cuda/export/test_auto_gptq_format.py b/test/test_cuda/export/test_auto_gptq_format.py index 474e058d0..f7a2891cf 100644 --- a/test/test_cuda/export/test_auto_gptq_format.py +++ b/test/test_cuda/export/test_auto_gptq_format.py @@ -72,7 +72,7 @@ def test_autogptq_format_qsave_ignore_layers(self): layer_config=layer_config, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) diff --git a/test/test_cuda/export/test_auto_round_format.py b/test/test_cuda/export/test_auto_round_format.py index ba9862712..863dfc9e9 100644 --- a/test/test_cuda/export/test_auto_round_format.py +++ b/test/test_cuda/export/test_auto_round_format.py @@ -53,7 +53,7 @@ def test_autoround_format(self, tiny_opt_model_path, bits, group_size, is_sym): ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") # Verify loading model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cuda:0", trust_remote_code=True) @@ -74,7 +74,7 @@ def test_mixed_precision(self): bits, group_size, sym = 4, 128, True autoround = AutoRound(model_name, bits=bits, group_size=group_size, sym=sym, layer_config=layer_config) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") eval_generated_prompt(quantized_model_path) evaluate_accuracy(quantized_model_path, threshold=0.32, batch_size=16) @@ -92,23 +92,31 @@ def test_awq_backend(self): sym=sym, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round:auto_awq" + ) quantization_config = AutoRoundConfig(backend="auto") model = AutoModelForCausalLM.from_pretrained( - self.save_dir, torch_dtype=torch.float16, device_map="cuda:0", quantization_config=quantization_config + quantized_model_path, + torch_dtype=torch.float16, + device_map="cuda:0", + quantization_config=quantization_config, ) - tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) eval_generated_prompt(model, tokenizer) evaluate_accuracy(model, tokenizer, threshold=0.18, batch_size=16) torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_dir, torch_dtype=torch.bfloat16, device_map="cuda:0", quantization_config=quantization_config + quantized_model_path, + torch_dtype=torch.bfloat16, + device_map="cuda:0", + quantization_config=quantization_config, ) - tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) eval_generated_prompt(model, tokenizer) @pytest.mark.skip_ci(reason="Time-consuming; Accuracy evaluation") @@ -138,7 +146,7 @@ def test_autoround_gptq_sym_format(self, tiny_opt_model_path, dataloader): ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path) + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path) from transformers import AutoRoundConfig @@ -188,7 +196,9 @@ def test_autoround_awq_sym_format(self, tiny_opt_model_path, dataloader): ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round:auto_awq" + ) model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) diff --git a/test/test_cuda/export/test_gguf_format.py b/test/test_cuda/export/test_gguf_format.py index 557c9b6bc..8d4c6a363 100644 --- a/test/test_cuda/export/test_gguf_format.py +++ b/test/test_cuda/export/test_gguf_format.py @@ -132,7 +132,7 @@ def test_special_model(self): disable_opt_rtn=True, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0") file_name = os.listdir(quantized_model_path)[0] file_size = os.path.getsize(os.path.join(quantized_model_path, file_name)) / 1024**2 assert abs(file_size - 307) < 5.0 @@ -163,11 +163,11 @@ def test_vlm_gguf(self): quant_nontext_module=True, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m") - assert "mmproj-model.gguf" in os.listdir(self.save_dir) - for file in os.listdir(self.save_dir): - print(f"{file}: {os.path.getsize(os.path.join(self.save_dir, file)) / 1024**2} MB") - file_size = os.path.getsize(os.path.join(self.save_dir, file)) / 1024**2 + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m") + assert "mmproj-model.gguf" in os.listdir(quantized_model_path) + for file in os.listdir(quantized_model_path): + print(f"{file}: {os.path.getsize(os.path.join(quantized_model_path, file)) / 1024**2} MB") + file_size = os.path.getsize(os.path.join(quantized_model_path, file)) / 1024**2 if "mmproj-model.gguf" in file: assert abs(file_size - 75) < 5.0 else: @@ -192,7 +192,7 @@ def test_q2k_mixed(self): disable_opt_rtn=True, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q2_k_mixed") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q2_k_mixed") gguf_file = os.listdir(quantized_model_path)[0] file_size = os.path.getsize(os.path.join(quantized_model_path, gguf_file)) / 1024**2 assert abs(file_size - 1236) < 5.0 @@ -223,7 +223,7 @@ def test_q2_k_s_ffn_down_q4k(self): disable_opt_rtn=True, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q2_k_s") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q2_k_s") gguf_file = os.listdir(quantized_model_path)[0] gguf_model = GGUFReader(os.path.join(quantized_model_path, gguf_file)) ffn_down_type = None @@ -252,5 +252,7 @@ def test_gguf_baseline(self): disable_opt_rtn=True, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="fake") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, inplace=False, format="fake" + ) eval_generated_prompt(quantized_model_path) diff --git a/test/test_cuda/integrations/test_sglang.py b/test/test_cuda/integrations/test_sglang.py index 2bee14bd5..94d5e8c62 100644 --- a/test/test_cuda/integrations/test_sglang.py +++ b/test/test_cuda/integrations/test_sglang.py @@ -58,13 +58,13 @@ def test_ar_format_sglang(self, dataloader): dataset=dataloader, ) - autoround.quantize_and_save( + _, quantized_model_path = autoround.quantize_and_save( output_dir=self.save_dir, inplace=True, format="auto_round", ) - generated_text = self._run_sglang_inference(self.save_dir) + generated_text = self._run_sglang_inference(quantized_model_path) print(generated_text) assert "!!!" not in generated_text @@ -85,12 +85,12 @@ def test_mixed_ar_format_sglang(self, dataloader): layer_config=layer_config, ) - autoround.quantize_and_save( + _, quantized_model_path = autoround.quantize_and_save( output_dir=self.save_dir, inplace=True, format="auto_round", ) - config_file = Path(self.save_dir) / "config.json" + config_file = Path(quantized_model_path) / "config.json" with open(config_file, "r", encoding="utf-8") as f: config = json.load(f) quant_config = config.get("quantization_config", {}) @@ -100,7 +100,7 @@ def test_mixed_ar_format_sglang(self, dataloader): assert "group_size" not in extra_config[".*fc1.*"].keys() assert "bits" in extra_config[".*fc1.*"].keys() and extra_config[".*fc1.*"]["bits"] == 16 assert "bits" in extra_config[".*self_attn.*"].keys() and extra_config[".*self_attn.*"]["bits"] == 8 - generated_text = self._run_sglang_inference(self.save_dir) + generated_text = self._run_sglang_inference(quantized_model_path) print(generated_text) assert "!!!" not in generated_text @@ -117,13 +117,13 @@ def test_awq_format_sglang(self, dataloader): dataset=dataloader, ) - autoround.quantize_and_save( + _, quantized_model_path = autoround.quantize_and_save( output_dir=self.save_dir, inplace=True, format="auto_round:auto_awq", ) - generated_text = self._run_sglang_inference(self.save_dir) + generated_text = self._run_sglang_inference(quantized_model_path) print(generated_text) assert "!!!" not in generated_text diff --git a/test/test_cuda/integrations/test_vllm.py b/test/test_cuda/integrations/test_vllm.py index 3cef719e5..5ca4a2469 100644 --- a/test/test_cuda/integrations/test_vllm.py +++ b/test/test_cuda/integrations/test_vllm.py @@ -100,7 +100,7 @@ def test_mixed_llmcompressor_format_vllm(tiny_opt_model_path, dataloader, tmp_pa layer_config=layer_config, ) quantized_model_path = str(tmp_path / "saved") - autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor") # verify loading. llm = LLM( @@ -165,9 +165,9 @@ def test_auto_round_awq_format_vllm(): iters=1, seqlen=2, ) - autoround.quantize_and_save(output_dir=save_dir, format="auto_round:auto_awq") + _, quantized_model_path = autoround.quantize_and_save(output_dir=save_dir, format="auto_round:auto_awq") sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=32) - llm = LLM(model=save_dir, trust_remote_code=True, tensor_parallel_size=1, gpu_memory_utilization=0.7) + llm = LLM(model=quantized_model_path, trust_remote_code=True, tensor_parallel_size=1, gpu_memory_utilization=0.7) outputs = llm.generate(["The capital of France is"], sampling_params) generated_text = outputs[0].outputs[0].text print(generated_text) diff --git a/test/test_cuda/models/test_fp8_model.py b/test/test_cuda/models/test_fp8_model.py index 1b7e97e4d..99b81f9dd 100644 --- a/test/test_cuda/models/test_fp8_model.py +++ b/test/test_cuda/models/test_fp8_model.py @@ -32,14 +32,14 @@ def setup_and_teardown_class(self): def test_small_model_rtn_generation(self, mock_fp8_capable_device, tiny_fp8_qwen_model_path): ar = AutoRound(tiny_fp8_qwen_model_path, iters=0, disable_opt_rtn=True) - ar.quantize_and_save(output_dir=self.save_dir) - model = AutoModelForCausalLM.from_pretrained(self.save_dir, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir) + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) generate_prompt(model, tokenizer) def test_gguf_imatrix(self, mock_fp8_capable_device, tiny_fp8_qwen_model_path): ar = AutoRound(tiny_fp8_qwen_model_path, iters=0) - ar.quantize_and_save(format="gguf:q2_k_s", output_dir=self.save_dir) + _, quantized_model_path = ar.quantize_and_save(format="gguf:q2_k_s", output_dir=self.save_dir) # from llama_cpp import Llama # # gguf_file = os.listdir("saved/Qwen3-0.6B-FP8/-gguf")[0] @@ -47,8 +47,8 @@ def test_gguf_imatrix(self, mock_fp8_capable_device, tiny_fp8_qwen_model_path): # output = llm("There is a girl who likes adventure,", max_tokens=32) # print(output) # shutil.rmtree("./saved", ignore_errors=True) - # model = AutoModelForCausalLM.from_pretrained(self.save_dir, torch_dtype="auto", trust_remote_code=True) - # tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + # model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", trust_remote_code=True) + # tokenizer = AutoTokenizer.from_pretrained(quantized_model_path ) # text = "There is a girl who likes adventure," # inputs = tokenizer(text, return_tensors="pt").to(model.device) # print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) @@ -86,11 +86,11 @@ def test_fp8_model_gguf_q4(self, mock_fp8_capable_device, tiny_fp8_qwen_model_pa from llama_cpp import Llama ar = AutoRound(tiny_fp8_qwen_model_path, iters=0, disable_opt_rtn=True) - ar.quantize_and_save(output_dir=self.save_dir, format="gguf:q4_0") - for file in os.listdir(self.save_dir): + _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir, format="gguf:q4_0") + for file in os.listdir(quantized_model_path): if file.endswith(".gguf"): gguf_file = file - llm = Llama(f"{self.save_dir}/{gguf_file}", n_gpu_layers=-1) + llm = Llama(f"{quantized_model_path}/{gguf_file}", n_gpu_layers=-1) output = llm("There is a girl who likes adventure,", max_tokens=32) print(output) @@ -99,11 +99,11 @@ def test_fp8_model_gguf_q3(self, mock_fp8_capable_device, tiny_fp8_qwen_model_pa from llama_cpp import Llama ar = AutoRound(tiny_fp8_qwen_model_path, iters=1) - ar.quantize_and_save(output_dir=self.save_dir, format="gguf:q3_k_s") - for file in os.listdir(self.save_dir): + _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir, format="gguf:q3_k_s") + for file in os.listdir(quantized_model_path): if file.endswith(".gguf"): gguf_file = file - llm = Llama(f"{self.save_dir}/{gguf_file}", n_gpu_layers=-1) + llm = Llama(f"{quantized_model_path}/{gguf_file}", n_gpu_layers=-1) output = llm("There is a girl who likes adventure,", max_tokens=32) print(output) @@ -113,8 +113,8 @@ def test_diff_datatype(self, scheme, tiny_fp8_qwen_model_path, mock_fp8_capable_ model_name = tiny_fp8_qwen_model_path print(f"Testing scheme: {scheme}") ar = AutoRound(model_name, iters=0, scheme=scheme, disable_opt_rtn=True, nsamples=2) - ar.quantize_and_save(output_dir=self.save_dir) - model = AutoModelForCausalLM.from_pretrained(self.save_dir, torch_dtype="auto", trust_remote_code=True) + _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir) + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", trust_remote_code=True) assert model is not None, f"Failed to load model for scheme {scheme}" @@ -127,9 +127,9 @@ def test_qwen3_fp8_moe_mxfp(tiny_fp8_qwen_moe_model_path, mock_fp8_capable_devic seqlen=32, iters=0, ) - quantized_model, _ = autoround.quantize_and_save(format="auto_round", output_dir=output_dir) + quantized_model, quantized_model_path = autoround.quantize_and_save(format="auto_round", output_dir=output_dir) assert quantized_model is not None, "Quantized model should not be None." - loaded_model = AutoModelForCausalLM.from_pretrained(output_dir) + loaded_model = AutoModelForCausalLM.from_pretrained(quantized_model_path) for n, m in quantized_model.named_modules(): if m.__class__.__name__ == "QuantLinear": loaded_m = loaded_model.get_submodule(n) diff --git a/test/test_cuda/models/test_moe_model.py b/test/test_cuda/models/test_moe_model.py index 95c61bd55..8ba5e871d 100644 --- a/test/test_cuda/models/test_moe_model.py +++ b/test/test_cuda/models/test_moe_model.py @@ -22,10 +22,10 @@ def test_qwen3_5_moe(tiny_qwen35_moe_model_path): seqlen=32, iters=1, ) - quantized_model, _ = ar.quantize_and_save(format="auto_round", output_dir=output_dir) + quantized_model, quantized_model_path = ar.quantize_and_save(format="auto_round", output_dir=output_dir) assert quantized_model is not None, "Quantized model should not be None." - loaded_model = Qwen3_5MoeForConditionalGeneration.from_pretrained(output_dir) + loaded_model = Qwen3_5MoeForConditionalGeneration.from_pretrained(quantized_model_path) loaded_model.to("cuda") inp = torch.randint(0, 100, (1, 64)).to("cuda") diff --git a/test/test_cuda/quantization/test_asym.py b/test/test_cuda/quantization/test_asym.py index eaca19d3f..97ac64288 100644 --- a/test/test_cuda/quantization/test_asym.py +++ b/test/test_cuda/quantization/test_asym.py @@ -29,15 +29,15 @@ def setup_and_teardown_class(self): def test_asym_group_size_with_tuning(self, group_size, tiny_opt_model_path): bits, sym = 4, False ar = AutoRound(tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1) - _, saved_folders = ar.quantize_and_save(format="auto_round", output_dir=self.save_dir) + _, quantized_model_path = ar.quantize_and_save(format="auto_round", output_dir=self.save_dir) model = AutoModelForCausalLM.from_pretrained( - saved_folders[0], + quantized_model_path, torch_dtype="auto", device_map="auto", ) - tokenizer = AutoTokenizer.from_pretrained(saved_folders[0]) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) @pytest.mark.skip_ci(reason="Not necessary since it's covered by backend tests") # skip this test in CI @@ -45,15 +45,15 @@ def test_asym_group_size_with_tuning(self, group_size, tiny_opt_model_path): def test_asym_bits_with_tuning(self, bits, tiny_opt_model_path): group_size, sym = 128, False ar = AutoRound(tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1) - _, saved_folders = ar.quantize_and_save(format="auto_round", output_dir=self.save_dir) + _, quantized_model_path = ar.quantize_and_save(format="auto_round", output_dir=self.save_dir) model = AutoModelForCausalLM.from_pretrained( - saved_folders[0], + quantized_model_path, torch_dtype="auto", device_map="auto", ) - tokenizer = AutoTokenizer.from_pretrained(saved_folders[0]) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) @pytest.mark.skip_ci(reason="Not necessary since it's covered by backend tests") # skip this test in CI @@ -61,17 +61,17 @@ def test_asym_bits_with_tuning(self, bits, tiny_opt_model_path): def test_asym_format_with_tuning(self, format, tiny_opt_model_path): bits, group_size, sym = 4, 128, False ar = AutoRound(tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1) - _, saved_folders = ar.quantize_and_save(format=format, output_dir=self.save_dir) + _, quantized_model_path = ar.quantize_and_save(format=format, output_dir=self.save_dir) if format == "auto_round:auto_gptq": # Cannot load correctly, skip auto_gptq since it's deprecated. return model = AutoModelForCausalLM.from_pretrained( - saved_folders[0], + quantized_model_path, torch_dtype="auto", device_map="auto", ) - tokenizer = AutoTokenizer.from_pretrained(saved_folders[0]) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) diff --git a/test/test_cuda/quantization/test_mxfp_nvfp.py b/test/test_cuda/quantization/test_mxfp_nvfp.py index 0befbe6f6..bdc79075e 100644 --- a/test/test_cuda/quantization/test_mxfp_nvfp.py +++ b/test/test_cuda/quantization/test_mxfp_nvfp.py @@ -55,11 +55,11 @@ def test_e2e_quant_and_infer(scheme, tiny_qwen_model_path): # Quantize and save the model to the temporary directory quantized_model_path = f"{temp_dir}/tmp_autoround_{scheme}" - _, saved_folders = autoround.quantize_and_save(format="auto_round", output_dir=quantized_model_path) + _, quantized_model_path = autoround.quantize_and_save(format="auto_round", output_dir=quantized_model_path) # Perform inference with the quantized model model = AutoModelForCausalLM.from_pretrained( - saved_folders[0], + quantized_model_path, torch_dtype="auto", ) model.eval() @@ -144,11 +144,11 @@ def test_qwen_moe_quant_infer(self, dataloader): layer_config=layer_config, ) quantized_model_path = self.save_dir - _, saved_folders = autoround.quantize_and_save( + _, quantized_model_path = autoround.quantize_and_save( output_dir=quantized_model_path, inplace=False, format="auto_round" ) - model = AutoModelForCausalLM.from_pretrained(saved_folders[0], torch_dtype="auto", device_map="auto") - tokenizer = AutoTokenizer.from_pretrained(saved_folders[0]) + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="auto") + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) from ...helpers import evaluate_accuracy evaluate_accuracy(model, tokenizer, threshold=0.49, batch_size=16, task="piqa", limit=10) diff --git a/test/test_cuda/quantization/test_torch_compile.py b/test/test_cuda/quantization/test_torch_compile.py index 620ef75b0..e7efe0bf2 100644 --- a/test/test_cuda/quantization/test_torch_compile.py +++ b/test/test_cuda/quantization/test_torch_compile.py @@ -64,9 +64,9 @@ def test_gguf_q2ks_torch_compile_iters0(self, tiny_qwen_model_path): seqlen=16, enable_torch_compile=True, ) - autoround.quantize_and_save(output_dir=self.save_dir, format="gguf:q2_k_s") + _, quantized_model_path = autoround.quantize_and_save(output_dir=self.save_dir, format="gguf:q2_k_s") - saved_files = [f for f in os.listdir(self.save_dir) if f.endswith(".gguf")] + saved_files = [f for f in os.listdir(quantized_model_path) if f.endswith(".gguf")] assert len(saved_files) > 0, "No GGUF file was generated" shutil.rmtree(self.save_dir, ignore_errors=True) diff --git a/test/test_cuda/transform/test_mxfp4_transform.py b/test/test_cuda/transform/test_mxfp4_transform.py index fba84e678..524645ea4 100644 --- a/test/test_cuda/transform/test_mxfp4_transform.py +++ b/test/test_cuda/transform/test_mxfp4_transform.py @@ -38,10 +38,10 @@ def test_transform_mxfp4_quant_infer(self): scheme=scheme, hadamard_config="default", ) - compressed_model, _ = ar.quantize_and_save(output_dir=self.save_dir, format="auto_round") + compressed_model, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir, format="auto_round") - model = AutoModelForCausalLM.from_pretrained(self.save_dir, torch_dtype="auto", device_map="cuda") - tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="cuda") + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) from ...helpers import generate_prompt generate_prompt(model, tokenizer) @@ -57,10 +57,10 @@ def test_transform_mxfp4_tuning_quant_infer(self): scheme=scheme, hadamard_config="default", ) - compressed_model, _ = ar.quantize_and_save(output_dir=self.save_dir, format="auto_round") + compressed_model, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir, format="auto_round") - model = AutoModelForCausalLM.from_pretrained(self.save_dir, torch_dtype="auto", device_map="cuda") - tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="cuda") + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) from ...helpers import generate_prompt generate_prompt(model, tokenizer) @@ -76,10 +76,10 @@ def test_random_transform_mxfp4_quant_infer(self): scheme=scheme, hadamard_config="random_hadamard", ) - compressed_model, _ = ar.quantize_and_save(output_dir=self.save_dir, format="auto_round") + compressed_model, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir, format="auto_round") - model = AutoModelForCausalLM.from_pretrained(self.save_dir, torch_dtype="auto", device_map="cuda") - tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="cuda") + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) from ...helpers import generate_prompt generate_prompt(model, tokenizer) diff --git a/test/test_xpu/test_autoround.py b/test/test_xpu/test_autoround.py index 21ec59552..d03c71444 100644 --- a/test/test_xpu/test_autoround.py +++ b/test/test_xpu/test_autoround.py @@ -48,13 +48,13 @@ def test_gptq_format(self, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - _, saved_folders = autoround.quantize_and_save(output_dir=quantized_model_path) + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path) quantization_config = AutoRoundConfig(backend="auto") model = AutoModelForCausalLM.from_pretrained( - saved_folders[0], device_map=self.device, quantization_config=quantization_config + quantized_model_path, device_map=self.device, quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(saved_folders[0]) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) text = "There is a girl who likes adventure," inputs = tokenizer(text, return_tensors="pt").to(model.device) res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]) @@ -79,13 +79,15 @@ def test_awq_format(self, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - _, saved_folders = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round:auto_awq" + ) quantization_config = AutoRoundConfig(backend="auto") model = AutoModelForCausalLM.from_pretrained( - saved_folders[0], device_map=self.device, quantization_config=quantization_config + quantized_model_path, device_map=self.device, quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(saved_folders[0]) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) text = "There is a girl who likes adventure," inputs = tokenizer(text, return_tensors="pt").to(model.device) res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]) @@ -110,7 +112,9 @@ def test_scheme(self, scheme, dataloader): dataset=dataloader, ) quantized_model_path = "./saved" - ar.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round") + _, quantized_model_path = ar.quantize_and_save( + output_dir=quantized_model_path, inplace=True, format="auto_round" + ) # test loading if scheme not in ["FPW8A16"]: # FPW8A16 group_size is 0 @@ -141,7 +145,9 @@ def test_vlm_model(self, dataloader): ) quantized_model_path = "./saved" - ar.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round") + _, quantized_model_path = ar.quantize_and_save( + output_dir=quantized_model_path, inplace=True, format="auto_round" + ) quantization_config = AutoRoundConfig(backend="auto") import requests @@ -212,7 +218,9 @@ def test_quant_lm_head(self, dataloader): dataset=dataloader, ) quantized_model_path = "./saved" - ar.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round") + _, quantized_model_path = ar.quantize_and_save( + output_dir=quantized_model_path, inplace=True, format="auto_round" + ) quantization_config = AutoRoundConfig(backend="auto") model = AutoModelForCausalLM.from_pretrained( From 29d2b64eca9f50d9fad5015fe2ac97bb5952a4f3 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Fri, 3 Apr 2026 10:09:08 +0800 Subject: [PATCH 34/90] fix Signed-off-by: n1ck-guo --- .../algorithms/quantization/rtn/quantizer.py | 5 ++++- auto_round/compressors_new/calib.py | 16 +++++++++++++++- test/test_cpu/core/test_autoround.py | 8 ++++---- test/test_cpu/export/test_export.py | 19 ++++++++++++------- test/test_cpu/quantization/test_mxfp_nvfp.py | 2 +- test/test_cuda/backends/test_torch_backend.py | 2 +- 6 files changed, 37 insertions(+), 15 deletions(-) diff --git a/auto_round/algorithms/quantization/rtn/quantizer.py b/auto_round/algorithms/quantization/rtn/quantizer.py index 94dea2dfe..3d3f23ad7 100644 --- a/auto_round/algorithms/quantization/rtn/quantizer.py +++ b/auto_round/algorithms/quantization/rtn/quantizer.py @@ -200,6 +200,9 @@ def __init__(self, config: RTNConfig): self.enable_alg_ext = True + def quantize_layer_outside_block(self, *args, **kwargs): + return self.quantize_layer(*args, **kwargs) + def quantize_block(self, block: torch.nn.Module, **kwargs): """Apply imatrix-informed RTN quantization to a block. @@ -220,6 +223,6 @@ def quantize_block(self, block: torch.nn.Module, **kwargs): if hasattr(m, "imatrix"): m.imatrix /= m.imatrix_cnt if hasattr(m, "global_name") and check_to_quantized(m): - self.quantize_layer(m.global_name) + self.quantize_layer_outside_block(m.global_name) # _get_block_outputs and _sampling_inputs are defined in BaseQuantizers and inherited. diff --git a/auto_round/compressors_new/calib.py b/auto_round/compressors_new/calib.py index 08791f303..ec4b41979 100644 --- a/auto_round/compressors_new/calib.py +++ b/auto_round/compressors_new/calib.py @@ -1386,7 +1386,21 @@ def _quantize_impl(self): # Release memory clear_memory(device_list=self.compress_context.device_list) - self._quant_rtn_with_imatrix() + enable_imatrix = False + if not getattr(self, "disable_opt_rtn", True): + formats = getattr(self, "formats", None) or [] + has_gguf_k = ( + any(fmt.is_gguf() and "k" in fmt.output_format for fmt in formats) or self.super_bits is not None + ) + if has_gguf_k: + enable_imatrix = True + elif self.data_type == "int" and self.sym: + enable_imatrix = True + + if enable_imatrix: + self._quant_rtn_with_imatrix() + else: + self._quantize_via_rtn_blockwise() convert_module_to_hp_if_necessary( self.model_context.model, diff --git a/test/test_cpu/core/test_autoround.py b/test/test_cpu/core/test_autoround.py index 8ac9ca53a..23c3d5fa9 100644 --- a/test/test_cpu/core/test_autoround.py +++ b/test/test_cpu/core/test_autoround.py @@ -428,7 +428,7 @@ def test_fallback_layers(self, tiny_opt_model_path, dataloader): quantized_model_path = self.save_folder _, quantized_model_path = autoround.save_quantized( - output_dir=quantized_model_path, format="auto_round", inplace=True + output_dir=quantized_model_path, format="auto_round", inplace=True, return_folders=True ) model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu") @@ -461,7 +461,7 @@ def test_fallback_layers_regex_awq(self, tiny_opt_model_path, dataloader): quantized_model_path = self.save_folder _, quantized_model_path = autoround.save_quantized( - output_dir=quantized_model_path, format="auto_awq", inplace=True + output_dir=quantized_model_path, format="auto_awq", inplace=True, return_folders=True ) quantization_config = AutoRoundConfig() @@ -498,7 +498,7 @@ def test_fallback_layers_regex_gptq(self, tiny_opt_model_path, dataloader): quantized_model_path = self.save_folder _, quantized_model_path = autoround.save_quantized( - output_dir=quantized_model_path, format="auto_gptq", inplace=True + output_dir=quantized_model_path, format="auto_round", inplace=True, return_folders=True ) quantization_config = AutoRoundConfig() @@ -535,7 +535,7 @@ def test_fallback_layers_regex_round(self, tiny_opt_model_path, dataloader): quantized_model_path = self.save_folder _, quantized_model_path = autoround.save_quantized( - output_dir=quantized_model_path, format="auto_round", inplace=True + output_dir=quantized_model_path, format="auto_round", inplace=True, return_folders=True ) quantization_config = AutoRoundConfig() diff --git a/test/test_cpu/export/test_export.py b/test/test_cpu/export/test_export.py index 3df20dcdd..1c7dfbd08 100644 --- a/test/test_cpu/export/test_export.py +++ b/test/test_cpu/export/test_export.py @@ -361,7 +361,9 @@ def test_gptq_lmhead_export(self, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") + compressed_model, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_gptq" + ) lm_head = compressed_model.lm_head assert hasattr(lm_head, "bits") and lm_head.bits == 4, "Illegal GPTQ quantization for lm_head layer" quantization_config = AutoRoundConfig() @@ -381,6 +383,7 @@ def test_export_format(self): self.model_name, scheme="FP8_STATIC", ) + autoround.post_init() format_list = get_formats("auto_round, llm_compressor, auto_round:llm_compressor", autoround) assert len(format_list) == 3 assert format_list[0].output_format == "auto_round" @@ -394,6 +397,7 @@ def test_export_format(self): self.model_name, scheme="W4A16", ) + autoround.post_init() format_list = get_formats("auto_round:auto_awq, auto_gptq", autoround) assert format_list[0].output_format == "auto_round" assert format_list[0].get_backend_name() == "auto_round:auto_awq" @@ -404,6 +408,7 @@ def test_export_format(self): model=self.model_name, scheme="INT8_W8A8", ) + autoround.post_init() format_list = get_formats("llm_compressor, auto_round:llm_compressor", autoround) assert format_list[0].output_format == "llm_compressor" assert format_list[0].get_backend_name() == "llm_compressor:int8_w8a8" @@ -422,10 +427,10 @@ def test_export_format_with_scheme(self, tiny_qwen_model_path): ) ar.post_init() with pytest.raises(ValueError, match="auto_awq format support quantization scheme with W4A16 but got bits=2"): - get_formats("auto_round:auto_awq", ar.quantizer) + get_formats("auto_round:auto_awq", ar) with pytest.raises(ValueError, match="but got bits=2, data_type=int"): - get_formats("auto_round:llm_compressor", ar.quantizer) + get_formats("auto_round:llm_compressor", ar) ar = AutoRound( model=tiny_qwen_model_path, @@ -436,7 +441,7 @@ def test_export_format_with_scheme(self, tiny_qwen_model_path): ) ar.post_init() with pytest.raises(ValueError, match="but got data_type=fp, bits=4"): - get_formats("auto_round:llm_compressor", ar.quantizer) + get_formats("auto_round:llm_compressor", ar) ar = AutoRound( model=tiny_qwen_model_path, @@ -446,7 +451,7 @@ def test_export_format_with_scheme(self, tiny_qwen_model_path): sym=True, ) ar.post_init() - get_formats("auto_round:auto_awq", ar.quantizer) + get_formats("auto_round:auto_awq", ar) def test_autoawq_qwen3_vl_infer(self, dataloader): model_path = get_model_path("Qwen/Qwen3-VL-2B-Instruct") @@ -489,7 +494,7 @@ def test_llmc_dynamic_wint8aint8_export(self): scheme="INT8_W8A8", ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor") with safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt") as f: assert "model.decoder.layers.8.self_attn.k_proj.weight_scale" in f.keys() assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype == torch.int8 @@ -499,7 +504,7 @@ def test_llmc_dynamic_wint8aint8_export_with_tuning(self, dataloader): autoround = AutoRound(self.model_name, iters=1, nsamples=2, seqlen=2, dataset=dataloader, scheme="INT8_W8A8") quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor") with safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt") as f: assert "model.decoder.layers.8.self_attn.k_proj.weight_scale" in f.keys() assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype == torch.int8 diff --git a/test/test_cpu/quantization/test_mxfp_nvfp.py b/test/test_cpu/quantization/test_mxfp_nvfp.py index acd411954..521fe74a5 100644 --- a/test/test_cpu/quantization/test_mxfp_nvfp.py +++ b/test/test_cpu/quantization/test_mxfp_nvfp.py @@ -314,7 +314,7 @@ def test_nvfp4_autoround_save_quantized(self, tiny_opt_model_path, dataloader): quantized_model_path = self.save_dir autoround.quantize() compressed_model, quantized_model_path = autoround.save_quantized( - output_dir=quantized_model_path, format="auto_round" + output_dir=quantized_model_path, format="auto_round", return_folders=True ) tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj assert ( diff --git a/test/test_cuda/backends/test_torch_backend.py b/test/test_cuda/backends/test_torch_backend.py index fafa5ccff..71e743f14 100644 --- a/test/test_cuda/backends/test_torch_backend.py +++ b/test/test_cuda/backends/test_torch_backend.py @@ -135,7 +135,7 @@ def test_autoround_3bit_sym_torch_format(self, tiny_opt_model_path, dataloader): quantized_model_path = self.save_dir _, quantized_model_path = autoround.save_quantized( - output_dir=quantized_model_path, inplace=False, format="auto_round" + output_dir=quantized_model_path, inplace=False, format="auto_round", return_folders=True ) device = "auto" ##cpu, hpu, cuda From 1c9e5296994de9e9cd852ad138f372828943dff4 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Fri, 3 Apr 2026 13:54:38 +0800 Subject: [PATCH 35/90] fix Signed-off-by: n1ck-guo --- auto_round/compressors_new/calib.py | 4 ++-- auto_round/compressors_new/entry.py | 3 ++- test/__init__.pyc | Bin 141 -> 0 bytes 3 files changed, 4 insertions(+), 3 deletions(-) delete mode 100755 test/__init__.pyc diff --git a/auto_round/compressors_new/calib.py b/auto_round/compressors_new/calib.py index ec4b41979..f78b45c7b 100644 --- a/auto_round/compressors_new/calib.py +++ b/auto_round/compressors_new/calib.py @@ -1212,7 +1212,7 @@ def _quantize_via_rtn_blockwise(self) -> None: # ── Infrastructure: register act_max hook and run forward pass ── hook_handles = self.quantizer._register_act_max_hook(block) - self.quantizer._get_block_outputs( + input_ids = self.quantizer._get_block_outputs( block, input_ids, input_others, @@ -1394,7 +1394,7 @@ def _quantize_impl(self): ) if has_gguf_k: enable_imatrix = True - elif self.data_type == "int" and self.sym: + elif self.data_type == "int" and self.sym and self.bits < 8: enable_imatrix = True if enable_imatrix: diff --git a/auto_round/compressors_new/entry.py b/auto_round/compressors_new/entry.py index b97d8aa65..53f3482b8 100644 --- a/auto_round/compressors_new/entry.py +++ b/auto_round/compressors_new/entry.py @@ -196,9 +196,10 @@ class DiffusionCalibCompressor(DiffusionMixin, CalibCompressor): _resolved = _preview_resolved_attrs(quant_config, scheme) _sym = _resolved.get("sym", getattr(quant_config, "sym", None)) _data_type = _resolved.get("data_type", getattr(quant_config, "data_type", "") or "") + _bits = _resolved.get("bits", getattr(quant_config, "bits", None)) if _sym is not None and _sym is False: enable_imatrix = False - elif _data_type == "int": + elif _data_type == "int" and (_bits is None or _bits < 8): enable_imatrix = True elif is_weight_scheme(scheme): enable_imatrix = True diff --git a/test/__init__.pyc b/test/__init__.pyc deleted file mode 100755 index 56ea0061b2641c037f4f73888d6b74e44a3ec687..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 141 zcmZSn%*&;~H#sAj0SXv_v;z{p`6;D2 MsdgX>N`ROF0LsB0EdT%j From 7e7fdeb5da093410adc0bd0ca2d6ab82b9d8bdcf Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Fri, 3 Apr 2026 15:23:51 +0800 Subject: [PATCH 36/90] fix vlm ut Signed-off-by: n1ck-guo --- test/test_cpu/core/test_autoround.py | 6 ------ test/test_cpu/export/test_gguf_format.py | 8 ++++---- test/test_cpu/models/test_mllm.py | 18 ++++++++++-------- test/test_cpu/schemes/test_scheme.py | 5 +++-- test/test_cuda/advanced/test_multiple_card.py | 10 +++++----- test/test_cuda/models/test_mllm.py | 12 +++++++----- 6 files changed, 29 insertions(+), 30 deletions(-) diff --git a/test/test_cpu/core/test_autoround.py b/test/test_cpu/core/test_autoround.py index 23c3d5fa9..6ed626cd8 100644 --- a/test/test_cpu/core/test_autoround.py +++ b/test/test_cpu/core/test_autoround.py @@ -673,12 +673,6 @@ def test_compressor(self, tiny_qwen_vl_model_path): assert ar.optimizer == torch.optim.AdamW assert ar.mllm - # test old api - from auto_round import AutoRoundMLLM - - ar = AutoRoundMLLM(model_name) - assert ar.mllm - def test_attention_mask_in_dataset(self): from transformers import AutoTokenizer diff --git a/test/test_cpu/export/test_gguf_format.py b/test/test_cpu/export/test_gguf_format.py index cc7a9e4d7..92e0015e0 100644 --- a/test/test_cpu/export/test_gguf_format.py +++ b/test/test_cpu/export/test_gguf_format.py @@ -140,9 +140,9 @@ def test_all_format(self, tiny_qwen_model_path): shutil.rmtree("../../tmp_autoround", ignore_errors=True) def test_vlm_gguf(self, tiny_qwen_vl_model_path): - from auto_round import AutoRoundMLLM + from auto_round import AutoRound - autoround = AutoRoundMLLM( + autoround = AutoRound( tiny_qwen_vl_model_path, iters=0, nsamples=8, @@ -160,9 +160,9 @@ def test_vlm_gguf(self, tiny_qwen_vl_model_path): assert file_size < 270, f"file size {file_size} MB is too large for non-quantized mmproj-model.gguf" def test_vlm_gguf_wo_quant_nontext_module(self, tiny_qwen_vl_model_path): - from auto_round import AutoRoundMLLM + from auto_round import AutoRound - autoround = AutoRoundMLLM( + autoround = AutoRound( tiny_qwen_vl_model_path, iters=0, nsamples=8, diff --git a/test/test_cpu/models/test_mllm.py b/test/test_cpu/models/test_mllm.py index d75be5667..2c0c71bd4 100644 --- a/test/test_cpu/models/test_mllm.py +++ b/test/test_cpu/models/test_mllm.py @@ -4,13 +4,14 @@ import pytest from transformers import AutoModelForImageTextToText, AutoProcessor, AutoTokenizer, Qwen2VLForConditionalGeneration -from auto_round import AutoRoundMLLM +from auto_round import AutoRound from auto_round.utils import get_block_names from ...helpers import get_model_path, opt_name_or_path class FakeDataLoader: + def __init__(self): self.batch_size = 1 @@ -27,7 +28,8 @@ def __iter__(self): yield self.data -class TestAutoRoundMLLM: +class TestAutoRound: + @classmethod def setup_class(self): self.model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct") @@ -43,7 +45,7 @@ def setup_save_dir(self, tmp_path): def test_tune(self, tiny_qwen_vl_model_path): bits, group_size = 4, 128 - autoround = AutoRoundMLLM( + autoround = AutoRound( model=tiny_qwen_vl_model_path, bits=bits, group_size=group_size, @@ -64,7 +66,7 @@ def test_quant_vision(self, tiny_qwen_vl_model_path): ## bug need to fix tiny_qwen_vl_model_path, trust_remote_code=True, device_map="auto" ) bits, group_size = 4, 128 - autoround = AutoRoundMLLM( + autoround = AutoRound( model, tokenizer, processor=processor, @@ -120,7 +122,7 @@ def test_diff_dataset(self, tiny_qwen_vl_model_path): ) bits, group_size = 4, 128 dataset = ["dataset test", "list test"] - autoround = AutoRoundMLLM( + autoround = AutoRound( model, tokenizer, processor=processor, @@ -154,7 +156,7 @@ def test_str_input(self): ) bits, group_size = 4, 128 dataset = ["test pure text", "input for mllm"] - autoround = AutoRoundMLLM( + autoround = AutoRound( model, tokenizer, processor=processor, @@ -215,7 +217,7 @@ def test_qwen2_5(self, tiny_qwen_2_5_vl_model_path): model_name = tiny_qwen_2_5_vl_model_path model, processor, tokenizer, image_processor = mllm_load_model(model_name) - autoround = AutoRoundMLLM( + autoround = AutoRound( model, tokenizer, iters=1, @@ -271,7 +273,7 @@ def test_mllm_early_stop_tracking(self, tiny_qwen_2_5_vl_model_path): model_name = tiny_qwen_2_5_vl_model_path model, processor, tokenizer, image_processor = mllm_load_model(model_name) - autoround = AutoRoundMLLM( + autoround = AutoRound( model, tokenizer, iters=1, diff --git a/test/test_cpu/schemes/test_scheme.py b/test/test_cpu/schemes/test_scheme.py index 3c4ecc713..b249bc50a 100644 --- a/test/test_cpu/schemes/test_scheme.py +++ b/test/test_cpu/schemes/test_scheme.py @@ -11,6 +11,7 @@ class TestAutoRound: + @pytest.fixture(autouse=True) def setup_save_folder(self, tmp_path): self.save_folder = str(tmp_path / "saved") @@ -103,9 +104,9 @@ def test_mxfp4_rceil(self, tiny_opt_model_path): assert model is not None, "Model loading failed after quantization with MXFP4 scheme" def test_vlm(self, tiny_qwen_vl_model_path): - from auto_round import AutoRoundMLLM + from auto_round import AutoRound - ar = AutoRoundMLLM(tiny_qwen_vl_model_path, scheme="W2A16", nsamples=1, iters=1, seqlen=2) + ar = AutoRound(tiny_qwen_vl_model_path, scheme="W2A16", nsamples=1, iters=1, seqlen=2) assert ar.bits == 2 assert ar.act_bits == 16 diff --git a/test/test_cuda/advanced/test_multiple_card.py b/test/test_cuda/advanced/test_multiple_card.py index ac6d0fb22..abc3bddb6 100644 --- a/test/test_cuda/advanced/test_multiple_card.py +++ b/test/test_cuda/advanced/test_multiple_card.py @@ -198,24 +198,24 @@ def test_device_map_for_triton(self): @multi_card def test_mllm_device_map(self, tiny_qwen_2_5_vl_model_path): - from auto_round import AutoRoundMLLM + from auto_round import AutoRound device_map = "0,1" - ar = AutoRoundMLLM(tiny_qwen_2_5_vl_model_path, device_map=device_map) + ar = AutoRound(tiny_qwen_2_5_vl_model_path, device_map=device_map) assert ar.device == "cuda:0" assert ar.device_map == device_map device_map = 1 - ar = AutoRoundMLLM(ar.model, ar.tokenizer, processor=ar.processor, device_map=device_map) + ar = AutoRound(ar.model, ar.tokenizer, processor=ar.processor, device_map=device_map) assert ar.device == "cuda:1" assert ar.device_map == device_map device_map = "auto" - ar = AutoRoundMLLM(ar.model, ar.tokenizer, processor=ar.processor, device_map=device_map) + ar = AutoRound(ar.model, ar.tokenizer, processor=ar.processor, device_map=device_map) assert ar.device == "cuda" assert ar.device_map == device_map device_map = {"model.language_model.layers": 0, "model.visual.blocks": 1} - ar = AutoRoundMLLM(ar.model, ar.tokenizer, processor=ar.processor, device_map=device_map) + ar = AutoRound(ar.model, ar.tokenizer, processor=ar.processor, device_map=device_map) assert ar.model.model.language_model.layers[0].self_attn.q_proj.tuning_device == "cuda:0" assert ar.model.model.visual.blocks[0].mlp.gate_proj.tuning_device == "cuda:1" diff --git a/test/test_cuda/models/test_mllm.py b/test/test_cuda/models/test_mllm.py index 52d09175c..ba79f77f8 100644 --- a/test/test_cuda/models/test_mllm.py +++ b/test/test_cuda/models/test_mllm.py @@ -8,7 +8,7 @@ from PIL import Image from transformers import AutoModelForImageTextToText, AutoProcessor, AutoTokenizer, Qwen2VLForConditionalGeneration -from auto_round import AutoRoundMLLM +from auto_round import AutoRound from auto_round.utils import get_block_names from ...envs import require_gptqmodel, require_optimum, require_vlm_env @@ -16,6 +16,7 @@ class VisionDataLoader: + def __init__(self): self.batch_size = 1 @@ -37,7 +38,8 @@ def __iter__(self): @pytest.mark.skip_ci(reason="Only tiny model is suggested") -class TestAutoRoundMLLM: +class TestAutoRound: + @pytest.fixture(autouse=True) def _save_dir(self, tmp_path): self.save_dir = str(tmp_path / "saved") @@ -112,13 +114,13 @@ def qwen_inference(self, quantized_model_dir): @require_gptqmodel @require_optimum def test_vlm_tune(self): - from auto_round import AutoRoundMLLM + from auto_round import AutoRound ## load the model model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct") ## quantize the model bits, group_size, sym = 4, 128, True - autoround = AutoRoundMLLM(model_name, bits=bits, group_size=group_size, sym=sym, iters=1, nsamples=1) + autoround = AutoRound(model_name, bits=bits, group_size=group_size, sym=sym, iters=1, nsamples=1) autoround.quantize() quantized_model_path = self.save_dir @@ -181,7 +183,7 @@ def test_llama32_vision_early_stop_tracking(self): model_path, trust_remote_code=True, device_map="auto", torch_dtype="auto" ) - autoround = AutoRoundMLLM( + autoround = AutoRound( model=model, tokenizer=tokenizer, processor=processor, From 4a035fbb28bb7f72e202d0a26cadba44c9204b02 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 7 Apr 2026 06:48:58 +0000 Subject: [PATCH 37/90] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/algorithms/transforms/__init__.py | 1 + auto_round/algorithms/transforms/base.py | 1 + .../algorithms/transforms/hadamard/apply.py | 1 + .../algorithms/transforms/hadamard/config.py | 1 + .../algorithms/transforms/hadamard/patch.py | 1 + .../transforms/hadamard/transforms.py | 1 + .../transforms/hadamard/utils/math.py | 1 + .../transforms/hadamard/utils/matrix.py | 1 + .../architecture_visualization.py | 42 +++++++------------ auto_round/compressors_new/calib.py | 2 +- auto_round/compressors_new/utils.py | 2 +- auto_round/compressors_new/zero_shot.py | 2 +- 12 files changed, 25 insertions(+), 31 deletions(-) diff --git a/auto_round/algorithms/transforms/__init__.py b/auto_round/algorithms/transforms/__init__.py index ddc00687f..247a49f99 100644 --- a/auto_round/algorithms/transforms/__init__.py +++ b/auto_round/algorithms/transforms/__init__.py @@ -34,6 +34,7 @@ >>> from auto_round.algorithms.transforms import apply_rotation >>> model = apply_rotation(model, config={"hadamard_type": "random_hadamard"}) """ + from __future__ import annotations from typing import Any diff --git a/auto_round/algorithms/transforms/base.py b/auto_round/algorithms/transforms/base.py index 05bce2472..aeec1454a 100644 --- a/auto_round/algorithms/transforms/base.py +++ b/auto_round/algorithms/transforms/base.py @@ -17,6 +17,7 @@ ``BaseRotation`` and declare a corresponding ``BaseRotationConfig``. """ + from __future__ import annotations from abc import ABC, abstractmethod diff --git a/auto_round/algorithms/transforms/hadamard/apply.py b/auto_round/algorithms/transforms/hadamard/apply.py index 428827ff1..5f99aea09 100644 --- a/auto_round/algorithms/transforms/hadamard/apply.py +++ b/auto_round/algorithms/transforms/hadamard/apply.py @@ -18,6 +18,7 @@ * :class:`HadamardRotation` – the stateful algorithm object. * :func:`apply_hadamard_transform` – convenience one-shot function. """ + from __future__ import annotations from typing import Any diff --git a/auto_round/algorithms/transforms/hadamard/config.py b/auto_round/algorithms/transforms/hadamard/config.py index 8801c1d86..ebff618c8 100644 --- a/auto_round/algorithms/transforms/hadamard/config.py +++ b/auto_round/algorithms/transforms/hadamard/config.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Hadamard rotation algorithm configuration.""" + from __future__ import annotations from typing import Any diff --git a/auto_round/algorithms/transforms/hadamard/patch.py b/auto_round/algorithms/transforms/hadamard/patch.py index 4c5006f81..e088e0c17 100644 --- a/auto_round/algorithms/transforms/hadamard/patch.py +++ b/auto_round/algorithms/transforms/hadamard/patch.py @@ -21,6 +21,7 @@ Each patch is idempotent: calling it twice has no effect. """ + from __future__ import annotations import torch diff --git a/auto_round/algorithms/transforms/hadamard/transforms.py b/auto_round/algorithms/transforms/hadamard/transforms.py index 00b23aef4..8f70e46b8 100644 --- a/auto_round/algorithms/transforms/hadamard/transforms.py +++ b/auto_round/algorithms/transforms/hadamard/transforms.py @@ -17,6 +17,7 @@ :class:`RandomHadamardTransform` – randomly signed Hadamard. :func:`build_hadamard_transform` – factory that selects the right class. """ + from __future__ import annotations import inspect diff --git a/auto_round/algorithms/transforms/hadamard/utils/math.py b/auto_round/algorithms/transforms/hadamard/utils/math.py index 2bb11b099..14b15ce0d 100644 --- a/auto_round/algorithms/transforms/hadamard/utils/math.py +++ b/auto_round/algorithms/transforms/hadamard/utils/math.py @@ -16,6 +16,7 @@ Provides ``deterministic_hadamard_matrix`` (Sylvester construction) and ``random_hadamard_matrix`` (loaded from a precomputed safetensors file). """ + # note that hadamard matrix multiplication reuses code from # https://github.com/vllm-project/compressed-tensors/blob/main/src/compressed_tensors/transform/utils/hadamard.py diff --git a/auto_round/algorithms/transforms/hadamard/utils/matrix.py b/auto_round/algorithms/transforms/hadamard/utils/matrix.py index e459127d1..c8c723d83 100644 --- a/auto_round/algorithms/transforms/hadamard/utils/matrix.py +++ b/auto_round/algorithms/transforms/hadamard/utils/matrix.py @@ -16,6 +16,7 @@ Note: ``apply_transform_weight`` reuses ideas from https://github.com/vllm-project/compressed-tensors/blob/main/src/compressed_tensors/transform/utils/matrix.py """ + from __future__ import annotations import torch diff --git a/auto_round/compressors_new/architecture_visualization.py b/auto_round/compressors_new/architecture_visualization.py index 2a0fc9648..7ff020733 100644 --- a/auto_round/compressors_new/architecture_visualization.py +++ b/auto_round/compressors_new/architecture_visualization.py @@ -90,8 +90,7 @@ def print_post_init_flow(): print("BaseCompressor.post_init() Execution Flow") print("=" * 110 + "\n") - print( - """ + print(""" BaseCompressor.post_init() │ ├─ Step 1: Resolve formats (str → list[OutputFormat]) @@ -113,8 +112,7 @@ def print_post_init_flow(): │ └─ back-fill to_quant_block_names if it was None │ └─ Step 4: Setup device map, torch compile, offloader - """ - ) + """) print("=" * 110 + "\n") @@ -128,8 +126,7 @@ def print_usage_examples(): print("Example 1: MLLM + AutoRoundCompatible (gradient-based)") print("-" * 110) - print( - """ + print(""" from auto_round.compressors_new.entry import AutoRound from auto_round.algorithms.quantization.sign_round.config import SignRoundConfig @@ -142,13 +139,11 @@ def print_usage_examples(): quant_nontext_module=False, # set True to also quantize vision encoder ) # Dynamically creates: class MLLMCalibCompressor(MLLMMixin, CalibCompressor) - """ - ) + """) print("\nExample 2: MLLM + RTN with imatrix") print("-" * 110) - print( - """ + print(""" from auto_round.algorithms.quantization.rtn.config import RTNConfig config = RTNConfig(scheme="W4A16") @@ -159,13 +154,11 @@ def print_usage_examples(): processor=processor, ) # Dynamically creates: class MLLMCalibratedRTNCompressor(MLLMMixin, CalibratedRTNCompressor) - """ - ) + """) print("\nExample 3: Diffusion + AutoRoundCompatible") print("-" * 110) - print( - """ + print(""" config = SignRoundConfig(scheme="W4A16", iters=200) compressor = AutoRound( config=config, @@ -173,8 +166,7 @@ def print_usage_examples(): guidance_scale=7.5, ) # Dynamically creates: class DiffusionCalibCompressor(DiffusionMixin, CalibCompressor) - """ - ) + """) print("\n" + "=" * 110 + "\n") @@ -188,8 +180,7 @@ def print_mro_example(): print("For class MLLMCalibCompressor(MLLMMixin, CalibCompressor):") print("-" * 110) - print( - """ + print(""" MLLMCalibCompressor (dynamic, created in AutoRound.__new__) └─> MLLMMixin └─> CalibCompressor @@ -212,8 +203,7 @@ def print_mro_example(): ✓ MLLM features from MLLMMixin (processor, template, calib() override) ✓ Calibration compression from CalibCompressor ✓ Model/context management from BaseCompressor - """ - ) + """) print("=" * 110 + "\n") @@ -225,8 +215,7 @@ def print_decision_tree(): print("AutoRound Creation Decision Tree") print("=" * 110 + "\n") - print( - """ + print(""" AutoRound.__new__(config, model, format, **kwargs) │ ├─ Step 1: Detect model type @@ -264,8 +253,7 @@ def print_decision_tree(): │ └─> class DiffusionZeroShotCompressor(DiffusionMixin, ZeroShotCompressor) └─ model_type == "llm" └─> ZeroShotCompressor - """ - ) + """) print("=" * 110 + "\n") @@ -277,8 +265,7 @@ def print_quantizer_interface(): print("BaseQuantizers Interface - Name-based quantize_block / quantize_layer") print("=" * 110 + "\n") - print( - """ + print(""" All quantizers use module *names* (str) instead of module objects. The module is retrieved internally via get_module(model, name). @@ -295,8 +282,7 @@ def print_quantizer_interface(): ├─ RTNQuantizer.quantize_block(block_name: str) ├─ OptimizedRTNQuantizer.quantize_block(block_name: str, input_ids, input_others) └─ SignRoundQuantizer.quantize_block(block_name: Union[str, list[str]], input_ids, input_others) - """ - ) + """) print("=" * 110 + "\n") diff --git a/auto_round/compressors_new/calib.py b/auto_round/compressors_new/calib.py index f78b45c7b..bf6b3e0e8 100644 --- a/auto_round/compressors_new/calib.py +++ b/auto_round/compressors_new/calib.py @@ -1120,7 +1120,7 @@ def __init__( def _quantize_via_rtn_blockwise(self) -> None: """Quantize model layers block by block using cached inputs and imatrix.""" - all_blocks = self.quantizer.quant_block_list if self.quantizer.quant_block_list else get_block_names(self.model) + all_blocks = self.quantizer.quant_block_list or get_block_names(self.model) if not all_blocks: raise ValueError("Could not find any blocks. Check the model or quant_block_list.") diff --git a/auto_round/compressors_new/utils.py b/auto_round/compressors_new/utils.py index b8af2f8de..f1342af61 100644 --- a/auto_round/compressors_new/utils.py +++ b/auto_round/compressors_new/utils.py @@ -391,7 +391,7 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str # 5. collect supported modules embedding_types = (torch.nn.Embedding,) - gguf_name = gguf_format_name if gguf_format_name else get_gguf_scheme(default_scheme) + gguf_name = gguf_format_name or get_gguf_scheme(default_scheme) if gguf_name: if torch.nn.Embedding not in supported_types: supported_types = (*supported_types, torch.nn.Embedding) diff --git a/auto_round/compressors_new/zero_shot.py b/auto_round/compressors_new/zero_shot.py index 49df93bcb..7f8e9cbc6 100644 --- a/auto_round/compressors_new/zero_shot.py +++ b/auto_round/compressors_new/zero_shot.py @@ -114,7 +114,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: if use_blockwise_quantization: # The ram usage is a little higher - all_blocks = self.quant_block_list if self.quant_block_list else get_block_names(self.model) + all_blocks = self.quant_block_list or get_block_names(self.model) pbar = tqdm(range(sum(len(block) for block in all_blocks))) for block_names in all_blocks: for block_name in block_names: From 463bb6c2a9fca8ec33ffd0ed9029e7cbd99b1fc3 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Tue, 7 Apr 2026 14:49:11 +0800 Subject: [PATCH 38/90] fix ut Signed-off-by: n1ck-guo --- .../algorithms/quantization/rtn/quantizer.py | 6 +- auto_round/compressors_new/base.py | 148 +++++++++++++++--- auto_round/compressors_new/entry.py | 104 +++++------- auto_round/compressors_new/utils.py | 4 + auto_round/context/compress.py | 1 - auto_round/export/export_to_gguf/export.py | 4 - test/test_cpu/export/test_gguf_format.py | 6 +- test/test_cpu/models/test_conv1d.py | 4 +- 8 files changed, 181 insertions(+), 96 deletions(-) diff --git a/auto_round/algorithms/quantization/rtn/quantizer.py b/auto_round/algorithms/quantization/rtn/quantizer.py index 3d3f23ad7..555542841 100644 --- a/auto_round/algorithms/quantization/rtn/quantizer.py +++ b/auto_round/algorithms/quantization/rtn/quantizer.py @@ -111,7 +111,11 @@ def quantize_layer(self, name: str, dtype: torch.dtype = None) -> None: set_module(self.model, name, m) tuning_device = m.tuning_device if hasattr(m, "tuning_device") else self.compress_context.device # Step 1: let gguf merge layers or rename module first and we will handle the RTN is gguf specific logic - if self.compress_context.is_immediate_packing and self.compress_context.formats[0].is_gguf(): + if ( + self.compress_context.is_immediate_packing + and self.compress_context.formats[0].is_gguf() + and not getattr(self.config, "disable_opt_rtn", False) + ): m = m.to(tuning_device) m.scale = None m.zp = None diff --git a/auto_round/compressors_new/base.py b/auto_round/compressors_new/base.py index 929b7fe7e..c607087c7 100644 --- a/auto_round/compressors_new/base.py +++ b/auto_round/compressors_new/base.py @@ -547,18 +547,44 @@ def post_init(self) -> None: ``quantize_and_save()`` does this automatically before entering the inference-mode quantize loop. - The five phases in order: - 1. Scheme resolution – pure config, no model structure needed. - 2. Format resolution – needs data_type/bits from phase 1. - 3. Model patching – needs formats from phase 2. - 4. Layer-config build – needs patched model from phase 3. - 5. Hardware setup – device map, torch.compile, offloading. + Delegates to five ordered pipeline phases; see each ``_resolve_scheme``, + ``_resolve_formats``, ``_patch_model``, ``_build_layer_config``, and + ``_hardware_setup`` for the precise preconditions and postconditions. """ if self._post_init_done: return - # ── Phase 1: resolve scheme ─────────────────────────────────────────── - # Initialize scheme state from quantize_config before resolving. + self._resolve_scheme() + self._resolve_formats() + self._patch_model() + self._build_layer_config() + self._hardware_setup() + + self._post_init_done = True + + # ── Pipeline phase methods ──────────────────────────────────────────────── + + def _resolve_scheme(self) -> None: + """Phase 1 – Scheme resolution and quantizer construction. + + Preconditions: + - ``self.quantize_config`` is a valid :class:`QuantizationConfig`. + + Work performed: + - Seeds scheme-related attrs (``scale_dtype``, ``ignore_layers``, + ``quant_lm_head``, ``to_quant_block_names``) from ``quantize_config``. + - Calls :meth:`resolve_scheme` to derive ``data_type``, ``bits``, + ``sym``, ``scale_dtype`` etc. and write them back to both ``self`` + and ``self.quantize_config``. + - Constructs ``self.quantizer`` from the now-resolved config and wires + it to the current model / context. + - Binds ``self.wrapper_block`` for later use in quantizers. + + Postconditions: + - ``self.scheme`` and ``self.quantize_config`` carry resolved scheme attrs. + - ``self.quantizer`` is ready; calibration params (``seqlen``, + ``nsamples``) are synced. + """ cfg = self.quantize_config self.scale_dtype = cfg.scale_dtype # self.layer_config is already set from __init__ (direct compressor param). @@ -585,7 +611,33 @@ def post_init(self) -> None: self.quantizer.nsamples = self.nsamples self.wrapper_block = wrapper_block - # ── Phase 2: resolve output format ─────────────────────────────────── + def _resolve_formats(self) -> None: + """Phase 2 – Format resolution, GGUF attr sync, and rotation application. + + Preconditions: + - Phase 1 complete: ``self.quantizer`` is initialised and the scheme + is resolved (``data_type``, ``bits``, ``sym`` etc. are final). + + Work performed: + - Converts a string ``self.formats`` to a list of + :class:`~auto_round.formats.OutputFormat` objects via + :func:`~auto_round.formats.get_formats`. + - Initialises :class:`~auto_round.compressors_new.shard_writer.ShardWriter` + when formats are present. + - **(2b)** Detects GGUF-driven attribute mutations (``bits``, ``sym``, + ``data_type``, ``group_size``, etc.) that ``gguf_args_check`` may + have written onto ``self`` inside ``get_formats``, syncs them to + ``self.quantizer``, and rebuilds ``self.scheme`` accordingly. + - Merges any GGUF-injected entries into ``self.layer_config``. + - **(2d)** Applies rotation transforms from ``self.transform_configs``. + + Postconditions: + - ``self.formats`` is a list (or ``None``). + - ``self.compress_context.formats`` mirrors ``self.formats``. + - ``self.quantizer`` carries the GGUF-adjusted scheme attrs. + - ``self.scheme`` is consistent with the final quantization attrs. + - All rotation transforms have been applied to ``self.model_context.model``. + """ # get_formats() inspects data_type / bits etc. that were just resolved. if isinstance(self.formats, str): self.formats = get_formats(self.formats, self) @@ -595,10 +647,10 @@ def post_init(self) -> None: self.shard_writer = ShardWriter(self.model_context.model, bits=8) # Snapshot the user-specified layer_config before GGUF processing may - # add extra entries, so we can distinguish them later in Phase 2b. + # add extra entries, so we can distinguish them later in step 2b. _pre_gguf_layer_config = copy.copy(self.layer_config) or {} - # ── Phase 2b: propagate GGUF-adjusted attrs back to quantizer ──────── + # ── 2b: propagate GGUF-adjusted attrs back to quantizer ────────────── # gguf_args_check (called inside get_formats) may have overridden # bits / sym / data_type / super_bits / super_group_size / group_size # on *this* BaseCompressor object. The quantizer stored its own copies @@ -674,7 +726,7 @@ def post_init(self) -> None: for _lname, _lval in _gguf_layer_cfg.items(): self.layer_config.setdefault(_lname, _lval) - # ── Phase 2d: apply rotation transforms ────────────────────────────── + # ── 2d: apply rotation transforms ──────────────────────────────────── if self.transform_configs: check_supported_schemes(self.scheme) need_calibration = self.quantize_config.iters > 0 @@ -685,15 +737,50 @@ def post_init(self) -> None: need_calibration=need_calibration, ) - # ── Phase 3: patch model structure ─────────────────────────────────── - # update_module() may replace layers (e.g. MoE expert merging); must + def _patch_model(self) -> None: + """Phase 3 – Model structure patching. + + Preconditions: + - Phase 2 complete: ``self.formats`` is resolved so that + ``apply_patches`` can inspect format-specific requirements. + + Work performed: + - Delegates to :meth:`~auto_round.context.model.ModelContext.apply_patches` + which may replace or merge layers (e.g. MoE expert merging, adding + static-kv wrappers) to produce the final model topology. + + Postconditions: + - ``self.model_context.model`` reflects the definitive topology that + :meth:`_build_layer_config` will walk. + """ + # apply_patches() may replace layers (e.g. MoE expert merging); must # happen before configure_layer_config() so it sees the final topology. self.model_context.apply_patches(self.formats) - # ── Phase 4: build layer config ────────────────────────────────────── + def _build_layer_config(self) -> None: + """Phase 4 – Layer-config construction and quantizer sync. + + Preconditions: + - Phase 3 complete: model topology is final. + - ``self.scheme`` and all scheme-resolved attrs are consistent with + the (possibly GGUF-adjusted) values set in Phase 2. + + Work performed: + - Calls :meth:`_scheme_post_init` which walks the patched model to + build ``self.layer_config``, ``self.quant_block_list``, etc. + On the AutoScheme path this also runs delta-loss forward/backward + passes to select per-layer schemes. + - Syncs the fully-resolved ``layer_config`` and related attrs to + ``self.quantizer`` so quantization methods have the complete view. + + Postconditions: + - ``self.layer_config`` is fully populated. + - ``self.quantizer`` mirrors ``layer_config``, ``has_qlayer_outside_block``, + ``regex_config``, ``quant_block_list``, ``to_quant_block_names``, + ``scale_dtype``, and ``ignore_layers``. + """ # configure_layer_config() walks the patched model; _gen_auto_scheme() # (AutoScheme path) runs delta-loss forward+backward passes. - # Both methods now live in BaseCompressor and operate on self directly. self._scheme_post_init() # Sync the fully-resolved scheme state to the quantizer so that @@ -707,7 +794,32 @@ def post_init(self) -> None: self.quantizer.scale_dtype = self.scale_dtype self.quantizer.ignore_layers = self.ignore_layers - # ── Phase 5: hardware / compile setup ──────────────────────────────── + def _hardware_setup(self) -> None: + """Phase 5 – Hardware and compile configuration. + + Preconditions: + - Phase 4 complete: ``layer_config`` is built and + ``has_qlayer_outside_block`` is known. + - ``self.quantize_config.data_type`` is the final resolved value + (needed by :meth:`_adjust_torch_compile`). + + Work performed: + - Applies the device map via :func:`~auto_round.utils.device.set_non_auto_device_map`. + - Re-evaluates ``torch.compile`` eligibility now that ``data_type`` is + resolved and writes the result back to ``compress_context``. + - Selects ``self.block_forward`` (compiled or plain). + - Resets the offload manager when ``low_cpu_mem_usage`` is active. + - Disables ``self.inplace`` when quantized layers live outside + transformer blocks (incompatible with in-place rewriting). + - Calls :meth:`_adjust_immediate_packing_and_saving` to decide whether + layers should be packed / written immediately after each block. + + Postconditions: + - ``self.block_forward`` is ready for use. + - ``compress_context.enable_torch_compile`` is final. + - ``self.inplace`` and ``self.is_immediate_packing`` / + ``self.is_immediate_saving`` are set to their definitive values. + """ set_non_auto_device_map(self.model_context.model, self.compress_context.device_map) # Re-evaluate torch.compile eligibility now that data_type is resolved. self._adjust_torch_compile(self.enable_torch_compile) @@ -727,8 +839,6 @@ def post_init(self) -> None: else: self._adjust_immediate_packing_and_saving() - self._post_init_done = True - # backward compatible with the legacy API def __getattr__(self, name: str) -> Any: if name in self.__dict__: diff --git a/auto_round/compressors_new/entry.py b/auto_round/compressors_new/entry.py index 53f3482b8..a37a91b1e 100644 --- a/auto_round/compressors_new/entry.py +++ b/auto_round/compressors_new/entry.py @@ -41,6 +41,41 @@ def _preview_resolved_attrs(config, scheme=None) -> dict: return {} +# --------------------------------------------------------------------------- +# Compressor-class registry +# --------------------------------------------------------------------------- +# Maps (model_type, base_class_name) → combined class, created lazily. +_COMPRESSOR_REGISTRY: dict[tuple[str, str], type] = {} + + +def _get_compressor_class(model_type: str, base_cls: type) -> type: + """Return the compressor class for *base_cls* wired with the right model-type Mixin. + + For ``model_type == "llm"`` the bare *base_cls* is returned unchanged. + For ``"mllm"`` and ``"diffusion"`` the corresponding Mixin is prepended via + :func:`type` and the result is cached in ``_COMPRESSOR_REGISTRY`` so that + each ``(model_type, base_cls)`` pair is created at most once per process. + """ + if model_type == "llm": + return base_cls + key = (model_type, base_cls.__name__) + if key in _COMPRESSOR_REGISTRY: + return _COMPRESSOR_REGISTRY[key] + if model_type == "mllm": + from auto_round.compressors_new.mllm_mixin import MLLMMixin + + mixin = MLLMMixin + elif model_type == "diffusion": + from auto_round.compressors_new.diffusion_mixin import DiffusionMixin + + mixin = DiffusionMixin + else: + return base_cls + combined = type(f"{model_type.capitalize()}{base_cls.__name__}", (mixin, base_cls), {}) + _COMPRESSOR_REGISTRY[key] = combined + return combined + + def is_weight_scheme(scheme): if isinstance(scheme, str): return scheme.upper().startswith("W") @@ -151,30 +186,7 @@ def __new__( model_type = "mllm" if isinstance(quant_config, SignRoundConfig): - # For AutoRoundCompatible, we need calibration-based compression - # Dynamically create combined class using Mixin pattern - if model_type == "mllm": - from auto_round.compressors_new.mllm_mixin import MLLMMixin - - # Create dynamic class: MLLMMixin + CalibCompressor - class MLLMCalibCompressor(MLLMMixin, CalibCompressor): - """MLLM model with AutoRoundCompatible calibration compression""" - - pass - - return MLLMCalibCompressor(alg_configs, **local_args, **kwargs) - elif model_type == "diffusion": - from auto_round.compressors_new.diffusion_mixin import DiffusionMixin - - # Create dynamic class: DiffusionMixin + CalibCompressor - class DiffusionCalibCompressor(DiffusionMixin, CalibCompressor): - """Diffusion model with AutoRoundCompatible calibration compression""" - - pass - - return DiffusionCalibCompressor(alg_configs, **local_args, **kwargs) - else: - return CalibCompressor(alg_configs, **local_args, **kwargs) + return _get_compressor_class(model_type, CalibCompressor)(alg_configs, **local_args, **kwargs) elif isinstance(quant_config, RTNConfig): enable_imatrix = False @@ -227,50 +239,10 @@ class DiffusionCalibCompressor(DiffusionMixin, CalibCompressor): if enable_imatrix or needs_act_calib or is_auto_scheme: quant_config._alg_cls = "OptimizedRTNQuantizer" - # For RTN with calibration data, dynamically combine with model-specific Mixin - if model_type == "mllm": - from auto_round.compressors_new.mllm_mixin import MLLMMixin - - class MLLMCalibratedRTNCompressor(MLLMMixin, CalibratedRTNCompressor): - """MLLM model with calibrated RTN compression""" - - pass - - return MLLMCalibratedRTNCompressor(alg_configs, **local_args, **kwargs) - elif model_type == "diffusion": - from auto_round.compressors_new.diffusion_mixin import DiffusionMixin - - class DiffusionCalibratedRTNCompressor(DiffusionMixin, CalibratedRTNCompressor): - """Diffusion model with calibrated RTN compression""" - - pass - - return DiffusionCalibratedRTNCompressor(alg_configs, **local_args, **kwargs) - else: - return CalibratedRTNCompressor(alg_configs, **local_args, **kwargs) + return _get_compressor_class(model_type, CalibratedRTNCompressor)(alg_configs, **local_args, **kwargs) else: quant_config._alg_cls = "RTNQuantizer" - # Zero-shot RTN: no calibration data needed - if model_type == "mllm": - from auto_round.compressors_new.mllm_mixin import MLLMMixin - - class MLLMZeroShotCompressor(MLLMMixin, ZeroShotCompressor): - """MLLM model with zero-shot RTN compression""" - - pass - - return MLLMZeroShotCompressor(alg_configs, **local_args, **kwargs) - elif model_type == "diffusion": - from auto_round.compressors_new.diffusion_mixin import DiffusionMixin - - class DiffusionZeroShotCompressor(DiffusionMixin, ZeroShotCompressor): - """Diffusion model with zero-shot RTN compression""" - - pass - - return DiffusionZeroShotCompressor(alg_configs, **local_args, **kwargs) - else: - return ZeroShotCompressor(alg_configs, **local_args, **kwargs) + return _get_compressor_class(model_type, ZeroShotCompressor)(alg_configs, **local_args, **kwargs) class AutoRoundCompatible: diff --git a/auto_round/compressors_new/utils.py b/auto_round/compressors_new/utils.py index f319d00a5..b8af2f8de 100644 --- a/auto_round/compressors_new/utils.py +++ b/auto_round/compressors_new/utils.py @@ -1213,4 +1213,8 @@ def immediate_pack(name: str, layer_config: dict): output_dir=_get_save_folder_name(compress_context.formats[0]), layer_config=layer_config, tokenizer=model_context.tokenizer, + mllm=model_context.is_mllm, + processor=getattr(model_context, "processor", None), + image_processor=getattr(model_context, "image_processor", None), + quant_nontext_module=getattr(model_context, "quant_nontext_module", False), ) diff --git a/auto_round/context/compress.py b/auto_round/context/compress.py index 07f378de6..5b92a8b7c 100644 --- a/auto_round/context/compress.py +++ b/auto_round/context/compress.py @@ -63,7 +63,6 @@ def __init__( self.immediate_packing = is_immediate_packing self.is_immediate_packing = is_immediate_packing self.is_immediate_saving = is_immediate_saving - self.formats = formats self.static_kv_dtype = static_kv_dtype self.static_attention_dtype = static_attention_dtype diff --git a/auto_round/export/export_to_gguf/export.py b/auto_round/export/export_to_gguf/export.py index c3a06c2ad..5f6dd8453 100644 --- a/auto_round/export/export_to_gguf/export.py +++ b/auto_round/export/export_to_gguf/export.py @@ -146,11 +146,7 @@ def pack_gguf_layer( ): """Export the model to gguf format.""" global gguf_model_instance_global - # if output_dir is not None and os.path.exists(output_dir): - # logger.warning_once(f"{output_dir} already exists, this may cause model conflict") if "gguf_model_instance_global" not in globals(): - config = model.config - gguf_model_instance_global = [ create_model_class( output_dir, diff --git a/test/test_cpu/export/test_gguf_format.py b/test/test_cpu/export/test_gguf_format.py index 92e0015e0..9666bb286 100644 --- a/test/test_cpu/export/test_gguf_format.py +++ b/test/test_cpu/export/test_gguf_format.py @@ -66,7 +66,7 @@ def test_func(self): ) assert autoround.group_size == 32 assert not autoround.sym - gguf_file = os.listdir(self.save_dir)[0] + gguf_file = os.listdir(quantized_model_path)[0] model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto") eval_generated_prompt(model, self.tokenizer) @@ -151,7 +151,7 @@ def test_vlm_gguf(self, tiny_qwen_vl_model_path): ) quantized_model_path = self.save_dir _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0") - assert "mmproj-model.gguf" in os.listdir(self.save_dir) + assert "mmproj-model.gguf" in os.listdir(quantized_model_path) for file_name in os.listdir(quantized_model_path): file_size = os.path.getsize(os.path.join(quantized_model_path, file_name)) / 1024**2 if file_name == "mmproj-model.gguf": @@ -171,7 +171,7 @@ def test_vlm_gguf_wo_quant_nontext_module(self, tiny_qwen_vl_model_path): ) quantized_model_path = self.save_dir _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0") - assert "mmproj-model.gguf" in os.listdir(self.save_dir) + assert "mmproj-model.gguf" in os.listdir(quantized_model_path) for file_name in os.listdir(quantized_model_path): file_size = os.path.getsize(os.path.join(quantized_model_path, file_name)) / 1024**2 if file_name == "mmproj-model.gguf": diff --git a/test/test_cpu/models/test_conv1d.py b/test/test_cpu/models/test_conv1d.py index 8a30b8207..717178e73 100644 --- a/test/test_cpu/models/test_conv1d.py +++ b/test/test_cpu/models/test_conv1d.py @@ -39,7 +39,7 @@ def test_quant(self, dataloader): ) autoround.quantize() - autoround.save_quantized(self.save_dir) + _, quantized_model_path = autoround.save_quantized(self.save_dir, return_folders=True) - model = AutoModelForCausalLM.from_pretrained(self.save_dir, device_map="cpu", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu", trust_remote_code=True) model_infer(model, self.tokenizer) From 755ab4e1f3169cb676412349a4e92bcde3cbf4e6 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Tue, 7 Apr 2026 14:56:50 +0800 Subject: [PATCH 39/90] sync merge Signed-off-by: n1ck-guo --- auto_round/compressors_new/base.py | 6 ++---- auto_round/compressors_new/calib.py | 2 +- auto_round/compressors_new/zero_shot.py | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/auto_round/compressors_new/base.py b/auto_round/compressors_new/base.py index c607087c7..4802814b2 100644 --- a/auto_round/compressors_new/base.py +++ b/auto_round/compressors_new/base.py @@ -338,12 +338,10 @@ def _scheme_post_init(self) -> None: def _gen_auto_scheme(self) -> dict[str, dict]: """Generate per-layer config via AutoScheme delta-loss selection.""" if self.model_context.is_mllm: - logger.info("AutoScheme is not yet supported for multimodal LLMs.") - sys.exit(-1) + raise NotImplementedError("AutoScheme is not yet supported for multimodal LLMs.") if is_quantized_input_module(self.model_context.model): - logger.info("AutoScheme does not currently support quantized input models (e.g., FP8).") - sys.exit(-1) + raise NotImplementedError("AutoScheme does not currently support quantized input models (e.g., FP8).") all_dtypes = [] all_gguf = True diff --git a/auto_round/compressors_new/calib.py b/auto_round/compressors_new/calib.py index f78b45c7b..68dcb4faa 100644 --- a/auto_round/compressors_new/calib.py +++ b/auto_round/compressors_new/calib.py @@ -1120,7 +1120,7 @@ def __init__( def _quantize_via_rtn_blockwise(self) -> None: """Quantize model layers block by block using cached inputs and imatrix.""" - all_blocks = self.quantizer.quant_block_list if self.quantizer.quant_block_list else get_block_names(self.model) + all_blocks = self.quant_block_list or get_block_names(self.model) if not all_blocks: raise ValueError("Could not find any blocks. Check the model or quant_block_list.") diff --git a/auto_round/compressors_new/zero_shot.py b/auto_round/compressors_new/zero_shot.py index 49df93bcb..7f8e9cbc6 100644 --- a/auto_round/compressors_new/zero_shot.py +++ b/auto_round/compressors_new/zero_shot.py @@ -114,7 +114,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: if use_blockwise_quantization: # The ram usage is a little higher - all_blocks = self.quant_block_list if self.quant_block_list else get_block_names(self.model) + all_blocks = self.quant_block_list or get_block_names(self.model) pbar = tqdm(range(sum(len(block) for block in all_blocks))) for block_names in all_blocks: for block_name in block_names: From d661e0b2be5bdef85af88c6da3b6350223b7d7d0 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Tue, 7 Apr 2026 16:16:46 +0800 Subject: [PATCH 40/90] fix by comment Signed-off-by: n1ck-guo --- auto_round/compressors_new/calib.py | 153 ++++++++++++++++++++ auto_round/compressors_new/entry.py | 36 +++++ auto_round/eval/evaluation.py | 7 +- test/test_cpu/export/test_llmc_format.py | 4 +- test/test_cpu/quantization/test_new_arch.py | 38 ----- test/test_cpu/schemes/test_scheme.py | 8 + 6 files changed, 204 insertions(+), 42 deletions(-) delete mode 100644 test/test_cpu/quantization/test_new_arch.py diff --git a/auto_round/compressors_new/calib.py b/auto_round/compressors_new/calib.py index 68dcb4faa..9bb53d1f9 100644 --- a/auto_round/compressors_new/calib.py +++ b/auto_round/compressors_new/calib.py @@ -653,6 +653,159 @@ def _split_inputs(self, inputs: dict, first_input_name: str) -> tuple[torch.Tens input_others = inputs return input_ids, input_others + def normalize_decoding_layer_inputs_(self, decoding_layer_inputs: list[tuple[tuple[Any, dict[str, Any]]]]) -> None: + """Replay captured decoding-layer calls to populate ``self.inputs``. + + Converts the raw ``(args, kwargs)`` tuples captured by LLM-Compressor's + input hook into the ``self.inputs`` dict format expected by + :meth:`quantize_block`. The logic mirrors the old-arch implementation in + ``compressors/base.py``. + + Args: + decoding_layer_inputs: + A list of entries captured by a forward hook on the decoding layer. + Each element is a tuple whose first item is ``(args, kwargs)``. + """ + first_block_name = self.quant_block_list[0][0] + + class _FakeDecodingLayer(torch.nn.Module): + + def forward(self, *args, **kwargs): + return args, kwargs + + fake_layer = _FakeDecodingLayer() + fake_layer.orig_forward = fake_layer.forward + fake_layer.forward = partial(self._get_block_forward_func(first_block_name), fake_layer) + + self.inputs = {} + self.last_cache_name = None + for step_input in decoding_layer_inputs: + args, kwargs = step_input[0] + fake_layer(*args, **kwargs) + + def quantize_block( + self, + block: torch.nn.Module, + inputs: tuple, + q_input: Union[torch.Tensor, dict, None] = None, + device: Union[str, torch.device] = "cpu", + auto_offload: bool = True, + ): + """Quantize a single decoded block of the model (public API for LLM-Compressor). + + This method is the new-arch equivalent of the old ``BaseCompressor.quantize_block`` + (see ``compressors/base.py``). It is primarily consumed by LLM-Compressor: + https://github.com/vllm-project/llm-compressor/pull/1994 + + The method normalizes the raw decoding-layer inputs provided by LLM-Compressor, + runs the full infrastructure pipeline (device placement, act-max collection, + reference-output caching) for the given *block*, delegates the pure-algorithm + weight optimization to ``self.quantizer.quantize_block``, then returns the + quantized-block outputs. + + Args: + block: The transformer block (decoder layer) to quantize. + inputs: Raw decoding-layer inputs captured by LLM-Compressor's hook. + Format: list of ``((args, kwargs),)`` tuples as produced by the hook. + q_input: Optional quantized input from the previous block. ``None`` on + the first block. + device: Target device for quantization (e.g. ``"cuda:0"``). + auto_offload: When *True*, use the device-map-aware offloading path; + otherwise move ``block`` directly to ``device``. + + Returns: + tuple: ``(q_outputs, reference_output)`` where *q_outputs* is the + block's output after quantization (or ``None`` when + ``enable_quanted_input`` is ``False``), and *reference_output* is the + full-precision reference output collected before optimization. + """ + assert not self.mllm and not self.diffusion, ( + f"Currently, {self.__class__.__name__} does not support quantize_block " "for MLLM / diffusion models." + ) + + # Ensure post_init has been called (sets up model_context, compress_context, + # quantizer, layer_config, etc.). + if not self._post_init_done: + self.post_init() + + self.normalize_decoding_layer_inputs_(inputs) + block_inputs = self.inputs[self.quant_block_list[0][0]] + input_ids, input_others = self._preprocess_block_inputs(block_inputs, "hidden_states") + + # ── Infrastructure: materialize, dtype convert, device placement ────── + materialize_model_(block) + convert_module_to_hp_if_necessary(block, self.model_context.amp_dtype, device) + + if auto_offload: + if is_auto_device_mapping(self.compress_context.device_map) and len(self.compress_context.device_list) > 1: + from auto_round.utils.device import set_auto_device_map_for_block_with_tuning + + card_0_in_high_risk, loss_device = set_auto_device_map_for_block_with_tuning( + block, + self.compress_context.device_map, + input_ids, + self.compress_context.low_gpu_mem_usage, + self.quantizer.batch_size, + device, + ) + else: + block = block.to(device) + card_0_in_high_risk, loss_device = False, device + else: + card_0_in_high_risk, loss_device = False, device + + if len(self.compress_context.device_list) > 1 and auto_offload: + from accelerate.hooks import AlignDevicesHook, add_hook_to_module + + for n, m in block.named_modules(): + if len(list(m.children())) != 0 or not hasattr(m, "tuning_device"): + continue + add_hook_to_module(m, AlignDevicesHook(m.tuning_device, io_same_device=True), True) + + # ── Infrastructure: collect reference output and act_max ────────────── + bs = self.quantizer.batch_size * self.quantizer.infer_bs_coeff + if q_input is None: + hook_handles = self.quantizer._register_act_max_hook(block) + reference_output = self.quantizer._get_block_outputs(block, input_ids, input_others, bs) + for h in hook_handles: + h.remove() + else: + reference_output = self.quantizer._get_block_outputs(block, input_ids, input_others, bs) + hook_handles = self.quantizer._register_act_max_hook(block) + if hook_handles: + self.quantizer._get_block_outputs(block, q_input, input_others, bs, save_output=False) + for h in hook_handles: + h.remove() + if input_ids is not q_input: + clear_memory(input_ids, device_list=self.compress_context.device_list) + else: + clear_memory(device_list=self.compress_context.device_list) + input_ids = q_input + + # ── Pure algorithm: delegates to quantizer ──────────────────────────── + mid_iter_mem_check = self.compress_context.low_gpu_mem_usage and card_0_in_high_risk + self.quantizer.quantize_block( + block, + input_ids, + input_others, + reference_output, + loss_device=loss_device, + mid_iter_mem_check=mid_iter_mem_check, + ) + + # ── Collect quantized-block outputs ─────────────────────────────────── + if self.quantizer.enable_quanted_input: + q_outputs = self.quantizer._get_block_outputs(block, input_ids, input_others, bs) + else: + q_outputs = None + + # ── Cleanup ─────────────────────────────────────────────────────────── + if len(self.compress_context.device_list) > 1: + accelerate.hooks.remove_hook_from_submodules(block) + mv_module_from_gpu(block) + + return q_outputs, reference_output + def _quantize_blocks( self, model: torch.nn.Module, diff --git a/auto_round/compressors_new/entry.py b/auto_round/compressors_new/entry.py index a37a91b1e..81a29661b 100644 --- a/auto_round/compressors_new/entry.py +++ b/auto_round/compressors_new/entry.py @@ -41,6 +41,37 @@ def _preview_resolved_attrs(config, scheme=None) -> dict: return {} +def _eager_validate_scheme(config, scheme=None) -> None: + """Eagerly validate scheme/config constraints at construction time. + + Mirrors the old-arch ``_check_configs()`` call in ``BaseCompressor.__init__``. + Raises ``ValueError`` or ``NotImplementedError`` immediately if the scheme + contains config-only invalid combinations (e.g. tuple group_size with non-fp8 + weight dtype) so that callers get a fast failure rather than a deferred error + buried inside ``post_init()``. + + ``AutoScheme`` is skipped because it requires model information. + """ + if isinstance(scheme, AutoScheme): + return + + scheme_attr_names = QuantizationScheme.get_attributes() + user_overrides = {k: getattr(config, k) for k in scheme_attr_names if getattr(config, k, None) is not None} + try: + _, _, final_attrs = _parse_scheme(scheme, user_overrides) + except (ValueError, NotImplementedError): + raise + except Exception: + return # Other parse errors are deferred to post_init + + import copy + + temp_config = copy.copy(config) + for key, value in final_attrs.items(): + setattr(temp_config, key, value) + temp_config.check_config() # raises ValueError / NotImplementedError if invalid + + # --------------------------------------------------------------------------- # Compressor-class registry # --------------------------------------------------------------------------- @@ -173,6 +204,11 @@ def __new__( else: quant_config = alg_configs + # Eagerly validate scheme constraints that do not require model info. + # This mirrors old-arch _check_configs() called at __init__ time so that + # callers get ValueError/NotImplementedError on construction, not deferred. + _eager_validate_scheme(quant_config, scheme) + # using different compressor base on AlgConfigs local_args = {k: v for k, v in locals().items() if k not in cls.SKIP_ARGS} diff --git a/auto_round/eval/evaluation.py b/auto_round/eval/evaluation.py index d4c4ec917..f878476f9 100644 --- a/auto_round/eval/evaluation.py +++ b/auto_round/eval/evaluation.py @@ -388,8 +388,11 @@ def run_model_evaluation(model, tokenizer, autoround, folders, formats, device_s evaluate_diffusion_model(args, autoround=autoround, model=model) return - # Check if evaluation is needed for language models - eval_folder = folders[-1] if folders else None + # Check if evaluation Compressoris needed for language models + if isinstance(folders, list): + eval_folder = folders[-1] if folders else None + else: + eval_folder = folders if args.tasks is None or args.tasks == "" or eval_folder is None: return diff --git a/test/test_cpu/export/test_llmc_format.py b/test/test_cpu/export/test_llmc_format.py index c7f3210c2..64f46df14 100644 --- a/test/test_cpu/export/test_llmc_format.py +++ b/test/test_cpu/export/test_llmc_format.py @@ -94,7 +94,7 @@ def test_mxfp8_llmcompressor_format(self, tiny_opt_model_path, tmp_path): disable_opt_rtn=True, scheme=scheme, ) - compressed_model, _ = ar.quantize_and_save(output_dir=tmp_path, format="llm_compressor") + compressed_model, tmp_path = ar.quantize_and_save(output_dir=tmp_path, format="llm_compressor") tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj assert ( hasattr(tmp_layer, "weight_scale") @@ -128,7 +128,7 @@ def test_mixed_precision_llmcompressor_format(self, tiny_opt_model_path, tmp_pat disable_opt_rtn=True, scheme=scheme, ) - ar.quantize_and_save(output_dir=tmp_path, format="llm_compressor") + _, tmp_path = ar.quantize_and_save(output_dir=tmp_path, format="llm_compressor") model = AutoModelForCausalLM.from_pretrained(tmp_path, torch_dtype="auto", trust_remote_code=True) op = model.model.decoder.layers[0].fc1 if op.quantization_scheme.targets != ["Linear"]: diff --git a/test/test_cpu/quantization/test_new_arch.py b/test/test_cpu/quantization/test_new_arch.py deleted file mode 100644 index dd24e7e9d..000000000 --- a/test/test_cpu/quantization/test_new_arch.py +++ /dev/null @@ -1,38 +0,0 @@ -import copy -import shutil -import sys - -import pytest -import torch -from transformers import AutoModelForCausalLM, AutoTokenizer - -from auto_round import AutoRound - -from ...helpers import get_model_path - - -class TestAutoRound: - # def test_calib(self, tiny_opt_model_path): - # from auto_round.compressors_new import Compressor - # from auto_round.algorithms.quantization.sign_round.config import SignRoundConfig - # config = SignRoundConfig(scheme="W4A16", iters=200, lr=0.005, bits=2, group_size=32) - # compressor = Compressor(config, tiny_opt_model_path, format="auto_round") - # compressor.quantize_and_save() - - def test_opt_rtn(self, tiny_opt_model_path): - from auto_round.algorithms.quantization.rtn.config import RTNConfig - from auto_round.compressors_new import Compressor - - config = RTNConfig(scheme="W4A16", bits=2, group_size=32) - compressor = Compressor(config, tiny_opt_model_path, format="auto_round") - compressor.quantize_and_save() - - ar = AutoRound(tiny_opt_model_path, bits=2, group_size=32, iters=0) - ar.quantize_and_save() - - # def test_rtn(self, tiny_opt_model_path): - # from auto_round.compressors_new import Compressor - # from auto_round.algorithms.quantization.rtn.config import RTNConfig - # config = RTNConfig(scheme="W4A16", bits=2, group_size=32, disable_opt_rtn=True) - # compressor = Compressor(config, tiny_opt_model_path, format="auto_round") - # compressor.quantize_and_save() diff --git a/test/test_cpu/schemes/test_scheme.py b/test/test_cpu/schemes/test_scheme.py index b249bc50a..41e9a9f7d 100644 --- a/test/test_cpu/schemes/test_scheme.py +++ b/test/test_cpu/schemes/test_scheme.py @@ -36,10 +36,12 @@ def test_gguf(self, tiny_qwen_model_path, dataloader): def test_w4a16(self, tiny_opt_model_path, dataloader): ar = AutoRound(tiny_opt_model_path, scheme="W4A16", nsamples=1, iters=1, seqlen=2, dataset=dataloader) + ar.post_init() assert ar.bits == 4 def test_w2a16_rtn(self, tiny_opt_model_path, dataloader): ar = AutoRound(tiny_opt_model_path, scheme="W2A16", nsamples=1, iters=0, seqlen=2, dataset=dataloader) + ar.post_init() assert ar.bits == 2 def test_w4a16_mixed(self, tiny_qwen_moe_model_path, dataloader): @@ -88,6 +90,7 @@ def test_w4a16_mixed_mllm(self, tiny_qwen_2_5_vl_model_path, dataloader): def test_mxfp4(self, tiny_opt_model_path, dataloader): ar = AutoRound(tiny_opt_model_path, scheme="MXFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader) + ar.post_init() assert ar.bits == 4 assert ar.act_bits == 4 assert ar.data_type == "mx_fp" @@ -95,6 +98,7 @@ def test_mxfp4(self, tiny_opt_model_path, dataloader): def test_mxfp4_rceil(self, tiny_opt_model_path): ar = AutoRound(tiny_opt_model_path, scheme="MXFP4_RCEIL", nsamples=1, iters=1) + ar.post_init() assert ar.bits == 4 assert ar.act_bits == 4 assert ar.data_type == "mx_fp" @@ -107,11 +111,13 @@ def test_vlm(self, tiny_qwen_vl_model_path): from auto_round import AutoRound ar = AutoRound(tiny_qwen_vl_model_path, scheme="W2A16", nsamples=1, iters=1, seqlen=2) + ar.post_init() assert ar.bits == 2 assert ar.act_bits == 16 def test_nvfp4(self, tiny_opt_model_path, dataloader): ar = AutoRound(tiny_opt_model_path, scheme="NVFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader) + ar.post_init() assert ar.bits == 4 assert ar.act_bits == 4 assert ar.data_type == "nv_fp" @@ -199,6 +205,7 @@ def test_set_scheme(self, tiny_qwen_model_path): def test_fp8_static(self, tiny_opt_model_path): ar = AutoRound(tiny_opt_model_path, scheme="FP8_STATIC", nsamples=1, iters=1) + ar.post_init() assert ar.bits == 8 assert ar.act_bits == 8 assert ar.data_type == "fp" @@ -211,6 +218,7 @@ def test_fp8_static(self, tiny_opt_model_path): def test_fp8_static_rtn(self, tiny_opt_model_path): ar = AutoRound(tiny_opt_model_path, scheme="FP8_STATIC", nsamples=1, iters=0, disable_opt_rtn=True) + ar.post_init() assert ar.bits == 8 assert ar.act_bits == 8 assert ar.data_type == "fp" From 08770cf5e7565734cf05304c2b51e3e74b199f45 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Wed, 8 Apr 2026 13:16:33 +0800 Subject: [PATCH 41/90] fix Signed-off-by: n1ck-guo --- auto_round/algorithms/quantization/base.py | 11 +++---- auto_round/compressors_new/base.py | 19 ++++++++++++ auto_round/compressors_new/calib.py | 4 +++ auto_round/compressors_new/mllm_mixin.py | 12 ++++++++ auto_round/compressors_new/utils.py | 3 ++ auto_round/compressors_new/zero_shot.py | 34 ++++++++++++++++++++++ test/test_cpu/models/test_moe_model.py | 14 +++++---- 7 files changed, 87 insertions(+), 10 deletions(-) diff --git a/auto_round/algorithms/quantization/base.py b/auto_round/algorithms/quantization/base.py index 3a0a0da59..737b27eeb 100644 --- a/auto_round/algorithms/quantization/base.py +++ b/auto_round/algorithms/quantization/base.py @@ -323,11 +323,12 @@ def _get_block_outputs( # TODO FIXME # This function could not be compiled, causing a large accuracy drop when `enable_alg_ext` is used. # To avoid issues, remove it in all scenarios except WOQ. - _bf = ( - compile_func(block_forward, self.compress_context.device) - if self.compress_context.enable_torch_compile - else block_forward - ) + if self.compress_context.enable_torch_compile: + if not hasattr(self, "_compiled_block_forward"): + self._compiled_block_forward = compile_func(block_forward, self.compress_context.device) + _bf = self._compiled_block_forward + else: + _bf = block_forward output = [] nsamples = len(input_ids) diff --git a/auto_round/compressors_new/base.py b/auto_round/compressors_new/base.py index 4802814b2..21f49fa6f 100644 --- a/auto_round/compressors_new/base.py +++ b/auto_round/compressors_new/base.py @@ -983,6 +983,12 @@ def save_quantized( logger.warning("please run autoround.quantize first") return folders = [] + if self.formats is None: + logger.info("format is not set, using default auto_round format.") + self.formats = "auto_round" + if isinstance(self.formats, str): + self.formats = get_formats(self.formats, self) + self.compress_context.formats = self.formats for format in self.formats: save_folder = _get_save_folder_name(format) if self.act_bits <= 8 and format.is_fake(): @@ -1030,6 +1036,12 @@ def _get_export_dir(self, output_dir: str, format_str: str) -> str: Mirrors the logic previously in ``__main__.py`` so callers only need to pass the base ``output_dir`` and the format string. """ + # Diffusion models use save_quantized from DiffusionMixin which manages its own + # directory layout (model_index.json + per-component subdirs). Appending a + # scheme-derived suffix here would place files one level too deep. + if getattr(self, "diffusion", False): + return output_dir + model_name = (getattr(self.model_context.model, "name_or_path", "") or "").rstrip("/") cfg = self.quantize_config group_size = cfg.group_size @@ -1120,6 +1132,13 @@ def quantize_and_save( # IMPORTANT: post_init() must run outside any @torch.inference_mode() context # because AutoScheme's delta-loss selection requires gradient tracking. self.post_init() + # If post_init() was called manually before quantize_and_save() (e.g. ar.post_init() + # in tests), _resolve_formats saw formats=None and was a no-op. Now that we have set + # self.formats to a default string above, resolve it into OutputFormat objects so that + # quantize() and save_quantized() receive proper objects, not a raw string. + if isinstance(self.formats, str): + self.formats = get_formats(self.formats, self) + self.compress_context.formats = self.formats # Derive descriptive export dir after post_init so scheme-resolved attrs are available. _fmt_str = format or (self.formats if isinstance(self.formats, str) else "") output_dir = self._get_export_dir(output_dir, _fmt_str) diff --git a/auto_round/compressors_new/calib.py b/auto_round/compressors_new/calib.py index 8ac66a405..92adcf866 100644 --- a/auto_round/compressors_new/calib.py +++ b/auto_round/compressors_new/calib.py @@ -270,6 +270,10 @@ def cache_inter_data(self, block_names, nsamples, layer_names=None, last_cache_n """ if layer_names is None: layer_names = [] + + if not self._post_init_done: + self.post_init() + self.inputs = {} self.to_cached_layers = block_names + layer_names diff --git a/auto_round/compressors_new/mllm_mixin.py b/auto_round/compressors_new/mllm_mixin.py index 6fc5f480a..9fccf72e5 100644 --- a/auto_round/compressors_new/mllm_mixin.py +++ b/auto_round/compressors_new/mllm_mixin.py @@ -126,6 +126,16 @@ def calib(self, nsamples, bs): if isinstance(self.dataset, str): dataset = self.dataset.replace(" ", "") + # Mirror old arch __init__: switch text-only dataset to MLLM dataset when + # quant_nontext_module=True, as text datasets cannot calibrate vision modules. + from auto_round.calib_dataset import CALIB_DATASETS + + if self.quant_nontext_module and dataset in CALIB_DATASETS: + logger.warning( + "Text only dataset cannot be used for calibrating non-text modules," + " switching to liuhaotian/llava_conv_58k" + ) + dataset = "liuhaotian/llava_conv_58k" ( self.dataloader, self.batch_size, @@ -145,6 +155,8 @@ def calib(self, nsamples, bs): nsamples=nsamples, quant_nontext_module=self.quant_nontext_module, ) + else: + self.dataloader = self.dataset # Process data through the model for calibration total_cnt = 0 diff --git a/auto_round/compressors_new/utils.py b/auto_round/compressors_new/utils.py index f1342af61..249c2bda7 100644 --- a/auto_round/compressors_new/utils.py +++ b/auto_round/compressors_new/utils.py @@ -324,6 +324,9 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str elif isinstance(item, QuantizationScheme): config = asdict(item) elif isinstance(item, dict): + # "in_blocks" is an internal bookkeeping key injected by LLM-Compressor; + # silently drop it before validation. + item = {k: v for k, v in item.items() if k != "in_blocks"} invalid = set(item) - set(scheme_keys + ("fixed_by_user", "scale_dtype")) if invalid: raise ValueError( diff --git a/auto_round/compressors_new/zero_shot.py b/auto_round/compressors_new/zero_shot.py index 7f8e9cbc6..8845ccb07 100644 --- a/auto_round/compressors_new/zero_shot.py +++ b/auto_round/compressors_new/zero_shot.py @@ -70,6 +70,40 @@ def __init__( ) self.lr = 5e-3 + def quantize_block( + self, + block: torch.nn.Module, + inputs: tuple, + q_input: Union[torch.Tensor, dict, None] = None, + device: Union[str, torch.device] = "cpu", + auto_offload: bool = True, + ): + """Quantize a single block via RTN (public API for LLM-Compressor). + + ZeroShotCompressor does not need calibration data, so ``inputs`` and + ``q_input`` are accepted for interface compatibility but not used for + algorithm purposes. The block is materialized, converted to the target + dtype, moved to ``device``, and quantized in-place via RTN. + + Returns: + tuple: ``(None, None)`` — RTN does not produce reference outputs. + """ + assert not self.mllm and not self.diffusion, ( + f"Currently, {self.__class__.__name__} does not support quantize_block " "for MLLM / diffusion models." + ) + + if not self._post_init_done: + self.post_init() + + materialize_model_(block) + convert_module_to_hp_if_necessary(block, self.model_context.amp_dtype, device) + block = block.to(device) + + self.quantizer.quantize_block(block) + + mv_module_from_gpu(block) + return None, None + # Use no_grad instead of inference_mode # https://github.com/intel/auto-round/issues/1620 @torch.no_grad() diff --git a/test/test_cpu/models/test_moe_model.py b/test/test_cpu/models/test_moe_model.py index f597a4f9d..1eeec3a73 100644 --- a/test/test_cpu/models/test_moe_model.py +++ b/test/test_cpu/models/test_moe_model.py @@ -24,7 +24,7 @@ def quantize_model(model, output_dir, scheme, iters=0, ignore_layers="self_attn, disable_opt_rtn=disable_opt_rtn, ) quantized_model, save_folder = autoround.quantize_and_save(format="auto_round", output_dir=output_dir) - return quantized_model + return quantized_model, save_folder def count_modules_by_type(model, target_module_name_or_class): @@ -44,7 +44,9 @@ def count_modules_by_type(model, target_module_name_or_class): def test_gptoss(scheme, tiny_gpt_oss_model_path, tmp_path): config = AutoConfig.from_pretrained(tiny_gpt_oss_model_path, trust_remote_code=True) output_dir = str(tmp_path / "saved") - quantized_model = quantize_model(tiny_gpt_oss_model_path, output_dir, scheme, ignore_layers="self_attn,lm_head") + quantized_model, save_folder = quantize_model( + tiny_gpt_oss_model_path, output_dir, scheme, ignore_layers="self_attn,lm_head" + ) # Ensure the quantized model is not None assert quantized_model is not None, "Quantized model should not be None." @@ -63,7 +65,7 @@ def test_gptoss(scheme, tiny_gpt_oss_model_path, tmp_path): ), f"Expected {config.num_hidden_layers * 3 * config.num_local_experts} QuantLinear modules, found {quant_linear_cnt}." # verify the quantized model can be loaded and run inference - loaded_model = GptOssForCausalLM.from_pretrained(output_dir) + loaded_model = GptOssForCausalLM.from_pretrained(save_folder) for n, m in quantized_model.named_modules(): if m.__class__.__name__ == "QuantLinear": loaded_m = loaded_model.get_submodule(n) @@ -78,12 +80,14 @@ def test_gptoss(scheme, tiny_gpt_oss_model_path, tmp_path): def test_llama4(tiny_llama4_model_path): output_dir = "./tmp/test_quantized_llama4" - quantized_model = quantize_model(tiny_llama4_model_path, output_dir, "MXFP4", ignore_layers="self_attn,lm_head") + quantized_model, save_folder = quantize_model( + tiny_llama4_model_path, output_dir, "MXFP4", ignore_layers="self_attn,lm_head" + ) # Ensure the quantized model is not None assert quantized_model is not None, "Quantized model should not be None." - loaded_model = Llama4ForConditionalGeneration.from_pretrained(output_dir) + loaded_model = Llama4ForConditionalGeneration.from_pretrained(save_folder) for n, m in quantized_model.named_modules(): if m.__class__.__name__ == "QuantLinear": loaded_m = loaded_model.get_submodule(n) From 97b89dda35d9bdc28a92573988f002d7b6659524 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Wed, 8 Apr 2026 15:27:07 +0800 Subject: [PATCH 42/90] fix Signed-off-by: n1ck-guo --- auto_round/compressors_new/calib.py | 12 +++++- auto_round/compressors_new/mllm_mixin.py | 54 ++++++++++++++++++++---- 2 files changed, 56 insertions(+), 10 deletions(-) diff --git a/auto_round/compressors_new/calib.py b/auto_round/compressors_new/calib.py index 92adcf866..d4682d2d5 100644 --- a/auto_round/compressors_new/calib.py +++ b/auto_round/compressors_new/calib.py @@ -538,7 +538,10 @@ def forward(m, hidden_states=None, *positional_inputs, **kwargs): self.inputs[name][key] = [data] else: data = post_process_cache_data(self.quantizer.batch_size, data, key) - self.inputs[name][key] = list(torch.split(data, 1, dim=self.quantizer.batch_dim)) + if isinstance(data, torch.Tensor): + self.inputs[name][key] = list(torch.split(data, 1, dim=self.quantizer.batch_dim)) + else: + self.inputs[name][key] = [data] else: # append cache inputs new_data = post_process_cache_data(self.quantizer.batch_size, kwargs[key], key) if new_data is None: # shareable args or NoneType @@ -547,7 +550,12 @@ def forward(m, hidden_states=None, *positional_inputs, **kwargs): if self.quantizer.batch_size <= 1: self.inputs[name][key].append(new_data) else: - self.inputs[name][key].extend(list(torch.split(new_data, 1, dim=self.quantizer.batch_dim))) + if isinstance(new_data, torch.Tensor): + self.inputs[name][key].extend( + list(torch.split(new_data, 1, dim=self.quantizer.batch_dim)) + ) + else: + self.inputs[name][key].append(new_data) elif isinstance(kwargs[key], (str, bool, type(None))): if key not in self.inputs[name].keys(): self.inputs[name][key] = kwargs[key] diff --git a/auto_round/compressors_new/mllm_mixin.py b/auto_round/compressors_new/mllm_mixin.py index 9fccf72e5..86cfdbe47 100644 --- a/auto_round/compressors_new/mllm_mixin.py +++ b/auto_round/compressors_new/mllm_mixin.py @@ -15,6 +15,7 @@ import torch from auto_round.logger import logger +from auto_round.utils import to_device class MLLMMixin: @@ -63,6 +64,21 @@ def __init__( # Pass quant_nontext_module to ModelContext so get_block_names can include vision blocks kwargs.setdefault("quant_nontext_module", quant_nontext_module) + # Mirror old arch: reset batch_size to 1 when quantizing non-text modules, + # because vision encoder blocks have non-standard hidden_states shapes that + # break batch_dim detection, and image collation fails with batch_size > 1. + if quant_nontext_module: + batch_size = kwargs.get("batch_size", None) + if batch_size is not None and batch_size != 1: + grad_acc = kwargs.get("gradient_accumulate_steps", 1) + kwargs["gradient_accumulate_steps"] = batch_size * grad_acc + kwargs["batch_size"] = 1 + logger.warning( + f"reset batch_size({batch_size}) to 1 and " + f"gradient_accumulate_steps to {batch_size * grad_acc} " + f"because batch_size={batch_size} cannot be used for calibrating non-text modules." + ) + # super().__init__() creates model_context, which eagerly loads the model and # populates model_context.processor / image_processor / tokenizer. super().__init__(*args, **kwargs) @@ -164,15 +180,37 @@ def calib(self, nsamples, bs): if data is None: continue - if isinstance(data, dict): - data_new = { - key: value.to(mc.model.device) if isinstance(value, torch.Tensor) else value - for key, value in data.items() - } - else: - data_new = data - try: + if isinstance(data, str): + # List-of-strings dataset: process through template → model inputs + processed = self.template_obj.processor.get_input( + text=data, images=None, max_length=self.seqlen, squeeze=False + ) + data_new = {k: to_device(v, mc.model.device) for k, v in processed.items()} + elif isinstance(data, dict) and "text" in data: + # FakeDataLoader-style {"text": ..., "image": ...}: process through template + text = data["text"] + if isinstance(text, dict): + text = [text] + input_text = self.template_obj._encode(text) + processed = self.template_obj.processor.get_input( + text=input_text, + images=data.get("image", None), + max_length=self.seqlen, + squeeze=False, + ) + data_new = {} + for key, value in processed.items(): + tensor_val = value if isinstance(value, torch.Tensor) else torch.as_tensor(value) + data_new[key] = to_device(tensor_val, mc.model.device) + elif isinstance(data, dict): + data_new = { + key: value.to(mc.model.device) if isinstance(value, torch.Tensor) else value + for key, value in data.items() + } + else: + data_new = data + if isinstance(data_new, dict): mc.model(**data_new) else: From 002525690788f95354869ff8fa98878a2e5bbf7e Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Wed, 8 Apr 2026 15:52:41 +0800 Subject: [PATCH 43/90] performance Signed-off-by: n1ck-guo --- auto_round/algorithms/quantization/base.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/auto_round/algorithms/quantization/base.py b/auto_round/algorithms/quantization/base.py index 737b27eeb..0a167ee54 100644 --- a/auto_round/algorithms/quantization/base.py +++ b/auto_round/algorithms/quantization/base.py @@ -371,9 +371,9 @@ def _get_current_q_output( ) -> torch.Tensor: """Compute block output for a mini-batch selected by *indices* (used during training). - Handles both LLM and diffusion model block formats. Always calls the - plain (non-compiled) ``block_forward`` because this runs inside the - autograd training loop where compilation is not needed. + Handles both LLM and diffusion model block formats. Uses the compiled + block_forward when enable_torch_compile is True (same as _get_block_outputs), + matching old-arch behaviour where self.block_forward was compiled at init. """ current_input_ids, current_input_others = self._sampling_inputs( input_ids, @@ -383,6 +383,14 @@ def _get_current_q_output( batch_dim=self.batch_dim, share_cache_keys=self.model_context.shared_cache_keys, ) + # Mirror _get_block_outputs: use compiled block_forward when available. + if self.compress_context.enable_torch_compile: + if not hasattr(self, "_compiled_block_forward"): + self._compiled_block_forward = compile_func(block_forward, self.compress_context.device) + _bf = self._compiled_block_forward + else: + _bf = block_forward + if getattr(self.model_context, "is_diffusion", False): output_config = self.DIFFUSION_OUTPUT_CONFIGS.get(block.__class__.__name__, []) idx = None if "hidden_states" not in output_config else output_config.index("hidden_states") @@ -390,7 +398,7 @@ def _get_current_q_output( hidden_states = current_input_ids.pop("hidden_states") current_input_others.update(current_input_ids) current_input_ids = hidden_states - output_q = block_forward( + output_q = _bf( block, current_input_ids, current_input_others, @@ -400,7 +408,7 @@ def _get_current_q_output( idx, ) else: - output_q = block_forward( + output_q = _bf( block, current_input_ids, current_input_others, From 183112693ec2402fe4faf486c6a4b8fc0cc1dba6 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 9 Apr 2026 10:21:06 +0800 Subject: [PATCH 44/90] fix Signed-off-by: n1ck-guo --- auto_round/autoround.py | 4 +++- auto_round/compressors_new/calib.py | 11 +++++++++ auto_round/compressors_new/diffusion_mixin.py | 11 +++++++++ auto_round/compressors_new/mllm_mixin.py | 23 +++++++++++++++++-- auto_round/compressors_new/zero_shot.py | 10 ++++++++ auto_round/envs.py | 5 ++-- 6 files changed, 59 insertions(+), 5 deletions(-) diff --git a/auto_round/autoround.py b/auto_round/autoround.py index bba4c4921..b2bec2651 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -17,6 +17,7 @@ import torch +import auto_round.envs as envs from auto_round.compressors import ( AdamCompressor, BaseCompressor, @@ -33,7 +34,8 @@ if TYPE_CHECKING: from auto_round.auto_scheme.gen_auto_scheme import AutoScheme -NEW_ARCH = True +# Default to new architecture; set AR_DISABLE_NEW_ARCH=true/1 to force old architecture. +NEW_ARCH = not envs.AR_DISABLE_NEW_ARCH class AutoRound: diff --git a/auto_round/compressors_new/calib.py b/auto_round/compressors_new/calib.py index d4682d2d5..7978dea8d 100644 --- a/auto_round/compressors_new/calib.py +++ b/auto_round/compressors_new/calib.py @@ -35,6 +35,8 @@ check_skippable_keywords, immediate_pack, init_cache, + is_nv_fp, + is_static_wfp8afp8, reset_params, ) from auto_round.logger import logger @@ -52,6 +54,7 @@ is_quantized_input_module, memory_monitor, mv_module_from_gpu, + set_amax_for_all_moe_layers, set_module, to_device, to_dtype, @@ -805,6 +808,10 @@ def quantize_block( mid_iter_mem_check=mid_iter_mem_check, ) + # ── MoE scale alignment for FP8 dispatch efficiency ──────────────── + if is_nv_fp(self.quantizer.act_data_type) or is_static_wfp8afp8(self.quantizer): + set_amax_for_all_moe_layers(block, attr_name="act_max") + # ── Collect quantized-block outputs ─────────────────────────────────── if self.quantizer.enable_quanted_input: q_outputs = self.quantizer._get_block_outputs(block, input_ids, input_others, bs) @@ -930,6 +937,10 @@ def _quantize_blocks( mid_iter_mem_check=mid_iter_mem_check, ) + # ── MoE scale alignment for FP8 dispatch efficiency ──────────────── + if is_nv_fp(self.quantizer.act_data_type) or is_static_wfp8afp8(self.quantizer): + set_amax_for_all_moe_layers(m, attr_name="act_max") + # ── Infrastructure: collect q_outputs if needed ─────────────────── if self.quantizer.enable_quanted_input: q_input = self.quantizer._get_block_outputs(m, input_ids, input_others, bs) diff --git a/auto_round/compressors_new/diffusion_mixin.py b/auto_round/compressors_new/diffusion_mixin.py index 65e2c23f0..10010210a 100644 --- a/auto_round/compressors_new/diffusion_mixin.py +++ b/auto_round/compressors_new/diffusion_mixin.py @@ -63,6 +63,17 @@ def _get_block_forward_func(self, name: str): """ return wrap_block_forward_positional_to_kwargs(super()._get_block_forward_func(name)) + def _should_stop_cache_forward(self, name: str) -> bool: + """Diffusion models must run all denoising steps to collect enough inputs. + + Mirrors old-arch DiffusionCompressor._should_stop_cache_forward which always + returns False so the pipeline never exits early after the first block hit. + Without this, CalibCompressor._should_stop_cache_forward would stop after the + first inference step, yielding only nsamples inputs instead of + nsamples * num_inference_steps. + """ + return False + @torch.no_grad() def calib(self, nsamples, bs): """Perform diffusion-specific calibration for quantization. diff --git a/auto_round/compressors_new/mllm_mixin.py b/auto_round/compressors_new/mllm_mixin.py index 86cfdbe47..5248cb28e 100644 --- a/auto_round/compressors_new/mllm_mixin.py +++ b/auto_round/compressors_new/mllm_mixin.py @@ -68,14 +68,33 @@ def __init__( # because vision encoder blocks have non-standard hidden_states shapes that # break batch_dim detection, and image collation fails with batch_size > 1. if quant_nontext_module: + # batch_size may come from kwargs (placed there by AutoRoundCompatible local_args) + # or from the AlgConfig object in args[0] (the authoritative source for quantizer.batch_size). + # We must update both so that quantizer.batch_size is also reset to 1. batch_size = kwargs.get("batch_size", None) + _alg_cfg = args[0] if args else None + if batch_size is None and _alg_cfg is not None: + cfgs = _alg_cfg if isinstance(_alg_cfg, list) else [_alg_cfg] + for cfg in cfgs: + if hasattr(cfg, "batch_size") and cfg.batch_size is not None: + batch_size = cfg.batch_size + break if batch_size is not None and batch_size != 1: grad_acc = kwargs.get("gradient_accumulate_steps", 1) - kwargs["gradient_accumulate_steps"] = batch_size * grad_acc + new_grad_acc = batch_size * grad_acc + kwargs["gradient_accumulate_steps"] = new_grad_acc kwargs["batch_size"] = 1 + # Also patch the AlgConfig object so that BaseCompressor.quantize_config.batch_size == 1 + if _alg_cfg is not None: + cfgs = _alg_cfg if isinstance(_alg_cfg, list) else [_alg_cfg] + for cfg in cfgs: + if hasattr(cfg, "batch_size"): + cfg.batch_size = 1 + if hasattr(cfg, "gradient_accumulate_steps"): + cfg.gradient_accumulate_steps = new_grad_acc logger.warning( f"reset batch_size({batch_size}) to 1 and " - f"gradient_accumulate_steps to {batch_size * grad_acc} " + f"gradient_accumulate_steps to {new_grad_acc} " f"because batch_size={batch_size} cannot be used for calibrating non-text modules." ) diff --git a/auto_round/compressors_new/zero_shot.py b/auto_round/compressors_new/zero_shot.py index 8845ccb07..426719184 100644 --- a/auto_round/compressors_new/zero_shot.py +++ b/auto_round/compressors_new/zero_shot.py @@ -19,6 +19,7 @@ from auto_round.algorithms.alg_config import AlgConfig from auto_round.compressors_new.base import BaseCompressor +from auto_round.compressors_new.utils import is_nv_fp, is_static_wfp8afp8 from auto_round.logger import logger from auto_round.modeling.fused_moe.replace_modules import materialize_model_ from auto_round.utils import ( @@ -32,6 +33,7 @@ global_state, memory_monitor, mv_module_from_gpu, + set_amax_for_all_moe_layers, set_module, ) @@ -101,6 +103,10 @@ def quantize_block( self.quantizer.quantize_block(block) + # ── MoE scale alignment for FP8 dispatch efficiency ──────────────── + if is_nv_fp(self.quantizer.act_data_type) or is_static_wfp8afp8(self.quantizer): + set_amax_for_all_moe_layers(block, attr_name="act_max") + mv_module_from_gpu(block) return None, None @@ -161,6 +167,10 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: # ── Pure algorithm ──────────────────────────────────────── self.quantizer.quantize_block(block) + # ── MoE scale alignment for FP8 dispatch efficiency ──────────────── + if is_nv_fp(self.quantizer.act_data_type) or is_static_wfp8afp8(self.quantizer): + set_amax_for_all_moe_layers(block, attr_name="act_max") + # ── Infrastructure: shard write / device cleanup ────────── if self.is_immediate_saving: # Save non-quantized leaf modules (e.g. norms, embeddings in block). diff --git a/auto_round/envs.py b/auto_round/envs.py index 8a731ad10..b97e56a7e 100644 --- a/auto_round/envs.py +++ b/auto_round/envs.py @@ -34,6 +34,7 @@ "AR_DISABLE_DATASET_SUBPROCESS": lambda: os.getenv("AR_DISABLE_DATASET_SUBPROCESS", "0").lower() in ("1", "true"), "AR_DISABLE_COPY_MTP_WEIGHTS": lambda: os.getenv("AR_DISABLE_COPY_MTP_WEIGHTS", "0").lower() in ("1", "true", "yes"), + "AR_DISABLE_NEW_ARCH": lambda: os.getenv("AR_DISABLE_NEW_ARCH", "0").lower() in ("1", "true", "yes"), } @@ -69,8 +70,8 @@ def set_config(**kwargs): for key, value in kwargs.items(): if key in environment_variables: # Convert value to appropriate string format - if key == "AR_USE_MODELSCOPE": - # Handle boolean values for AR_USE_MODELSCOPE + if key in ("AR_USE_MODELSCOPE", "AR_DISABLE_NEW_ARCH"): + # Handle boolean values for boolean env flags str_value = "true" if value in [True, "True", "true", "1", 1] else "false" else: # For other variables, convert to string From 8873eca80952a9e8f4498bbd131c761d41a5ead2 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 9 Apr 2026 15:32:40 +0800 Subject: [PATCH 45/90] fix Signed-off-by: n1ck-guo --- .../algorithms/quantization/rtn/quantizer.py | 24 +++++++++++++++++-- .../quantization/sign_round/quantizer.py | 8 +++---- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/auto_round/algorithms/quantization/rtn/quantizer.py b/auto_round/algorithms/quantization/rtn/quantizer.py index 555542841..21493e6c9 100644 --- a/auto_round/algorithms/quantization/rtn/quantizer.py +++ b/auto_round/algorithms/quantization/rtn/quantizer.py @@ -81,6 +81,17 @@ def quantize_block( Returns: dict: Empty dict (zero-shot RTN has no tunable parameters to return). """ + if ( + self.config.is_act_nv_fp + or self.config.is_static_afp8 + or (self.config.is_wfp8afp8 and not self.config.act_dynamic) + ): + # For FP8 static / NVFP paths, expert input scales are derived during + # layer quantization from the current act_max. Unify MoE input-proj + # act_max values before quantizing each expert so exported input_scale + # stays aligned across experts. + set_amax_for_all_moe_layers(block, attr_name="act_max") + for _name, m in block.named_modules(): if hasattr(m, "global_name") and check_to_quantized(m): self.quantize_layer(m.global_name) @@ -207,7 +218,9 @@ def __init__(self, config: RTNConfig): def quantize_layer_outside_block(self, *args, **kwargs): return self.quantize_layer(*args, **kwargs) - def quantize_block(self, block: torch.nn.Module, **kwargs): + def quantize_block( + self, block: torch.nn.Module, input_ids=None, input_others=None, reference_output=None, **kwargs + ): """Apply imatrix-informed RTN quantization to a block. Pure-algorithm entry point. All infrastructure (device placement, @@ -217,9 +230,16 @@ def quantize_block(self, block: torch.nn.Module, **kwargs): Args: block: Module already placed on the correct device(s) with act_max attributes populated by the Compressor's hook pass. + input_ids: Unused for optimized RTN; accepted for interface consistency. + input_others: Unused for optimized RTN. + reference_output: Unused for optimized RTN. """ update_block_global_scale_if_needed(block, self.data_type, self.group_size) - if self.config.is_act_nv_fp or self.config.is_static_afp8: + if ( + self.config.is_act_nv_fp + or self.config.is_static_afp8 + or (self.config.is_wfp8afp8 and not self.config.act_dynamic) + ): # enable moe experts act_max automatic generation for Linear set_amax_for_all_moe_layers(block, attr_name="act_max") # Normalize imatrix and quantize layers diff --git a/auto_round/algorithms/quantization/sign_round/quantizer.py b/auto_round/algorithms/quantization/sign_round/quantizer.py index 0f177d970..4aa8befa2 100644 --- a/auto_round/algorithms/quantization/sign_round/quantizer.py +++ b/auto_round/algorithms/quantization/sign_round/quantizer.py @@ -270,8 +270,8 @@ def quantize_block( if self.attention_mask: num_elm = self._get_non_zero_cnt(self.attention_mask, global_indices) - for tmp_step in range(self.gradient_accumulate_steps): - indices = global_indices[tmp_step * batch_size : (tmp_step + 1) * batch_size] + for batch_start in range(0, len(global_indices), batch_size): + indices = global_indices[batch_start : batch_start + batch_size] current_output = self._get_current_output(reference_output, indices) current_output = to_device(current_output, loss_device) output_q = self._get_current_q_output(block, input_ids, input_others, indices, device, loss_device) @@ -443,8 +443,8 @@ def quantize_layer_outside_block( if self.attention_mask: num_elm = self._get_non_zero_cnt(self.attention_mask, global_indices) - for tmp_step in range(gradient_accumulate_steps): - indices = global_indices[tmp_step * batch_size : (tmp_step + 1) * batch_size] + for batch_start in range(0, len(global_indices), batch_size): + indices = global_indices[batch_start : batch_start + batch_size] if q_inputs is not None: current_input = [q_inputs[i] for i in indices] current_input = torch.cat(current_input, dim=0).to(device) From a1a42447cfb268e4ffc636603db1a225e9d597e6 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 9 Apr 2026 16:14:40 +0800 Subject: [PATCH 46/90] fix Signed-off-by: n1ck-guo --- auto_round/algorithms/quantization/base.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/auto_round/algorithms/quantization/base.py b/auto_round/algorithms/quantization/base.py index 0a167ee54..02edf8dee 100644 --- a/auto_round/algorithms/quantization/base.py +++ b/auto_round/algorithms/quantization/base.py @@ -74,6 +74,11 @@ def __init__(self, config: QuantizationConfig): self.batch_size = getattr(config, "batch_size", 8) self.batch_dim = getattr(config, "batch_dim", None) self.infer_bs_coeff = getattr(config, "infer_bs_coeff", 1) + # Whether to feed quantized-block outputs as inputs to the next block. + # Subclasses that support cascaded quantized-input (e.g. SignRoundQuantizer) + # override this from their config. Defaults to False for zero-shot algorithms + # (RTN) where activations are not used during weight optimization. + self.enable_quanted_input = getattr(config, "enable_quanted_input", False) @classmethod def from_config(cls, config: QuantizationConfig): From bd75536172c746ff1535d8f57ac0fa5814ba0cd6 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Fri, 10 Apr 2026 13:48:58 +0800 Subject: [PATCH 47/90] preformance Signed-off-by: n1ck-guo --- auto_round/algorithms/quantization/base.py | 38 +++++++++++++--------- auto_round/compressors_new/calib.py | 1 + 2 files changed, 23 insertions(+), 16 deletions(-) diff --git a/auto_round/algorithms/quantization/base.py b/auto_round/algorithms/quantization/base.py index 02edf8dee..60bd1476a 100644 --- a/auto_round/algorithms/quantization/base.py +++ b/auto_round/algorithms/quantization/base.py @@ -325,15 +325,7 @@ def _get_block_outputs( ): _bf = block_forward else: - # TODO FIXME - # This function could not be compiled, causing a large accuracy drop when `enable_alg_ext` is used. - # To avoid issues, remove it in all scenarios except WOQ. - if self.compress_context.enable_torch_compile: - if not hasattr(self, "_compiled_block_forward"): - self._compiled_block_forward = compile_func(block_forward, self.compress_context.device) - _bf = self._compiled_block_forward - else: - _bf = block_forward + _bf = self._resolve_block_forward() output = [] nsamples = len(input_ids) @@ -365,6 +357,26 @@ def _get_block_outputs( return output + def _resolve_block_forward(self): + """Resolve and cache the block forward function once. + + This avoids repeated attribute checks in the hot training loop + (called thousands of times per block). + """ + if hasattr(self, "_resolved_block_forward"): + return self._resolved_block_forward + if self.compress_context.enable_torch_compile: + if not hasattr(self, "_compiled_block_forward"): + self._compiled_block_forward = compile_func(block_forward, self.compress_context.device) + self._resolved_block_forward = self._compiled_block_forward + else: + self._resolved_block_forward = block_forward + return self._resolved_block_forward + + def _invalidate_block_forward_cache(self): + """Clear the cached block forward function (call when block changes).""" + self.__dict__.pop("_resolved_block_forward", None) + def _get_current_q_output( self, block: torch.nn.Module, @@ -388,13 +400,7 @@ def _get_current_q_output( batch_dim=self.batch_dim, share_cache_keys=self.model_context.shared_cache_keys, ) - # Mirror _get_block_outputs: use compiled block_forward when available. - if self.compress_context.enable_torch_compile: - if not hasattr(self, "_compiled_block_forward"): - self._compiled_block_forward = compile_func(block_forward, self.compress_context.device) - _bf = self._compiled_block_forward - else: - _bf = block_forward + _bf = self._resolve_block_forward() if getattr(self.model_context, "is_diffusion", False): output_config = self.DIFFUSION_OUTPUT_CONFIGS.get(block.__class__.__name__, []) diff --git a/auto_round/compressors_new/calib.py b/auto_round/compressors_new/calib.py index 7978dea8d..f73c913c1 100644 --- a/auto_round/compressors_new/calib.py +++ b/auto_round/compressors_new/calib.py @@ -953,6 +953,7 @@ def _quantize_blocks( mv_module_from_gpu(m) if self.enable_torch_compile: torch._dynamo.reset() + self.quantizer._invalidate_block_forward_cache() # Always advance input_ids to the current block's output so that the next # block receives the correct activations. When enable_quanted_input is # False we reuse reference_output (unquantized block output); otherwise From e4ce4206b5e1ef64f95a22b47cd1aa546582a279 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Fri, 10 Apr 2026 15:27:51 +0800 Subject: [PATCH 48/90] sync Signed-off-by: n1ck-guo --- auto_round/algorithms/quantization/base.py | 13 ++++++++----- auto_round/compressors_new/utils.py | 17 ++++++++++++----- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/auto_round/algorithms/quantization/base.py b/auto_round/algorithms/quantization/base.py index 60bd1476a..1233c9deb 100644 --- a/auto_round/algorithms/quantization/base.py +++ b/auto_round/algorithms/quantization/base.py @@ -363,12 +363,15 @@ def _resolve_block_forward(self): This avoids repeated attribute checks in the hot training loop (called thousands of times per block). """ - if hasattr(self, "_resolved_block_forward"): - return self._resolved_block_forward + cached = self.__dict__.get("_resolved_block_forward") + if cached is not None: + return cached if self.compress_context.enable_torch_compile: - if not hasattr(self, "_compiled_block_forward"): - self._compiled_block_forward = compile_func(block_forward, self.compress_context.device) - self._resolved_block_forward = self._compiled_block_forward + compiled = self.__dict__.get("_compiled_block_forward") + if compiled is None: + compiled = compile_func(block_forward, self.compress_context.device) + self._compiled_block_forward = compiled + self._resolved_block_forward = compiled else: self._resolved_block_forward = block_forward return self._resolved_block_forward diff --git a/auto_round/compressors_new/utils.py b/auto_round/compressors_new/utils.py index 249c2bda7..f7fce3bd9 100644 --- a/auto_round/compressors_new/utils.py +++ b/auto_round/compressors_new/utils.py @@ -347,9 +347,11 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str extra_scheme_keys = ("scale_dtype",) scheme_keys = tuple(f.name for f in fields(QuantizationScheme)) + ("scale_dtype",) layer_config = copy.deepcopy(layer_config) or {} + ignore_layer_patterns = set() if ignore_layers: ignore_layers = ignore_layers.replace(" ", "").split(",") ignore_layers = [name + "." if name[-1].isdigit() else name for name in ignore_layers] + ignore_layer_patterns = set(ignore_layers) # 1. ignore_layers -> force 16 for name in get_fp_layer_names(model, ignore_layers): @@ -434,11 +436,16 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str if name in all_module_names: m = get_module(model, name) if len(list(m.children())) == 0 and type(m) not in supported_types: - layer_config.pop(name) - logger.warning( - f"'{name}' exists in the model but is not a supported quantization target " - f"in the current scheme, ignoring its setting in `layer_config`" - ) + val = layer_config.pop(name) + if name in ignore_layer_patterns: + # Keep unsupported ignore_layers entries so export can serialize + # them into regex-based extra_config for loaders like vLLM INC. + regex_config[name] = val + else: + logger.warning( + f"'{name}' exists in the model but is not a supported quantization target " + f"in the current scheme, ignoring its setting in `layer_config`" + ) continue regex = re.compile(to_standard_regex(name)) From 1286749cf1f9d1e7e0c62528dd9b71d79fd299d8 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Fri, 10 Apr 2026 16:06:06 +0800 Subject: [PATCH 49/90] fix Signed-off-by: n1ck-guo --- auto_round/algorithms/quantization/sign_round/quantizer.py | 2 +- auto_round/compressors_new/base.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/auto_round/algorithms/quantization/sign_round/quantizer.py b/auto_round/algorithms/quantization/sign_round/quantizer.py index 4aa8befa2..69572b13c 100644 --- a/auto_round/algorithms/quantization/sign_round/quantizer.py +++ b/auto_round/algorithms/quantization/sign_round/quantizer.py @@ -127,7 +127,7 @@ def _get_loss( ): autocast_ctx = ( nullcontext() - if not self.model_context.amp + if self.model_context.amp else autocast(device_type=str(device).split(":")[0], dtype=self.model_context.amp_dtype) ) if self.attention_mask: diff --git a/auto_round/compressors_new/base.py b/auto_round/compressors_new/base.py index 21f49fa6f..85e0f4af4 100644 --- a/auto_round/compressors_new/base.py +++ b/auto_round/compressors_new/base.py @@ -895,7 +895,7 @@ def _adjust_immediate_packing_and_saving(self): logger.warning("reset low_cpu_mem_usage to False due to tied weights") return if len(tied_weight_keys) == 1: - key = tied_weight_keys.keys[0] + key = list(tied_weight_keys.keys())[0] if "lm_head" not in key: self.is_immediate_saving = False if self.compress_context.low_cpu_mem_usage: From 5c212b56efe86c476008438a9ddd5c3c81acb1a7 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Fri, 10 Apr 2026 16:46:35 +0800 Subject: [PATCH 50/90] performance Signed-off-by: n1ck-guo --- auto_round/compressors_new/calib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/compressors_new/calib.py b/auto_round/compressors_new/calib.py index f73c913c1..ef1bc58f9 100644 --- a/auto_round/compressors_new/calib.py +++ b/auto_round/compressors_new/calib.py @@ -1023,7 +1023,6 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: supported_types=SUPPORTED_LAYER_TYPES, quant_block_list=self.quantizer.quant_block_list, ) - start_time = time.time() all_first_block_names = [block[0] for block in all_blocks] if len(layer_names) > 0: logger.info( @@ -1064,6 +1063,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: else: pbar = tqdm(range(0, len(all_blocks[0]), self.nblocks)) # move the alg warning outside pbar + start_time = time.time() for block_names in all_blocks: inputs = all_inputs[block_names[0]] all_inputs.pop(block_names[0]) From 4806d5aa5487ef9f46b6f847013ae1e98408acdb Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Sun, 12 Apr 2026 07:48:23 +0800 Subject: [PATCH 51/90] performance Signed-off-by: n1ck-guo --- auto_round/algorithms/quantization/base.py | 24 ++++++++++++-------- auto_round/algorithms/quantization/config.py | 2 +- auto_round/compressors_new/calib.py | 6 +++++ auto_round/utils/device.py | 10 ++++++++ 4 files changed, 32 insertions(+), 10 deletions(-) diff --git a/auto_round/algorithms/quantization/base.py b/auto_round/algorithms/quantization/base.py index 1233c9deb..e6744a96a 100644 --- a/auto_round/algorithms/quantization/base.py +++ b/auto_round/algorithms/quantization/base.py @@ -318,14 +318,7 @@ def _get_block_outputs( self.compress_context.cache_device, ) - if ( - (self.config.is_act_quantize and (not self.config.act_dynamic or self.config.is_act_nv_fp)) # have hooks - or self.enable_alg_ext # Use imatrix - # or not self.disable_opt_rtn # Use imatrix - ): - _bf = block_forward - else: - _bf = self._resolve_block_forward() + _bf = self._resolve_block_forward() output = [] nsamples = len(input_ids) @@ -362,11 +355,24 @@ def _resolve_block_forward(self): This avoids repeated attribute checks in the hot training loop (called thousands of times per block). + + For activation-quantization schemes (e.g. FP8_STATIC) or when + algorithm extensions are enabled, forward hooks are attached to layers + inside the block. ``torch.compile`` is incompatible with these hooks, + so we must fall back to the plain ``block_forward``. This mirrors the + old-arch behaviour where ``self.block_forward`` was set in ``__init__`` + to the uncompiled function for these cases. """ cached = self.__dict__.get("_resolved_block_forward") if cached is not None: return cached - if self.compress_context.enable_torch_compile: + # Act-quantization hooks / alg-extension hooks are incompatible with + # torch.compile → always use the plain (uncompiled) block_forward. + if ( + self.config.is_act_quantize and (not self.config.act_dynamic or self.config.is_act_nv_fp) + ) or self.enable_alg_ext: + self._resolved_block_forward = block_forward + elif self.compress_context.enable_torch_compile: compiled = self.__dict__.get("_compiled_block_forward") if compiled is None: compiled = compile_func(block_forward, self.compress_context.device) diff --git a/auto_round/algorithms/quantization/config.py b/auto_round/algorithms/quantization/config.py index 4ce703396..d99219e51 100644 --- a/auto_round/algorithms/quantization/config.py +++ b/auto_round/algorithms/quantization/config.py @@ -30,7 +30,7 @@ class BackendDataType(str, Enum): @dataclass(kw_only=True) -class QuantizationConfig(AlgConfig): +class QuantizationConfig: _alg_cls: ClassVar[str] = None # quantization args diff --git a/auto_round/compressors_new/calib.py b/auto_round/compressors_new/calib.py index ef1bc58f9..964cf39f9 100644 --- a/auto_round/compressors_new/calib.py +++ b/auto_round/compressors_new/calib.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import copy +import gc import time import traceback from functools import partial @@ -61,6 +62,7 @@ wrap_block_forward_positional_to_kwargs, ) from auto_round.utils.device import ( + _maybe_trim_malloc, parse_available_devices, ) from auto_round.wrapper import WrapperLinear, WrapperMultiblock @@ -1006,6 +1008,10 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: """ self.post_init() + # Reclaim heap fragmentation from init/post_init before the memory-intensive quantize loop. + gc.collect() + _maybe_trim_malloc() + self._check_compatibility() if bool(self.quantizer.quant_block_list): diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index 747a4eb2b..c6fba10bb 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -398,6 +398,7 @@ def __enter__(self): # Create and inject fake triton module class FakeTriton: + def __getattr__(self, name): return None @@ -595,6 +596,7 @@ def _maybe_trim_malloc() -> None: class ClearMemory: + def __init__(self, device_list: list | tuple | None = None): self.device_list = device_list @@ -606,6 +608,13 @@ def __call__( from auto_round.utils.device import is_hpex_available if is_hpex_available(): + # Clear CPU-side references so Python can reclaim them. + if isinstance(tensor, list): + for i in range(len(tensor)): + tensor[i] = None + tensor = None + gc.collect() + _maybe_trim_malloc() memory_monitor.update_hpu(device_list) return else: @@ -1727,6 +1736,7 @@ def dump_mem_usage(msg: str = "", log_level: str = "info"): """Decorator to dump memory usage before and after a function call.""" def decorator(func): + @functools.wraps(func) def wrapper(*args, **kwargs): memory_monitor.update_cpu() From 1f1fbd93e83e5150e18ba9e4c2e3c314c10da89e Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Sun, 12 Apr 2026 08:20:17 +0800 Subject: [PATCH 52/90] fix Signed-off-by: n1ck-guo --- test/test_cpu/models/test_omni_model.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/test/test_cpu/models/test_omni_model.py b/test/test_cpu/models/test_omni_model.py index 136f19b31..f02ea29e6 100644 --- a/test/test_cpu/models/test_omni_model.py +++ b/test/test_cpu/models/test_omni_model.py @@ -278,18 +278,20 @@ def test_weight_fidelity(self): intermediate = 32 # moe_intermediate_size # Verify thinker expert weights + # Use equal_nan=True because fused expert parameters may contain + # uninitialized memory with NaN bit patterns, and NaN != NaN in IEEE 754. for i in range(4): expert = model.thinker.model.layers[0].mlp.experts[i] - assert torch.allclose(expert.gate_proj.weight.data, thinker_gate_up[i, :intermediate, :]) - assert torch.allclose(expert.up_proj.weight.data, thinker_gate_up[i, intermediate:, :]) - assert torch.allclose(expert.down_proj.weight.data, thinker_down[i]) + assert torch.allclose(expert.gate_proj.weight.data, thinker_gate_up[i, :intermediate, :], equal_nan=True) + assert torch.allclose(expert.up_proj.weight.data, thinker_gate_up[i, intermediate:, :], equal_nan=True) + assert torch.allclose(expert.down_proj.weight.data, thinker_down[i], equal_nan=True) # Verify talker expert weights for i in range(4): expert = model.talker.model.layers[0].mlp.experts[i] - assert torch.allclose(expert.gate_proj.weight.data, talker_gate_up[i, :intermediate, :]) - assert torch.allclose(expert.up_proj.weight.data, talker_gate_up[i, intermediate:, :]) - assert torch.allclose(expert.down_proj.weight.data, talker_down[i]) + assert torch.allclose(expert.gate_proj.weight.data, talker_gate_up[i, :intermediate, :], equal_nan=True) + assert torch.allclose(expert.up_proj.weight.data, talker_gate_up[i, intermediate:, :], equal_nan=True) + assert torch.allclose(expert.down_proj.weight.data, talker_down[i], equal_nan=True) def test_forward_output_match(self): """Test that replaced MoE forward output matches original.""" From e4fdfe6d98398d7bfd5a5e783a305801f5aa8830 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Sun, 12 Apr 2026 09:56:29 +0800 Subject: [PATCH 53/90] update Signed-off-by: n1ck-guo --- auto_round/compressors_new/calib.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/auto_round/compressors_new/calib.py b/auto_round/compressors_new/calib.py index 964cf39f9..b36f15d1d 100644 --- a/auto_round/compressors_new/calib.py +++ b/auto_round/compressors_new/calib.py @@ -964,6 +964,8 @@ def _quantize_blocks( clear_memory( input_ids if input_ids is not next_input_ids else None, device_list=self.compress_context.device_list ) + if reference_output is not next_input_ids: + clear_memory(reference_output, device_list=self.compress_context.device_list) memory_monitor.log_summary() # ── Infrastructure: immediate_pack / shard write ────────────────── From ec45a1c55e279fcbda4e10e5650e7e4733793083 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Sun, 12 Apr 2026 13:38:34 +0800 Subject: [PATCH 54/90] fix: skip compile_func for FP8_STATIC on HPU + trim malloc at ModelContext init - _hardware_setup: apply act-quantize/alg-ext guard before compile_func, matching _resolve_block_forward() and old-arch behavior. On HPU where enable_torch_compile stays True for FP8_STATIC, this avoids creating a compiled graph that wastes ~264 MB of HPU memory. - ModelContext.__init__: gc.collect + malloc_trim after model/tokenizer loading to reclaim C heap fragmentation (~96 MB). Signed-off-by: n1ck-guo --- auto_round/compressors_new/base.py | 11 +++++++++-- auto_round/context/model.py | 7 +++++++ test/test_cpu/models/test_omni_model.py | 1 - 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/auto_round/compressors_new/base.py b/auto_round/compressors_new/base.py index 85e0f4af4..274bd7b77 100644 --- a/auto_round/compressors_new/base.py +++ b/auto_round/compressors_new/base.py @@ -822,9 +822,16 @@ def _hardware_setup(self) -> None: # Re-evaluate torch.compile eligibility now that data_type is resolved. self._adjust_torch_compile(self.enable_torch_compile) self.compress_context.enable_torch_compile = self.enable_torch_compile - self.block_forward = ( - compile_func(block_forward, self.compress_context.device) if self.enable_torch_compile else block_forward + # Apply the same act-quantization / alg-ext guard as + # _resolve_block_forward() so we never compile when hooks are present. + cfg = self.quantize_config + _needs_plain_forward = (cfg.is_act_quantize and (not cfg.act_dynamic or cfg.is_act_nv_fp)) or getattr( + cfg, "enable_alg_ext", False ) + if self.enable_torch_compile and not _needs_plain_forward: + self.block_forward = compile_func(block_forward, self.compress_context.device) + else: + self.block_forward = block_forward if self.compress_context.low_cpu_mem_usage: self._offloader.reset() diff --git a/auto_round/context/model.py b/auto_round/context/model.py index 603b5a7de..212582083 100644 --- a/auto_round/context/model.py +++ b/auto_round/context/model.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import gc import importlib from typing import Any, Callable, Optional, Union @@ -37,6 +38,7 @@ mllm_load_model, unsupported_meta_device, ) +from auto_round.utils.device import _maybe_trim_malloc __all__ = ["ModelContext"] @@ -119,6 +121,11 @@ def __init__( else: logger.info(f"using {self.model.dtype} for quantization tuning") + # Reclaim C heap fragmentation left by model/tokenizer loading so + # that the quantize loop starts from a tighter RSS baseline. + gc.collect() + _maybe_trim_malloc() + def _load_model(self): if is_mllm_model(self.model, platform=self.platform): self.is_mllm = True diff --git a/test/test_cpu/models/test_omni_model.py b/test/test_cpu/models/test_omni_model.py index f02ea29e6..f965da9b2 100644 --- a/test/test_cpu/models/test_omni_model.py +++ b/test/test_cpu/models/test_omni_model.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - """Unit tests for Qwen2.5-Omni and Qwen3-Omni-MoE model support. Tests cover: From 3cd3c739f851953d086f3a9545e8a546b4263364 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Mon, 13 Apr 2026 15:34:18 +0800 Subject: [PATCH 55/90] fix(memory): reduce peak RSS for new arch via forced malloc_trim and init reorder - Add _force_trim_malloc() in device.py that unconditionally calls malloc_trim(0), bypassing the counter-based throttle in _maybe_trim_malloc() which was skipping critical lifecycle trim points - ClearMemory HPU path: replace _maybe_trim_malloc() with _force_trim_malloc() so heap pages are reclaimed before each MemoryMonitor RSS sample, preventing inflated peak_ram readings - ModelContext._load_model: add gc.collect + _force_trim_malloc before llm_load_model to reclaim temporary HTTP/config objects from is_mllm_model/is_diffusion_model/AutoConfig.from_pretrained calls - ModelContext.__init__: use _force_trim_malloc at end so the trim actually fires (previously _maybe_trim_malloc was a no-op at counter=1) - BaseCompressor.__init__: reorder context creation so ModelContext (large model allocation) is created before CompressContext (small), matching OLD arch allocation order to reduce heap fragmentation - BaseCompressor.post_init: add gc.collect + _force_trim_malloc after the five init phases to start quantize loop from tighter baseline - CalibCompressor.quantize: use _force_trim_malloc at loop start --- auto_round/compressors_new/base.py | 41 +++++++++++++++++++---------- auto_round/compressors_new/calib.py | 4 +-- auto_round/context/model.py | 11 ++++++-- auto_round/utils/device.py | 21 ++++++++++++++- 4 files changed, 58 insertions(+), 19 deletions(-) diff --git a/auto_round/compressors_new/base.py b/auto_round/compressors_new/base.py index 274bd7b77..50ac8e441 100644 --- a/auto_round/compressors_new/base.py +++ b/auto_round/compressors_new/base.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import copy +import gc import os import sys from dataclasses import asdict, dataclass, fields @@ -56,7 +57,7 @@ is_quantized_input_module, memory_monitor, ) -from auto_round.utils.device import set_non_auto_device_map +from auto_round.utils.device import _force_trim_malloc, get_major_device, set_non_auto_device_map from auto_round.utils.offload import OffloadManager from auto_round.wrapper import wrapper_block @@ -228,18 +229,13 @@ def __init__( # consecutive AutoRound creations don't inherit stale config from earlier ones. CompressContext.reset_context() ModelContext.reset_context() - # Alternatively, you can use CompressContext.create_context - self.compress_context = CompressContext( - low_cpu_mem_usage, - low_gpu_mem_usage, - device_map, - enable_torch_compile, - is_immediate_packing=self.is_immediate_packing, - is_immediate_saving=self.is_immediate_saving, - formats=self.formats, - static_kv_dtype=self.static_kv_dtype, - static_attention_dtype=self.static_attention_dtype, - ) + + # Resolve the device eagerly so ModelContext can be created before + # CompressContext. Creating ModelContext first places the large model + # allocation early in the heap, matching the OLD arch allocation order + # and reducing C-heap fragmentation (which is amplified on HPU). + _device = get_major_device(device_map if device_map is not None else 0) + self.model_context = ModelContext( model, tokenizer=tokenizer, @@ -248,11 +244,23 @@ def __init__( trust_remote_code=trust_remote_code, amp=amp, need_calib=self.need_calib, - device=self.compress_context.device, + device=_device, formats=self.formats, is_act_quantize=self.quantize_config.is_act_quantize, quant_nontext_module=quant_nontext_module, ) + # Alternatively, you can use CompressContext.create_context + self.compress_context = CompressContext( + low_cpu_mem_usage, + low_gpu_mem_usage, + device_map, + enable_torch_compile, + is_immediate_packing=self.is_immediate_packing, + is_immediate_saving=self.is_immediate_saving, + formats=self.formats, + static_kv_dtype=self.static_kv_dtype, + static_attention_dtype=self.static_attention_dtype, + ) self.shard_writer = None # scale_dtype is resolved in quantizer.resolve_scheme() after scheme resolution, @@ -558,6 +566,11 @@ def post_init(self) -> None: self._build_layer_config() self._hardware_setup() + # Reclaim heap fragmentation from the five init phases above so that + # the quantize loop starts from a tighter RSS baseline. + gc.collect() + _force_trim_malloc() + self._post_init_done = True # ── Pipeline phase methods ──────────────────────────────────────────────── diff --git a/auto_round/compressors_new/calib.py b/auto_round/compressors_new/calib.py index b36f15d1d..44e1e00db 100644 --- a/auto_round/compressors_new/calib.py +++ b/auto_round/compressors_new/calib.py @@ -62,7 +62,7 @@ wrap_block_forward_positional_to_kwargs, ) from auto_round.utils.device import ( - _maybe_trim_malloc, + _force_trim_malloc, parse_available_devices, ) from auto_round.wrapper import WrapperLinear, WrapperMultiblock @@ -1012,7 +1012,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: # Reclaim heap fragmentation from init/post_init before the memory-intensive quantize loop. gc.collect() - _maybe_trim_malloc() + _force_trim_malloc() self._check_compatibility() diff --git a/auto_round/context/model.py b/auto_round/context/model.py index 212582083..4f992758e 100644 --- a/auto_round/context/model.py +++ b/auto_round/context/model.py @@ -38,7 +38,7 @@ mllm_load_model, unsupported_meta_device, ) -from auto_round.utils.device import _maybe_trim_malloc +from auto_round.utils.device import _force_trim_malloc __all__ = ["ModelContext"] @@ -124,7 +124,7 @@ def __init__( # Reclaim C heap fragmentation left by model/tokenizer loading so # that the quantize loop starts from a tighter RSS baseline. gc.collect() - _maybe_trim_malloc() + _force_trim_malloc() def _load_model(self): if is_mllm_model(self.model, platform=self.platform): @@ -171,6 +171,13 @@ def _load_model(self): "Please consider submitting an issue to https://github.com/intel/auto-round/issues" ) + # Reclaim temporary HTTP/config objects from model type detection + # and AutoConfig loading before the large model allocation. This + # reduces heap fragmentation especially on HPU where habana internal + # allocations amplify fragmentation into persistent RSS growth. + gc.collect() + _force_trim_malloc() + self.model, self.tokenizer = llm_load_model( self.model, platform=self.platform, diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index c6fba10bb..1394e58b6 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -564,6 +564,25 @@ def _clear_memory_for_cpu_and_cuda( _malloc_trim_counter = 0 +def _force_trim_malloc() -> None: + """Unconditionally release glibc heap pages back to the OS on Linux. + + Unlike :func:`_maybe_trim_malloc`, this ignores the call-count throttle and + always invokes ``malloc_trim(0)``. Use at critical lifecycle boundaries + (end of model loading, end of post_init, start of quantize loop) where a + one-time trim has a meaningful impact on peak RSS. + """ + if os.name != "posix": + return + if os.environ.get("AR_ENABLE_MALLOC_TRIM", "1") != "1": + return + try: + libc = ctypes.CDLL("libc.so.6") + libc.malloc_trim(0) + except Exception: + pass + + def _maybe_trim_malloc() -> None: """Optionally release glibc heap pages back to OS on Linux. @@ -614,7 +633,7 @@ def __call__( tensor[i] = None tensor = None gc.collect() - _maybe_trim_malloc() + _force_trim_malloc() memory_monitor.update_hpu(device_list) return else: From 14a59db159cf6909172166ada6d5b2b895da0d66 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Tue, 14 Apr 2026 12:54:47 +0800 Subject: [PATCH 56/90] fix(memory): reduce peak RAM via deferred ShardWriter, intermediate GC, and dataloader cleanup - Defer ShardWriter creation from post_init to save_quantized (or _adjust_immediate_packing for immediate-save flows) to avoid heap fragmentation from parameter iteration during initialization - Add gc.collect + _force_trim_malloc between Phase 4 (layer config) and Phase 5 (hardware setup) to compact heap before compile setup - Release calibration dataloader after cache_inter_data completes to free tokenized sample tensors earlier --- auto_round/compressors_new/base.py | 28 +++++++++++++++++++++++++--- auto_round/compressors_new/calib.py | 3 +++ 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/auto_round/compressors_new/base.py b/auto_round/compressors_new/base.py index 9b923d975..03c1ef4ba 100644 --- a/auto_round/compressors_new/base.py +++ b/auto_round/compressors_new/base.py @@ -564,10 +564,17 @@ def post_init(self) -> None: self._resolve_formats() self._patch_model() self._build_layer_config() + + # Reclaim temporaries from Phases 1-4 (scheme resolution, format + # parsing, model patching, layer-config walk) before Phase 5 + # allocates hardware/compile objects. This compacts the heap so that + # the fragmentation gap between live and freed blocks is minimised. + gc.collect() + _force_trim_malloc() + self._hardware_setup() - # Reclaim heap fragmentation from the five init phases above so that - # the quantize loop starts from a tighter RSS baseline. + # Final trim after all init phases. gc.collect() _force_trim_malloc() @@ -655,7 +662,8 @@ def _resolve_formats(self) -> None: if self.formats is not None: self.compress_context.formats = self.formats ShardWriter.reset() - self.shard_writer = ShardWriter(self.model_context.model, bits=8) + # Defer ShardWriter construction to _ensure_shard_writer() to avoid + # heap fragmentation during post_init (parameter iteration). # Snapshot the user-specified layer_config before GGUF processing may # add extra entries, so we can distinguish them later in step 2b. @@ -961,6 +969,17 @@ def _adjust_immediate_packing_and_saving(self): self.compress_context.is_immediate_packing = self.is_immediate_packing self.compress_context.is_immediate_saving = self.is_immediate_saving + # Create ShardWriter eagerly only when immediate saving is active + # (it interleaves with the quantize loop). Otherwise keep it deferred + # until save_quantized() to avoid heap fragmentation during init. + if self.is_immediate_saving: + self._ensure_shard_writer() + + def _ensure_shard_writer(self): + """Lazily create ShardWriter if it hasn't been created yet.""" + if self.shard_writer is None and self.formats is not None: + self.shard_writer = ShardWriter(self.model_context.model, bits=8) + def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: """Quantize the model and return the quantized model along with layer configurations.The entry of AutoRound. Returns: @@ -1180,6 +1199,9 @@ def quantize_and_save( self.quantize() self.model_context.quantized = True + # Ensure ShardWriter is ready before saving (deferred from post_init). + self._ensure_shard_writer() + # Save the quantized model in the specified format_list model, folders = self.save_quantized(output_dir, inplace=inplace, return_folders=True, **kwargs) memory_monitor.log_summary() diff --git a/auto_round/compressors_new/calib.py b/auto_round/compressors_new/calib.py index 897362dd9..a5bbc1f1c 100644 --- a/auto_round/compressors_new/calib.py +++ b/auto_round/compressors_new/calib.py @@ -329,6 +329,9 @@ def cache_inter_data(self, block_names, nsamples, layer_names=None, last_cache_n for attr in ("last_cache_name", "_cache_target_set", "_cache_seen_targets", "to_cached_layers"): if hasattr(self, attr): delattr(self, attr) + # Release calibration dataloader to free tokenized sample tensors + if hasattr(self, "dataloader"): + del self.dataloader res = self.inputs if tmp_dtype is not None: self.model_context.model = self.model_context.model.to(tmp_dtype) From 29969c88b5379f33088cc58b27564710530bb209 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Wed, 15 Apr 2026 12:22:15 +0800 Subject: [PATCH 57/90] fix Signed-off-by: n1ck-guo --- auto_round/compressors_new/entry.py | 1 + auto_round/export/utils.py | 4 +++- auto_round/special_model_handler.py | 7 ++++--- test/test_cuda/models/test_omni_model.py | 8 ++++---- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/auto_round/compressors_new/entry.py b/auto_round/compressors_new/entry.py index 81a29661b..c55769da2 100644 --- a/auto_round/compressors_new/entry.py +++ b/auto_round/compressors_new/entry.py @@ -483,6 +483,7 @@ def __new__( format=format, scheme=scheme, dataset=dataset, + iters=iters, low_gpu_mem_usage=low_gpu_mem_usage, device_map=device_map, enable_torch_compile=enable_torch_compile, diff --git a/auto_round/export/utils.py b/auto_round/export/utils.py index 4a76fce09..c740791ef 100644 --- a/auto_round/export/utils.py +++ b/auto_round/export/utils.py @@ -371,13 +371,15 @@ def filter_quantization_config(quantization_config): quantization_config.pop("act_sym", None) quantization_config.pop("act_group_size", None) - clean_list = ("supported_types", "quant_block_list") + clean_list = ("supported_types", "quant_block_list", "transform_configs") for key in list(quantization_config.keys()): if callable(key): quantization_config.pop(key) elif isinstance(quantization_config[key], (list, tuple)): if any([callable(item) for item in quantization_config[key]]): quantization_config.pop(key) + elif len(quantization_config[key]) == 0: + quantization_config.pop(key) if key in clean_list and key in quantization_config: quantization_config.pop(key) return quantization_config diff --git a/auto_round/special_model_handler.py b/auto_round/special_model_handler.py index b6343e048..5b5e66ba7 100644 --- a/auto_round/special_model_handler.py +++ b/auto_round/special_model_handler.py @@ -60,15 +60,16 @@ def _handle_special_model(model): - if hasattr(model, "config") and model.config.model_type == "deepseek_vl_v2": + model_type = getattr(getattr(model, "config", None), "model_type", None) + if model_type == "deepseek_vl_v2": from functools import partial model.forward = partial(_deepseek_vl2_forward, model) - if hasattr(model, "config") and model.config.model_type == "qwen2_5_omni": + if model_type == "qwen2_5_omni": from functools import partial model.forward = partial(_qwen2_5_omni_forward, model) - if hasattr(model, "config") and model.config.model_type == "qwen3_omni_moe": + if model_type == "qwen3_omni_moe": from functools import partial model.forward = partial(_qwen3_omni_moe_forward, model) diff --git a/test/test_cuda/models/test_omni_model.py b/test/test_cuda/models/test_omni_model.py index b4ccdb81c..d74a4e529 100644 --- a/test/test_cuda/models/test_omni_model.py +++ b/test/test_cuda/models/test_omni_model.py @@ -73,10 +73,10 @@ def test_quantize_and_reload(self, tiny_qwen2_5_omni_model_path, tmp_path): for extra_file in ["spk_dict.pt"]: src = os.path.join(tiny_qwen2_5_omni_model_path, extra_file) if os.path.exists(src): - shutil.copy2(src, tmp_path) + shutil.copy2(src, save_folder) # Reload - loaded_model = Qwen2_5OmniForConditionalGeneration.from_pretrained(tmp_path, device_map="cuda") + loaded_model = Qwen2_5OmniForConditionalGeneration.from_pretrained(save_folder, device_map="cuda") # Run inference on thinker inp = torch.randint(0, 100, (1, 64)).to("cuda") @@ -111,7 +111,7 @@ def test_quantize_and_reload(self, tiny_qwen3_omni_moe_model_path): assert quantized_model is not None, "Quantized model should not be None" # Reload - loaded_model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(self.save_dir, device_map="cuda") + loaded_model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(save_folder, device_map="cuda") # Run inference on thinker inp = torch.randint(0, 100, (1, 64)).to("cuda") @@ -134,7 +134,7 @@ def test_quantize_mxfp4(self, tiny_qwen3_omni_moe_model_path): assert quantized_model is not None, "MXFP4 quantized model should not be None" # Reload and inference - loaded_model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(self.save_dir, device_map="cuda") + loaded_model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(save_folder, device_map="cuda") inp = torch.randint(0, 100, (1, 64)).to("cuda") with torch.inference_mode(): From c7f21a7504916260360a45a836406cf013718bff Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 16 Apr 2026 14:08:06 +0800 Subject: [PATCH 58/90] update Signed-off-by: n1ck-guo --- auto_round/compressors_new/base.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/auto_round/compressors_new/base.py b/auto_round/compressors_new/base.py index 03c1ef4ba..a017f474f 100644 --- a/auto_round/compressors_new/base.py +++ b/auto_round/compressors_new/base.py @@ -561,6 +561,19 @@ def post_init(self) -> None: return self._resolve_scheme() + + # After scheme resolution, is_act_quantize is known. When activation + # quantization is enabled and the model is in float16, convert to + # bfloat16 to match the old arch. This also detaches any parameter + # tensors that are still backed by safetensors' mmap, preventing + # per-block RSS growth (~14 MB/block) when .to(device) page-faults + # the underlying file pages into physical memory. + if self.quantize_config.is_act_quantize and self.model_context.amp_dtype == torch.float16: + logger.warning("force to use bf16 for quantization tuning when enabling activation quantization") + self.model_context.amp_dtype = torch.bfloat16 + if self.model_context.model.dtype != torch.bfloat16: + self.model_context.model = self.model_context.model.to(torch.bfloat16) + self._resolve_formats() self._patch_model() self._build_layer_config() From 028bb0697b45567a7a7c122b5c6fc74050cef4d7 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 16 Apr 2026 14:50:23 +0800 Subject: [PATCH 59/90] sync hadamard transform changes from main branch to new architecture - Replace need_calibration with data_type parameter throughout transform pipeline - Add data_type-aware block_size defaults (mx_fp->32, nv_fp->16) - Disable triton kernel path for NV_FP data types - Expand ROTATION_SUPPORTED_SCHEMES to include MXFP8, MXFP4, NVFP4 - Simplify patch functions: delegate to original _qdq_weight/_qdq_act - Use QModuleBase instead of MXQuantLinearBase for target type detection - Add orig_dtype preservation in input transform hooks - Remove check_supported_schemes from compressor entry point - Remove precision param from weight transform build (keep for input transform) --- auto_round/algorithms/transforms/__init__.py | 8 +- auto_round/algorithms/transforms/base.py | 9 +- .../algorithms/transforms/hadamard/apply.py | 91 ++++++++----------- .../algorithms/transforms/hadamard/config.py | 53 ++++++++++- .../algorithms/transforms/hadamard/patch.py | 78 +++++----------- auto_round/compressors_new/base.py | 6 +- 6 files changed, 120 insertions(+), 125 deletions(-) diff --git a/auto_round/algorithms/transforms/__init__.py b/auto_round/algorithms/transforms/__init__.py index 247a49f99..1a561ad08 100644 --- a/auto_round/algorithms/transforms/__init__.py +++ b/auto_round/algorithms/transforms/__init__.py @@ -114,7 +114,7 @@ def normalize_rotation_config( def apply_rotation( model: torch.nn.Module, config: Any, - need_calibration: bool = False, + data_type: str = "mx_fp", **kwargs: Any, ) -> torch.nn.Module: """Apply a rotation/transform algorithm to *model*. @@ -130,9 +130,7 @@ def apply_rotation( * :class:`HadamardConfig` or compatible ``dict``/``str``. * Any :class:`BaseRotationConfig` subclass. - need_calibration: Forward to the rotation implementation; controls - whether transforms are fused eagerly or patched into - calibration wrappers. + data_type: Quantization data type (e.g. ``"mx_fp"``). **kwargs: Forwarded to :meth:`BaseRotation.apply_to_model`. Returns: @@ -146,4 +144,4 @@ def apply_rotation( return model rotation = BaseRotation.from_config(normalised) - return rotation.apply_to_model(model, need_calibration=need_calibration, **kwargs) + return rotation.apply_to_model(model, data_type=data_type, **kwargs) diff --git a/auto_round/algorithms/transforms/base.py b/auto_round/algorithms/transforms/base.py index aeec1454a..236e64347 100644 --- a/auto_round/algorithms/transforms/base.py +++ b/auto_round/algorithms/transforms/base.py @@ -74,17 +74,14 @@ def __init__(self, config: BaseRotationConfig) -> None: def apply_to_model( self, model: torch.nn.Module, - need_calibration: bool = False, + data_type: str = "mx_fp", **kwargs: Any, ) -> torch.nn.Module: """Apply this rotation to *model* and return the (possibly mutated) model. Args: model: The model to transform. - need_calibration: When ``True``, monkey-patch training-time wrappers - (``WrapperLinear``, ``WrapperWALayer``) so the transform is - re-applied each forward pass during calibration. When - ``False``, fuse the transform eagerly into the weight tensor. + data_type: Quantization data type (e.g. ``"mx_fp"``). **kwargs: Algorithm-specific extra arguments. Returns: @@ -134,7 +131,7 @@ def from_config(cls, config: BaseRotationConfig) -> "BaseRotation": # --------------------------------------------------------------------------- #: Quantization schemes that support (and require) rotation transforms. -ROTATION_SUPPORTED_SCHEMES: list[str] = ["MXFP4"] +ROTATION_SUPPORTED_SCHEMES: list[str] = ["MXFP8", "MXFP4", "NVFP4"] def check_supported_schemes(scheme: str) -> None: diff --git a/auto_round/algorithms/transforms/hadamard/apply.py b/auto_round/algorithms/transforms/hadamard/apply.py index 5f99aea09..622cbe203 100644 --- a/auto_round/algorithms/transforms/hadamard/apply.py +++ b/auto_round/algorithms/transforms/hadamard/apply.py @@ -29,13 +29,16 @@ from auto_round.algorithms.transforms.base import BaseRotation from auto_round.algorithms.transforms.hadamard.config import HadamardConfig, normalize_hadamard_config from auto_round.algorithms.transforms.hadamard.transforms import build_hadamard_transform -from auto_round.experimental.qmodules.mx import MXQuantLinearBase # optional dep, guarded below +from auto_round.compressors.utils import is_nv_fp +from auto_round.experimental.qmodules.base import QModuleBase __all__ = ["HadamardRotation", "apply_hadamard_transform"] -# Detect optional Triton path once at import time. -def _triton_available() -> bool: +def _triton_available(data_type: str = "mx_fp") -> bool: + """Best-effort check for whether Triton kernel path can be used.""" + if is_nv_fp(data_type): + return False try: import triton # noqa: F401 # pylint: disable=E0401 @@ -81,25 +84,21 @@ def from_config(cls, config: dict | HadamardConfig) -> "HadamardRotation": def apply_to_model( self, model: torch.nn.Module, - need_calibration: bool = False, location: str = "weight", use_tqdm: bool = True, desc: str | None = None, + data_type: str = "mx_fp", **kwargs: Any, ) -> torch.nn.Module: """Apply the Hadamard rotation to *model*. Args: model: Target model; modified in-place. - need_calibration: When ``True``, calibration wrappers - (:class:`~auto_round.wrapper.WrapperLinear`, - :class:`~auto_round.wrapper.WrapperWALayer`) are - monkey-patched so the transform is re-applied each - forward pass during AutoRound tuning. location: ``"weight"`` (eager, fused into weights) or ``"input"`` (activation-side, via forward hook). use_tqdm: Show a progress bar while iterating modules. desc: Custom progress-bar description. + data_type: Quantization data type (e.g. ``"mx_fp"``). **kwargs: Reserved for future use. Returns: @@ -109,10 +108,7 @@ def apply_to_model( cfg = self.config # Collect target modules. - try: - target_types = (torch.nn.Linear, MXQuantLinearBase) - except Exception: - target_types = (torch.nn.Linear,) + target_types = (torch.nn.Linear, QModuleBase) modules = [(name, module) for name, module in model.named_modules() if isinstance(module, target_types)] @@ -120,7 +116,7 @@ def apply_to_model( for name, module in tqdm.tqdm(modules, desc=_desc, disable=not use_tqdm): if "lm_head" in name: continue - _apply_to_module(model, module, cfg, need_calibration, location) + _apply_to_module(model, module, cfg, location, data_type) # Store config on model for serialisation / downstream inspection. setattr(model, "hadamard_config", cfg) @@ -136,27 +132,21 @@ def _apply_to_module( model: torch.nn.Module, module: torch.nn.Module, config: HadamardConfig, - need_calibration: bool, location: str, + data_type: str = "mx_fp", ) -> None: """Apply the configured Hadamard transform to a single *module*.""" - from auto_round.algorithms.transforms.hadamard.patch import ( - patch_quantlinear, - patch_wrapperlinear_to_apply_transform, - patch_wrapperwalayer_forward_to_apply_transform, - ) - if location == "input": - _apply_input_transform(module, config) + _apply_input_transform(module, config, data_type) elif location == "weight": - _apply_weight_transform(module, config, need_calibration) + _apply_weight_transform(module, config) else: raise NotImplementedError(f"Unsupported transform location: {location!r}") -def _apply_input_transform(module: torch.nn.Module, config: HadamardConfig) -> None: +def _apply_input_transform(module: torch.nn.Module, config: HadamardConfig, data_type: str = "mx_fp") -> None: """Register a forward pre-hook that applies the Hadamard to the input activation.""" from auto_round.algorithms.transforms.hadamard.utils.matrix import multihead_matmul @@ -173,16 +163,17 @@ def _apply_input_transform(module: torch.nn.Module, config: HadamardConfig) -> N else: hadamard_weight = None - if _triton_available(): + if _triton_available(data_type): from auto_round.algorithms.transforms.hadamard.utils.triton.mxfp4 import mxfp4_forward_kernel_wrapper def _input_hook(self, args): x = args[0] orig_shape = x.shape + orig_dtype = x.dtype x_flat = x.contiguous().flatten(end_dim=-2) - w = hadamard_weight if hadamard_weight is not None else self.hadamard_matrix.T + w = hadamard_weight.to(orig_dtype) if hadamard_weight is not None else self.hadamard_matrix.T.to(orig_dtype) qdq_input, _ = mxfp4_forward_kernel_wrapper(x_flat, w) - return qdq_input.reshape(orig_shape) + return qdq_input.reshape(orig_shape).to(orig_dtype) module.pre_dequantized_input = True module.register_forward_pre_hook(_input_hook, prepend=True) @@ -191,12 +182,13 @@ def _input_hook(self, args): def _input_hook(self, args): x = args[0] ori_shape = x.shape + orig_dtype = x.dtype if hadamard_weight is not None: x = x.view(-1, hadamard_weight.shape[0]) - return multihead_matmul(x, hadamard_weight.to(x.device)).view(ori_shape) + return multihead_matmul(x, hadamard_weight.to(x.device).to(orig_dtype)).view(ori_shape).to(orig_dtype) else: x = x.view(-1, self.hadamard_matrix.shape[0]) - return multihead_matmul(x, self.hadamard_matrix.T).view(ori_shape) + return multihead_matmul(x, self.hadamard_matrix.T.to(orig_dtype)).view(ori_shape).to(orig_dtype) module.pre_dequantized_input = False module.register_forward_pre_hook(_input_hook, prepend=True) @@ -205,7 +197,6 @@ def _input_hook(self, args): def _apply_weight_transform( module: torch.nn.Module, config: HadamardConfig, - need_calibration: bool, ) -> None: """Fuse or patch the Hadamard rotation into the weight of *module*.""" from auto_round.algorithms.transforms.hadamard.patch import ( @@ -220,28 +211,26 @@ def _apply_weight_transform( **config.model_dump(), location="weight", device=module.weight.device, - precision=module.weight.dtype, ) # For random Hadamard, save the matrix as a submodule for serialisation. if config.hadamard_type == "random_hadamard": - module.register_module(config.hadamard_type, w_transform) - patch_quantlinear(config.hadamard_type) - - if need_calibration: - inp_transform = build_hadamard_transform( - **config.model_dump(), - location="input", - inverse=True, - device=module.weight.device, - precision=module.weight.dtype, - ) - patch_wrapperlinear_to_apply_transform(w_transform, inp_transform) - patch_wrapperwalayer_forward_to_apply_transform(inp_transform) - else: - # Eagerly fuse the transform into the weight tensor. - with torch.no_grad(): - module.weight.copy_(w_transform(module.weight).to(module.weight.device)) + from auto_round.algorithms.transforms.hadamard.patch import patch_quantlinear as _patch_ql + + _patch_ql(w_transform) + + # Patch WrapperLinear and WrapperWALayer so the transform is applied + # during calibration tuning. + inp_transform = build_hadamard_transform( + **config.model_dump(), + location="input", + inverse=True, + device=module.weight.device, + precision=module.weight.dtype, + ) + + patch_wrapperlinear_to_apply_transform(w_transform, inp_transform) + patch_wrapperwalayer_forward_to_apply_transform(inp_transform) # --------------------------------------------------------------------------- @@ -252,10 +241,10 @@ def _apply_weight_transform( def apply_hadamard_transform( model: torch.nn.Module, config: str | dict | HadamardConfig | None, - need_calibration: bool = False, location: str = "weight", use_tqdm: bool = True, desc: str | None = None, + data_type: str = "mx_fp", ) -> torch.nn.Module: """Apply a Hadamard rotation to *model*. @@ -266,10 +255,10 @@ def apply_hadamard_transform( model: Target model. config: One of: :class:`HadamardConfig`, ``dict``, ``str`` shorthand, or ``None`` (no-op). - need_calibration: See :meth:`HadamardRotation.apply_to_model`. location: ``"weight"`` or ``"input"``. use_tqdm: Show progress bar. desc: Custom progress-bar label. + data_type: Quantization data type (e.g. ``"mx_fp"``). Returns: The transformed model. @@ -280,8 +269,8 @@ def apply_hadamard_transform( rotation = HadamardRotation.from_config(normalised) return rotation.apply_to_model( model, - need_calibration=need_calibration, location=location, use_tqdm=use_tqdm, desc=desc, + data_type=data_type, ) diff --git a/auto_round/algorithms/transforms/hadamard/config.py b/auto_round/algorithms/transforms/hadamard/config.py index ebff618c8..df85d8103 100644 --- a/auto_round/algorithms/transforms/hadamard/config.py +++ b/auto_round/algorithms/transforms/hadamard/config.py @@ -20,6 +20,8 @@ from pydantic import BaseModel, Field, field_validator from auto_round.algorithms.transforms.base import BaseRotationConfig +from auto_round.compressors.utils import is_mx_fp, is_nv_fp +from auto_round.utils import logger __all__ = ["HadamardConfig", "normalize_hadamard_config"] @@ -63,6 +65,7 @@ def _validate_hadamard_type(cls, v: str) -> str: def normalize_hadamard_config( config: str | dict | HadamardConfig | None, + data_type: str = "mx_fp", ) -> dict[str, Any]: """Normalise various input forms to a canonical ``dict`` for :class:`HadamardConfig`. @@ -74,6 +77,8 @@ def normalize_hadamard_config( * :class:`HadamardConfig` → converted to ``dict`` * ``str`` shorthand → treated as ``hadamard_type`` (``"default"`` → default :class:`HadamardConfig`) + data_type: Quantization data type. Used to infer ``block_size`` + when not explicitly set (mx_fp → 32, nv_fp → 16). Returns: A validated ``dict`` that can be passed to ``HadamardConfig(**result)``. @@ -82,15 +87,48 @@ def normalize_hadamard_config( ValueError: If the config is invalid. TypeError: If the config type is not recognised. """ + + def _apply_data_type_block_size(cfg_dict: dict[str, Any], block_size_explicitly_set: bool) -> dict[str, Any]: + block_size = cfg_dict.get("block_size") + + if not block_size_explicitly_set or block_size is None: + if is_mx_fp(data_type): + cfg_dict["block_size"] = 32 + elif is_nv_fp(data_type): + cfg_dict["block_size"] = 16 + logger.warning("block_size is not set for data_type 'nv_fp'; defaulting to 16.") + else: + logger.warning( + f"block_size is not set and cannot be inferred for data_type {data_type!r}; " + "please set block_size explicitly in hadamard_config if needed." + ) + else: + if is_mx_fp(data_type) and block_size != 32: + logger.warning(f"data_type is 'mx_fp' but block_size={block_size}; recommended value is 32.") + elif is_nv_fp(data_type) and block_size != 16: + logger.warning(f"data_type is 'nv_fp' but block_size={block_size}; recommended value is 16.") + + return cfg_dict + if config is None: return {} if isinstance(config, HadamardConfig): - return config.model_dump() + raw_cfg_dict = config.model_dump(exclude_unset=True) + block_size_explicitly_set = "block_size" in raw_cfg_dict + cfg_dict = dict(raw_cfg_dict) + cfg_dict = _apply_data_type_block_size(cfg_dict, block_size_explicitly_set) + try: + return HadamardConfig.model_validate(cfg_dict).model_dump() + except Exception as exc: + raise ValueError(f"Invalid HadamardConfig: {exc}") from exc if isinstance(config, dict): + block_size_explicitly_set = "block_size" in config + cfg_dict = dict(config) + cfg_dict = _apply_data_type_block_size(cfg_dict, block_size_explicitly_set) try: - return HadamardConfig.model_validate(config).model_dump() + return HadamardConfig.model_validate(cfg_dict).model_dump() except Exception as exc: raise ValueError(f"Invalid HadamardConfig dict: {exc}") from exc @@ -99,14 +137,21 @@ def normalize_hadamard_config( if not key: return {} if key == "default": - return HadamardConfig().model_dump() + cfg_dict = {} + cfg_dict = _apply_data_type_block_size(cfg_dict, block_size_explicitly_set=False) + try: + return HadamardConfig.model_validate(cfg_dict).model_dump() + except Exception as exc: + raise ValueError(f"Invalid default hadamard_config after data_type adjustment: {exc}") from exc if key not in HADAMARD_TYPES: raise ValueError( f"Unrecognised hadamard config string: {key!r}. " f"Expected one of {sorted(HADAMARD_TYPES)} or 'default'." ) + cfg_dict = {"hadamard_type": key} + cfg_dict = _apply_data_type_block_size(cfg_dict, block_size_explicitly_set=False) try: - return HadamardConfig.model_validate({"hadamard_type": key}).model_dump() + return HadamardConfig.model_validate(cfg_dict).model_dump() except Exception as exc: raise ValueError(f"Failed to build HadamardConfig from {key!r}: {exc}") from exc diff --git a/auto_round/algorithms/transforms/hadamard/patch.py b/auto_round/algorithms/transforms/hadamard/patch.py index e088e0c17..49c838ac6 100644 --- a/auto_round/algorithms/transforms/hadamard/patch.py +++ b/auto_round/algorithms/transforms/hadamard/patch.py @@ -59,64 +59,32 @@ def patch_wrapperlinear_to_apply_transform( def _qdq_weight_patched(self, value, min_scale, max_scale): if self.orig_layer.bits >= 16: - # Keep original behaviour for ≥16-bit quantisation. + # Keep original behaviour for >=16-bit quantisation. return _orig_qdq_weight(self, value, min_scale, max_scale) - min_scale.data.clamp_(0, 1.0) - max_scale.data.clamp_(0, 1.0) + if getattr(self, "applied_weight_hadamard", None) is None: + with torch.no_grad(): + weight = self.orig_layer.weight + if weight.device.type == "meta": + weight = self.orig_layer.get_weight().to(self.device) - weight = self.orig_layer.weight - if weight.device.type == "meta": - weight = self.orig_layer.get_weight().to(self.device) + is_conv1d = type(self.orig_layer) is transformers.pytorch_utils.Conv1D + if is_conv1d: + weight = weight.t().contiguous() + new_weight = w_transform(weight).to(self.device) + if is_conv1d: + new_weight = new_weight.t().contiguous() + self.orig_layer.weight.data.copy_(new_weight) + self.applied_weight_hadamard = True - is_conv1d = type(self.orig_layer) is transformers.pytorch_utils.Conv1D - if is_conv1d: - weight = weight.t() - weight = weight.to(self.device) + return _orig_qdq_weight(self, value, min_scale, max_scale) - weight_t = w_transform(weight) - - quant_kwargs = {} - if hasattr(self.orig_layer, "super_bits"): - quant_kwargs["super_bits"] = self.orig_layer.super_bits - quant_kwargs["super_group_size"] = self.orig_layer.super_group_size - - weight_q, scale, zp = self.weight_quant_func( - weight_t, - bits=self.orig_layer.bits, - group_size=self.orig_layer.group_size, - v=value, - min_scale=min_scale, - max_scale=max_scale, - scale_dtype=self.orig_layer.scale_dtype, - tensor_min=self.weight_min, - tensor_max=self.weight_max, - data_type=self.data_type, - q_scale_thresh=self.q_scale_thresh, - imatrix=self.orig_layer.imatrix.to(self.device) if hasattr(self.orig_layer, "imatrix") else None, - global_scale=getattr(self, "weight_global_scale", None), - **quant_kwargs, - ) - weight_q = weight_q.to(dtype=weight.dtype) - if is_conv1d: - weight_q = weight_q.t() - return weight_q, scale, zp + _orig_qdq_act = WrapperLinear._qdq_act def _qdq_act_patched(self, x, act_max_scale, act_max=None): x = inp_transform(x) - act_max_scale.data.clamp_(0, 1.0) - x, scale, zp = self.act_quant_func( - x, - bits=self.orig_layer.act_bits, - group_size=self.orig_layer.act_group_size, - scale_dtype=self.orig_layer.scale_dtype, - q_scale_thresh=self.q_scale_thresh, - data_type=self.act_data_type, - max_scale=act_max_scale, - tensor_max=act_max, - global_scale=getattr(self, "input_global_scale", None), - ) - return x, scale, zp + + return _orig_qdq_act(self, x, act_max_scale, act_max) WrapperLinear._qdq_weight = _qdq_weight_patched WrapperLinear._qdq_act = _qdq_act_patched @@ -155,7 +123,7 @@ def _forward_patched(self, x): WrapperWALayer._hadamard_forward_patched = True -def patch_quantlinear(hadamard_type: str) -> None: +def patch_quantlinear(w_transform) -> None: """Patch :class:`QuantLinear` so random Hadamard matrices are saved when packing. Only needed for ``random_hadamard`` where the rotation matrix must be @@ -219,11 +187,11 @@ def _pack_patched( if global_scale is not None: self.weight_global_scale = global_scale.to(torch.float32).to(device) if input_global_scale is not None: - self.input_global_scale = input_global_scale.to(torch.float32).to(device) + self.input_global_scale = input_global_scale.to(torch.float32).to(device).reshape([1]) - # Save the random Hadamard matrix from the submodule. - if hasattr(linear, hadamard_type): - self.register_module(hadamard_type, getattr(linear, hadamard_type)) + # add transform weight + self.register_buffer("hadamard_matrix", w_transform.weight.to(device)) + return QuantLinear.pack = _pack_patched QuantLinear._pack_patched = True diff --git a/auto_round/compressors_new/base.py b/auto_round/compressors_new/base.py index a017f474f..5e1f7cd6d 100644 --- a/auto_round/compressors_new/base.py +++ b/auto_round/compressors_new/base.py @@ -26,7 +26,6 @@ from auto_round.algorithms.transforms import ( BaseRotationConfig, apply_rotation, - check_supported_schemes, ) from auto_round.compressors_new.shard_writer import ShardWriter from auto_round.compressors_new.utils import _get_save_folder_name, block_forward, set_layer_config @@ -760,13 +759,12 @@ def _resolve_formats(self) -> None: # ── 2d: apply rotation transforms ──────────────────────────────────── if self.transform_configs: - check_supported_schemes(self.scheme) - need_calibration = self.quantize_config.iters > 0 + logger.info("Applying Hadamard transform to the model.") for rotation_cfg in self.transform_configs: self.model_context.model = apply_rotation( self.model_context.model, rotation_cfg, - need_calibration=need_calibration, + data_type=self.quantize_config.data_type, ) def _patch_model(self) -> None: From 451008b5cebfe1d76940b66c69b41eeff3fa86b8 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 16 Apr 2026 16:13:10 +0800 Subject: [PATCH 60/90] fix sglang test: switch OPT to Qwen3-0.6B to avoid fused qkv_proj regression sglang's OPT model implementation fuses q/k/v into a single qkv_proj layer and expects the quantized checkpoint to have qkv_proj.qweight. But auto-round saves separate q_proj/k_proj/v_proj following HuggingFace's OPT structure, causing a KeyError during weight loading. This regression in sglang@main affects all auto-round users of OPT models. Switch the test to Qwen3-0.6B which uses consistent weight naming between HuggingFace and sglang. Also update test_mixed_ar_format_sglang layer_config: replace OPT-specific layer name 'fc1' with Qwen3's 'mlp' and update corresponding assertions. --- test/test_cuda/integrations/test_sglang.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/test_cuda/integrations/test_sglang.py b/test/test_cuda/integrations/test_sglang.py index 94d5e8c62..8db11b6c7 100644 --- a/test/test_cuda/integrations/test_sglang.py +++ b/test/test_cuda/integrations/test_sglang.py @@ -10,11 +10,11 @@ from auto_round import AutoRound -from ...helpers import get_model_path, opt_name_or_path +from ...helpers import get_model_path, qwen_name_or_path class TestAutoRound: - model_name = opt_name_or_path + model_name = qwen_name_or_path @pytest.fixture(autouse=True) def _save_dir(self, tmp_path): @@ -73,7 +73,7 @@ def test_mixed_ar_format_sglang(self, dataloader): layer_config = { "self_attn": {"bits": 8}, "lm_head": {"bits": 16}, - "fc1": {"bits": 16, "act_bits": 16}, + "mlp": {"bits": 16, "act_bits": 16}, } autoround = AutoRound( @@ -96,9 +96,9 @@ def test_mixed_ar_format_sglang(self, dataloader): quant_config = config.get("quantization_config", {}) extra_config = quant_config.get("extra_config", {}) # check extra_config only saved attributes differing from Scheme values - assert "act_bits" not in extra_config[".*fc1.*"].keys() - assert "group_size" not in extra_config[".*fc1.*"].keys() - assert "bits" in extra_config[".*fc1.*"].keys() and extra_config[".*fc1.*"]["bits"] == 16 + assert "act_bits" not in extra_config[".*mlp.*"].keys() + assert "group_size" not in extra_config[".*mlp.*"].keys() + assert "bits" in extra_config[".*mlp.*"].keys() and extra_config[".*mlp.*"]["bits"] == 16 assert "bits" in extra_config[".*self_attn.*"].keys() and extra_config[".*self_attn.*"]["bits"] == 8 generated_text = self._run_sglang_inference(quantized_model_path) print(generated_text) From 8bdf054e48515f0042ce675ece5c33d14d5b749c Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Fri, 17 Apr 2026 10:18:32 +0800 Subject: [PATCH 61/90] fix: invalidate compiled block forward cache on block change; guard low_cpu_mem_usage without immediate_packing --- auto_round/algorithms/quantization/base.py | 1 + auto_round/compressors/base.py | 8 ++++++++ auto_round/compressors_new/base.py | 8 ++++++++ 3 files changed, 17 insertions(+) diff --git a/auto_round/algorithms/quantization/base.py b/auto_round/algorithms/quantization/base.py index e6744a96a..29fd9211e 100644 --- a/auto_round/algorithms/quantization/base.py +++ b/auto_round/algorithms/quantization/base.py @@ -385,6 +385,7 @@ def _resolve_block_forward(self): def _invalidate_block_forward_cache(self): """Clear the cached block forward function (call when block changes).""" self.__dict__.pop("_resolved_block_forward", None) + self.__dict__.pop("_compiled_block_forward", None) def _get_current_q_output( self, diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index cdd9b743e..751fa5aeb 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -1724,6 +1724,14 @@ def _adjust_immediate_packing_and_saving(self): if self.low_cpu_mem_usage and self.is_immediate_packing: self.is_immediate_saving = True + if self.low_cpu_mem_usage and not self.is_immediate_packing: + logger.info( + "`low_cpu_mem_usage` is only supported when `immediate_packing` is True. " + "Setting `low_cpu_mem_usage` to False." + ) + self.low_cpu_mem_usage = False + self.is_immediate_saving = False + if self.low_cpu_mem_usage and self.is_immediate_packing: if formats[0].is_gguf(): logger.warning( diff --git a/auto_round/compressors_new/base.py b/auto_round/compressors_new/base.py index 5e1f7cd6d..bb5e28ae5 100644 --- a/auto_round/compressors_new/base.py +++ b/auto_round/compressors_new/base.py @@ -944,6 +944,14 @@ def _adjust_immediate_packing_and_saving(self): if self.compress_context.low_cpu_mem_usage and self.is_immediate_packing: self.is_immediate_saving = True + if self.compress_context.low_cpu_mem_usage and not self.is_immediate_packing: + logger.info( + "`low_cpu_mem_usage` is only supported when `immediate_packing` is True. " + "Setting `low_cpu_mem_usage` to False." + ) + self.compress_context.low_cpu_mem_usage = False + self.is_immediate_saving = False + if self.compress_context.low_cpu_mem_usage and self.is_immediate_packing: if formats[0].is_gguf(): logger.warning( From d067b5348ca3152c4de896fd3dc8fadf8b4cf177 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Fri, 17 Apr 2026 10:26:26 +0800 Subject: [PATCH 62/90] sync 8d7bb84c to new arch: enable immediate_saving for nv_fp/mx_fp, fix shard_writer storage ptr --- auto_round/compressors_new/base.py | 9 +++++++-- auto_round/compressors_new/shard_writer.py | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/auto_round/compressors_new/base.py b/auto_round/compressors_new/base.py index bb5e28ae5..00bb8319a 100644 --- a/auto_round/compressors_new/base.py +++ b/auto_round/compressors_new/base.py @@ -27,6 +27,7 @@ BaseRotationConfig, apply_rotation, ) +from auto_round.compressors.utils import is_mx_fp, is_nv_fp from auto_round.compressors_new.shard_writer import ShardWriter from auto_round.compressors_new.utils import _get_save_folder_name, block_forward, set_layer_config from auto_round.context.compress import CompressContext @@ -978,8 +979,12 @@ def _adjust_immediate_packing_and_saving(self): self.compress_context.low_cpu_mem_usage = False self.is_immediate_saving = False - if self.is_immediate_saving and "int" not in self.quantize_config.data_type: - logger.warning("immediate_saving is only supported for int quantization, set to False") + if self.is_immediate_saving and not ( + "int" in self.quantize_config.data_type + or is_nv_fp(self.quantize_config.data_type) + or is_mx_fp(self.quantize_config.data_type) + ): + logger.warning("immediate_saving is only supported for int/nv_fp/mx_fp quantization, set to False") self.is_immediate_saving = False if self.output_dir is None: diff --git a/auto_round/compressors_new/shard_writer.py b/auto_round/compressors_new/shard_writer.py index 980059a2e..dbdd2cc86 100644 --- a/auto_round/compressors_new/shard_writer.py +++ b/auto_round/compressors_new/shard_writer.py @@ -194,7 +194,7 @@ def _handle_tied_weights(self): filtered_tensors[name] = tensor continue - ptr = tensor.untyped_storage().data_ptr() + ptr = tensor.untyped_storage().data_ptr() + tensor.storage_offset() * tensor.element_size() if ptr not in storage_map: storage_map.add(ptr) filtered_tensors[name] = tensor From 8d2f3419331fd56fb11824f65e88d8fc14a1f05e Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Fri, 17 Apr 2026 13:30:58 +0800 Subject: [PATCH 63/90] merge main Signed-off-by: n1ck-guo --- auto_round/compressors_new/calib.py | 13 +++++-------- test/test_cpu/export/test_llmc_format.py | 1 + 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/auto_round/compressors_new/calib.py b/auto_round/compressors_new/calib.py index f5648ce9a..c78c95e54 100644 --- a/auto_round/compressors_new/calib.py +++ b/auto_round/compressors_new/calib.py @@ -997,16 +997,14 @@ def _quantize_blocks( if self.enable_torch_compile: torch._dynamo.reset() self.quantizer._invalidate_block_forward_cache() - # Always advance input_ids to the current block's output so that the next - # block receives the correct activations. When enable_quanted_input is - # False we reuse reference_output (unquantized block output); otherwise - # q_input already holds the quantized-block output. - next_input_ids = q_input if q_input is not None else reference_output + # Keep old-arch semantics: the next block's FP reference input comes + # from the current block's reference output, while q_input (when + # enabled) is only used as the quantized-input companion for the + # next block. + next_input_ids = reference_output clear_memory( input_ids if input_ids is not next_input_ids else None, device_list=self.compress_context.device_list ) - if reference_output is not next_input_ids: - clear_memory(reference_output, device_list=self.compress_context.device_list) memory_monitor.log_summary() # ── Infrastructure: immediate_pack / shard write ────────────────── @@ -1092,7 +1090,6 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: all_inputs = copy.deepcopy(self.inputs) clear_memory(self.inputs, device_list=self.compress_context.device_list) all_q_inputs = self.try_cache_inter_data_gpucpu(all_first_block_names, self.nsamples, layer_names) - self.inputs = all_q_inputs # Remove accelerate dispatch hooks before moving parameters. # hf_device_map is kept for reference but hooks are no longer needed. if hasattr(self.model_context.model, "hf_device_map") and len(self.model_context.model.hf_device_map) > 1: diff --git a/test/test_cpu/export/test_llmc_format.py b/test/test_cpu/export/test_llmc_format.py index fd8c62930..e321648b0 100644 --- a/test/test_cpu/export/test_llmc_format.py +++ b/test/test_cpu/export/test_llmc_format.py @@ -12,6 +12,7 @@ class TestLLMC: + @classmethod def setup_class(self): self.model_name = get_model_path("stas/tiny-random-llama-2") From cec095808b0b0dbc309e4706dc25873f3f01a9cc Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Fri, 17 Apr 2026 14:15:35 +0800 Subject: [PATCH 64/90] fix ut Signed-off-by: n1ck-guo --- test/test_cpu/advanced/test_low_precision_input_model.py | 4 ++-- test/test_cpu/models/test_moe_model.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/test_cpu/advanced/test_low_precision_input_model.py b/test/test_cpu/advanced/test_low_precision_input_model.py index 9b0779954..9087e0de0 100644 --- a/test/test_cpu/advanced/test_low_precision_input_model.py +++ b/test/test_cpu/advanced/test_low_precision_input_model.py @@ -97,6 +97,6 @@ def test_w4a16_to_mxfp4(self, tmp_path): iters=2, nsamples=2, ) - ar.quantize_and_save(tmp_path, format="llm_compressor") - model = transformers.AutoModelForCausalLM.from_pretrained(tmp_path) + _, quantized_model_path = ar.quantize_and_save(tmp_path, format="llm_compressor") + model = transformers.AutoModelForCausalLM.from_pretrained(quantized_model_path) assert model, "Failed to load the quantized model" diff --git a/test/test_cpu/models/test_moe_model.py b/test/test_cpu/models/test_moe_model.py index 484bcbd06..9c25b1d81 100644 --- a/test/test_cpu/models/test_moe_model.py +++ b/test/test_cpu/models/test_moe_model.py @@ -65,7 +65,7 @@ def test_gptoss(scheme, tiny_gpt_oss_model_path, tmp_path): ), f"Expected {config.num_hidden_layers * 3 * config.num_local_experts} QuantLinear modules, found {quant_linear_cnt}." # verify the quantized model can be loaded and run inference - loaded_model = GptOssForCausalLM.from_pretrained(output_dir) + loaded_model = GptOssForCausalLM.from_pretrained(save_folder) inp = torch.randint(0, 100, (1, 32)) with torch.inference_mode(): @@ -81,7 +81,7 @@ def test_llama4(tiny_llama4_model_path): # Ensure the quantized model is not None assert quantized_model is not None, "Quantized model should not be None." - loaded_model = Llama4ForConditionalGeneration.from_pretrained(output_dir) + loaded_model = Llama4ForConditionalGeneration.from_pretrained(save_folder) inp = torch.randint(0, 100, (1, 32)) with torch.inference_mode(): From 459435cad08852673325cdff669a72c6c68a1f0e Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Fri, 17 Apr 2026 16:29:51 +0800 Subject: [PATCH 65/90] fix HPU FP8_STATIC peak RAM: disable eager pipeline in new-arch, clear attention_mask in calib - entry.py: add _maybe_disable_hpu_eager_pipeline() to disable PT_HPU_EAGER_PIPELINE_ENABLE for new-arch static wfp8afp8 path, called in AutoRound.__new__ before HPU runtime init - calib.py: clear quantizer.attention_mask at start of compress() to free cached activations early and reduce host RAM --- auto_round/compressors_new/calib.py | 3 ++ auto_round/compressors_new/entry.py | 55 ++++++++++++++++++++++++++++- 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/auto_round/compressors_new/calib.py b/auto_round/compressors_new/calib.py index c78c95e54..e0c1a691e 100644 --- a/auto_round/compressors_new/calib.py +++ b/auto_round/compressors_new/calib.py @@ -302,6 +302,9 @@ def cache_inter_data(self, block_names, nsamples, layer_names=None, last_cache_n if not self._post_init_done: self.post_init() + if hasattr(self, "quantizer") and hasattr(self.quantizer, "attention_mask"): + self.quantizer.attention_mask = [] + self.inputs = {} self.to_cached_layers = block_names + layer_names diff --git a/auto_round/compressors_new/entry.py b/auto_round/compressors_new/entry.py index c55769da2..fd8d2c215 100644 --- a/auto_round/compressors_new/entry.py +++ b/auto_round/compressors_new/entry.py @@ -1,6 +1,8 @@ # # Copyright (C) 2026 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 +import os +from types import SimpleNamespace from typing import Any, Callable, Optional, Union import torch @@ -11,10 +13,11 @@ from auto_round.algorithms.transforms.hadamard.config import HadamardConfig from auto_round.auto_scheme.gen_auto_scheme import AutoScheme from auto_round.compressors_new.calib import CalibCompressor, CalibratedRTNCompressor -from auto_round.compressors_new.utils import check_need_act_calibration +from auto_round.compressors_new.utils import check_need_act_calibration, is_static_wfp8afp8 from auto_round.compressors_new.zero_shot import ZeroShotCompressor from auto_round.logger import logger from auto_round.schemes import QuantizationScheme, _parse_scheme +from auto_round.utils.device import get_device_and_parallelism def _preview_resolved_attrs(config, scheme=None) -> dict: @@ -72,6 +75,52 @@ def _eager_validate_scheme(config, scheme=None) -> None: temp_config.check_config() # raises ValueError / NotImplementedError if invalid +def _needs_hpu_fp8_static_eager_guard(config, scheme, device_map) -> bool: + """Return True when new-arch HPU FP8_STATIC should disable eager pipelines. + + On HPU, the new architecture's static FP8 calibration path can trigger + persistent host-side eager-pipeline growth across blocks. Disabling eager + pipeline restores old-arch-like host RAM usage for this specific path. + """ + device, _ = get_device_and_parallelism(device_map) + if not str(device).startswith("hpu"): + return False + + resolved = _preview_resolved_attrs(config, scheme) + attrs = SimpleNamespace( + bits=resolved.get("bits", getattr(config, "bits", None)), + act_bits=resolved.get("act_bits", getattr(config, "act_bits", None)), + data_type=resolved.get("data_type", getattr(config, "data_type", None)), + act_data_type=resolved.get("act_data_type", getattr(config, "act_data_type", None)), + act_dynamic=resolved.get("act_dynamic", getattr(config, "act_dynamic", None)), + ) + return is_static_wfp8afp8(attrs) + + +def _maybe_disable_hpu_eager_pipeline(config, scheme, device_map) -> None: + """Apply the HPU eager-pipeline guard for the affected new-arch path. + + Respect explicit user configuration. If either environment variable is set, + AutoRound assumes the caller intentionally chose the runtime behavior. + """ + if not _needs_hpu_fp8_static_eager_guard(config, scheme, device_map): + return + + eager_keys = ( + "PT_HPU_EAGER_PIPELINE_ENABLE", + "PT_HPU_EAGER_COLLECTIVE_PIPELINE_ENABLE", + ) + if any(key in os.environ for key in eager_keys): + return + + os.environ["PT_HPU_EAGER_PIPELINE_ENABLE"] = "0" + os.environ["PT_HPU_EAGER_COLLECTIVE_PIPELINE_ENABLE"] = "0" + logger.warning_once( + "Disabling HPU eager pipeline for new-architecture FP8_STATIC tuning to avoid host RAM growth. " + "Set PT_HPU_EAGER_PIPELINE_ENABLE/PT_HPU_EAGER_COLLECTIVE_PIPELINE_ENABLE explicitly to override." + ) + + # --------------------------------------------------------------------------- # Compressor-class registry # --------------------------------------------------------------------------- @@ -209,6 +258,10 @@ def __new__( # callers get ValueError/NotImplementedError on construction, not deferred. _eager_validate_scheme(quant_config, scheme) + # Guard the known HPU FP8_STATIC host-RAM regression before any HPU + # runtime initialization performed by the concrete compressor. + _maybe_disable_hpu_eager_pipeline(quant_config, scheme, device_map) + # using different compressor base on AlgConfigs local_args = {k: v for k, v in locals().items() if k not in cls.SKIP_ARGS} From 463f58d24302da32fc58e67bc8ec5d164f8ca0a4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 17 Apr 2026 08:36:59 +0000 Subject: [PATCH 66/90] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- test/test_cpu/utils/test_hpu_eager_guard.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_cpu/utils/test_hpu_eager_guard.py b/test/test_cpu/utils/test_hpu_eager_guard.py index 0d8704ae9..5df624ae1 100644 --- a/test/test_cpu/utils/test_hpu_eager_guard.py +++ b/test/test_cpu/utils/test_hpu_eager_guard.py @@ -31,4 +31,4 @@ def test_hpu_fp8_static_eager_guard_respects_user_env(monkeypatch): _maybe_disable_hpu_eager_pipeline(RTNConfig(), "FP8_STATIC", "hpu") assert os.getenv("PT_HPU_EAGER_PIPELINE_ENABLE") == "1" - assert os.getenv("PT_HPU_EAGER_COLLECTIVE_PIPELINE_ENABLE") == "1" \ No newline at end of file + assert os.getenv("PT_HPU_EAGER_COLLECTIVE_PIPELINE_ENABLE") == "1" From 6cbb1560bc4f24857d4d4a80bd59976a72d756fc Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Fri, 17 Apr 2026 16:37:15 +0800 Subject: [PATCH 67/90] test: add unit tests for HPU FP8_STATIC eager pipeline guard --- test/test_cpu/utils/test_hpu_eager_guard.py | 34 +++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 test/test_cpu/utils/test_hpu_eager_guard.py diff --git a/test/test_cpu/utils/test_hpu_eager_guard.py b/test/test_cpu/utils/test_hpu_eager_guard.py new file mode 100644 index 000000000..0d8704ae9 --- /dev/null +++ b/test/test_cpu/utils/test_hpu_eager_guard.py @@ -0,0 +1,34 @@ +import os + +from auto_round.algorithms.quantization.rtn.config import RTNConfig +from auto_round.compressors_new.entry import _maybe_disable_hpu_eager_pipeline + + +def test_hpu_fp8_static_eager_guard_sets_env(monkeypatch): + monkeypatch.delenv("PT_HPU_EAGER_PIPELINE_ENABLE", raising=False) + monkeypatch.delenv("PT_HPU_EAGER_COLLECTIVE_PIPELINE_ENABLE", raising=False) + + _maybe_disable_hpu_eager_pipeline(RTNConfig(), "FP8_STATIC", "hpu") + + assert os.getenv("PT_HPU_EAGER_PIPELINE_ENABLE") == "0" + assert os.getenv("PT_HPU_EAGER_COLLECTIVE_PIPELINE_ENABLE") == "0" + + +def test_hpu_non_fp8_static_does_not_set_env(monkeypatch): + monkeypatch.delenv("PT_HPU_EAGER_PIPELINE_ENABLE", raising=False) + monkeypatch.delenv("PT_HPU_EAGER_COLLECTIVE_PIPELINE_ENABLE", raising=False) + + _maybe_disable_hpu_eager_pipeline(RTNConfig(), "W4A16", "hpu") + + assert os.getenv("PT_HPU_EAGER_PIPELINE_ENABLE") is None + assert os.getenv("PT_HPU_EAGER_COLLECTIVE_PIPELINE_ENABLE") is None + + +def test_hpu_fp8_static_eager_guard_respects_user_env(monkeypatch): + monkeypatch.setenv("PT_HPU_EAGER_PIPELINE_ENABLE", "1") + monkeypatch.setenv("PT_HPU_EAGER_COLLECTIVE_PIPELINE_ENABLE", "1") + + _maybe_disable_hpu_eager_pipeline(RTNConfig(), "FP8_STATIC", "hpu") + + assert os.getenv("PT_HPU_EAGER_PIPELINE_ENABLE") == "1" + assert os.getenv("PT_HPU_EAGER_COLLECTIVE_PIPELINE_ENABLE") == "1" \ No newline at end of file From 9f88982b06107e2e8e9d4b4332b5ae5fb460a2f1 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Fri, 17 Apr 2026 16:53:33 +0800 Subject: [PATCH 68/90] fix ut Signed-off-by: n1ck-guo --- test/test_cuda/integrations/test_sglang.py | 14 ++++++++++++ test/test_cuda/integrations/test_vllm.py | 25 ++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/test/test_cuda/integrations/test_sglang.py b/test/test_cuda/integrations/test_sglang.py index 8db11b6c7..196fa2efa 100644 --- a/test/test_cuda/integrations/test_sglang.py +++ b/test/test_cuda/integrations/test_sglang.py @@ -35,6 +35,20 @@ def setup_and_teardown_class(self): shutil.rmtree("runs", ignore_errors=True) def _run_sglang_inference(self, model_path: Path): + # SM 12.x (Blackwell) GPUs require CUDA >= 12.9 for sglang's gptq_marlin_repack JIT kernel. + # Skip inference when the environment is known to be incompatible. + if torch.cuda.is_available(): + try: + major, minor = torch.cuda.get_device_capability() + if major >= 12: + cuda_ver = tuple(int(x) for x in (torch.version.cuda or "0.0").split(".")[:2]) + if cuda_ver < (12, 9): + pytest.skip( + f"SM {major}.{minor} GPU requires CUDA >= 12.9 for sglang GPTQ JIT kernels " + f"(installed: CUDA {torch.version.cuda})" + ) + except Exception: + pass llm = sgl.Engine( model_path=str(model_path), mem_fraction_static=0.5, disable_piecewise_cuda_graph=True, cuda_graph_bs=[1] ) diff --git a/test/test_cuda/integrations/test_vllm.py b/test/test_cuda/integrations/test_vllm.py index 5ca4a2469..e9faa3e14 100644 --- a/test/test_cuda/integrations/test_vllm.py +++ b/test/test_cuda/integrations/test_vllm.py @@ -21,6 +21,31 @@ os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + +def _is_sm12_with_old_cuda() -> bool: + """Return True when the GPU is SM 12.x (Blackwell) and CUDA < 12.9. + + vLLM's gptq_marlin JIT kernels require CUDA >= 12.9 on SM 12.x devices. + """ + try: + import torch + + if not torch.cuda.is_available(): + return False + major, _ = torch.cuda.get_device_capability() + if major < 12: + return False + cuda_ver = tuple(int(x) for x in (torch.version.cuda or "0.0").split(".")[:2]) + return cuda_ver < (12, 9) + except Exception: + return False + + +pytestmark = pytest.mark.skipif( + _is_sm12_with_old_cuda(), + reason="SM 12.x (Blackwell) GPU requires CUDA >= 12.9 for vLLM GPTQ marlin JIT kernels", +) + MODELS = [ "OPEA/Qwen2.5-0.5B-Instruct-int4-sym-inc", ##auto_round:auto_gptq "Intel/Qwen2-0.5B-Instruct-int4-sym-AutoRound", ##auto_round:auto_awq From e6de66bbb98d5837712a2af4beb6a1cd5d298020 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Wed, 22 Apr 2026 15:27:14 +0800 Subject: [PATCH 69/90] fix merge Signed-off-by: n1ck-guo --- auto_round/algorithms/quantization/base.py | 13 ++++++++++--- auto_round/special_model_handler.py | 2 +- auto_round/utils/device.py | 9 +++++++-- test/test_cpu/export/test_export.py | 2 ++ 4 files changed, 20 insertions(+), 6 deletions(-) diff --git a/auto_round/algorithms/quantization/base.py b/auto_round/algorithms/quantization/base.py index 8ce842d76..054ab9acd 100644 --- a/auto_round/algorithms/quantization/base.py +++ b/auto_round/algorithms/quantization/base.py @@ -467,9 +467,16 @@ def _sampling_inputs( for key in input_others.keys(): if "positional_inputs" in key: continue - if (key not in share_cache_keys or len(indices) == 1) and not isinstance( - input_others[key], (str, bool, type(None)) - ): + if key in share_cache_keys: + # Shared keys are stored once (not per-sample), often wrapped in a + # 1-element list by the caching hook. Unwrap so the model receives + # the raw value (e.g. (cos, sin) tuple, not [(cos, sin)]). + val = input_others[key] + if isinstance(val, list) and len(val) == 1: + current_input_others[key] = val[0] + else: + current_input_others[key] = val + elif not isinstance(input_others[key], (str, bool, type(None))): current_input_others[key] = None if input_others[key] is not None: current_input_others[key] = [input_others[key][i] for i in indices] diff --git a/auto_round/special_model_handler.py b/auto_round/special_model_handler.py index f9ae373a3..88560101c 100644 --- a/auto_round/special_model_handler.py +++ b/auto_round/special_model_handler.py @@ -204,7 +204,7 @@ def _handle_special_model(model): from functools import partial model.forward = partial(_qwen3_omni_moe_forward, model) - if hasattr(model, "config") and model.config.model_type == "gemma4": + if hasattr(model, "config") and model_type == "gemma4": import transformers from packaging import version diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index 1f717c40f..d5be01ff2 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -40,7 +40,6 @@ "hpu": "HABANA_VISIBLE_MODULES", } - # Note on HPU usage: # There are two modes available for enabling auto-round on HPU: # 1. Compile Mode @@ -284,7 +283,13 @@ def is_valid_digit(s): return device -def get_device_and_parallelism(device: Union[str, torch.device, int]) -> tuple[str, bool]: +def get_device_and_parallelism(device: Union[str, torch.device, int, dict]) -> tuple[str, bool]: + if isinstance(device, dict): + unique_devices = set(device.values()) + if len(unique_devices) == 1: + device = next(iter(unique_devices)) + else: + device = "auto" if isinstance(device, str): if device in ["cuda", "xpu", "hpu"]: device = detect_device(device) diff --git a/test/test_cpu/export/test_export.py b/test/test_cpu/export/test_export.py index a72b336b2..08ecef2d1 100644 --- a/test/test_cpu/export/test_export.py +++ b/test/test_cpu/export/test_export.py @@ -24,6 +24,7 @@ def _get_folder_size(path: str) -> float: class TestAutoRound: + @classmethod def setup_class(self): self.model_name = opt_name_or_path @@ -420,6 +421,7 @@ def test_export_format(self): model=self.model_name, scheme="INT8_W8A8", ) + autoround_old.post_init() format_list_old = get_formats("llm_compressor, auto_round:llm_compressor", autoround_old) assert format_list_old[0].output_format == "llm_compressor" assert format_list_old[0].get_backend_name() == "llm_compressor:int8_w8a8" From 85740b5c69f6f28b2f80d154effe9e94f998e73a Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Wed, 22 Apr 2026 16:13:33 +0800 Subject: [PATCH 70/90] fix Signed-off-by: n1ck-guo --- auto_round/compressors_new/entry.py | 54 +---------------------------- auto_round/utils/device.py | 3 ++ 2 files changed, 4 insertions(+), 53 deletions(-) diff --git a/auto_round/compressors_new/entry.py b/auto_round/compressors_new/entry.py index fd8d2c215..3765de4bd 100644 --- a/auto_round/compressors_new/entry.py +++ b/auto_round/compressors_new/entry.py @@ -2,7 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 import os -from types import SimpleNamespace from typing import Any, Callable, Optional, Union import torch @@ -13,11 +12,10 @@ from auto_round.algorithms.transforms.hadamard.config import HadamardConfig from auto_round.auto_scheme.gen_auto_scheme import AutoScheme from auto_round.compressors_new.calib import CalibCompressor, CalibratedRTNCompressor -from auto_round.compressors_new.utils import check_need_act_calibration, is_static_wfp8afp8 +from auto_round.compressors_new.utils import check_need_act_calibration from auto_round.compressors_new.zero_shot import ZeroShotCompressor from auto_round.logger import logger from auto_round.schemes import QuantizationScheme, _parse_scheme -from auto_round.utils.device import get_device_and_parallelism def _preview_resolved_attrs(config, scheme=None) -> dict: @@ -75,52 +73,6 @@ def _eager_validate_scheme(config, scheme=None) -> None: temp_config.check_config() # raises ValueError / NotImplementedError if invalid -def _needs_hpu_fp8_static_eager_guard(config, scheme, device_map) -> bool: - """Return True when new-arch HPU FP8_STATIC should disable eager pipelines. - - On HPU, the new architecture's static FP8 calibration path can trigger - persistent host-side eager-pipeline growth across blocks. Disabling eager - pipeline restores old-arch-like host RAM usage for this specific path. - """ - device, _ = get_device_and_parallelism(device_map) - if not str(device).startswith("hpu"): - return False - - resolved = _preview_resolved_attrs(config, scheme) - attrs = SimpleNamespace( - bits=resolved.get("bits", getattr(config, "bits", None)), - act_bits=resolved.get("act_bits", getattr(config, "act_bits", None)), - data_type=resolved.get("data_type", getattr(config, "data_type", None)), - act_data_type=resolved.get("act_data_type", getattr(config, "act_data_type", None)), - act_dynamic=resolved.get("act_dynamic", getattr(config, "act_dynamic", None)), - ) - return is_static_wfp8afp8(attrs) - - -def _maybe_disable_hpu_eager_pipeline(config, scheme, device_map) -> None: - """Apply the HPU eager-pipeline guard for the affected new-arch path. - - Respect explicit user configuration. If either environment variable is set, - AutoRound assumes the caller intentionally chose the runtime behavior. - """ - if not _needs_hpu_fp8_static_eager_guard(config, scheme, device_map): - return - - eager_keys = ( - "PT_HPU_EAGER_PIPELINE_ENABLE", - "PT_HPU_EAGER_COLLECTIVE_PIPELINE_ENABLE", - ) - if any(key in os.environ for key in eager_keys): - return - - os.environ["PT_HPU_EAGER_PIPELINE_ENABLE"] = "0" - os.environ["PT_HPU_EAGER_COLLECTIVE_PIPELINE_ENABLE"] = "0" - logger.warning_once( - "Disabling HPU eager pipeline for new-architecture FP8_STATIC tuning to avoid host RAM growth. " - "Set PT_HPU_EAGER_PIPELINE_ENABLE/PT_HPU_EAGER_COLLECTIVE_PIPELINE_ENABLE explicitly to override." - ) - - # --------------------------------------------------------------------------- # Compressor-class registry # --------------------------------------------------------------------------- @@ -258,10 +210,6 @@ def __new__( # callers get ValueError/NotImplementedError on construction, not deferred. _eager_validate_scheme(quant_config, scheme) - # Guard the known HPU FP8_STATIC host-RAM regression before any HPU - # runtime initialization performed by the concrete compressor. - _maybe_disable_hpu_eager_pipeline(quant_config, scheme, device_map) - # using different compressor base on AlgConfigs local_args = {k: v for k, v in locals().items() if k not in cls.SKIP_ARGS} diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index d5be01ff2..3b99abf9f 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -284,6 +284,9 @@ def is_valid_digit(s): def get_device_and_parallelism(device: Union[str, torch.device, int, dict]) -> tuple[str, bool]: + if device is None: + device = detect_device(device) + return device, False if isinstance(device, dict): unique_devices = set(device.values()) if len(unique_devices) == 1: From 66c4da125c4ac9fa4504d1056b2e5752be49cf06 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Wed, 22 Apr 2026 16:16:30 +0800 Subject: [PATCH 71/90] clean Signed-off-by: n1ck-guo --- .../docs/compressors_new_architecture.md | 314 --------------- .../docs/compressors_new_architecture_CN.md | 372 ------------------ docs/step_by_step.md | 8 +- docs/step_by_step_CN.md | 8 +- 4 files changed, 8 insertions(+), 694 deletions(-) delete mode 100644 auto_round/compressors_new/docs/compressors_new_architecture.md delete mode 100644 auto_round/compressors_new/docs/compressors_new_architecture_CN.md diff --git a/auto_round/compressors_new/docs/compressors_new_architecture.md b/auto_round/compressors_new/docs/compressors_new_architecture.md deleted file mode 100644 index 78250c3c7..000000000 --- a/auto_round/compressors_new/docs/compressors_new_architecture.md +++ /dev/null @@ -1,314 +0,0 @@ -# Compressor New Architecture - -## Overview - -This document describes the new architecture of `compressors_new`, which provides a unified -quantization entry point for LLM, MLLM, and Diffusion models. - -## Architecture Design - -### Core Idea - -`Compressor` in `entry.py` is the single entry point. It detects the model type and config -type at construction time and dynamically creates the correct concrete class using multiple -inheritance (Mixin pattern). - -### Directory Structure - -``` -compressors_new/ -├── entry.py # Unified entry point — Compressor + AutoRound wrapper -├── base.py # BaseCompressor base class + SerializedCompressorConfig -├── calib.py # CalibCompressor (AutoRound gradient-based) -│ # CalibratedRTNCompressor (RTN + imatrix / act-calib) -├── zero_shot.py # ZeroShotCompressor (zero-shot RTN) -├── mllm_mixin.py # MLLMMixin (vision-language model extra logic) -├── diffusion_mixin.py # DiffusionMixin (diffusion pipeline extra logic) -└── docs/ # This document -``` - -### Class Hierarchy - -``` -BaseCompressor - ├── CalibCompressor (AutoRound, gradient-based calibration) - ├── CalibratedRTNCompressor (RTN + importance-matrix or act calibration) - └── ZeroShotCompressor (RTN, no calibration data needed) - -Mixins (combined dynamically in entry.py): - MLLMMixin + {CalibCompressor | CalibratedRTNCompressor | ZeroShotCompressor} - DiffusionMixin + {CalibCompressor | CalibratedRTNCompressor | ZeroShotCompressor} -``` - -## Configuration Layer - -### QuantizationConfig (dataclass) - -`QuantizationConfig` is declared as a `@dataclass(kw_only=True)`, which eliminates -`__init__` boilerplate. Subclasses call `super().__init__(scheme=..., **kwargs)` as normal: - -```python -@dataclass(kw_only=True) -class QuantizationConfig(AlgConfig): - _alg_cls: ClassVar[str] = None # which quantizer class to use - - scheme: Union[str, dict, QuantizationScheme, AutoScheme] = "W4A16" - bits: int = None - group_size: int = None # also accepts tuple, e.g. (128,128) for block-FP8 - # ... other fields - - def __post_init__(self): - self._early_resolve_scheme() # eagerly resolves scheme attrs at construction time -``` - -Subclasses: -- `RTNConfig(QuantizationConfig)` — adds `disable_opt_rtn`, `seqlen`, `nsamples`, `batch_size` -- `SignRoundConfig(QuantizationConfig)` — adds `iters`, `lr`, `nblocks`, `enable_minmax_tuning`, … - -### AlgConfig - -`AlgConfig` is the base class used as type annotation throughout `compressors_new/`. -Both `QuantizationConfig` and future non-quantization configs inherit from it. - -## ModelContext - -`ModelContext.__init__` **eagerly loads the model** — by the time `BaseCompressor.__init__` -returns, the model is already loaded in CPU memory. - -```python -class ModelContext(BaseContext): - def __init__(self, model, tokenizer, platform, ..., formats, is_act_quantize, quant_nontext_module): - # ... store attrs - self._load_model() # load LLM / MLLM / Diffusion model - check_and_mark_quantized_module(self.model) - self.model = self.model.eval() - self.shared_cache_keys = get_shared_keys(self.model) - self.is_moe_model = is_moe_model(self.model) - self._set_amp_dtype() - - def apply_patches(self, formats): - """Apply format-specific model structure patches. - Called by BaseCompressor.post_init() after formats are resolved. - """ - self._patch_custom_moe_modules() # e.g. Qwen3VL top_k fix - self.model = update_module(self.model, formats=formats, ...) - for n, m in self.model.named_modules(): - m.global_name = n # assign names used by quantizers - self._is_initialized = True -``` - -## BaseCompressor.post_init() Flow - -`post_init()` is called at the start of `quantize()` (not in `__init__`). -The order matters — model patches must come before quantizer setup: - -``` -post_init() -│ -├─ 1. Resolve formats (str → list[OutputFormat]) -│ -├─ 2. Apply model patches -│ model_context.apply_patches(formats) -│ ├── _patch_custom_moe_modules() -│ ├── update_module(model, formats) # insert gguf_pack_linear, etc. -│ └── assign m.global_name to all modules -│ -├─ 3. Setup quantizer on the patched model -│ quantizer = BaseQuantizers.from_config(config) -│ quantizer.post_init() -│ ├── _parse_scheme() → resolve final quant attrs -│ ├── get_block_names(quant_vision=quant_nontext_module) -│ ├── find_matching_blocks() → quant_block_list -│ ├── back-fill to_quant_block_names (if was None) -│ └── configure_layer_config() -│ -└─ 4. Setup device map, torch compile, offloader -``` - -> **No `refresh_quantizer_for_initialized_model()`** — eliminated by running `apply_patches` -> *before* `quantizer.post_init()`. - -## BaseQuantizers Interface - -All quantizers accept **names** (str), not module objects. -The module is retrieved internally via `get_module(model, name)`: - -```python -class BaseQuantizers: - def quantize_block( - self, - block_name: Union[str, list[str]], # list[str] for nblocks > 1 - input_ids=None, - input_others=None, - **kwargs, - ): ... - - def quantize_layer(self, layer_name: str, **kwargs): ... -``` - -- `str` → `get_module(model, block_name)` -- `list[str]` → `WrapperMultiblock([get_module(model, n) for n in block_name])` (multi-block) - -## Compressor Selection Decision Tree - -``` -Compressor.__new__(config, model, format, **kwargs) -│ -├─ Detect model type -│ ├─ is_diffusion_model() → "diffusion" -│ ├─ is_mllm_model() → "mllm" -│ └─ else → "llm" -│ -├─ isinstance(config, SignRoundConfig) -│ ├─ mllm → class MLLMCalibCompressor(MLLMMixin, CalibCompressor) -│ ├─ diffusion → class DiffusionCalibCompressor(DiffusionMixin, CalibCompressor) -│ └─ llm → CalibCompressor -│ -└─ isinstance(config, RTNConfig) - ├─ enable_imatrix OR needs_act_calib → CalibratedRTNCompressor path - │ ├─ gguf_k format → enable_imatrix = True - │ ├─ symmetric int RTN → enable_imatrix = True - │ ├─ static act quantization → needs_act_calib = True - │ │ - │ ├─ mllm → class MLLMCalibratedRTNCompressor(MLLMMixin, CalibratedRTNCompressor) - │ ├─ diffusion → class DiffusionCalibratedRTNCompressor(DiffusionMixin, CalibratedRTNCompressor) - │ └─ llm → CalibratedRTNCompressor - │ - └─ else → ZeroShotCompressor path - ├─ mllm → class MLLMZeroShotCompressor(MLLMMixin, ZeroShotCompressor) - ├─ diffusion → class DiffusionZeroShotCompressor(DiffusionMixin, ZeroShotCompressor) - └─ llm → ZeroShotCompressor -``` - -## MLLMMixin - -```python -class MLLMMixin: - def __init__( - self, - *args, - processor=None, - image_processor=None, - template=None, - extra_data_dir=None, - quant_nontext_module=False, - **kwargs - ): - self.processor = processor - self.template = template - self.quant_nontext_module = quant_nontext_module - # Pass to ModelContext so get_block_names includes vision blocks - kwargs.setdefault("quant_nontext_module", quant_nontext_module) - super().__init__(*args, **kwargs) - - def calib(self, nsamples, bs): - # Uses get_mllm_dataloader with template / processor - ... -``` - -`quant_nontext_module` flow: -`MLLMMixin.__init__` → `kwargs.setdefault` → `BaseCompressor.__init__` pops → `ModelContext(quant_nontext_module=...)` -→ `BaseQuantizers.post_init()` calls `get_block_names(quant_vision=quant_nontext_module)` - -## Usage Examples - -### Basic LLM quantization - -```python -from auto_round.compressors_new.entry import Compressor -from auto_round.algorithms.quantization.sign_round.config import SignRoundConfig - -config = SignRoundConfig(scheme="W4A16", iters=200, nsamples=128) -compressor = Compressor(config=config, model="/path/to/llm", tokenizer=tokenizer) -quantized_model, layer_config = compressor.quantize() -``` - -### MLLM (vision-language model) - -```python -config = SignRoundConfig(scheme="W4A16", iters=200) -compressor = Compressor( - config=config, - model="/models/Qwen2-VL-2B-Instruct", - processor=processor, - template="qwen2_vl", - quant_nontext_module=False, # True to also quantize vision encoder -) -# Creates: MLLMCalibCompressor(MLLMMixin, CalibCompressor) -``` - -### Diffusion model - -```python -config = SignRoundConfig(scheme="W4A16", iters=200) -compressor = Compressor( - config=config, - model="/models/stable-diffusion-2-1", - guidance_scale=7.5, -) -# Creates: DiffusionCalibCompressor(DiffusionMixin, CalibCompressor) -``` - -### RTN zero-shot - -```python -from auto_round.algorithms.quantization.rtn.config import RTNConfig - -config = RTNConfig(scheme="W4A16") -compressor = Compressor(config=config, model="/path/to/model") -``` - -### RTN with imatrix (GGUF k-quants) - -```python -config = RTNConfig(scheme="W4A16") -compressor = Compressor(config=config, model="/path/to/model", format="gguf_k") -# Creates: CalibratedRTNCompressor (enable_imatrix=True) -``` - -## Extending with New Model Types - -**Step 1**: Create a new Mixin in `compressors_new/`: - -```python -class AudioMixin: - def __init__(self, *args, audio_processor=None, **kwargs): - self.audio_processor = audio_processor - super().__init__(*args, **kwargs) - - def calib(self, nsamples, bs): ... -``` - -**Step 2**: Add detection in `entry.py`: - -```python -def detect_model_type(model): - if is_audio_model(model): - return "audio" - if is_diffusion_model(model): - return "diffusion" - ... -``` - -**Step 3**: Add routing in `Compressor.__new__()`: - -```python -if model_type == "audio": - from auto_round.compressors_new.audio_mixin import AudioMixin - - class AudioCalibCompressor(AudioMixin, CalibCompressor): - pass - - return AudioCalibCompressor(config, **local_args, **kwargs) -``` - -## Summary - -| Aspect | Description | -|---|---| -| **Entry point** | Single `Compressor` class, auto-detects model type | -| **Config** | `QuantizationConfig` dataclass; subclasses `RTNConfig`, `SignRoundConfig` | -| **Model loading** | `ModelContext.__init__` loads eagerly; `apply_patches()` runs before quantizer setup | -| **9 combinations** | 3 model types × 3 compressors, dynamic classes via Mixin | -| **Quantizer interface** | Name-based `quantize_block(name)` / `quantize_layer(name)`, not module objects | -| **Extension** | Add new model type in 3 steps (Mixin class, detect fn, routing) | diff --git a/auto_round/compressors_new/docs/compressors_new_architecture_CN.md b/auto_round/compressors_new/docs/compressors_new_architecture_CN.md deleted file mode 100644 index 8ab713d07..000000000 --- a/auto_round/compressors_new/docs/compressors_new_architecture_CN.md +++ /dev/null @@ -1,372 +0,0 @@ -# Compressor 新架构说明 - -## 概述 - -本文档介绍 `compressors_new` 的新架构设计,为 LLM、MLLM 和 Diffusion 模型提供统一的量化入口。 - -## 架构设计 - -### 核心思想 - -`entry.py` 中的 `Compressor` 是唯一入口。构造时自动检测模型类型和配置类型,通过多重继承(Mixin 模式)动态创建正确的具体类。 - -### 目录结构 - -``` -compressors_new/ -├── entry.py # 统一入口 — Compressor + AutoRound 兼容层 -├── base.py # BaseCompressor 基类 + SerializedCompressorConfig -├── calib.py # CalibCompressor(AutoRound 梯度校准) -│ # CalibratedRTNCompressor(RTN + imatrix / 激活校准) -├── zero_shot.py # ZeroShotCompressor(零样本 RTN) -├── mllm_mixin.py # MLLMMixin(视觉-语言模型扩展逻辑) -├── diffusion_mixin.py # DiffusionMixin(扩散模型 pipeline 扩展逻辑) -└── docs/ # 本文档 -``` - -### 类继承关系 - -``` -BaseCompressor - ├── CalibCompressor (AutoRound,基于梯度的校准量化) - ├── CalibratedRTNCompressor (RTN + importance-matrix 或激活校准) - └── ZeroShotCompressor (RTN,不需要校准数据) - -Mixin(在 entry.py 中动态组合): - MLLMMixin + {CalibCompressor | CalibratedRTNCompressor | ZeroShotCompressor} - DiffusionMixin + {CalibCompressor | CalibratedRTNCompressor | ZeroShotCompressor} -``` - -## 配置层 - -### QuantizationConfig(dataclass) - -`QuantizationConfig` 声明为 `@dataclass(kw_only=True)`,消除了 `__init__` 中的样板代码。 -子类仍然用 `super().__init__(scheme=..., **kwargs)` 正常调用: - -```python -@dataclass(kw_only=True) -class QuantizationConfig(AlgConfig): - _alg_cls: ClassVar[str] = None # 指定使用哪个量化器类 - - scheme: Union[str, dict, QuantizationScheme, AutoScheme] = "W4A16" - bits: int = None - group_size: int = None # 也接受 tuple,如 (128,128) 用于块状 FP8 - # ... 其他字段 - - def __post_init__(self): - self._early_resolve_scheme() # 构造时即刻解析 scheme 属性 -``` - -子类: -- `RTNConfig(QuantizationConfig)` — 新增 `disable_opt_rtn`、`seqlen`、`nsamples`、`batch_size` -- `SignRoundConfig(QuantizationConfig)` — 新增 `iters`、`lr`、`nblocks`、`enable_minmax_tuning` 等 - -### AlgConfig - -`AlgConfig` 是基类,用于 `compressors_new/` 各处的类型标注。 -`QuantizationConfig` 及未来的非量化配置都继承自它。 - -## ModelContext - -`ModelContext.__init__` **立即加载模型** —— `BaseCompressor.__init__` 返回时,模型已经在 CPU 内存中。 - -```python -class ModelContext(BaseContext): - def __init__(self, model, tokenizer, platform, ..., formats, is_act_quantize, quant_nontext_module): - # ... 存储属性 - self._load_model() # 加载 LLM / MLLM / Diffusion 模型 - check_and_mark_quantized_module(self.model) - self.model = self.model.eval() - self.shared_cache_keys = get_shared_keys(self.model) - self.is_moe_model = is_moe_model(self.model) - self._set_amp_dtype() - - def apply_patches(self, formats): - """应用格式相关的模型结构补丁。 - 由 BaseCompressor.post_init() 在 formats 解析完毕后调用。 - """ - self._patch_custom_moe_modules() # 如 Qwen3VL top_k 修复 - self.model = update_module(self.model, formats=formats, ...) - for n, m in self.model.named_modules(): - m.global_name = n # 赋予量化器使用的全局名称 - self._is_initialized = True -``` - -## BaseCompressor.post_init() 执行流程 - -`post_init()` 在 `quantize()` 开始时调用(不在 `__init__` 中)。 -顺序至关重要——模型补丁必须在量化器初始化之前完成: - -``` -post_init() -│ -├─ 1. 解析 formats(str → list[OutputFormat]) -│ -├─ 2. 应用模型补丁 -│ model_context.apply_patches(formats) -│ ├── _patch_custom_moe_modules() -│ ├── update_module(model, formats) # 插入 gguf_pack_linear 等 -│ └── 为所有模块赋予 m.global_name -│ -├─ 3. 在已补丁的模型上初始化量化器 -│ quantizer = BaseQuantizers.from_config(config) -│ quantizer.post_init() -│ ├── _parse_scheme() → 解析最终量化属性 -│ ├── get_block_names(quant_vision=quant_nontext_module) -│ ├── find_matching_blocks() → quant_block_list -│ ├── 反填 to_quant_block_names(如果原来为 None) -│ └── configure_layer_config() -│ -└─ 4. 设置 device_map、torch compile、offloader -``` - -> **无 `refresh_quantizer_for_initialized_model()`** —— 旧调用已通过先执行 `apply_patches`、 -> 再调用 `quantizer.post_init()` 的顺序调整消除。 - -## BaseQuantizers 接口 - -所有量化器接受**名称**(str),而非模块对象。 -模块在内部通过 `get_module(model, name)` 获取: - -```python -class BaseQuantizers: - def quantize_block( - self, - block_name: Union[str, list[str]], # list[str] 用于 nblocks > 1 - input_ids=None, - input_others=None, - **kwargs, - ): ... - - def quantize_layer(self, layer_name: str, **kwargs): ... -``` - -- `str` → `get_module(model, block_name)` -- `list[str]` → `WrapperMultiblock([get_module(model, n) for n in block_name])`(多块模式) - -## Compressor 选择决策树 - -``` -Compressor.__new__(config, model, format, **kwargs) -│ -├─ 检测模型类型 -│ ├─ is_diffusion_model() → "diffusion" -│ ├─ is_mllm_model() → "mllm" -│ └─ 其他 → "llm" -│ -├─ isinstance(config, SignRoundConfig) -│ ├─ mllm → class MLLMCalibCompressor(MLLMMixin, CalibCompressor) -│ ├─ diffusion → class DiffusionCalibCompressor(DiffusionMixin, CalibCompressor) -│ └─ llm → CalibCompressor -│ -└─ isinstance(config, RTNConfig) - ├─ enable_imatrix 或 needs_act_calib → CalibratedRTNCompressor 路径 - │ ├─ gguf_k 格式 → enable_imatrix = True - │ ├─ 对称 int RTN → enable_imatrix = True - │ ├─ 静态激活量化 → needs_act_calib = True - │ │ - │ ├─ mllm → class MLLMCalibratedRTNCompressor(MLLMMixin, CalibratedRTNCompressor) - │ ├─ diffusion → class DiffusionCalibratedRTNCompressor(DiffusionMixin, CalibratedRTNCompressor) - │ └─ llm → CalibratedRTNCompressor - │ - └─ 其他(零样本) → ZeroShotCompressor 路径 - ├─ mllm → class MLLMZeroShotCompressor(MLLMMixin, ZeroShotCompressor) - ├─ diffusion → class DiffusionZeroShotCompressor(DiffusionMixin, ZeroShotCompressor) - └─ llm → ZeroShotCompressor -``` - -## MLLMMixin - -```python -class MLLMMixin: - def __init__( - self, - *args, - processor=None, - image_processor=None, - template=None, - extra_data_dir=None, - quant_nontext_module=False, - **kwargs - ): - self.processor = processor - self.template = template - self.quant_nontext_module = quant_nontext_module - # 传给 ModelContext,使 get_block_names 包含视觉编码器的块 - kwargs.setdefault("quant_nontext_module", quant_nontext_module) - super().__init__(*args, **kwargs) - - def calib(self, nsamples, bs): - # 使用 get_mllm_dataloader,带 template / processor - ... -``` - -`quant_nontext_module` 传递链路: -`MLLMMixin.__init__` → `kwargs.setdefault` → `BaseCompressor.__init__` pop -→ `ModelContext(quant_nontext_module=...)` → `BaseQuantizers.post_init()` -调用 `get_block_names(quant_vision=quant_nontext_module)` - -## MRO(方法解析顺序)示例 - -``` -MLLMCalibCompressor(entry.py 中动态创建) - └─> MLLMMixin - └─> CalibCompressor - └─> BaseCompressor - └─> object - -调用 __init__() 的执行顺序: - 1. MLLMCalibCompressor.__init__() → 未定义,向上查找 - 2. MLLMMixin.__init__() - - 保存 MLLM 专属属性:processor、template、quant_nontext_module 等 - - kwargs.setdefault("quant_nontext_module", ...) - - super().__init__() → 进入 CalibCompressor - 3. CalibCompressor.__init__() → BaseCompressor.__init__() - - pop quant_nontext_module from kwargs - - 创建 ModelContext(..., quant_nontext_module=quant_nontext_module) - - ModelContext.__init__ 立即加载模型 - - 创建 CompressContext 单例 - -结果:MLLMCalibCompressor 实例同时具备: - ✓ MLLMMixin 提供的 MLLM 特性(processor、template、calib() 重写) - ✓ CalibCompressor 提供的梯度校准量化 - ✓ BaseCompressor 提供的模型/上下文管理 -``` - -## 使用示例 - -### 基本 LLM 量化 - -```python -from auto_round.compressors_new.entry import Compressor -from auto_round.algorithms.quantization.sign_round.config import SignRoundConfig - -config = SignRoundConfig(scheme="W4A16", iters=200, nsamples=128) -compressor = Compressor(config=config, model="/path/to/llm", tokenizer=tokenizer) -quantized_model, layer_config = compressor.quantize() -``` - -### MLLM(视觉-语言模型) - -```python -config = SignRoundConfig(scheme="W4A16", iters=200) -compressor = Compressor( - config=config, - model="/models/Qwen2-VL-2B-Instruct", - processor=processor, - template="qwen2_vl", - quant_nontext_module=False, # True 则同时量化视觉编码器 -) -# 创建:MLLMCalibCompressor(MLLMMixin, CalibCompressor) -quantized_model, layer_config = compressor.quantize() -``` - -### Diffusion 扩散模型 - -```python -config = SignRoundConfig(scheme="W4A16", iters=200) -compressor = Compressor( - config=config, - model="/models/stable-diffusion-2-1", - guidance_scale=7.5, -) -# 创建:DiffusionCalibCompressor(DiffusionMixin, CalibCompressor) -``` - -### RTN 零样本 - -```python -from auto_round.algorithms.quantization.rtn.config import RTNConfig - -config = RTNConfig(scheme="W4A16") -compressor = Compressor(config=config, model="/path/to/model") -``` - -### RTN + imatrix(GGUF k-quants) - -```python -config = RTNConfig(scheme="W4A16") -compressor = Compressor(config=config, model="/path/to/model", format="gguf_k") -# 创建:CalibratedRTNCompressor(enable_imatrix=True) -``` - -## 扩展新模型类型 - -**第 1 步**:在 `compressors_new/` 中创建新 Mixin: - -```python -class AudioMixin: - def __init__(self, *args, audio_processor=None, **kwargs): - self.audio_processor = audio_processor - super().__init__(*args, **kwargs) - - def calib(self, nsamples, bs): - # 音频专用 dataloader - ... -``` - -**第 2 步**:在 `entry.py` 中添加检测逻辑: - -```python -def detect_model_type(model): - if is_audio_model(model): - return "audio" - if is_diffusion_model(model): - return "diffusion" - ... -``` - -**第 3 步**:在 `Compressor.__new__()` 中添加路由: - -```python -if model_type == "audio": - from auto_round.compressors_new.audio_mixin import AudioMixin - - class AudioCalibCompressor(AudioMixin, CalibCompressor): - pass - - return AudioCalibCompressor(config, **local_args, **kwargs) -``` - -## 常见问题 - -### Q1:如何确认我的模型会使用哪个 Compressor? - -```python -from auto_round.compressors_new.entry import detect_model_type, Compressor -from auto_round.algorithms.quantization.sign_round.config import SignRoundConfig - -model_path = "/your/model/path" -print(f"模型类型: {detect_model_type(model_path)}") - -config = SignRoundConfig(scheme="W4A16") -comp = Compressor(config=config, model=model_path) -print(f"Compressor 类型: {type(comp).__name__}") -``` - -### Q2:RTN 和 AutoRound 有什么区别? - -| 特性 | RTN | AutoRound | -|------|-----|-----------| -| 需要校准数据 | ❌ 否(ZeroShot)/ ✅ 是(Calibrated) | ✅ 是 | -| 量化质量 | 较低 | 较高 | -| 量化速度 | 快 | 慢 | -| Compressor | ZeroShotCompressor / CalibratedRTNCompressor | CalibCompressor | - -### Q3:`group_size` 可以是 tuple 吗? - -可以。块状 FP8(如 `FP8_BLOCK` scheme)会将 `group_size` 设置为 `(128, 128)`, -`check_config()` 已通过 `_is_valid_group_size()` 静态方法正确处理 tuple/list/scalar 三种形式。 - -## 总结 - -| 特性 | 说明 | -|---|---| -| **统一入口** | 单一 `Compressor` 类,自动检测模型类型 | -| **配置** | `QuantizationConfig` dataclass;子类 `RTNConfig`、`SignRoundConfig` | -| **模型加载** | `ModelContext.__init__` 立即加载;`apply_patches()` 在量化器初始化前运行 | -| **9 种组合** | 3 种模型类型 × 3 种 Compressor,通过 Mixin 动态创建 | -| **量化器接口** | 基于名称的 `quantize_block(name)` / `quantize_layer(name)`,非模块对象 | -| **扩展** | 3 步添加新模型类型(Mixin 类、检测函数、路由) | - diff --git a/docs/step_by_step.md b/docs/step_by_step.md index c3cf4bfd4..0327f4dc1 100644 --- a/docs/step_by_step.md +++ b/docs/step_by_step.md @@ -760,10 +760,10 @@ The backend may not always be the most suitable for certain devices. You can specify your preferred backend such as "ark" for CPU and Intel GPU, "marlin/exllamav2/triton" for CUDA, according to your needs or hardware compatibility. Please note that additional corresponding libraries may be required. ```python -from transformers import AutoModelForCausalLM, AutoTokenizer, SignRoundConfig +from transformers import AutoModelForCausalLM, AutoTokenizer, AutoRoundConfig model_name = "OPEA/Qwen2.5-1.5B-Instruct-int4-sym-inc" -quantization_config = SignRoundConfig(backend="ark") +quantization_config = AutoRoundConfig(backend="ark") model = AutoModelForCausalLM.from_pretrained( model_name, device_map="cpu", quantization_config=quantization_config, torch_dtype="auto" ) @@ -798,10 +798,10 @@ print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50, do_sample=Fal Most GPTQ/AWQ models can be converted to the AutoRound format for better compatibility and support with Intel devices. Please note that the quantization config will be changed if the model is serialized. ```python -from transformers import AutoModelForCausalLM, AutoTokenizer, SignRoundConfig +from transformers import AutoModelForCausalLM, AutoTokenizer, AutoRoundConfig model_name = "ybelkada/opt-125m-gptq-4bit" -quantization_config = SignRoundConfig() +quantization_config = AutoRoundConfig() model = AutoModelForCausalLM.from_pretrained( model_name, device_map="cpu", quantization_config=quantization_config, torch_dtype="auto" ) diff --git a/docs/step_by_step_CN.md b/docs/step_by_step_CN.md index 3a972f226..d535718a6 100644 --- a/docs/step_by_step_CN.md +++ b/docs/step_by_step_CN.md @@ -715,10 +715,10 @@ AutoRound 会根据兼容性为每个层自动选择推理后端,默认优先 指定后端的示例: ```python -from transformers import AutoModelForCausalLM, AutoTokenizer, SignRoundConfig +from transformers import AutoModelForCausalLM, AutoTokenizer, AutoRoundConfig model_name = "OPEA/Qwen2.5-1.5B-Instruct-int4-sym-inc" -quantization_config = SignRoundConfig(backend="ark") +quantization_config = AutoRoundConfig(backend="ark") model = AutoModelForCausalLM.from_pretrained( model_name, device_map="cpu", quantization_config=quantization_config, torch_dtype="auto" ) @@ -755,10 +755,10 @@ print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50, do_sample=Fal 转换并推理的示例: ```python -from transformers import AutoModelForCausalLM, AutoTokenizer, SignRoundConfig +from transformers import AutoModelForCausalLM, AutoTokenizer, AutoRoundConfig model_name = "ybelkada/opt-125m-gptq-4bit" -quantization_config = SignRoundConfig() +quantization_config = AutoRoundConfig() model = AutoModelForCausalLM.from_pretrained( model_name, device_map="cpu", quantization_config=quantization_config, torch_dtype="auto" ) From ab9d972db0cc212d6240d11537293e6ad66f0c18 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Wed, 22 Apr 2026 22:12:53 +0800 Subject: [PATCH 72/90] fix W8A16 VRAM regression: skip block_forward compile for zero-shot path For ZeroShotCompressor (need_calib=False), block_forward is never called during W8A16/RTN zero-shot quantization. Previously, compile_func was called unconditionally in _hardware_setup, which on HPU triggers hpu_backend initialization and allocates ~0.84 GB of workspace memory even though the compiled function is never executed. Add 'and self.need_calib' guard to skip compilation when block_forward won't be used. This matches old arch behavior where block_forward was NOT compiled for W8A16 with iters=200 due to 'not self.disable_opt_rtn' check. Also remove test_hpu_eager_guard.py which tested the deleted _maybe_disable_hpu_eager_pipeline function. Signed-off-by: n1ck-guo --- auto_round/compressors_new/base.py | 5 ++- test/test_cpu/utils/test_hpu_eager_guard.py | 34 --------------------- 2 files changed, 4 insertions(+), 35 deletions(-) delete mode 100644 test/test_cpu/utils/test_hpu_eager_guard.py diff --git a/auto_round/compressors_new/base.py b/auto_round/compressors_new/base.py index 3b9e9faf5..8d9842503 100644 --- a/auto_round/compressors_new/base.py +++ b/auto_round/compressors_new/base.py @@ -864,7 +864,10 @@ def _hardware_setup(self) -> None: _needs_plain_forward = (cfg.is_act_quantize and (not cfg.act_dynamic or cfg.is_act_nv_fp)) or getattr( cfg, "enable_alg_ext", False ) - if self.enable_torch_compile and not _needs_plain_forward: + # Only compile block_forward when it will actually be used (calibration path). + # For zero-shot compressors (need_calib=False), block_forward is never called, + # so skipping compilation avoids unnecessary HPU workspace allocation. + if self.enable_torch_compile and not _needs_plain_forward and self.need_calib: self.block_forward = compile_func(block_forward, self.compress_context.device) else: self.block_forward = block_forward diff --git a/test/test_cpu/utils/test_hpu_eager_guard.py b/test/test_cpu/utils/test_hpu_eager_guard.py deleted file mode 100644 index 5df624ae1..000000000 --- a/test/test_cpu/utils/test_hpu_eager_guard.py +++ /dev/null @@ -1,34 +0,0 @@ -import os - -from auto_round.algorithms.quantization.rtn.config import RTNConfig -from auto_round.compressors_new.entry import _maybe_disable_hpu_eager_pipeline - - -def test_hpu_fp8_static_eager_guard_sets_env(monkeypatch): - monkeypatch.delenv("PT_HPU_EAGER_PIPELINE_ENABLE", raising=False) - monkeypatch.delenv("PT_HPU_EAGER_COLLECTIVE_PIPELINE_ENABLE", raising=False) - - _maybe_disable_hpu_eager_pipeline(RTNConfig(), "FP8_STATIC", "hpu") - - assert os.getenv("PT_HPU_EAGER_PIPELINE_ENABLE") == "0" - assert os.getenv("PT_HPU_EAGER_COLLECTIVE_PIPELINE_ENABLE") == "0" - - -def test_hpu_non_fp8_static_does_not_set_env(monkeypatch): - monkeypatch.delenv("PT_HPU_EAGER_PIPELINE_ENABLE", raising=False) - monkeypatch.delenv("PT_HPU_EAGER_COLLECTIVE_PIPELINE_ENABLE", raising=False) - - _maybe_disable_hpu_eager_pipeline(RTNConfig(), "W4A16", "hpu") - - assert os.getenv("PT_HPU_EAGER_PIPELINE_ENABLE") is None - assert os.getenv("PT_HPU_EAGER_COLLECTIVE_PIPELINE_ENABLE") is None - - -def test_hpu_fp8_static_eager_guard_respects_user_env(monkeypatch): - monkeypatch.setenv("PT_HPU_EAGER_PIPELINE_ENABLE", "1") - monkeypatch.setenv("PT_HPU_EAGER_COLLECTIVE_PIPELINE_ENABLE", "1") - - _maybe_disable_hpu_eager_pipeline(RTNConfig(), "FP8_STATIC", "hpu") - - assert os.getenv("PT_HPU_EAGER_PIPELINE_ENABLE") == "1" - assert os.getenv("PT_HPU_EAGER_COLLECTIVE_PIPELINE_ENABLE") == "1" From 70ed236ef8e99d8939af66d5d607e83d1a1f105d Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 23 Apr 2026 10:43:10 +0800 Subject: [PATCH 73/90] sync rotation/hadamard: handle rotation_config kwarg in AutoRoundCompatible In old arch, rotation_config was a direct kwarg to AutoRound(). In new arch (AutoRoundCompatible), it was being passed through to the unrecognized kwargs warning and silently ignored. Fix: extract rotation_config in AutoRoundCompatible.__new__, convert it to HadamardConfig (the new arch's transform config type), and prepend it to the alg_configs list alongside the quantization config. Handles all input forms: str ('default', 'random_hadamard'), dict, and RotationConfig instances. backend='inplace' emits a warning and is skipped (inplace backend not yet supported in new arch; requires CUDA/triton via transform backend). --- auto_round/compressors_new/entry.py | 36 +++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/auto_round/compressors_new/entry.py b/auto_round/compressors_new/entry.py index 3765de4bd..901ce19bf 100644 --- a/auto_round/compressors_new/entry.py +++ b/auto_round/compressors_new/entry.py @@ -455,6 +455,42 @@ def __new__( # Determine output format if specified format = kwargs.pop("format", None) + # Extract rotation_config (old-API kwarg) and convert to HadamardConfig for new arch. + # In old arch, rotation_config was a keyword arg; in new arch, rotation transforms + # are passed as part of alg_configs list. "inplace" backend is not yet supported + # in the new arch (requires CUDA/triton), so we only convert transform-compatible configs. + _rotation_config_raw = kwargs.pop("rotation_config", None) + if _rotation_config_raw is not None: + from auto_round.algorithms.transforms.hadamard.config import ( + HadamardConfig, + normalize_hadamard_config, + ) + from auto_round.experimental.transform.rotation_config import RotationConfig as _RotationConfig + + # Resolve to a RotationConfig to check the backend field + if isinstance(_rotation_config_raw, _RotationConfig): + _rc = _rotation_config_raw + elif isinstance(_rotation_config_raw, dict): + _rc = _RotationConfig.model_validate(_rotation_config_raw) + else: + # str ("default", "random_hadamard", …) or plain dict + _rc = _RotationConfig() + + if _rc.backend == "inplace": + logger.warning( + "rotation_config with backend='inplace' is not yet supported in the new architecture. " + "The rotation will be skipped. Use backend='transform' or backend='auto' " + "with an MXFP4/NVFP4 scheme, or pass HadamardConfig() explicitly via alg_configs." + ) + else: + # Convert to HadamardConfig understood by the new arch. + # normalize_hadamard_config only accepts None/str/dict/HadamardConfig, so + # convert RotationConfig instances to dict first (dropping backend field). + _raw_for_norm = _rc.model_dump(exclude={"backend", "fuse_online_to_weight", "allow_online_rotation"}) + hadamard_dict = normalize_hadamard_config(_raw_for_norm) + hadamard_cfg = HadamardConfig.model_validate(hadamard_dict) + config = [config, hadamard_cfg] + # Extract MLLM-specific parameters processor = kwargs.pop("processor", None) image_processor = kwargs.pop("image_processor", None) From c07ca3f16fbe844e5ee48fdc4dd1b318665a3c99 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 23 Apr 2026 10:55:24 +0800 Subject: [PATCH 74/90] sync: rename HadamardConfig to RotationConfig in new arch transforms --- auto_round/algorithms/transforms/__init__.py | 12 +++- .../transforms/hadamard/__init__.py | 22 ++++++-- .../algorithms/transforms/hadamard/apply.py | 36 ++++++------ .../algorithms/transforms/hadamard/config.py | 56 +++++++++++-------- auto_round/compressors_new/entry.py | 24 ++++---- 5 files changed, 90 insertions(+), 60 deletions(-) diff --git a/auto_round/algorithms/transforms/__init__.py b/auto_round/algorithms/transforms/__init__.py index 1a561ad08..8ede2ee31 100644 --- a/auto_round/algorithms/transforms/__init__.py +++ b/auto_round/algorithms/transforms/__init__.py @@ -51,7 +51,10 @@ HadamardConfig, HadamardRotation, apply_hadamard_transform, + apply_rotation_transform, normalize_hadamard_config, + normalize_rotation_config as _normalize_hadamard_for_transforms, + RotationConfig, ) __all__ = [ @@ -60,7 +63,10 @@ "BaseRotationConfig", "ROTATION_SUPPORTED_SCHEMES", "check_supported_schemes", - # Hadamard + # Config (new names) + "RotationConfig", + "apply_rotation_transform", + # Config (backward-compat aliases) "HadamardConfig", "HadamardRotation", "apply_hadamard_transform", @@ -96,14 +102,14 @@ def normalize_rotation_config( if isinstance(config, dict): alg = config.get("algorithm", "hadamard") if alg == "hadamard": - return HadamardConfig.model_validate(config) + return RotationConfig.model_validate(config) raise ValueError( f"Unknown rotation algorithm: {alg!r}. " f"Registered algorithms: {sorted(BaseRotation._REGISTRY)}" ) if isinstance(config, str): # String shorthand → treat as Hadamard config. - return HadamardConfig.model_validate(normalize_hadamard_config(config)) + return RotationConfig.model_validate(_normalize_hadamard_for_transforms(config)) raise TypeError( f"Unsupported rotation config type: {type(config).__name__}. " diff --git a/auto_round/algorithms/transforms/hadamard/__init__.py b/auto_round/algorithms/transforms/hadamard/__init__.py index d86923fa5..8ed1c8a06 100644 --- a/auto_round/algorithms/transforms/hadamard/__init__.py +++ b/auto_round/algorithms/transforms/hadamard/__init__.py @@ -13,8 +13,17 @@ # limitations under the License. """Hadamard rotation sub-package for ``algorithms/transforms``.""" -from auto_round.algorithms.transforms.hadamard.apply import HadamardRotation, apply_hadamard_transform -from auto_round.algorithms.transforms.hadamard.config import HadamardConfig, normalize_hadamard_config +from auto_round.algorithms.transforms.hadamard.apply import ( + HadamardRotation, + apply_rotation_transform, + apply_hadamard_transform, +) +from auto_round.algorithms.transforms.hadamard.config import ( + RotationConfig, + normalize_rotation_config, + HadamardConfig, + normalize_hadamard_config, +) from auto_round.algorithms.transforms.hadamard.transforms import ( HADAMARDS, HadamardTransform, @@ -25,7 +34,10 @@ __all__ = [ # Algorithm class "HadamardRotation", - # Config + # Config (new names) + "RotationConfig", + "normalize_rotation_config", + # Config (backward-compat aliases) "HadamardConfig", "normalize_hadamard_config", # Transform modules @@ -33,6 +45,8 @@ "RandomHadamardTransform", "HADAMARDS", "build_hadamard_transform", - # One-shot convenience + # One-shot convenience (new name) + "apply_rotation_transform", + # One-shot convenience (backward-compat alias) "apply_hadamard_transform", ] diff --git a/auto_round/algorithms/transforms/hadamard/apply.py b/auto_round/algorithms/transforms/hadamard/apply.py index 622cbe203..1b31ea57d 100644 --- a/auto_round/algorithms/transforms/hadamard/apply.py +++ b/auto_round/algorithms/transforms/hadamard/apply.py @@ -27,12 +27,12 @@ import tqdm from auto_round.algorithms.transforms.base import BaseRotation -from auto_round.algorithms.transforms.hadamard.config import HadamardConfig, normalize_hadamard_config +from auto_round.algorithms.transforms.hadamard.config import RotationConfig, normalize_rotation_config from auto_round.algorithms.transforms.hadamard.transforms import build_hadamard_transform from auto_round.compressors.utils import is_nv_fp from auto_round.experimental.qmodules.base import QModuleBase -__all__ = ["HadamardRotation", "apply_hadamard_transform"] +__all__ = ["HadamardRotation", "apply_rotation_transform", "apply_hadamard_transform"] def _triton_available(data_type: str = "mx_fp") -> bool: @@ -71,14 +71,14 @@ class HadamardRotation(BaseRotation): model = apply_hadamard_transform(model, config=HadamardConfig(), need_calibration=True) """ - def __init__(self, config: HadamardConfig) -> None: + def __init__(self, config: RotationConfig) -> None: super().__init__(config) @classmethod - def from_config(cls, config: dict | HadamardConfig) -> "HadamardRotation": - """Build a :class:`HadamardRotation` from a raw dict or :class:`HadamardConfig`.""" + def from_config(cls, config: dict | RotationConfig) -> "HadamardRotation": + """Build a :class:`HadamardRotation` from a raw dict or :class:`RotationConfig`.""" if isinstance(config, dict): - config = HadamardConfig.model_validate(config) + config = RotationConfig.model_validate(config) return cls(config) def apply_to_model( @@ -102,8 +102,8 @@ def apply_to_model( **kwargs: Reserved for future use. Returns: - The mutated *model* with ``model.hadamard_config`` set to the - normalised :class:`HadamardConfig`. + The mutated *model* with ``model.rotation_config`` set to the + normalised :class:`RotationConfig` dict. """ cfg = self.config @@ -119,7 +119,7 @@ def apply_to_model( _apply_to_module(model, module, cfg, location, data_type) # Store config on model for serialisation / downstream inspection. - setattr(model, "hadamard_config", cfg) + setattr(model, "rotation_config", cfg.model_dump()) return model @@ -131,7 +131,7 @@ def apply_to_model( def _apply_to_module( model: torch.nn.Module, module: torch.nn.Module, - config: HadamardConfig, + config: RotationConfig, location: str, data_type: str = "mx_fp", ) -> None: @@ -146,7 +146,7 @@ def _apply_to_module( raise NotImplementedError(f"Unsupported transform location: {location!r}") -def _apply_input_transform(module: torch.nn.Module, config: HadamardConfig, data_type: str = "mx_fp") -> None: +def _apply_input_transform(module: torch.nn.Module, config: RotationConfig, data_type: str = "mx_fp") -> None: """Register a forward pre-hook that applies the Hadamard to the input activation.""" from auto_round.algorithms.transforms.hadamard.utils.matrix import multihead_matmul @@ -196,7 +196,7 @@ def _input_hook(self, args): def _apply_weight_transform( module: torch.nn.Module, - config: HadamardConfig, + config: RotationConfig, ) -> None: """Fuse or patch the Hadamard rotation into the weight of *module*.""" from auto_round.algorithms.transforms.hadamard.patch import ( @@ -238,9 +238,9 @@ def _apply_weight_transform( # --------------------------------------------------------------------------- -def apply_hadamard_transform( +def apply_rotation_transform( model: torch.nn.Module, - config: str | dict | HadamardConfig | None, + config: str | dict | RotationConfig | None, location: str = "weight", use_tqdm: bool = True, desc: str | None = None, @@ -253,7 +253,7 @@ def apply_hadamard_transform( Args: model: Target model. - config: One of: :class:`HadamardConfig`, ``dict``, ``str`` + config: One of: :class:`RotationConfig`, ``dict``, ``str`` shorthand, or ``None`` (no-op). location: ``"weight"`` or ``"input"``. use_tqdm: Show progress bar. @@ -263,7 +263,7 @@ def apply_hadamard_transform( Returns: The transformed model. """ - normalised = normalize_hadamard_config(config) + normalised = normalize_rotation_config(config) if not normalised: return model rotation = HadamardRotation.from_config(normalised) @@ -274,3 +274,7 @@ def apply_hadamard_transform( desc=desc, data_type=data_type, ) + + +# Backward-compatibility alias +apply_hadamard_transform = apply_rotation_transform diff --git a/auto_round/algorithms/transforms/hadamard/config.py b/auto_round/algorithms/transforms/hadamard/config.py index df85d8103..a5e891f63 100644 --- a/auto_round/algorithms/transforms/hadamard/config.py +++ b/auto_round/algorithms/transforms/hadamard/config.py @@ -23,13 +23,13 @@ from auto_round.compressors.utils import is_mx_fp, is_nv_fp from auto_round.utils import logger -__all__ = ["HadamardConfig", "normalize_hadamard_config"] +__all__ = ["RotationConfig", "normalize_rotation_config", "HadamardConfig", "normalize_hadamard_config"] # Supported Hadamard transform types (also used by HadamardTransform registry). -HADAMARD_TYPES: frozenset[str] = frozenset({"hadamard", "random_hadamard"}) +HADAMARD_TYPES: frozenset[str] = frozenset({"hadamard", "random_hadamard", "quarot_hadamard"}) -class HadamardConfig(BaseModel, BaseRotationConfig): +class RotationConfig(BaseModel, BaseRotationConfig): """Configuration for Hadamard rotation transforms. This config is designed to be embedded inside a model's ``config.json`` @@ -40,8 +40,8 @@ class HadamardConfig(BaseModel, BaseRotationConfig): algorithm: Fixed to ``"hadamard"`` – identifies this config in the :class:`~auto_round.algorithms.transforms.base.BaseRotation` registry. block_size: Block size for the block-diagonal Hadamard matrix. - hadamard_type: Which transform to use (``"hadamard"`` or - ``"random_hadamard"``). + hadamard_type: Which transform to use (``"hadamard"``, ``"random_hadamard"``, + or ``"quarot_hadamard"``). random_seed: For ``"random_hadamard"`` – seed the generator for reproducibility. Excluded from serialisation (``exclude=True``) because it is a calibration-time detail. @@ -63,25 +63,29 @@ def _validate_hadamard_type(cls, v: str) -> str: return v -def normalize_hadamard_config( - config: str | dict | HadamardConfig | None, +# Backward-compatibility alias +HadamardConfig = RotationConfig + + +def normalize_rotation_config( + config: str | dict | RotationConfig | None, data_type: str = "mx_fp", ) -> dict[str, Any]: - """Normalise various input forms to a canonical ``dict`` for :class:`HadamardConfig`. + """Normalise various input forms to a canonical ``dict`` for :class:`RotationConfig`. Args: config: One of: * ``None`` → returns ``{}`` - * ``dict`` → validated via :class:`HadamardConfig` - * :class:`HadamardConfig` → converted to ``dict`` + * ``dict`` → validated via :class:`RotationConfig` + * :class:`RotationConfig` → converted to ``dict`` * ``str`` shorthand → treated as ``hadamard_type`` - (``"default"`` → default :class:`HadamardConfig`) + (``"default"`` → default :class:`RotationConfig`) data_type: Quantization data type. Used to infer ``block_size`` when not explicitly set (mx_fp → 32, nv_fp → 16). Returns: - A validated ``dict`` that can be passed to ``HadamardConfig(**result)``. + A validated ``dict`` that can be passed to ``RotationConfig(**result)``. Raises: ValueError: If the config is invalid. @@ -100,7 +104,7 @@ def _apply_data_type_block_size(cfg_dict: dict[str, Any], block_size_explicitly_ else: logger.warning( f"block_size is not set and cannot be inferred for data_type {data_type!r}; " - "please set block_size explicitly in hadamard_config if needed." + "please set block_size explicitly in rotation_config if needed." ) else: if is_mx_fp(data_type) and block_size != 32: @@ -113,24 +117,24 @@ def _apply_data_type_block_size(cfg_dict: dict[str, Any], block_size_explicitly_ if config is None: return {} - if isinstance(config, HadamardConfig): + if isinstance(config, RotationConfig): raw_cfg_dict = config.model_dump(exclude_unset=True) block_size_explicitly_set = "block_size" in raw_cfg_dict cfg_dict = dict(raw_cfg_dict) cfg_dict = _apply_data_type_block_size(cfg_dict, block_size_explicitly_set) try: - return HadamardConfig.model_validate(cfg_dict).model_dump() + return RotationConfig.model_validate(cfg_dict).model_dump() except Exception as exc: - raise ValueError(f"Invalid HadamardConfig: {exc}") from exc + raise ValueError(f"Invalid RotationConfig: {exc}") from exc if isinstance(config, dict): block_size_explicitly_set = "block_size" in config cfg_dict = dict(config) cfg_dict = _apply_data_type_block_size(cfg_dict, block_size_explicitly_set) try: - return HadamardConfig.model_validate(cfg_dict).model_dump() + return RotationConfig.model_validate(cfg_dict).model_dump() except Exception as exc: - raise ValueError(f"Invalid HadamardConfig dict: {exc}") from exc + raise ValueError(f"Invalid RotationConfig dict: {exc}") from exc if isinstance(config, str): key = config.strip() @@ -140,19 +144,23 @@ def _apply_data_type_block_size(cfg_dict: dict[str, Any], block_size_explicitly_ cfg_dict = {} cfg_dict = _apply_data_type_block_size(cfg_dict, block_size_explicitly_set=False) try: - return HadamardConfig.model_validate(cfg_dict).model_dump() + return RotationConfig.model_validate(cfg_dict).model_dump() except Exception as exc: - raise ValueError(f"Invalid default hadamard_config after data_type adjustment: {exc}") from exc + raise ValueError(f"Invalid default rotation_config after data_type adjustment: {exc}") from exc if key not in HADAMARD_TYPES: raise ValueError( - f"Unrecognised hadamard config string: {key!r}. " + f"Unrecognised rotation config string: {key!r}. " f"Expected one of {sorted(HADAMARD_TYPES)} or 'default'." ) cfg_dict = {"hadamard_type": key} cfg_dict = _apply_data_type_block_size(cfg_dict, block_size_explicitly_set=False) try: - return HadamardConfig.model_validate(cfg_dict).model_dump() + return RotationConfig.model_validate(cfg_dict).model_dump() except Exception as exc: - raise ValueError(f"Failed to build HadamardConfig from {key!r}: {exc}") from exc + raise ValueError(f"Failed to build RotationConfig from {key!r}: {exc}") from exc + + raise TypeError("rotation_config must be None, dict, RotationConfig, or str " f"(got {type(config).__name__})") + - raise TypeError("hadamard_config must be None, dict, HadamardConfig, or str " f"(got {type(config).__name__})") +# Backward-compatibility alias +normalize_hadamard_config = normalize_rotation_config diff --git a/auto_round/compressors_new/entry.py b/auto_round/compressors_new/entry.py index 901ce19bf..4c4ef30bc 100644 --- a/auto_round/compressors_new/entry.py +++ b/auto_round/compressors_new/entry.py @@ -9,7 +9,7 @@ from auto_round.algorithms.alg_config import AlgConfig from auto_round.algorithms.quantization.rtn.config import RTNConfig from auto_round.algorithms.quantization.sign_round.config import SignRoundConfig -from auto_round.algorithms.transforms.hadamard.config import HadamardConfig +from auto_round.algorithms.transforms.hadamard.config import RotationConfig as _NewArchRotationConfig from auto_round.auto_scheme.gen_auto_scheme import AutoScheme from auto_round.compressors_new.calib import CalibCompressor, CalibratedRTNCompressor from auto_round.compressors_new.utils import check_need_act_calibration @@ -153,7 +153,7 @@ class AutoRound(object): "sign_round": SignRoundConfig, "signround": SignRoundConfig, "rtn": RTNConfig, - "hadamard": HadamardConfig, + "hadamard": _NewArchRotationConfig, } @classmethod @@ -455,16 +455,14 @@ def __new__( # Determine output format if specified format = kwargs.pop("format", None) - # Extract rotation_config (old-API kwarg) and convert to HadamardConfig for new arch. + # Extract rotation_config (old-API kwarg) and convert to new arch RotationConfig. # In old arch, rotation_config was a keyword arg; in new arch, rotation transforms # are passed as part of alg_configs list. "inplace" backend is not yet supported # in the new arch (requires CUDA/triton), so we only convert transform-compatible configs. _rotation_config_raw = kwargs.pop("rotation_config", None) if _rotation_config_raw is not None: - from auto_round.algorithms.transforms.hadamard.config import ( - HadamardConfig, - normalize_hadamard_config, - ) + from auto_round.algorithms.transforms.hadamard.config import RotationConfig as _NARotCfg + from auto_round.algorithms.transforms.hadamard.config import normalize_rotation_config as _normalize_rc from auto_round.experimental.transform.rotation_config import RotationConfig as _RotationConfig # Resolve to a RotationConfig to check the backend field @@ -480,15 +478,15 @@ def __new__( logger.warning( "rotation_config with backend='inplace' is not yet supported in the new architecture. " "The rotation will be skipped. Use backend='transform' or backend='auto' " - "with an MXFP4/NVFP4 scheme, or pass HadamardConfig() explicitly via alg_configs." + "with an MXFP4/NVFP4 scheme, or pass RotationConfig() explicitly via alg_configs." ) else: - # Convert to HadamardConfig understood by the new arch. - # normalize_hadamard_config only accepts None/str/dict/HadamardConfig, so - # convert RotationConfig instances to dict first (dropping backend field). + # Convert to new arch RotationConfig. + # normalize_rotation_config accepts None/str/dict/RotationConfig, so + # convert old-arch RotationConfig instances to dict first (dropping backend field). _raw_for_norm = _rc.model_dump(exclude={"backend", "fuse_online_to_weight", "allow_online_rotation"}) - hadamard_dict = normalize_hadamard_config(_raw_for_norm) - hadamard_cfg = HadamardConfig.model_validate(hadamard_dict) + hadamard_dict = _normalize_rc(_raw_for_norm) + hadamard_cfg = _NARotCfg.model_validate(hadamard_dict) config = [config, hadamard_cfg] # Extract MLLM-specific parameters From 4f5341002b1bf7d2309ec430ed41571ac2371127 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 23 Apr 2026 11:03:41 +0800 Subject: [PATCH 75/90] refactor: rename algorithms/transforms/hadamard -> rotation --- auto_round/algorithms/transforms/__init__.py | 23 +- .../{hadamard => rotation}/__init__.py | 18 +- .../{hadamard => rotation}/apply.py | 26 +- .../{hadamard => rotation}/config.py | 12 +- .../{hadamard => rotation}/patch.py | 0 .../{hadamard => rotation}/transforms.py | 4 +- .../{hadamard => rotation}/utils/__init__.py | 0 .../utils/hadamards.safetensors | Bin .../{hadamard => rotation}/utils/math.py | 0 .../{hadamard => rotation}/utils/matrix.py | 0 .../utils/triton/__init__.py | 0 .../utils/triton/mxfp4.py | 0 auto_round/compressors_new/entry.py | 6 +- auto_round/schemes.py | 1 - benchmark_both.py | 78 ++++++ docs/fp8_new_arch_debug_handoff.md | 222 ++++++++++++++++++ performance_ut.sh | 115 +++++++++ profile_rss_per_block.py | 194 +++++++++++++++ 18 files changed, 640 insertions(+), 59 deletions(-) rename auto_round/algorithms/transforms/{hadamard => rotation}/__init__.py (69%) rename auto_round/algorithms/transforms/{hadamard => rotation}/apply.py (91%) rename auto_round/algorithms/transforms/{hadamard => rotation}/config.py (95%) rename auto_round/algorithms/transforms/{hadamard => rotation}/patch.py (100%) rename auto_round/algorithms/transforms/{hadamard => rotation}/transforms.py (97%) rename auto_round/algorithms/transforms/{hadamard => rotation}/utils/__init__.py (100%) rename auto_round/algorithms/transforms/{hadamard => rotation}/utils/hadamards.safetensors (100%) rename auto_round/algorithms/transforms/{hadamard => rotation}/utils/math.py (100%) rename auto_round/algorithms/transforms/{hadamard => rotation}/utils/matrix.py (100%) rename auto_round/algorithms/transforms/{hadamard => rotation}/utils/triton/__init__.py (100%) rename auto_round/algorithms/transforms/{hadamard => rotation}/utils/triton/mxfp4.py (100%) create mode 100644 benchmark_both.py create mode 100644 docs/fp8_new_arch_debug_handoff.md create mode 100644 performance_ut.sh create mode 100644 profile_rss_per_block.py diff --git a/auto_round/algorithms/transforms/__init__.py b/auto_round/algorithms/transforms/__init__.py index 8ede2ee31..6648cf6d6 100644 --- a/auto_round/algorithms/transforms/__init__.py +++ b/auto_round/algorithms/transforms/__init__.py @@ -20,7 +20,7 @@ Current algorithms ------------------ * **hadamard** – Block-diagonal Hadamard rotations (QuaRot / SpinQuant style). - See :mod:`auto_round.algorithms.transforms.hadamard`. + See :mod:`auto_round.algorithms.transforms.rotation`. Adding a new algorithm ----------------------- @@ -47,13 +47,10 @@ ROTATION_SUPPORTED_SCHEMES, check_supported_schemes, ) -from auto_round.algorithms.transforms.hadamard import ( - HadamardConfig, +from auto_round.algorithms.transforms.rotation import ( HadamardRotation, - apply_hadamard_transform, apply_rotation_transform, - normalize_hadamard_config, - normalize_rotation_config as _normalize_hadamard_for_transforms, + normalize_rotation_config as _normalize_hadamard_config, RotationConfig, ) @@ -63,14 +60,10 @@ "BaseRotationConfig", "ROTATION_SUPPORTED_SCHEMES", "check_supported_schemes", - # Config (new names) + # Config "RotationConfig", - "apply_rotation_transform", - # Config (backward-compat aliases) - "HadamardConfig", "HadamardRotation", - "apply_hadamard_transform", - "normalize_hadamard_config", + "apply_rotation_transform", # Unified entry "apply_rotation", "normalize_rotation_config", @@ -86,7 +79,7 @@ def normalize_rotation_config( legacy dicts that only carry Hadamard keys). Args: - config: One of: ``None``, :class:`HadamardConfig`, a ``dict`` with + config: One of: ``None``, :class:`RotationConfig`, a ``dict`` with an ``"algorithm"`` key, or a plain Hadamard shorthand string. Returns: @@ -109,7 +102,7 @@ def normalize_rotation_config( if isinstance(config, str): # String shorthand → treat as Hadamard config. - return RotationConfig.model_validate(_normalize_hadamard_for_transforms(config)) + return RotationConfig.model_validate(_normalize_hadamard_config(config)) raise TypeError( f"Unsupported rotation config type: {type(config).__name__}. " @@ -133,7 +126,7 @@ def apply_rotation( config: Rotation configuration. Accepts: * ``None`` – no-op, returns *model* unmodified. - * :class:`HadamardConfig` or compatible ``dict``/``str``. + * :class:`RotationConfig` or compatible ``dict``/``str``. * Any :class:`BaseRotationConfig` subclass. data_type: Quantization data type (e.g. ``"mx_fp"``). diff --git a/auto_round/algorithms/transforms/hadamard/__init__.py b/auto_round/algorithms/transforms/rotation/__init__.py similarity index 69% rename from auto_round/algorithms/transforms/hadamard/__init__.py rename to auto_round/algorithms/transforms/rotation/__init__.py index 8ed1c8a06..640cf06ab 100644 --- a/auto_round/algorithms/transforms/hadamard/__init__.py +++ b/auto_round/algorithms/transforms/rotation/__init__.py @@ -13,18 +13,15 @@ # limitations under the License. """Hadamard rotation sub-package for ``algorithms/transforms``.""" -from auto_round.algorithms.transforms.hadamard.apply import ( +from auto_round.algorithms.transforms.rotation.apply import ( HadamardRotation, apply_rotation_transform, - apply_hadamard_transform, ) -from auto_round.algorithms.transforms.hadamard.config import ( +from auto_round.algorithms.transforms.rotation.config import ( RotationConfig, normalize_rotation_config, - HadamardConfig, - normalize_hadamard_config, ) -from auto_round.algorithms.transforms.hadamard.transforms import ( +from auto_round.algorithms.transforms.rotation.transforms import ( HADAMARDS, HadamardTransform, RandomHadamardTransform, @@ -34,19 +31,14 @@ __all__ = [ # Algorithm class "HadamardRotation", - # Config (new names) + # Config "RotationConfig", "normalize_rotation_config", - # Config (backward-compat aliases) - "HadamardConfig", - "normalize_hadamard_config", # Transform modules "HadamardTransform", "RandomHadamardTransform", "HADAMARDS", "build_hadamard_transform", - # One-shot convenience (new name) + # One-shot convenience "apply_rotation_transform", - # One-shot convenience (backward-compat alias) - "apply_hadamard_transform", ] diff --git a/auto_round/algorithms/transforms/hadamard/apply.py b/auto_round/algorithms/transforms/rotation/apply.py similarity index 91% rename from auto_round/algorithms/transforms/hadamard/apply.py rename to auto_round/algorithms/transforms/rotation/apply.py index 1b31ea57d..bf2144e76 100644 --- a/auto_round/algorithms/transforms/hadamard/apply.py +++ b/auto_round/algorithms/transforms/rotation/apply.py @@ -16,7 +16,7 @@ Public entry points ------------------- * :class:`HadamardRotation` – the stateful algorithm object. -* :func:`apply_hadamard_transform` – convenience one-shot function. +* :func:`apply_rotation_transform` – convenience one-shot function. """ from __future__ import annotations @@ -27,12 +27,12 @@ import tqdm from auto_round.algorithms.transforms.base import BaseRotation -from auto_round.algorithms.transforms.hadamard.config import RotationConfig, normalize_rotation_config -from auto_round.algorithms.transforms.hadamard.transforms import build_hadamard_transform +from auto_round.algorithms.transforms.rotation.config import RotationConfig, normalize_rotation_config +from auto_round.algorithms.transforms.rotation.transforms import build_hadamard_transform from auto_round.compressors.utils import is_nv_fp from auto_round.experimental.qmodules.base import QModuleBase -__all__ = ["HadamardRotation", "apply_rotation_transform", "apply_hadamard_transform"] +__all__ = ["HadamardRotation", "apply_rotation_transform"] def _triton_available(data_type: str = "mx_fp") -> bool: @@ -44,7 +44,7 @@ def _triton_available(data_type: str = "mx_fp") -> bool: if not torch.cuda.is_available(): return False - from auto_round.algorithms.transforms.hadamard.utils.triton.mxfp4 import ( # noqa: F401 + from auto_round.algorithms.transforms.rotation.utils.triton.mxfp4 import ( # noqa: F401 mxfp4_forward_kernel_wrapper, ) @@ -67,8 +67,8 @@ class HadamardRotation(BaseRotation): Or directly:: - from auto_round.algorithms.transforms.hadamard import apply_hadamard_transform - model = apply_hadamard_transform(model, config=HadamardConfig(), need_calibration=True) + from auto_round.algorithms.transforms.rotation import apply_rotation_transform + model = apply_rotation_transform(model, config=RotationConfig(), need_calibration=True) """ def __init__(self, config: RotationConfig) -> None: @@ -148,7 +148,7 @@ def _apply_to_module( def _apply_input_transform(module: torch.nn.Module, config: RotationConfig, data_type: str = "mx_fp") -> None: """Register a forward pre-hook that applies the Hadamard to the input activation.""" - from auto_round.algorithms.transforms.hadamard.utils.matrix import multihead_matmul + from auto_round.algorithms.transforms.rotation.utils.matrix import multihead_matmul inp_transform = build_hadamard_transform( **config.model_dump(), @@ -164,7 +164,7 @@ def _apply_input_transform(module: torch.nn.Module, config: RotationConfig, data hadamard_weight = None if _triton_available(data_type): - from auto_round.algorithms.transforms.hadamard.utils.triton.mxfp4 import mxfp4_forward_kernel_wrapper + from auto_round.algorithms.transforms.rotation.utils.triton.mxfp4 import mxfp4_forward_kernel_wrapper def _input_hook(self, args): x = args[0] @@ -199,7 +199,7 @@ def _apply_weight_transform( config: RotationConfig, ) -> None: """Fuse or patch the Hadamard rotation into the weight of *module*.""" - from auto_round.algorithms.transforms.hadamard.patch import ( + from auto_round.algorithms.transforms.rotation.patch import ( patch_quantlinear, patch_wrapperlinear_to_apply_transform, patch_wrapperwalayer_forward_to_apply_transform, @@ -215,7 +215,7 @@ def _apply_weight_transform( # For random Hadamard, save the matrix as a submodule for serialisation. if config.hadamard_type == "random_hadamard": - from auto_round.algorithms.transforms.hadamard.patch import patch_quantlinear as _patch_ql + from auto_round.algorithms.transforms.rotation.patch import patch_quantlinear as _patch_ql _patch_ql(w_transform) @@ -274,7 +274,3 @@ def apply_rotation_transform( desc=desc, data_type=data_type, ) - - -# Backward-compatibility alias -apply_hadamard_transform = apply_rotation_transform diff --git a/auto_round/algorithms/transforms/hadamard/config.py b/auto_round/algorithms/transforms/rotation/config.py similarity index 95% rename from auto_round/algorithms/transforms/hadamard/config.py rename to auto_round/algorithms/transforms/rotation/config.py index a5e891f63..ee1ab8ee8 100644 --- a/auto_round/algorithms/transforms/hadamard/config.py +++ b/auto_round/algorithms/transforms/rotation/config.py @@ -23,7 +23,7 @@ from auto_round.compressors.utils import is_mx_fp, is_nv_fp from auto_round.utils import logger -__all__ = ["RotationConfig", "normalize_rotation_config", "HadamardConfig", "normalize_hadamard_config"] +__all__ = ["RotationConfig", "normalize_rotation_config"] # Supported Hadamard transform types (also used by HadamardTransform registry). HADAMARD_TYPES: frozenset[str] = frozenset({"hadamard", "random_hadamard", "quarot_hadamard"}) @@ -34,7 +34,7 @@ class RotationConfig(BaseModel, BaseRotationConfig): This config is designed to be embedded inside a model's ``config.json`` for serialisation, and is also used at runtime to drive - :class:`~auto_round.algorithms.transforms.hadamard.apply.HadamardRotation`. + :class:`~auto_round.algorithms.transforms.rotation.apply.HadamardRotation`. Attributes: algorithm: Fixed to ``"hadamard"`` – identifies this config in the @@ -63,10 +63,6 @@ def _validate_hadamard_type(cls, v: str) -> str: return v -# Backward-compatibility alias -HadamardConfig = RotationConfig - - def normalize_rotation_config( config: str | dict | RotationConfig | None, data_type: str = "mx_fp", @@ -160,7 +156,3 @@ def _apply_data_type_block_size(cfg_dict: dict[str, Any], block_size_explicitly_ raise ValueError(f"Failed to build RotationConfig from {key!r}: {exc}") from exc raise TypeError("rotation_config must be None, dict, RotationConfig, or str " f"(got {type(config).__name__})") - - -# Backward-compatibility alias -normalize_hadamard_config = normalize_rotation_config diff --git a/auto_round/algorithms/transforms/hadamard/patch.py b/auto_round/algorithms/transforms/rotation/patch.py similarity index 100% rename from auto_round/algorithms/transforms/hadamard/patch.py rename to auto_round/algorithms/transforms/rotation/patch.py diff --git a/auto_round/algorithms/transforms/hadamard/transforms.py b/auto_round/algorithms/transforms/rotation/transforms.py similarity index 97% rename from auto_round/algorithms/transforms/hadamard/transforms.py rename to auto_round/algorithms/transforms/rotation/transforms.py index 8f70e46b8..9725b2d9a 100644 --- a/auto_round/algorithms/transforms/hadamard/transforms.py +++ b/auto_round/algorithms/transforms/rotation/transforms.py @@ -27,11 +27,11 @@ import torch import torch.nn as nn -from auto_round.algorithms.transforms.hadamard.utils.math import ( +from auto_round.algorithms.transforms.rotation.utils.math import ( deterministic_hadamard_matrix, random_hadamard_matrix, ) -from auto_round.algorithms.transforms.hadamard.utils.matrix import apply_transform_weight +from auto_round.algorithms.transforms.rotation.utils.matrix import apply_transform_weight __all__ = [ "HadamardTransform", diff --git a/auto_round/algorithms/transforms/hadamard/utils/__init__.py b/auto_round/algorithms/transforms/rotation/utils/__init__.py similarity index 100% rename from auto_round/algorithms/transforms/hadamard/utils/__init__.py rename to auto_round/algorithms/transforms/rotation/utils/__init__.py diff --git a/auto_round/algorithms/transforms/hadamard/utils/hadamards.safetensors b/auto_round/algorithms/transforms/rotation/utils/hadamards.safetensors similarity index 100% rename from auto_round/algorithms/transforms/hadamard/utils/hadamards.safetensors rename to auto_round/algorithms/transforms/rotation/utils/hadamards.safetensors diff --git a/auto_round/algorithms/transforms/hadamard/utils/math.py b/auto_round/algorithms/transforms/rotation/utils/math.py similarity index 100% rename from auto_round/algorithms/transforms/hadamard/utils/math.py rename to auto_round/algorithms/transforms/rotation/utils/math.py diff --git a/auto_round/algorithms/transforms/hadamard/utils/matrix.py b/auto_round/algorithms/transforms/rotation/utils/matrix.py similarity index 100% rename from auto_round/algorithms/transforms/hadamard/utils/matrix.py rename to auto_round/algorithms/transforms/rotation/utils/matrix.py diff --git a/auto_round/algorithms/transforms/hadamard/utils/triton/__init__.py b/auto_round/algorithms/transforms/rotation/utils/triton/__init__.py similarity index 100% rename from auto_round/algorithms/transforms/hadamard/utils/triton/__init__.py rename to auto_round/algorithms/transforms/rotation/utils/triton/__init__.py diff --git a/auto_round/algorithms/transforms/hadamard/utils/triton/mxfp4.py b/auto_round/algorithms/transforms/rotation/utils/triton/mxfp4.py similarity index 100% rename from auto_round/algorithms/transforms/hadamard/utils/triton/mxfp4.py rename to auto_round/algorithms/transforms/rotation/utils/triton/mxfp4.py diff --git a/auto_round/compressors_new/entry.py b/auto_round/compressors_new/entry.py index 4c4ef30bc..ef9c1bd34 100644 --- a/auto_round/compressors_new/entry.py +++ b/auto_round/compressors_new/entry.py @@ -9,7 +9,7 @@ from auto_round.algorithms.alg_config import AlgConfig from auto_round.algorithms.quantization.rtn.config import RTNConfig from auto_round.algorithms.quantization.sign_round.config import SignRoundConfig -from auto_round.algorithms.transforms.hadamard.config import RotationConfig as _NewArchRotationConfig +from auto_round.algorithms.transforms.rotation.config import RotationConfig as _NewArchRotationConfig from auto_round.auto_scheme.gen_auto_scheme import AutoScheme from auto_round.compressors_new.calib import CalibCompressor, CalibratedRTNCompressor from auto_round.compressors_new.utils import check_need_act_calibration @@ -461,8 +461,8 @@ def __new__( # in the new arch (requires CUDA/triton), so we only convert transform-compatible configs. _rotation_config_raw = kwargs.pop("rotation_config", None) if _rotation_config_raw is not None: - from auto_round.algorithms.transforms.hadamard.config import RotationConfig as _NARotCfg - from auto_round.algorithms.transforms.hadamard.config import normalize_rotation_config as _normalize_rc + from auto_round.algorithms.transforms.rotation.config import RotationConfig as _NARotCfg + from auto_round.algorithms.transforms.rotation.config import normalize_rotation_config as _normalize_rc from auto_round.experimental.transform.rotation_config import RotationConfig as _RotationConfig # Resolve to a RotationConfig to check the backend field diff --git a/auto_round/schemes.py b/auto_round/schemes.py index 1b751c5a0..8ee5b2ed9 100644 --- a/auto_round/schemes.py +++ b/auto_round/schemes.py @@ -502,7 +502,6 @@ def _parse_scheme( } ) - # For AutoScheme 16 bits options BF16 = QuantizationScheme.from_dict( { diff --git a/benchmark_both.py b/benchmark_both.py new file mode 100644 index 000000000..3edbc04ef --- /dev/null +++ b/benchmark_both.py @@ -0,0 +1,78 @@ +"""Quick A/B benchmark: old (compressors) vs new (compressors_new) architecture. + +Uses AR_DISABLE_NEW_ARCH env-var to toggle. Runs each configuration in a +subprocess to avoid cross-contamination, with a warmup run to fill OS page cache. +""" + +import json +import os +import subprocess +import sys +import time + + +MODEL = "Qwen/Qwen3-0.6B" +ITERS = "200" +SCHEME = "W4A16" +DEVICE = "cuda:0" + +CMD_TEMPLATE = [ + sys.executable, "-m", "auto_round", + "--model_name", MODEL, + "--scheme", SCHEME, + "--iters", ITERS, + "--device", DEVICE, +] + + +def run_once(label: str, env_override: dict) -> float: + env = {**os.environ, **env_override} + print(f"\n{'='*60}") + print(f" Running: {label}") + print(f" AR_DISABLE_NEW_ARCH={env.get('AR_DISABLE_NEW_ARCH', 'unset')}") + print(f"{'='*60}", flush=True) + t0 = time.perf_counter() + proc = subprocess.run(CMD_TEMPLATE, env=env, capture_output=True, text=True) + elapsed = time.perf_counter() - t0 + if proc.returncode != 0: + print(f"STDERR:\n{proc.stderr[-2000:]}") + raise RuntimeError(f"{label} failed with rc={proc.returncode}") + print(f" {label}: {elapsed:.1f}s") + return elapsed + + +def main(): + # Warmup: fill OS page cache & JIT caches + print("Warmup run (old arch)...") + run_once("warmup", {"AR_DISABLE_NEW_ARCH": "1"}) + + # Interleaved runs to reduce bias + results = {"old": [], "new": []} + for trial in range(2): + if trial % 2 == 0: + first, second = ("old", "1"), ("new", "0") + else: + first, second = ("new", "0"), ("old", "1") + for label, flag in [first, second]: + t = run_once(f"{label} (trial {trial+1})", {"AR_DISABLE_NEW_ARCH": flag}) + results[label].append(t) + + # Summary + print("\n" + "=" * 60) + print("SUMMARY") + print("=" * 60) + for arch in ["old", "new"]: + times = results[arch] + avg = sum(times) / len(times) + print(f" {arch}: {[f'{t:.1f}' for t in times]} avg={avg:.1f}s") + old_avg = sum(results["old"]) / len(results["old"]) + new_avg = sum(results["new"]) / len(results["new"]) + diff_pct = (new_avg - old_avg) / old_avg * 100 + print(f"\n Diff: {diff_pct:+.1f}% (new vs old)") + print(f" {'PASS' if abs(diff_pct) < 5 else 'FAIL'} (threshold: ±5%)") + + json.dump(results, open("benchmark_results/latest.json", "w"), indent=2) + + +if __name__ == "__main__": + main() diff --git a/docs/fp8_new_arch_debug_handoff.md b/docs/fp8_new_arch_debug_handoff.md new file mode 100644 index 000000000..a269b66f9 --- /dev/null +++ b/docs/fp8_new_arch_debug_handoff.md @@ -0,0 +1,222 @@ +# FP8 Scheme Debug Handoff — `hengguo/new_ar_arch` (PR #1542) + +> **Purpose**: Hand-off for the next AI/engineer continuing FP8 regression +> debugging on the new AutoRound architecture (`auto_round/compressors_new/`). +> Skip the discovery phase — this doc captures what's broken, what's already +> fixed, how to reach the HPU test environment, and the exact commands that +> reproduce results. Read this end-to-end **before touching code**. + +--- + +## 1. Context + +- Repo: `intel/auto-round` +- Branch: `hengguo/new_ar_arch` (PR #1542 "Step1 new architecture for auto_round") +- Origin: `18ba254d merge main` (last pushed commit; several fixes are still **uncommitted** locally — see §5) +- Two parallel code paths exist: + - **Old arch**: `auto_round/compressors/` (baseline, `main` branch) + - **New arch**: `auto_round/compressors_new/` + `auto_round/algorithms/` + `auto_round/context/` (this PR) +- Routing entry: `auto_round/compressors_new/entry.py::AutoRound.__new__` picks old vs new compressor class by config type. + +### Key architectural differences vs old arch + +- `BaseCompressor.__getattr__` delegates attribute access to **three contexts** in order: `quantize_config`, `model_context`, `compress_context`. Missing attribute on all three → `AttributeError`. Many latent bugs were caused by attributes existing on the wrong context. +- Per-sample batching logic moved from `BaseCompressor._get_batch_data` (old) to `BaseQuantizers._sampling_inputs` in `auto_round/algorithms/quantization/base.py` (new). +- `share_cache_keys = ('position_ids', 'cache_position', 'position_embeddings')` — values cached **once** (not per-sample), typically wrapped by hook as `[val]`. New arch needs to unwrap and pass through regardless of `len(indices)`. +- Immediate packing flag is `self.compress_context.is_immediate_packing` (not `is_immediate_saving`). Conflating the two caused the original FP8_STATIC RAM regression. + +--- + +## 2. The FP8_STATIC Host-RAM Regression (primary motivating bug) + +### Symptom +On HPU, new-arch `FP8_STATIC` (static W8A8-FP8) tuning leaked ~GBs of host RAM per block vs old arch — traced to HPU eager-pipeline host-side growth when the static-activation calibration path runs. + +### Root causes & fixes (already in tree) + +1. **Immediate packing trigger flag** + `auto_round/compressors_new/calib.py` ~L1031: + ```python + if self.compress_context.is_immediate_packing: # was: is_immediate_saving + ... + ``` + Without this, packed weights were held in CPU RAM indefinitely. + +2. **`tmp_dtype` missing** + `auto_round/compressors_new/calib.py` ~L1424: added + ```python + tmp_dtype = self.model_context.amp_dtype if self.model_context.amp else torch.float32 + ``` + Matches old arch. + +### Status +**Fix #1 verified on HPU** (see §6). Do not revert these two pieces. + +> **Note (2026-04-22):** An earlier version of this branch added `_needs_hpu_fp8_static_eager_guard` / +> `_maybe_disable_hpu_eager_pipeline` in `entry.py` that set `PT_HPU_EAGER_PIPELINE_ENABLE=0` +> for FP8_STATIC on HPU. This was **speculative** (never confirmed to reduce RAM) and has been +> **deleted**. The `is_immediate_packing` fix in calib.py is the real fix. + +--- + +## 3. Latest performance report from user (the trigger for this handoff) + +Reported CI (from `performance_ut.sh`, model `Qwen/Qwen3-0.6B`, scheme likely W4A16 per default): + +``` +Tuning Time (s) : Current = 1192.5 | Baseline = 445.7 (+167.58%) FAIL +Peak RAM (GB) : Current = 3.68 | Baseline = 4.05 (−9.14%) FAIL (tolerance) +Peak VRAM (GB) : Current = 1.29 | Baseline = 26.73 (−95.17%) FAIL +Output Size (GB) : Current = 0.7114 | Baseline = 0.7114 (+0.00%) PASS +``` + +### CRITICAL finding from direct HPU test (iters=20, same scheme/model) + +Running the exact same binary on the HPU box (see §4) produced: + +``` +Quantizing model.layers.0 ... peak_ram=2.83GB peak_vram=1.25GB (block 0 only, expected) +Quantizing model.layers.1 ... peak_ram=2.86GB peak_vram=26.43GB ← HPU *is* being used +Quantizing model.layers.27 ... peak_ram=3.62GB peak_vram=26.61GB +quantization tuning time 68.34s for 20 iters → linear scale 200 iters ≈ 683s +real 1m24s user 11m16s +``` + +**So on the actual HPU**, `Peak VRAM = 26.61 GB` (matches baseline 26.73 GB). +The CI-reported `1.29 GB` equals the **first block only** (`model.layers.0`), before HPU allocation expands. This strongly suggests: + +- `check_performance.py` in the CI pipeline is picking up the first `peak_vram` log line (block 0 = 1.25–1.29 GB) and missing later ones, **OR** +- The CI run crashed/exited after block 0 (VRAM never grew) but reported partial data as "success". + +The tuning time gap (683 s estimate locally, 1192 s in CI) is real but smaller than the 2.7× the CI report suggests. Likely contributors: +- `torch.compile` recompile on every block (new-arch cache invalidation logic). +- Caching / `_sampling_inputs` overhead per step. +- CI docker env differences (no model warm cache, different HPU driver). + +### Action for next agent +1. **Do NOT assume VRAM=1.29 GB is real**. First re-read `.azure-pipelines/scripts/performance/check_performance.py` — it likely parses log incorrectly. Fix the parser before believing the VRAM number. +2. Investigate the tuning-time gap separately from VRAM: + - Profile `_resolve_block_forward`: does `torch.compile` actually hit the compiled path, or does `self` delegate through `__getattr__` to a context that returns `False` for `enable_torch_compile`? + - Profile `_sampling_inputs`: per-sample tensor copies, shared-key unwrap path. + - Compare `block_forward` call count / time per block between old and new arch under identical config. + +--- + +## 4. HPU Test Environment (**use this to reproduce — do not reinvent**) + +### 4.1 SSH chain (3 hops) + +``` +local → ssh tensorflow@clx5673.ra.intel.com + → ssh -i ~/.ssh/id_rsa_qun -J guest@146.152.224.86 sdp@100.81.152.55 + → sshpass -p 1 ssh sdp@192.168.122.81 (host: kvm-01) + → docker exec AutoRoundDebug bash +``` + +- Final host `kvm-01` has **4× HL-225 (Gaudi2, 96 GB HBM each)**, driver 1.24.0, hl-1.23.0. +- Container `AutoRoundDebug`: Ubuntu 24.04, `torch 2.9.0+hpu.1.23.0.695`, `habana-torch-plugin 1.23.0.695`. +- Code lives at `/ar_work_space/auto-round-patched/` **inside** the container (NOT bind-mounted; must be copied via SSH). +- HF cache inside container: `~/.cache/huggingface/hub/` contains `models--Qwen--Qwen3-0.6B` and `Qwen3-1.7B` (already downloaded, no HF token needed). +- `auto_round` is **not** pip-installed in the container. Run via `PYTHONPATH=/ar_work_space/auto-round-patched` + `python3 -m auto_round ...`. + +### 4.2 Helper script that works across all 3 hops + +Saved at `/tmp/hpu_run.sh` (local). Base64-encodes the command to avoid shell escape hell: + +```bash +#!/bin/bash +# Usage: /tmp/hpu_run.sh '' +set -e +CMD="$1" +B64=$(printf '%s' "$CMD" | base64 -w0) +ssh -o StrictHostKeyChecking=no -T tensorflow@clx5673.ra.intel.com \ + "ssh -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa_qun -J guest@146.152.224.86 sdp@100.81.152.55 \ + \"sshpass -p 1 ssh -o StrictHostKeyChecking=no sdp@192.168.122.81 \\\"docker exec AutoRoundDebug bash -c 'echo $B64 | base64 -d | bash'\\\"\"" +``` + +### 4.3 Syncing local code → container (no bind mount) + +```bash +cd /home/hengguo/code/bug_fix/auto-round && \ +tar -czf - auto_round/ | ssh tensorflow@clx5673.ra.intel.com \ + "ssh -i ~/.ssh/id_rsa_qun -J guest@146.152.224.86 sdp@100.81.152.55 \ + 'sshpass -p 1 ssh -o StrictHostKeyChecking=no sdp@192.168.122.81 \ + \"docker exec -i AutoRoundDebug bash -c \\\"rm -rf /ar_work_space/auto-round-patched/auto_round && tar -C /ar_work_space/auto-round-patched -xzf - && echo OK\\\"\"'" +``` + +Repeat for `auto_round_extension/`, `setup.py`, `setup.cfg`, `pyproject.toml` if those change. + +### 4.4 Canonical perf commands + +**Short sanity (20 iters, ~90 s wall):** +```bash +/tmp/hpu_run.sh 'cd /ar_work_space/auto-round-patched && \ + export PYTHONPATH=/ar_work_space/auto-round-patched:$PYTHONPATH && \ + export HF_HUB_DISABLE_PROGRESS_BARS=1 TQDM_MININTERVAL=60 && \ + rm -rf /tmp/ar_test && \ + time python3 -m auto_round --model_name Qwen/Qwen3-0.6B --scheme W4A16 \ + --iters 20 --enable_torch_compile --device hpu --output_dir /tmp/ar_test 2>&1 | tail -60' +``` + +**Full perf run (mirrors `performance_ut.sh`, 200 iters, ~12 min):** +Replace `--iters 20` with `--iters 200` above. Baseline for W4A16/Qwen3-0.6B is ~445 s tuning on this box. + +**FP8_STATIC reproduction:** replace `--scheme W4A16` with `--scheme FP8_STATIC`. + +--- + +## 5. Uncommitted local changes (as of this handoff) + +``` +auto_round/algorithms/quantization/base.py — _sampling_inputs share_cache_keys unwrap +auto_round/utils/device.py — get_device_and_parallelism dict handling +auto_round/special_model_handler.py — L207 use pre-extracted model_type (gemma4 FrozenDict) +test/test_cpu/export/test_export.py — added autoround_old.post_init() in INT8_W8A8 test +``` + +Plus earlier, already-committed fixes: +- Deleted `auto_round/sign_sgd.py` (duplicate of `auto_round/algorithms/quantization/sign_round/sign_sgd.py`). +- Removed duplicate `from auto_round.sign_sgd import SignSGD` in `auto_round/compressors/base.py`. +- `calib.py`: `is_immediate_packing` flag + `tmp_dtype` definition. +- `entry.py`: Removed speculative `_maybe_disable_hpu_eager_pipeline` / `_needs_hpu_fp8_static_eager_guard` (never validated on HPU). +- `utils/device.py`: `get_device_and_parallelism` now handles `device=None` (fixes llmcompressor integration crash). + +Push these before next CI run or CI logs will still show pre-fix behaviour. + +--- + +## 6. What the next agent should do (ordered) + +1. **Commit & push the uncommitted fixes in §5** so CI reflects current state. +2. **Fix `check_performance.py`** (in `.azure-pipelines/scripts/performance/`) — it is almost certainly reporting `peak_vram` from block 0 instead of the run max. Local HPU proof in §3 shows VRAM=26.6 GB is correct. +3. **Profile the real 2.7× tuning-time gap** with iters=200: + - Add timing around `_resolve_block_forward` branches (compiled vs plain). + - Log `self.compress_context.enable_torch_compile` once per block. + - Compare `_sampling_inputs` CPU time between archs (new arch has extra conditional branches for share_cache_keys). + - Check whether `torch.compile` cache is invalidated every block (`_invalidate_block_forward_cache` in `calib.py` at block boundary). Old arch reused the compiled function across blocks; new arch resets on `_dynamo.reset()` — confirm this is intentional and not the regression source. +4. Only then consider the algorithm-level code as suspect. + +--- + +## 7. Known-good signals (sanity checks) + +- `PT_HPU_LAZY_MODE=0` (eager mode) is active in this container. +- `torch.hpu.is_available() → True`, `device_count() → 4`. +- `from auto_round import __version__ → 0.13.0`. + +--- + +## 8. Files to study first (highest-signal) + +| Path | Why | +|---|---| +| `auto_round/compressors_new/entry.py` | routing, scheme pre-resolution | +| `auto_round/compressors_new/calib.py` | caching, block loop, immediate_pack, tmp_dtype | +| `auto_round/algorithms/quantization/base.py` | `_sampling_inputs`, `_get_block_outputs`, `_resolve_block_forward` | +| `auto_round/context/compress.py` / `model.py` | the three contexts `__getattr__` delegates to | +| `auto_round/compressors/base.py` (old arch) | ground-truth reference for every behaviour | +| `.azure-pipelines/scripts/performance/check_performance.py` | likely source of bogus VRAM=1.29 GB | + +--- + +*Written 2026-04-22 during PR #1542 post-merge bug-fix session.* diff --git a/performance_ut.sh b/performance_ut.sh new file mode 100644 index 000000000..12e78906b --- /dev/null +++ b/performance_ut.sh @@ -0,0 +1,115 @@ +#!/bin/bash +set -euo pipefail + +PATTERN='[-a-zA-Z0-9_]*=' + +for i in "$@"; do + case $i in + --model_name=*) + model_name=$(echo $i | sed "s/${PATTERN}//") + ;; + --scheme=*) + scheme=$(echo $i | sed "s/${PATTERN}//") + ;; + *) + echo "Parameter $i not recognized." + exit 1 + ;; + esac +done + +readonly WORKSPACE_DIR="/auto-round" +readonly LOG_DIR="${WORKSPACE_DIR}/log_dir" +readonly PERF_SCRIPT_DIR="${WORKSPACE_DIR}/.azure-pipelines/scripts/performance" +readonly BASELINE_GIT_URL="git+https://github.com/intel/auto-round.git" +readonly ITERS=200 + +log_group_start() { echo "##[group]$1"; } +log_group_end() { echo "##[endgroup]"; } +log_info() { echo -e "[\033[32mINFO\033[0m] $1"; } +log_err() { echo -e "[\033[31mERROR\033[0m] $1" >&2; } + +function setup_environment() { + log_group_start "Set up environment..." + + export TZ='Asia/Shanghai' + export TQDM_MININTERVAL=60 + export HF_HUB_DISABLE_PROGRESS_BARS=1 + export UV_NO_PROGRESS=1 + export UV_SYSTEM_PYTHON=1 + + log_info "Creating log directory: ${LOG_DIR}" + mkdir -p "${LOG_DIR}" + + log_info "Downloading model: ${model_name}" + hf download "${model_name}" + + log_group_end +} + +function install_auto_round() { + local install_source=$1 + local mode_name=$2 + + log_group_start "Install requirements for [${mode_name}]..." + + ( + cd "${WORKSPACE_DIR}" + log_info "Uninstalling existing auto-round..." + uv pip uninstall auto-round || true + + log_info "Installing auto-round from: ${install_source}" + BUILD_HPU_ONLY=1 uv pip install "${install_source}" + ) + + log_group_end +} + +function run_performance_test() { + local test_mode=$1 + local log_file="${LOG_DIR}/perf_test_${test_mode}.log" + + log_group_start "Run ${test_mode} performance test (${scheme})..." + + ( + cd "${PERF_SCRIPT_DIR}" + log_info "Executing auto-round for ${scheme}. Logging to ${log_file}" + auto-round \ + --model_name "${model_name}" \ + --scheme "${scheme}" \ + --iters "${ITERS}" \ + --enable_torch_compile \ + --device hpu \ + --output_dir "./${test_mode}" 2>&1 | tee -a "${log_file}" + ) + + log_group_end +} + +function run_performance_check() { + log_group_start "Check performance results..." + + ( + cd "${PERF_SCRIPT_DIR}" + log_info "Executing check_performance.py" + python check_performance.py + ) + + log_group_end +} + +function main() { + setup_environment + + install_auto_round "." "current" + run_performance_test "current" + + install_auto_round "${BASELINE_GIT_URL}" "baseline" + run_performance_test "baseline" + + run_performance_check + + log_info "All tasks completed successfully." +} + +main "$@" \ No newline at end of file diff --git a/profile_rss_per_block.py b/profile_rss_per_block.py new file mode 100644 index 000000000..06dd014f6 --- /dev/null +++ b/profile_rss_per_block.py @@ -0,0 +1,194 @@ +"""Granular per-block RSS profiling for peak RAM regression diagnosis. + +Instruments both old and new architecture to measure RSS at key points +within the per-block quantization loop. + +Usage: + # New arch: + python profile_rss_per_block.py + # Old arch: + AR_DISABLE_NEW_ARCH=1 python profile_rss_per_block.py +""" +import gc +import os +import resource +import sys +import time + + +def rss_mb(): + """Get current RSS in MB (no gc.collect - raw measurement).""" + return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 # KB -> MB on Linux + + +def rss_mb_clean(): + """Get current RSS in MB after gc.collect.""" + gc.collect() + return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 + + +# Use psutil for live RSS (ru_maxrss is peak, not current) +import psutil + +_proc = psutil.Process() + + +def live_rss_mb(): + """Current RSS in MB (not peak).""" + return _proc.memory_info().rss / (1024*1024) + + +def live_rss_mb_clean(): + gc.collect() + try: + import ctypes + libc = ctypes.CDLL("libc.so.6") + libc.malloc_trim(0) + except Exception: + pass + return _proc.memory_info().rss / (1024*1024) + + +arch = os.environ.get("AR_DISABLE_NEW_ARCH", "0") +arch_label = "OLD" if arch == "1" else "NEW" +print(f"\n{'='*70}") +print(f" {arch_label} Architecture - Granular Per-Block RSS Profiling") +print(f"{'='*70}") +print(f"Before import RSS: {live_rss_mb():.1f} MB") + +# Monkey-patch to add instrumentation +if arch != "1": + # NEW ARCH: patch CalibCompressor._quantize_single_block + from auto_round.compressors_new import calib as calib_mod + _orig_quantize_single_block = calib_mod.CalibCompressor._quantize_single_block + _orig_quantize_blocks = calib_mod.CalibCompressor._quantize_blocks + + _block_rss_log = [] + + def _patched_quantize_single_block(self, model, m, input_ids, input_others, q_input): + block_idx = len(_block_rss_log) + rss_before = live_rss_mb() + + result = _orig_quantize_single_block(self, model, m, input_ids, input_others, q_input) + + rss_after_return = live_rss_mb() + gc.collect() + rss_after_gc = live_rss_mb() + try: + import ctypes + libc = ctypes.CDLL("libc.so.6") + libc.malloc_trim(0) + except Exception: + pass + rss_after_trim = live_rss_mb() + + entry = { + 'block': block_idx, + 'before': rss_before, + 'after_return': rss_after_return, + 'after_gc': rss_after_gc, + 'after_trim': rss_after_trim, + 'delta_return': rss_after_return - rss_before, + 'delta_gc': rss_after_gc - rss_before, + 'delta_trim': rss_after_trim - rss_before, + } + _block_rss_log.append(entry) + print( + f" Block {block_idx:2d}: before={rss_before:.1f} after_ret={rss_after_return:.1f} " + f"after_gc={rss_after_gc:.1f} after_trim={rss_after_trim:.1f} " + f"delta_ret={entry['delta_return']:+.1f} delta_trim={entry['delta_trim']:+.1f} MB", + flush=True) + return result + + calib_mod.CalibCompressor._quantize_single_block = _patched_quantize_single_block + +else: + # OLD ARCH: patch LLMCompressor._quantize_block + from auto_round.compressors import base as base_mod + _orig_quantize_block = base_mod.LLMCompressor._quantize_block + + _block_rss_log = [] + + def _patched_quantize_block(self, block, input_ids, input_others, q_input=None, device="cpu", auto_offload=True): + block_idx = len(_block_rss_log) + rss_before = live_rss_mb() + + result = _orig_quantize_block(self, block, input_ids, input_others, q_input, device, auto_offload) + + rss_after_return = live_rss_mb() + gc.collect() + rss_after_gc = live_rss_mb() + try: + import ctypes + libc = ctypes.CDLL("libc.so.6") + libc.malloc_trim(0) + except Exception: + pass + rss_after_trim = live_rss_mb() + + entry = { + 'block': block_idx, + 'before': rss_before, + 'after_return': rss_after_return, + 'after_gc': rss_after_gc, + 'after_trim': rss_after_trim, + 'delta_return': rss_after_return - rss_before, + 'delta_gc': rss_after_gc - rss_before, + 'delta_trim': rss_after_trim - rss_before, + } + _block_rss_log.append(entry) + print( + f" Block {block_idx:2d}: before={rss_before:.1f} after_ret={rss_after_return:.1f} " + f"after_gc={rss_after_gc:.1f} after_trim={rss_after_trim:.1f} " + f"delta_ret={entry['delta_return']:+.1f} delta_trim={entry['delta_trim']:+.1f} MB", + flush=True) + return result + + base_mod.LLMCompressor._quantize_block = _patched_quantize_block + +print(f"After import RSS: {live_rss_mb():.1f} MB") + +from auto_round import AutoRound + +print(f"After AutoRound import RSS: {live_rss_mb():.1f} MB") + +import shutil + +save_dir = "/tmp/profile_rss_output" +shutil.rmtree(save_dir, ignore_errors=True) + +print(f"\nCreating AutoRound instance...") +ar = AutoRound( + model="Qwen/Qwen3-0.6B", + scheme="FP8_STATIC", + iters=200, + nsamples=128, + enable_torch_compile=True, +) +print(f"After init RSS: {live_rss_mb():.1f} MB") +print(f"After init RSS (clean): {live_rss_mb_clean():.1f} MB") + +print(f"\nStarting quantize_and_save...\n") +model, folder = ar.quantize_and_save(output_dir=save_dir, format="llm_compressor") + +print(f"\n{'='*70}") +print(f" SUMMARY ({arch_label} Architecture)") +print(f"{'='*70}") +print(f"Final RSS: {live_rss_mb():.1f} MB") +print(f"Final RSS (clean): {live_rss_mb_clean():.1f} MB") +print(f"\nPer-block deltas (after return, after gc+trim):") +for e in _block_rss_log: + print( + f" Block {e['block']:2d}: delta_ret={e['delta_return']:+.1f} delta_trim={e['delta_trim']:+.1f} MB " + f"(abs: {e['after_trim']:.1f} MB)") + +# Compute growth rate +if len(_block_rss_log) >= 2: + first = _block_rss_log[0]['after_trim'] + last = _block_rss_log[-1]['after_trim'] + n = len(_block_rss_log) - 1 + print(f"\nGrowth: {first:.1f} -> {last:.1f} MB over {n} blocks = {(last-first)/n:.1f} MB/block avg") + +print(f"\nPeak RSS (ru_maxrss): {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.1f} MB") + +shutil.rmtree(save_dir, ignore_errors=True) From 00fc80064634e3c50f2b40828be97809b971b2a9 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 23 Apr 2026 11:10:53 +0800 Subject: [PATCH 76/90] refactor: dedupe experimental/transform, re-export from new arch rotation module - Convert 5 duplicated files in experimental/transform/ into thin re-export shims pointing to auto_round/algorithms/transforms/rotation/: * utils/hadamard.py -> utils/math.py * utils/matrix.py -> utils/matrix.py (incl. multihead_matmul rename) * hadamards.py -> transforms.py * patch_modules.py -> patch.py * triton/mxfp4.py -> utils/triton/mxfp4.py - Remove the duplicated hadamards.safetensors, keep only the new-arch copy - Update setup.py package_data to reference the new safetensors path Old external callers (compressors/base.py, inference/convert_model.py, __main__.py, experimental/apply_rotation_transform.py, experimental/utils.py, experimental/rotation_inplace/utils.py) keep working unchanged via the shims. rotation_config.py and apply.py are kept as-is because they carry backend-dispatch semantics (inplace vs transform) that do not exist in the new architecture. --- .../experimental/transform/hadamards.py | 165 ++------------ .../experimental/transform/patch_modules.py | 187 ++-------------- .../experimental/transform/triton/mxfp4.py | 203 +----------------- .../experimental/transform/utils/hadamard.py | 157 ++------------ .../transform/utils/hadamards.safetensors | Bin 1436901 -> 0 bytes .../experimental/transform/utils/matrix.py | 103 +-------- setup.py | 7 +- 7 files changed, 78 insertions(+), 744 deletions(-) delete mode 100644 auto_round/experimental/transform/utils/hadamards.safetensors diff --git a/auto_round/experimental/transform/hadamards.py b/auto_round/experimental/transform/hadamards.py index 42c47818d..8f52b80a5 100644 --- a/auto_round/experimental/transform/hadamards.py +++ b/auto_round/experimental/transform/hadamards.py @@ -1,143 +1,22 @@ -# Copyright (c) 2026 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import inspect -import math -from typing import Any, Callable, Dict - -import torch -import torch.nn as nn - -from auto_round.experimental.transform.utils.hadamard import deterministic_hadamard_matrix, random_hadamard_matrix -from auto_round.experimental.transform.utils.matrix import apply_transform_weight - - -def filter_kwarg_dict(fn_or_method: Callable, kwarg_dict: Dict[str, Any]) -> Dict[str, Any]: - fn_or_method_keys = inspect.signature(fn_or_method).parameters.keys() - return {k: v for k, v in kwarg_dict.items() if k in fn_or_method_keys} - - -class HadamardTransform(nn.Module): - - def __init__( - self, - block_size: int = 32, - device: torch.device = None, - precision: torch.dtype = torch.float32, - location: str = "weight", - module_type: type[torch.nn.Module] = torch.nn.Linear, - inverse: bool = False, - ): - """Initialize a Hadamard transform module. - - Args: - block_size: Size of each Hadamard block. The input tensor is reshaped - to ``(-1, block_size)`` before applying the transform. - device: Device on which to create the Hadamard matrix. - precision: Data type used for the Hadamard matrix weights, using float32 as default. - location: Target location used by ``apply_transform_weight`` when - applying the transform. - module_type: Module type associated with the transform application, - typically ``torch.nn.Linear``. - inverse: Whether to build the inverse form of the transform. - """ - - super().__init__() - self.size = block_size - self.scale = 1 / math.sqrt(self.size) - self.location = location - self.module_type = module_type - self.inverse = inverse - self.weight = self._create_weight(self.size, device, precision) - - def _create_weight( - self, - size: int, - device: torch.device = None, - precision: torch.dtype = torch.float32, - ) -> torch.nn.Parameter: - data = deterministic_hadamard_matrix(size, precision, device) * self.scale - # TODO: implement SpinQuant, which rotation matrix is learnable - return nn.Parameter(data, requires_grad=False) - - def forward(self, x: torch.Tensor): - # Hadamard transform is it own inverse - ori_shape = x.shape - x = x.view(-1, self.size) - return ( - ( - apply_transform_weight( - self.weight.to(x.device), - x.to(dtype=self.weight.dtype), - self.location, - self.module_type, - ) - ) - .to(x.dtype) - .view(ori_shape) - ) - - -class RandomHadamardTransform(HadamardTransform): - def __init__( - self, - block_size: int = 32, - device: torch.device = None, - precision: torch.dtype = None, - location: str = "weight", - module_type: type[torch.nn.Module] = torch.nn.Linear, - inverse: bool = False, - seed: int | None = None, - generator: torch.Generator | None = None, - ): - if generator is not None: - self.generator = generator - else: - self.generator = torch.Generator() - if seed is not None: - self.generator.manual_seed(seed) - - super().__init__( - block_size=block_size, - device=device, - precision=precision, - location=location, - module_type=module_type, - inverse=inverse, - ) - - def _create_weight( - self, - size: int, - device: torch.device = None, - precision: torch.dtype = None, - ) -> torch.nn.Parameter: - data = random_hadamard_matrix(size, precision, device, self.generator) * self.scale - # activation needs transpose - if self.inverse: - data = data.T - # data = deterministic_hadamard_matrix(size, precision, device) * self.scale - # TODO: implement SpinQuant, which rotation matrix is learnable - return nn.Parameter(data, requires_grad=False) - - -HADAMARDS = { - "hadamard": HadamardTransform, - "random_hadamard": RandomHadamardTransform, -} - - -def build_hadamard_transform(hadamard_type: str, **hadamard_kwargs): - hadamard = HADAMARDS[hadamard_type] - return hadamard(**filter_kwarg_dict(hadamard.__init__, hadamard_kwargs)) +# # Copyright (C) 2026 Intel Corporation +# # SPDX-License-Identifier: Apache-2.0 +"""Backward-compat re-export shim. + +The canonical implementation now lives in +:mod:`auto_round.algorithms.transforms.rotation.transforms`. +""" + +from auto_round.algorithms.transforms.rotation.transforms import ( # noqa: F401 + HADAMARDS, + HadamardTransform, + RandomHadamardTransform, + _filter_kwargs as filter_kwarg_dict, + build_hadamard_transform, +) + +__all__ = [ + "HADAMARDS", + "HadamardTransform", + "RandomHadamardTransform", + "build_hadamard_transform", +] diff --git a/auto_round/experimental/transform/patch_modules.py b/auto_round/experimental/transform/patch_modules.py index e0f9adbae..980c28588 100644 --- a/auto_round/experimental/transform/patch_modules.py +++ b/auto_round/experimental/transform/patch_modules.py @@ -1,172 +1,19 @@ # # Copyright (C) 2026 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 - -import torch -import transformers - -from auto_round.export.export_to_autoround.qlinear_fp import QuantLinear, pack_fp4_to_uint8 -from auto_round.wrapper import WrapperLinear, WrapperWALayer - - -def patch_wrapperlinear_to_apply_transform(w_transform, inp_transform): - """ - Globally monkey-patch WrapperLinear._qdq_weight and WrapperLinear._qdq_act so that it applies - a weight and activation transform before quantization. - - e.g. by apply_transform() before wrapper_block(). - """ - - if getattr(WrapperLinear, "_hadamard_patched", False): - return - - orig_qdq_weight = WrapperLinear._qdq_weight - - def _qdq_weight_patched(self, value, min_scale, max_scale): - """ - # If no transform attached, fall back to original behavior - if not hasattr(self.orig_layer, transform_attr): - return orig_qdq_weight(self, value, min_scale, max_scale) - """ - - if self.orig_layer.bits >= 16: - # keep original behavior for >=16bit to avoid changing semantics unexpectedly - return orig_qdq_weight(self, value, min_scale, max_scale) - - if getattr(self, "applied_weight_hadamard", None) is None: - with torch.no_grad(): - weight = self.orig_layer.weight - if weight.device.type == "meta": - weight = self.orig_layer.get_weight().to(self.device) - - is_conv1d = type(self.orig_layer) == transformers.pytorch_utils.Conv1D - if is_conv1d: - weight = weight.t().continuous() - new_weight = w_transform(weight).to(self.device) - if is_conv1d: - new_weight = weight.t().continuous() - self.orig_layer.weight.data.copy_(new_weight) - self.applied_weight_hadamard = True - - return orig_qdq_weight(self, value, min_scale, max_scale) - - orig_qdq_act = WrapperLinear._qdq_act - - def _qdq_act_patched(self, x, act_min_scale, act_max_scale, act_max=None): - - x = inp_transform(x) - - return orig_qdq_act(self, x, act_min_scale, act_max_scale, act_max) - - WrapperLinear._qdq_weight = _qdq_weight_patched - WrapperLinear._qdq_act = _qdq_act_patched - WrapperLinear._hadamard_patched = True - - -def patch_wrapperwalayer_forward_to_apply_transform(inp_transform): - """ - Globally monkey-patch WrapperWALayer.forward so that it applies - a activation transform before quantization. - - e.g. by apply_transform() before wrapper_block(). - """ - - if getattr(WrapperWALayer, "_hadamard_forward_patched", False): - return - - orig_forward = WrapperWALayer.forward - - def _forward_patched(self, x): - """ - # If no transform attached, fall back to original behavior - if not hasattr(self.orig_layer, transform_attr): - return orig_forward(self, x) - """ - - act_max = self.orig_layer.act_max if hasattr(self.orig_layer, "act_max") else None - - # transform = getattr(self.orig_layer, transform_attr) - x = inp_transform(x) - - x, _, _ = self.orig_layer.act_quant_func( - x, - bits=self.orig_layer.act_bits, - group_size=self.orig_layer.act_group_size, - scale_dtype=self.orig_layer.scale_dtype, - q_scale_thresh=self.orig_layer.q_scale_thresh, - data_type=self.orig_layer.act_data_type, - tensor_max=act_max, - ) - return self.orig_layer.forward(x) - - WrapperWALayer.forward = _forward_patched - WrapperWALayer._hadamard_forward_patched = True - - -def patch_quantlinear(w_transform): - """ """ - - if getattr(QuantLinear, "_pack_patched", False): - return - - from auto_round.data_type.nvfp import cast_to_fp4, get_reciprocal - from auto_round.data_type.utils import reshape_pad_tensor_by_group_size, revert_tensor_by_pad - from auto_round.utils import get_packing_device - - E8M0_EXPONENT_BIAS = 127 - E8M0_EXPONENT_NAN_VAL = 255 - - def _pack_patched( - self, linear, scales, zeros=None, g_idx=None, global_scale=None, input_global_scale=None, device=None - ): - device = get_packing_device(device) - if getattr(linear, "bias", None) is not None: - self.bias = linear.bias.detach().to(torch.float16) - - W = linear.weight.data.detach().to(device) - if type(linear) == torch.nn.Conv2d: - W = W.flatten(1) - if type(linear) == transformers.pytorch_utils.Conv1D: - W = W.t() - - tensor, orig_shape, pad_len = reshape_pad_tensor_by_group_size(W, self.group_size) - scales = scales.to(device) - if self.is_nv: - assert global_scale is not None and global_scale.numel() == 1 - global_scale = global_scale.reshape([1]) - global_scale = global_scale.to(device) - scaled_tensor = tensor.to(global_scale.dtype) * get_reciprocal( - scales.reshape(tensor.shape[0], -1) * get_reciprocal(global_scale) - ) - scaled_tensor.clamp_(-6.0, 6.0) - scaled_tensor = cast_to_fp4(scaled_tensor) - else: - scaled_tensor = tensor / (2 ** scales.reshape(tensor.shape[0], -1)) - scaled_tensor = revert_tensor_by_pad(scaled_tensor, orig_shape=orig_shape, pad_len=pad_len) - if self.is_mx: - final_scale = (scales + E8M0_EXPONENT_BIAS).clamp(0, E8M0_EXPONENT_NAN_VAL).to(torch.uint8) - else: - final_scale = scales.to(torch.float8_e4m3fn) - - self.weight_scale = final_scale - # self.weight = get_compressed_weight(scaled_tensor, self.bits, self.data_type) ## TODO - if self.bits == 8: - compress_dtype = torch.float8_e4m3fn - self.weight = scaled_tensor.to(compress_dtype) - - else: - compress_dtype = torch.uint8 - self.weight_packed = pack_fp4_to_uint8(scaled_tensor) - - if global_scale is not None: - self.weight_global_scale = global_scale.to(torch.float32).to(device) - - if input_global_scale is not None: - # TODO: the shape of `input_global_scale` is [] in some cases — need to investigate why. - self.input_global_scale = input_global_scale.to(torch.float32).to(device).reshape([1]) - - # add transform weight - self.register_buffer("hadamard_matrix", w_transform.weight.to(device)) - return - - QuantLinear.pack = _pack_patched - QuantLinear._pack_patched = True +"""Backward-compat re-export shim. + +The canonical implementation now lives in +:mod:`auto_round.algorithms.transforms.rotation.patch`. +""" + +from auto_round.algorithms.transforms.rotation.patch import ( # noqa: F401 + patch_quantlinear, + patch_wrapperlinear_to_apply_transform, + patch_wrapperwalayer_forward_to_apply_transform, +) + +__all__ = [ + "patch_quantlinear", + "patch_wrapperlinear_to_apply_transform", + "patch_wrapperwalayer_forward_to_apply_transform", +] diff --git a/auto_round/experimental/transform/triton/mxfp4.py b/auto_round/experimental/transform/triton/mxfp4.py index 8028c167b..3cc26a7d0 100644 --- a/auto_round/experimental/transform/triton/mxfp4.py +++ b/auto_round/experimental/transform/triton/mxfp4.py @@ -1,197 +1,14 @@ -# Copyright (c) 2026 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# # Copyright (C) 2026 Intel Corporation +# # SPDX-License-Identifier: Apache-2.0 +"""Backward-compat re-export shim. -# Refer code here: -# https://github.com/IST-DASLab/FP-Quant/blob/master/inference_lib/src/fp_quant/module/triton/mxfp4.py +The canonical implementation now lives in +:mod:`auto_round.algorithms.transforms.rotation.utils.triton.mxfp4`. +""" -import torch -import triton # pylint: disable=E0401 -import triton.language as tl # pylint: disable=E0401 - - -@triton.autotune( - configs=[ - triton.Config({"BLOCK_SIZE": 32 * 32}), - triton.Config({"BLOCK_SIZE": 64 * 32}), - triton.Config({"BLOCK_SIZE": 128 * 32}), - triton.Config({"BLOCK_SIZE": 256 * 32}), - triton.Config({"BLOCK_SIZE": 512 * 32}), - ], - key=[], +from auto_round.algorithms.transforms.rotation.utils.triton.mxfp4 import ( # noqa: F401 + mxfp4_forward_kernel, + mxfp4_forward_kernel_wrapper, ) -@triton.jit -def mxfp4_forward_kernel( - x_ptr, - hadamard_matrix_ptr, - output_ptr, - clip_mask_ptr, - n_elements: tl.constexpr, - hadamard_dim: tl.constexpr, - group_size: tl.constexpr, - gaussian_scale: tl.constexpr, - quest: tl.constexpr, - BLOCK_SIZE: tl.constexpr, -): - offsets_hadamard = tl.arange(0, hadamard_dim * hadamard_dim) - hadamard_matrix = tl.load(hadamard_matrix_ptr + offsets_hadamard).reshape(hadamard_dim, hadamard_dim) - - # load x - pid = tl.program_id(0) - start_idx = pid * BLOCK_SIZE - offsets = start_idx + tl.arange(0, BLOCK_SIZE) - mask = offsets < n_elements - x_flat = tl.load(x_ptr + offsets, mask=mask) - - # hadamard transform - x = tl.reshape(x_flat, (BLOCK_SIZE // hadamard_dim, hadamard_dim)) - x_had = tl.dot(x, hadamard_matrix) - - # group - x_had_grouped = tl.reshape(x_had, (BLOCK_SIZE // group_size, group_size)) - - # scale - # quest=True: per-group Gaussian-based scale = gaussian_scale * std - # quest=False: per-group max-abs-based scale, adjusted to FP4 range - if quest: - mean_squared = tl.sum(x_had_grouped * x_had_grouped, axis=-1, keep_dims=True) / group_size - mean = tl.sum(x_had_grouped, axis=-1, keep_dims=True) / group_size - std = tl.sqrt(mean_squared - mean * mean) - scales = gaussian_scale * std + 1e-8 - shared_exps = tl.exp2(tl.floor(tl.log2(scales))) - x_had_scaled = x_had_grouped / shared_exps - else: - scales = tl.max(tl.abs(x_had_grouped), axis=-1, keep_dims=True) - shared_exps = tl.exp2(tl.floor(tl.log2(scales)) - 2) / (3 / 4) - x_had_scaled = x_had_grouped / shared_exps - - # quantize - # Map abs(x) to FP4 levels {0, 0.5, 1, 1.5, 2, 3, 4, 6} - x_had_scaled_abs = tl.abs(x_had_scaled) - x_had_scaled_sign = tl.where( - x_had_scaled > 0, - 1, - -1, - ) - - x_fp4 = ( - tl.where( - x_had_scaled_abs > 5, - 6, - tl.where( - x_had_scaled_abs > 3.5, - 4, - tl.where( - x_had_scaled_abs > 2.5, - 3, - tl.where( - x_had_scaled_abs > 1.75, - 2, - tl.where( - x_had_scaled_abs > 1.25, - 1.5, - tl.where( - x_had_scaled_abs > 0.75, - 1, - tl.where( - x_had_scaled_abs > 0.25, - 0.5, - 0, - ), - ), - ), - ), - ), - ), - ) - * x_had_scaled_sign - ) - if clip_mask_ptr is not None: - tl.store( - clip_mask_ptr + offsets, - tl.reshape(x_had_scaled_abs < 6, (BLOCK_SIZE,)), - mask=mask, - ) - - # dequantize - x_dequantized = x_fp4 * shared_exps - - # Reshape back to flat form for storage - x_dequantized_flat = tl.reshape(x_dequantized, (BLOCK_SIZE,)) - - # store - tl.store(output_ptr + offsets, x_dequantized_flat, mask=mask) - - -@torch.compiler.disable() -def mxfp4_forward_kernel_wrapper( - x, - hadamard_matrix, - return_clip_mask=False, - quest=False, - gaussian_scale=3 / 4, -): - """ - Refer code here: - https://github.com/IST-DASLab/FP-Quant/blob/master/inference_lib/src/fp_quant/module/triton/mxfp4.py - Apply Hadamard transform + group-wise FP4 quantize/dequantize on x. - - Note: - The output is still in the Hadamard-transformed space (no inverse Hadamard is applied). - """ - # Pick a device — we require CUDA - device = x.device - if device.type != "cuda": - raise RuntimeError( - f"mxfp4_forward_kernel_wrapper requires a CUDA tensor for 'x', " - f"but got device '{device.type}'. Please move inputs to CUDA before calling." - ) - - # Ensure hadamard_matrix is on the same CUDA device - if hadamard_matrix.device != device: - hadamard_matrix = hadamard_matrix.to(device) - - dtype = hadamard_matrix.dtype - - if x.dtype != dtype: - x = x.to(dtype) - - # Make sure inputs are contiguous - x = x.contiguous() - hadamard_matrix = hadamard_matrix.contiguous() - - # Create output tensors on CUDA - output = torch.empty_like(x, device=device) - if return_clip_mask: - clip_mask = torch.empty_like(x, dtype=torch.bool, device=device).contiguous() - else: - clip_mask = None - - # Get total number of elements and calculate grid for launching the kernel - n_elements = x.numel() - grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),) - - # Launch kernel – no need for `with torch.device(...)` - mxfp4_forward_kernel[grid]( - x_ptr=x, - hadamard_matrix_ptr=hadamard_matrix, - output_ptr=output, - clip_mask_ptr=clip_mask, - n_elements=n_elements, - hadamard_dim=hadamard_matrix.shape[-1], - group_size=32, - gaussian_scale=gaussian_scale, - quest=quest, - ) - return output, clip_mask +__all__ = ["mxfp4_forward_kernel", "mxfp4_forward_kernel_wrapper"] diff --git a/auto_round/experimental/transform/utils/hadamard.py b/auto_round/experimental/transform/utils/hadamard.py index 320ae1832..aa9e59d9a 100644 --- a/auto_round/experimental/transform/utils/hadamard.py +++ b/auto_round/experimental/transform/utils/hadamard.py @@ -1,151 +1,18 @@ # # Copyright (C) 2026 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 +"""Backward-compat re-export shim. -import math -from pathlib import Path - -import torch -from safetensors import safe_open - -REPO_PATH = Path(__file__).parent / "hadamards.safetensors" +The canonical implementation now lives in +:mod:`auto_round.algorithms.transforms.rotation.utils.math`. +""" +from auto_round.algorithms.transforms.rotation.utils.math import ( # noqa: F401 + _fetch_hadamard_divisor, + _HADAMARD_MATRICES_PATH as REPO_PATH, + _matmul_hadU, + deterministic_hadamard_matrix, + is_pow2, + random_hadamard_matrix, +) __all__ = ["random_hadamard_matrix", "deterministic_hadamard_matrix", "is_pow2"] - - -# note that hadamard matrix multiplication reuses the code from -# https://github.com/vllm-project/compressed-tensors/blob/main/src/compressed_tensors/transform/utils/hadamard.py - - -def deterministic_hadamard_matrix( - size: int, - dtype: torch.dtype = torch.bfloat16, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """ - Construct an n-by-n Hadamard matrix, using Sylvester's construction. - `n` must be a power of 2. - - Adapted from https://github.com/scipy/scipy/blob/v1.15.2/scipy/linalg/_special_matrices.py # noqa: E501 - - :param size: order of the matrix, must be a power of 2 - :param dtype: data type of matrix - :param device: device to construct matrix on - :return: hadamard matrix of size `size` - """ - if size <= 0: - raise ValueError("Cannot construct deterministic hadamard of size <= 0") - - log2 = int(math.log2(size)) - if size != 2**log2: - raise ValueError("Cannot construct deterministic hadamard of size != 2^n") - - H = torch.tensor([[1]], dtype=dtype, device=device) - - # Sylvester's construction - for _ in range(log2): - H = torch.vstack((torch.hstack((H, H)), torch.hstack((H, -H)))) - - return H - - -def random_hadamard_matrix( - size: int, - dtype: torch.dtype = torch.bfloat16, - device: torch.device = torch.device("cpu"), - gen: torch.Generator | None = None, -) -> torch.Tensor: - """ - Produces a randomly generated Hadamard matrix. Differs from - `deterministic_hadamard_matrix` in that this function supports non powers of 2 - and randomization using a seeded generator - - Adapted from https://github.com/facebookresearch/SpinQuant/blob/main/utils/hadamard_utils.py # noqa: E501 - Known matrices were retrieved from N. J. A. Sloane's Library of Hadamard Matrices - http://www.neilsloane.com/hadamard/ # noqa: E501 - - :param size: The dimension of the hamadard matrix - :param dtype: data type of matrix - :param device: device to construct matrix on - :param gen: Optional generator random values - :return: randomly generated hadamard matrix - """ - Q = torch.randint(low=0, high=2, size=(size,), generator=gen, device="cpu") # cpu - Q = Q.to(device=device, dtype=dtype) - Q = Q * 2 - 1 - Q = torch.diag(Q) - return _matmul_hadU(Q) - - -def is_pow2(n: int) -> bool: - """ - Check if a number is a power of 2 - - :param n: number to check - :return: True iff `n` is a power of 2 - """ - return n > 0 and (n & (n - 1) == 0) - - -def _fetch_hadamard_divisor( - n: int, - dtype: torch.dtype, - device: torch.device = torch.device("cpu"), - file_path: str = REPO_PATH, -) -> torch.Tensor | None: - """ - Fetch a known hadamard matrix from the given file path. The returned matrix will - be of of size `k` such that `n / k` is a power of two. Return None if no such - matrix exists. - - Note: This function reopens the safetensors file every time it is called. - This is technically inefficient, but a very small runtime cost and simpler - than forcing callers to manage the file open context - - :param n: size of known hadamard matrix - :param dtype: data type to move fetched hadamard to - :param device: device to move fetched hadamard to - :return: a known hadamard matrix of size `n` if one exists, else None - """ - open_device = torch.device("cpu") if device.type == "meta" else device - with safe_open(file_path, framework="pt", device=str(open_device)) as file: - divisors = sorted((int(key) for key in file.keys()), reverse=True) - for divisor in divisors: - if n % divisor == 0 and is_pow2(n // divisor): - return file.get_tensor(str(divisor)).to(dtype=dtype, device=device) - - return None - - -def _matmul_hadU(X: torch.Tensor) -> torch.Tensor: - size = X.size(0) - dtype = X.dtype - device = X.device - - # Check if we have the determined hadamard matrix - hadK = _fetch_hadamard_divisor(size, dtype, device=device) - if hadK is None: - raise ValueError(f"Cannot construct random hadamard matrix of size {size}") - K = hadK.size(0) - - # Reshape diag matrix with randomized -1/+1 - input = X.clone().view(-1, size, 1) - output = input.clone() - while input.shape[1] > K: - input = input.view(input.shape[0], input.shape[1] // 2, 2, input.shape[2]) - output = output.view(input.shape) - output[:, :, 0, :] = input[:, :, 0, :] + input[:, :, 1, :] - output[:, :, 1, :] = input[:, :, 0, :] - input[:, :, 1, :] - output = output.view(input.shape[0], input.shape[1], -1) - input, output = (output, input) - assert input.shape[1] == K - del output - - # Do not explicitly repeat - OOM - # input = torch.bmm( - # hadK.repeat(len(input), 1, 1).to(input.device).to(input.dtype), input) - # Use bcast instead - input = hadK.view(1, K, K).to(input) @ input - - # normalize - return input.view(X.shape) diff --git a/auto_round/experimental/transform/utils/hadamards.safetensors b/auto_round/experimental/transform/utils/hadamards.safetensors deleted file mode 100644 index 9624e008623e86678a2da7f27000106e03055257..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1436901 zcmeGAZH^?#(xnGZZ(6s&1G^n5xRhIY&g*EIjl7_y7EVc>Q1h_>ceb|MQ>!{onr6zy15a{m1|K zkN@HS`p36#zy0HX_{V?wxBv9dn*H;C{kQ-6k1zlD_WSo?`~CYr|J(nG?dz|@RJi zzm2%x*08UO#IN5Ml3$nHUq|jABkqsypnap{MN7?w@%!NTK8hPyqtE z5%i-3EE!3#y$e|#neN8d<+RppDE+oT*X*DFK91aS8ot$O==|jv7G%{4&G#XrW^Mbn zw4-MK{9pc4h5Z;}ek_U0L-MsAk|zBzz<&G~l*?AtPK_#-ZTVW;;!b;@mh<+t4qV4B zEJcFJ!@6|qTeWKxyllj}592IW0n66a&gF^zW1NjY#?kn?$gkNy|NVde-zx9hdaQD; z=-HB5vr+o8Gs|ma>FnCr<=B6#V?TudK7{zJ{Vzy0ICS1G&$|I@$!pa1KBuEhV5llb5M%fGSse|`P(^?&;E4_}o3%m4g8 z|F{4A@Bgigzm$TY?RRzXoZZCuUKrnh#xVQK?>~l) zJ2Fbe6%nQu!1orxTkV#JZ?z%cu55_eU;g-Y6v7Fy?Hiy$I^h#Gmd9FR_~_wJ*P3*_Wh$sXu6c{1Pqv z@w$cbTQ&B#D~(P1mwJo+I#BMk_P1*7Z&zBI^ec`x5YsqX%MrMpT0^6mFue(jyU15y~*Aj}2g zdpCtY8n=XeuMPTsWrLFb<=Y?MzYUZ-8}z+4==+roO8S>yfBE(6uX!c$#=ohhHt746 z4NCf#Ukl`Wwwb>!w^0754f^BC1||K=uit+e+swDAn<&-XKdy8)>0f^Rv%Y6W3x8B^ zq5P=s{oAfWg*IxC(OP#d^<@N^Gzb6Xi$_6L>%lF@Z`H`)u zKh8IsTE0TLvcXCJ^8L@>e#zE+yYn}_d|&GK=fB5EufK(V)_an_9`&91wwABo3+2iN zC;dzP8T;N^?(Gm;bZw84{^iH7OGS77FxNj?3*}08lg?k}Z+WfwMfS##EnlHr>2A`$ z{P}CWD0AfQ{B5p(eSE3kpSzp%FMs~|%P-&4-8+Aq>z^Eja;3XT=X=8Me|(RaJEyMN za-}Ps49NFi^Xm7Xr?>i2f8opjb48tBU+#YS_0NHGPYrUe=}A6c5`XPpj{yY4V-F^z|Plg>z-EI={YT{r=l8 z{dC_ctE#$k_&QmTdTa4m-Z_1B_^zEjKzQZV`1_AM+jkCMoxN*Euag1!<7>Wg-`?!s zzFgbrWCkGz=FT_EI(Ao29U#BezoUNr{l^~-az{rAxvqrd0{Q;S_Z+!9C$0|M)$^8Q zK)&^PyYscOPTSSPmSjN23-iudtD|=Hq$L@UAN_aa&NssPLU`@_U?&&IAK!kYeX{-+z3$_HD3}3*^Vw+G@`Hov(xSZSdNc!A=I``>#2! zcfJk&@%_uS?}D9NAoX1^-&bxw!9TuSeSXahLf+VSzE0M+$!lLGrT@V;hI81#WFJHTF|dol47`{HMu}w$kGD9a)Q+#-Jvki z98-iM+TW4a*~SVIyz3+;Clj(#)mg>y(Cm_$;7xTs%Lf$he- zNrs{V5-hh^Kb(Z^yGl~VcUF>4q!AaDct+ku;-P}yiD3PlJd?!TPPd%eAL?atqg^*D z8GD_y&Bjx*Xd<$x6+bsggd&DEiO1AL<*3I>8!JAe9S%T1O2ADK;Q2 zsmz&)iSC4c3<+!y{zQU3&ICkgLe)n#21VXjA@fBgomxXe*Gk?^B07SOlb|1=-oZ;3 zm}KREA4h6(;`*WzbF`2LoTh7rAMU=5EPa6T}EGTnUE5Us2b&O9+mVDDhn+3Y2!}Mj=vish(O5jBdTFdYuMTV#qHm zN$!H07?#aZIrSlGKZ)t)2lfC74Y*oK>b^pP@LX@LgoH^X-L~$ZJ?z^plHQ55?6O_A zD!E0HO}tV`C0uWp9lGAG8zdlIZ`Uo7is#dpi&_QbI!VP`CjsMnCGRF7;e8}fU#(=V zrnRAxA1GvzXuH4=KPdy(@p_5I2oummwW6ye#yyKf-fNX~weP4TZM)X4XOh@*_Um{h z)u!u}1pYlFg>$`KWq%%tO?v0X->4+*zS1u0zPcybB7EtL1I^HKwOzMJ%p$+3(ozl0 z{$>(ED*vZS*rB(NZ3y9k)cnuBzM1gH6oxe6?B+J7Q3h+z2z zSA}UfqaRyW!i)vDW)s9Grk?1+qE(=PkrZ=`9av#t51h-fy+GF&Cm8@x-Bphv+4Nzks4DDi_yMvJ;42TFk3 z!uzh46s`x+R%u|G)KY({1WCND?}dNh*uQm6--4e}Tqk z_5u#YRg!^AF#7o+ zL^6ClQ8Rov(dgMH(b9pM_^9P6%emrsLM6{N-p(l2nAy4)z$NRH&J6dO(eN;d6fzA71yoAQC&VOM0eemH#KpZZJW+8xf=HGJ5EaiuDuU+`M4W?%5K+tnh>GPQM0tK5P|>4^>P_&7 z))mWRi0T+Tg2<*lfG8AyBck4&kFKcjAJe)|qI!LdRxeLQOuYG>^(&?Z9#3|L+ap>h z$KQ#_=6!tY(z1u_(L*Yto`(=E2mE9AC{NnQSG0X(KeD3QpY`4FkwmpWk05e?9zc|8 zKAx!e=i`Zbe?FRM*`E(4qHmAcqsI{W9(_bbe>b9fRr4q4qgw~VBZxfH4%*VH`wuhbha3UxkL1Zm|C!*@wBU)E1k0Gi*aE~D3 zTt9+nJUaaZedxd*`4ly&1?Iil9_jTi#govF9yTz%W6p<8*{s-2aGE^f5$t*#M=dVL% z87YJUT7L|aD-(Zo?W4)q2`_j2Ln~X3PrCM@l|k`$tjw|=R9SKPls@8scqAUSXOF9l zL;k=vK8(z|{)UxxU5}~kk!1Ok^^nt8&3jm7Mf6GLwrM zZA6bh|IGam-d?lF)oJWsh%B|)iI1+#7Sa543c8PL<0qN@<>2lfs~X_%Jfg{G-S`h7Tfx>rs_?Fdju#Jddi(v+*FZoQ=nj zbq60pCf9@ZEa&iVNS1d7&+~`xSy4Q!jSxMIjOIOvtY{ubR#cB7v#v*xS=VF8x~@+$ zz6s@D*c>VkmWM0m>W`1mINmjSkwx&s+6d9#lFUXvrZRi@xHdklG9Kbbk#YVXMV5!} zF_q=HeoSTIdQ2PZ%x_*;AN-tslJRs@$MegirZn9<=6rCpk5ChIj)aHonL@)KHRv1) z+a7f&SELi7hS0ZGW{QcHYb(mu?GPSJSdNncgfQS5kkv*gCg-NCS@$nnvL_&_US$Vr-RZDAfjSV}h+iZ|EBlP(wD>j^7TCOwx72-FZk%vhcT+%4-?Kt8iD| zme6jAS%7TSrlq~2ZxHGoguNoOrmy7T9)c4LW)K0gZY{2G>fN2UCW7rMgr!1?Chb)S zM`VkAbKXMnb0XBY2(d@nd-HB53V4H1AvXx`=Fz@IX|^pev~*kHc72Z3(0DI$^52PWX;Idg6zc z$kc=OxXRwWTv{scuDlDvB7sw0Avc7uLd-A%a2x-QJYhN!IIzSM9EYoh9fcATmnrYb z+awm+N(i5;5Eg@xWz)VT9FYpWn}=*+Mj5vVvEL%Z{?@z&hOXU7+|=|_h409tHp8v2 z=Bv_dTXefsOC;Po^DYvL?Iz&`#S6ms zeFeE~1x^KNzT=~aJ%p67rFbhLWGf+A7pt2-$X3Fnl>spAw|oL#P>@Sf3%OXWzPq51 z-5^YLH$d-*xnIR?LYbD9W;Lp~=TrVtMOGExUD;ND=`&*4!(LSgExqR(0ZS0!O?b~& zn7F&bu_Dvtn?5rj<2U&X7prSNGFH%KH6U*K;FC$vWi>?J^x?P(ddEl3P0%|&lXeg) zat9%;Un|_MaNPE}QA>r}Rl2-Zf9bmmdWX<$y9K(Xpb=~HLEKzay16!h&nL%4~g`9N4vfZ42=ZkAi8EdyA= ztoW=e{a1YA-M|dlb!V!(pm-DEJxrMrAiP~%=&hg$9qN_}vl6I|S2$L~8H#`!BOM51b&#LAa@@t8DsLD!hluMywTXSCqf4aJ#~B+qXr5 zdu5<0o8{$-@@|9P@!iENj`gC_^(wZ_wMIGHcp_@`Q^k5+1eVHef#utlgwtfqg|KM_ zhFd;dn;u?Z0MvVp<1fTDOCdE0jnldKI)!LJl?ErdmO5lJF8?iW5BSX9q+gBdNu zse^#PB(GPJEign1W#pyr9wzeqs3uCvaD~uFfze&phVasN53`cCgfIyOp}fL|UNRFp z{WR6fYgMEyz0A$*(BrzaYHD0v_uIZgOKrt?;a*TVM16aDNY{09Qh0w+Por2-nEbnxNoYBW=J1=A~0=!cpOG(~E+kl2^b){^YxdN$`Mq zsRqW%<(|(H3F&R$Jxnqem=rZAE?4lbk!o1QT>36nmncG*96UoKKKdm=5Gv%0g?Y?Y zA16Q>i8e=P&FoJ*iN$~ol)lB{G89)x10zln$uFRgi94JxNWr~HiUgGaMkK4~15S>C z8z)#xtT~2Xk_v?#1o3mzB93d8`0E;^B(%AcaRRi&-XJxGTjvQJ_JsvCjsEeZka=Ar zO}*1ArMyRzTHYlrex98@;le25%~< z3T1)>nlqE8^t3^>+x99xeY&=%dn+w0w&6NyIyRlO5A=1?-lo~4#d)XFd!+c%o_8Ol z-K53z5u`ao)mkR>!&#OiuUGQ1Y<$ko+MYC;5E~(}t8RucQwaMp4@- zOfx_Q2?1?#9iZX7LmB{(lYT)eZWO`MAyU0UDvTnyks(0F2DnaYh@45Qb;UbZL1r;T z(TWo;315;9fHk-*xy%C4t0pyjE2-Mxs!JT@I~z-V197l4$ymONREp;NiL}@NdnhR) zZj!2{+ob4swD(S>w);$^t>t>9+5VJM4XIe>y-zV<(_+IC;FRcgRzRY(U{4NKW*LUD z{hb_dlR}9-X}vOCW6f4)a2KbE2AqUc8B{aM*;bm;)?!?b0CH|tT8_FB@KdGq4bZEk z$t)i~0t_TAdr3u`PdWr%t&|G=)Huo^ka>!sG?(wH^d4#9egr9sf$q^F%3^u(+^RI? znflI3ZPP7MJ9w+ov^bm-NhzKV_{pN?UbY9pJ4i)M69EN93Z}gRn2#=IUIu>vDMS!U z`5;oOyG4qe1YBva-B5DEGEMW$0+l6~V_OI2^-4RKTr(ZR1K5X>)?S;ChK_(YwwH}c3mtkdD|x|z@)~K< zrjr)c?e?~72PvpGE5&t_bhsvys%JhaTXnNixAg-my+>MdZnqa->-o98deWgnVE5cX zGc3|ux6lEmG75T|G(h-`4S!OlVX84mVbQNSgfLwwYT5>O6$VJ4QrcWnSSaHCq^MU| zvV+4MV@n;lh7xWpemqx6t@Bo;*gt?&VYk|AEw@OmH$^31eQ12O(tD(|My?M>bMWI@lk!pFp(F=EtPgEK5KT6X+R{M+3*q}Y>xoxC72mM`21-;e8U+a(}WOB)WjsV!It1c1H+7NF zL`fs+&7=s6xGSVlshpXl65b+(XjLSw9kHTb?x?iVZ&w=FMbSG*OBy?2@0dx$h}Gq$ z7$l^b&P=^dT5);R~X)-1^8TH~Zf4~x;k*t=b$>k*}ezV(`HAw@-N#Ev>D(bCKY*r|3n%Mlp}qHHjboJU@C!8-NjXm28%NR_eG^H zF{M%xgaGR&(l!wxG_lOh0Lj(6N#PVCxWQR*qvI;6oVQ3DG8zd|UQM)o4C$!jDk+`X zSE=Ho-^O>{gXLbOA46&rflf2iS$n*A_Eg$9*DDR#V@bQhhmxkJH}+H|+}_>Zf~~xq zoJ<2jv+><-FL{wOl@v7}K#Injm3p#nktT0XrB%mmQUPyON)K+fw?P|#KsTaN+=ve$ z9iUf96)?Y2%URlxmMoPPC-vGU=5-OOtrxCL;Fef|3|(+#*>>=Ddy~3{RMv=P3TMWa z%yLnLZi8^<8vd~yX{?Ry3Tw4gn&b`cn@OAXI_ZFo1o$?jQC=ZsZ4W3}pp0vECeTQu z42-{~T;SxE1uK7qY%BssT*zQC!uVUHi862{5osW1@i0{BlrQg2Eoz!_v8LmCYQk05_Hq-g9Cdti<59<^pkU<0HO6k|vN;Jq?1q4@l* z$OY0F6~{AhkCBQ$pH%&~Dos&)NYjqUElD0c}e=a(A5>xW- z+(XHq#Mv?qj7O}79LI4G*%vm4Ht%iqq1;2s2SgvJxFqi;_DR^=>LbJl_E_4T6_4Y! z=Wrd$J(N7&uk3R*ZY)_bzxz1tCzi@)zrS0u?%d-;xrdT%=Dv#A3V$m7-R(M*dnkFg zXsg^;aXPd!_fYZ?(VxV0fL*KKOf$dw_90-`!6a{IvT3_>_Y+g+UePvbck(B(Cy_mW z{b2rUwb~=O?_}mV+MBzdxEXg5SJH0cqq%#D;XRbQGg;ofxrdS|_%B9Wa`&qCwY0P1 zBHc}VFn2dGjoK^PChtr>lzS*So4fCP>?h9Ox_zxK*4@Mhb9WOXc&})W)b8ZJAaRbx zu8RA8wJ-Nj@)6NbB1ZDg!)4;$+(XF+MAs9zx8lm&pL;0z05PdMRhzjxcRz7GxVuI3 z(CV1V6qAc!4R#YFO#)bHDV+JUsRB zCs8cS03tx3=}c?`V&vlw^CjMZ#$m2AvCEt^auOEEk+%Sqvuyr|-&3+OHQ?ikhlV}G zlwjVDinDQ}^lpWCb)$BMG&rF$r8_!z5{s!`0p3O2Y7z0+iVqMY)ZfN<&*7p__F-qM zpF*6C*;ldkKBZzBy_0wx<7}+km1*OZpBV!O!NHwzUw0D=h3XxxHjSHV2MEjPi$?E+ zj0y}lOVvAwM~XeDm=dh1@|{yAmARxCy>oP5W~BdSkAq~hbpGPoy6|I-iotdw)dT_CUbAAsdaC~?CxG-o4k|Q zT6Yq|ySHL^_f{CTCz?Gk)?i zsU_{qGNrM0MjrE+6GxWOZhi7j9!@haGv z1!P89-6keuxvAI&uK+ed8!;niU;x1nF0SBcKzf@xXe12~qX`4TTQFP<6Nvm6Bak#K z`OymozkqOM(3v~Mi)wzN%i7jI% zF-S{`Ea2_LNZDJlz1v9)=M#vlRlA98)6Q1grJWU%w!hT}Dn3B$k=)tpr0uIXS^J1B zcXz7~RD6Iqto!!3XgOJrulN8l_40DtQ?ay9saVnbT20p8ihUR=+aH@o+sbP9nAL-V zvpWit>oIvev67&#NzWs%?qyK9_smdBV1j!_75r!ggaj&M<~=j~1SUQcfs9I|_W>Y` zP-Qk5h=hQ<>pGaOdISz3=t&ik9i7$~Dnpqjkjc*MVC+su*>3VKU>`XR+D~rh4y1pQ z?>KPFe(yOl*{r>LUT18a^d00Qd*6ZceD6MROZG1EJeyA;={N$={l z)aKI?Ost84rB`}70<&S|B*l>Sjd*geWDDKtK)f?us!#HF_w8S@?R1Ir_T~fU z*}TZ+QF!Zdg_Ua``IYQhRmYc5!h;5ZKz$r`H;U01V- z0@Z}x&}-Sotpp@A@f*?sye9o^8uOCH)3DDErA3@rLvdA&2)Z+OLwE_?B!aC{wo4b|C*H$*ULp zE2ooh&91l?ZKvUHPA9T{*Ij#8_D|$MeUj5pTS_ z_Fu=kUcBzN?;1t)8>eAuDf^PHj)2kVcJ4eepzSAD*Mal)Jo2s@0W^>nFvUC?dZ+E} zGFPNBn4CSbqu?tsEhahE^w`SRZtWnq&ZXt=u6(mq?kiJuC_b z<-Ozg z_mZ>UsE-D*-u0Xxs{9anF&`ke$fs5AdvtHxYr~FIPT%(Kxeeb<9^JdimHiy@?%7Uq z>L&F$l^-JatnEHPQ74u&5PTfnM4cklJ$7LtE59I#JkC2nL~Pa;?J zZgQWm-IaUFc2}N8*xIRyhj~Swq!lAk{v2 z9Xcx?r$5z4Sj+o3f1kvs>^+C=nR}lD=Bay-(5Ok#`)={TMHtY}bqH zu%GfbpUqVB*2jFhAIQz`<%;SqEtjha{uh83Vm4=D$e8Z8LPV^4JLZgBZ21_sbDc;1 zy{Fr~9rLmG?D?>5=TrO_)luuHG~;FUAK)*-wYuyw_hH-4u?_ne5Bqx)Up&?Ye|deF zpHl17Ll#~AviL}UTIvb6x!vS$b+2;jKg8dl3(;neTbh;+^Vc?yy=BK!Y4(21Kh(e3 z+i?!YzH=sqZ9AXp?~2)mwfbzz*0!A!w~fteXMQD*y#;nuyq$ljKU;nCKKB0Bcr&pb zxAKO4?B%VIj@z! znR_u*!^J*vWwowj{n!O{*SG8=Z;ZF%ZrMj3 zRd27cm$$}k?6vu=ykY+-XKZSZFP3c6#phyi>2h_&bdhqgAULgGMmJ(vz4W5~e#Cw6 zmHZb8cc_hgh5qM)UI_jv-->`HFxQx08?V{O8usmc*tX+#ZfxyAwjtwo&R)G-U$bpv zw)1PTKiRB$Rkc>K+R~Y6_L^;*vz;4DS#_}& z+bh=9KGtI#u`k!R>?3ZBOC05Q^*z-PYt&*N>oJblm+M>h5jVyq4hpO1zW$Ecu#NQ? zU)sldjJM)`vUkFUTFVeK>|;H~m-ew9t`E^$^vO+#76RJq#6dW<9f<@%O=#Eo%@tI2lH zMun+TSrw3o;oHW)j2kg8?PER0C600rT;nY49odf~pF_!Xo`xLwJ(hi#o(c|0VC%=T zxM$pxv*l#(*~7i#kz6>BRiEY`!8lfZ_WlS4d%-oH->lxg-kUdg+=D&?V=u>9c*~lY zx5VAz$qwce^B}! z{yjXh_vU)?vm0Yi-m|BB&diqQJ$rp^kKb}euH_BhvFBqPdpO2nALFp!8b{n1U&aml zGG^ETYhiBNwdW^$9#$g}hVo%FUPWE<;mn+m>TE=SF$9jyzKE`1m<6`g03;2?E zX}MfcMJ&TU)?*y@F%J6}7kh2UvKKwy?h*5Q{&Xf8V}39GC0;7YZjAYMkA{tiG2i-! zed`O__4pCD7gALC-LiOJ;vW~koI@A=;wKX8{k ztvt3|bRQt19w&$`V?_0l`d$sCS z_3h5AG1og6|2?1BlvCGuzO!1_x9l(DZmnZmngCS#UytcLxq>sRBfIJ-fWT+80{T;mHnwr$+4^({NO z_)>9pFy;tI2guH-z;ia0(h|AT_{#WB`gQ6AQyivO?PX(zac1=VL3jw(a~H)=OKeUR7UK^K%8S#lQ5wT))JpS}yi+d&Roi z$9jz0Smwh%#^Haterc!jQT^DTQT5h7;>Ng*-NwK4zg&;JVW+~eXPL$F(mvK>+{UK* zOUqcj8sCA67{N8RteRvt)?*y@F%J6}U)sldj3bWy+3pGRmwUEdzZU<}|8o5jKb2-z z#+-4?nQYfD?XUS?jko$VIh}~&F;6$LNn@VXsO&QSR^B!@Y~6`+Z1*J9#@hLu#744GA!>*Tf>Kkkkr#t)4>KayX zSu(5$tO3uh{kjt;b_Wp$oFK-?Bf*$0bQp}SaI-3bb@BqPBSvF$!|KOG7okw3n7Ks@ zR_nJCXE>z8Wh_vO^{*#_rrA5H@spCR30I1=*hCm;4<>aYO?$EkxAnQ5u-nhyj;D7b zeR%T0{1m~1KhDA!eIKx>&%@pm$awm8oI!*e^4w1N=itfP;n92YnW$aJ^YP5>ID-hz z7AetpCWiHKRe+$$TRWOGcjIK z|MDW12lIKI=oeLCtqJiilX5J4B7+zFq}XN$<6tfzK@jd=EUcp5%MWQ5*5}POuT5B1R9M+X=Sf+1sIjXN#y4`Rq>Ef+ve84Nva` z8lEj8e=MGTCZ6Ak>cjIzl!RxC;3Pa-M8Bw?{B%6O6DoM}c5oh_-3c^2Tg3XFeB#sb zDZ*pmzcQbFChGgkv$uoi^4TJMS3g_CI2BJk6V-?3Z^x$yPhxGw(>syF@Z2-e&*rCy zoQmh33D4wn4`!dqr-~S7@~I;FiSWt^vqjj4=ZYwHHsh%xLjGJ4{eF0^h~9>0ieN8}UQ;lt3G{(|S^nAAz&Lb{muXgY zC%_Cksr5L98lEqNx-8oIxKaJPIuZbqj$j-Y;@%uACKVh7@w61wb2|e6(XLv=VUP(z z=;gs`+>gT;$AwVG(PBKOhe7^ELO7!b+gh740pmDG&iS!o3gfVlDHu~gW)$-&gcHHV z$;#veXGAH-_BZ7m#Fzpyp_tyBBh`!zK7=s^WJWQcLg)a0+xn?p9^ga!`?)nQ+j^`c z>&yS=D5Dr6GIVIigCIK~u^ z2|}pkU^SkPqZm^_W)$-&Bpt{ryym`;kDQSy7*jxI6mt?GRm#z70630u9EA7!Q6av? z4uZ_WI9d%lreGWdQO6XFV<6FS5Mv4m8*_APjti+{eppDq)Q?p&LrB$fv>FhOV;l!5 z4TptL$5D`37)OP;CkKnEb9%fZQ!u80%qZp@Lh5yVR7f3-2^hygs*b}#DC9WE42+Wq zscsythVyY4V+zOwA-qfvR#OPaF{Xh0jfBwAgKahaFb01b4qo*FI({8*NIkRsc-pHX zB-vson6vUMQ?a4OanniCGIz1YCCET>@fz)fu)%Ih(6Pl2tnsAEB~i^?IfxJ;QIWX_ zn>f~>HPQgq7QTj5&_N+hrsBOqtZQi6+mYi!M4)MQ4pFww(%!l?Bh~I(mpFmUCcb5B z%zU;G`465E`T<1UojRwp5AVnXA!`Xow&t*qw3CL~T5q(}GfJc1a9As3tzoSTYulZO z20=&!4g+o|%MmCh8o0IH31*`bX_~fLz_lsX9=(g6E% ztZ-P{?!^2B3De>Q4HQB!GSR@T?cpl3Q5}XcX(YZ^2&trz388GF#32bcd>ba#n%Wa5 zXo*UOkQie(NRCh$w5SgfH1XotWVRh+D1ha{p)M(Wp)=UtB zD21VQr;rpeO4`>EMI01j1=$SCVn--TMG#bds!Xkh<8w#sLsT5^2vnioEhJ(U9J#2B z{2KI9ONTLajw^O!u*E>G&czKJwoYxkK|fK2S}=T_)v;^lil%D_G2x&P>q07mcM3rV zH6Q5+I_%m3A?i3NL>du@kUY1?I^vcd>XI&3lZV@LaL6#Tf?jL6djo$gnk_D$awu6IU}5mqe9#jZ%3Y* z2|}uvvs`sfIjSit4bn9ByuCTkBhHPWhe>s3E>=sY8@`=MFbPgLmfV`x)pWy*&EH{H zstaetHB7KaA={Op!Z-sD3F96ije?wbAt7wlh{xYy*Xv@5`w%gT9i%8^{vqru7GaGYeDlyeR-p4-pPxEY?8UMS?9BQAk2Ax&Lmo?__KEE@Z z-Lr^6$fp?3NLtbV6zNm*xlQzA+nHKq`TVEJqk9rDqeZ89nm)yNW~&X141HAk=w|u! z13Xkc{mj^;b39FBqv*-Aw`uIpDLQi|G3)d6^lhrPoToFNVt8olaq-ofZsZMop3daG zg2>hy#i~X*j(n`7g|9}^{H>dHdUug^5P%4BdU3hJiA5`OCq1x;kU&*b;;`$&pb@*j z!>$Au&WOs-t761!f(~YQh8R>(6}P8%=1gMf$4NSq&G{5FHpIt`&2f*0^|=SQ_vjpA zPShFSi6=S0oSUE*rzsEJ`usdq%-ATNcjiwNv!0sg zi(z+O^y0HKo{zenq%+y0lZZ)MP9cW(3wv}5F^D*cm~`YEVoJ+tI)j$8h{<2Yvz(dp zbjGuLl5Ohu?m5oPnZ)>A@f>G{^KzCmLr2acCcATv&Zy-ione1YvrT6b$$DW=ZI=FFU^Gkt{5@Sdrz z4B8f70`w-b|E|`M4tMq;9M}}|DdnP4Npie>BRUeFNPsf~2_dDt6-YaIpmT}JWi=qREaTZSh#k4;M1eKgAg;h(Y08PW0 z1~jX%lL@L_pHUf`J_TnQ(5Ikm$k>&Po9{7mI^A!L!Ls)~G+r9Vapnl3cXgry1NG-pDG zgqVRd4QNtfY0lxw3Sf$$dK*pwnus$+kZ<`39pVr2r=Zb;>O{U?(313pZ!Kr)Q0@5) zK|B;waHauG669e%UK#s8qeG=+3eYs1X+V<-Tb{FHPZ|3)qto?JOahvOLs1hd17!}* z9H3}9UKlU2DV0sbnFchguyYB@TlMgk6~L6Or#Ul#CgMyHRC?y%Oaq!Eh_~2;$~=Hm zaOMc&{LCmUXL&-0{;q;*&u6rn_sqU}$a&oV zf~~{j#Xy(1gXSbRL-Zvel#8!*m_(kxtb*bsx8N?wjl1}wL#tvc9N#`cfVHwig1}H% zWb76s#son^Oc2x%fulwmy<^L|q?A|py;~MN$G0qcj&E7(IV`C492Vs9LsU1shMikB zsY8V+E2L37I%Jjr#{?mz5e{t`QX1fpAjQlOl)fJC&?G_W&hZYF z9J+I&4owo2$7#kHnj(njTrmf?jJ=v6h{tP&AWqH{L7<#S5H-yZlxORBhw?f+-l5Vn zWy^RG&gc+grU)uAQv|W=GXxdL6hWZO5Cq7H1f{Cug1W539qKk87L=niW6LIWXp*3A z%;7Usa;iCV4)v4{O%gOj!Z2x{i`TKQTP-&Rbp;A(fo^ggI2}*B|3!(e7HlNMHaqqnnj#1(GXxdJln!waX9(iKIFTUsYK9;&PSqj0JV6lMnb9G|Oc11) z34%(`ln#}iDIG%33_<9bAxJ$F1gU3+pa31ebDU>h+h@!d9dNts=)d6h*@8Z!HQ&CYaE;!)-YMS!<8}-wDCZKD^D*J+qou&~ z?Lep2*Guq@K>e*n(U|nx{d2zSpaU|eQ#@z`>X@i4Qeiggp7JX7e4?t|b41bY73oN) zz}Y|@*uED?2SgRpBvCJs_KV^?olX>jrik)9T_El4l#lGEsOms>B3<=$RGU>R92s$4 zG=1082jWjiySA_7T!Gpn>N?UMQMTyk`gVxAfwV(ZHty|q(?^BU9}x9<%|ZxqGeeTu3s z@FKhW@*Hj>X_-l;bh?|@^ZQWo-b4D!ZeN5?I8***dBf>^y;FeH^?K7^efr*v^r|T3 zoJ|zFHl+XGNOMHJ59yvL&iLs>jW<(- zj}-@pv+ug5eU3a)J-3qbcVx$s;IVrZDPhT%Y()y~+o6=zQeHp}8K0u;z{htAns+w+ zx+qS@dy%rsWNuGqj^qqcyO6Gm@|eE6zH6P@hjdL8UBf=1Q~QwqDxyl~oKEGW?&$eX zonqHcC#nw898ny}ok%Y_RXy4bb@}uiLAn&>zU{5=LewFo3sLFz$wc{Lnz4PHrvnFJ z(BVP={S@J zeG1a2sCwtrl{b%jvcAuC!`Kqaw(?$Of=0Rb|-+3G*`9 z2fT5+#!TF0M2rFG@I;tu2Ik-ZffqBga0@tWiP$5Gocax}95U<>g&;1V6uMhf^vnBU@(7%+Q&rlW?W3|e+vnplMO6MsO*vDuMA7Xzo$@@*+P>3? za?_@CD(`Xdj-IK!$7hK01eBl&qVmkm5Y_#eAga_5QUO@%W2Rjx+ZmsM{e5o%@RdKnfieoUTQ`1D1qDi9Y_MA>}0_KRa z+fzh&EYBsX>zW|SGd^qkW_4b+%7Z!OO!=@(5fzXbqJl9)l>O#yFyTyzF+-G( z%M?*QaZ}C|#TD9wP60FNpwAMO+GdFA;wE&eYnvd7{Xbo&rir3K9?ip@;&h*{Q`1D{ zOwM@v`thA`ruvm~CQ&&|GtLy}DTne*oth?!^EB&BO%lc4%@O5^oU(oH{S;AoJ53Sg z@jjO*o|HM;$8%Tb;9O^Fnked;)2TeYGdh*m+>GrjO_MrRnkID$O>;z{X^yBm0jIqB zG%;`-GCEz}Hk@88xz&*NI-xm2*zcuPYtuR{+fk9^W}&Ci#2EZb6cK@XerFDhW47^$fm}>&+3Wn&xZU!x64G5AFUM@a*)>$)R zNSb$o@EDGx!&WS34VF}OnV94th{3I)2nf2MhT@EM2^?vqhSz(q*d8yboS*@+HI9=Ru=4P;^k3I=u7b<38 zEwiNQep|{HnIL)Sa>>g(dOW1k(P1AIfO!MEUe-}E039Ql-C4J3n=F5c?o zW}yeP*vxX8aMt{Ee#VBXMH0al2NZ(wBuiNHaq$V(GAqMyC+&r>#=~mhz;$e}FyaqX z`AFl2B{wW#ND!+rODyYkmNY;*&E#4L*vYs$aG*H~3|hOL$_S?jj4L0oxbT^iSz__5 zv!r#P(VZnAb`ZG4cO-(fzSYc3P7y+ZMe_h6z@UbZIk9-wS<-YvR=>h}6ATzH@$ri5 z=oB=~1PmWhq+`j=JYb#RU1y19z0Q&bi0It1NJ!mvU?t6w2-f;mGqZB&(`yP*gc__j zK@B0<{rnb}B027WBLVIdnJ23BL1Sk~)|#S&GM6y?C2bv?k)y&K^aLCk`dn|=xh5;-L zH!}0i(lgL1#xw?Cyb-7ao3xC$9Gxy^0)~$$aoVK)4P!b;L-$9QCD#MqxP8_xzE;$-Xhh%cdMIsD2$<5r$kST_809m^>@-x+fo z5%<|K|KfH(eEhGyF3Nw_tvsbLkJbr0#(6*ER&xG6g>jUZHt@*h1YLV7^_{O{5xmsE zgo-$F{hsm5Ni2FX*OjnQu%mfq$K61+fF;Pe#{!CpEQK_5yXrin!kAFw6HWL7g~5g# zbY#XN{jo3HQoYex9!ZV3{GrqZbaE^ z!rTELpiC1irc<+7dXbh?K)8ZGfJ6&~fpsfaKFo=)JF_I}5;7+t33@TqC5abfhO=)~BWRaV&2GN9BYv&m=@CYQ$JLv!$9-FiSwKEw2%;=PrY37Ttuo-A7A!Zru z3LFqjhBwd&G&->cBnZ5X0{pD`*_gq>fKRVJri5bgGRKgB;L1nO2qmZSOiVUjg7pTk z!^*77(W%)iy-4Gi%-Fog@+s`S9dkQqxJ|dUJxhP>dpDPs*JbzVG||1#BS>t9k(($p zSz^f8K=|?M%bkUp<_3baF&l0W63C?Sf*%p(H6TH;88)ySg=yWj?kcc2e3pO1L<_`!~BA}TMz(n_q1*ZB4n0qXQX%-{X1dHj^Y#{># zlyITItz$re9CSeqmA%oJCnk8n2$*|P1;tR;LaR&khDsuS;$f1FdE4S3T(5ZvGu+6Q zldOhc1C16o*g%6;m$Z(!SP!W14GrzpZxV3?8?JZd8?)hF1BD-M=<1hyjUWv)Y^)p; z)xOo7XcLnh1mn0pQCN#dzcRSB+@}g_871mm!lLVZTiLo!C#-fa7u$HIt^5?0-54() z-(Pp%W9RG`r|N`ftR$UM7_B;^Ft_l8!g97gh4J?GyFnl9>Im=kGzJ%If3^-^{q2o2 z7E{NY2jrj!1fG2KI8Tgex`>233y}EL0$Qd)6+Pb7!3@kGgaV7^0Y!j8hV?A5l0crA zfkxo_gf)d~1RYS|AV^8MH=hh2E=+dGtqED-e6BEiV$)G`o{lMHvs+QdeQY38FZ#&b zhB{5m2p&=d_KRgnZxCyEJR z1>P4I)}X7qGde8@pf+0U-iMaOGz{RxXr4=eo-3|voFuM0Vp)Zp^13bQ16F$frp}S# z@7>OA;S4vax#dfboAecNxZfZywY@6tqSnLWBz;|6kt&St-RWG(+0=SeTu5IP2kBYG z*{j#Z^#faV)WumNg)x$|G&h zwGqSwU*9`Qhbab_&CRQ0fioF<2ykN?w*dpVM*PhA8E_rmaHKYhGr~;JK~P;60XPC4 zyBUe9Y1;_ttc<~on<(8E;@X0zNtY-r!Yp?tM*S7VrLMYfN?mMg=cw3DjT%WvoW;8o zV~I>pl9g;p2zLn~pt+bWE~mm9VaC{ppT!YFc6|~q&IgMNp?7tcyrylBslf@PkmOsofD3p2b3+uJc_9wd zUtAnDS=Ie*O%#C%r*dc3pY2L75|FlB1bddUt=QktxxrCM{;ZUN0Yz7o%n>K)>*9=1 zz+G{pggKoH>8s+>+gHUcCEORsR*lv=O-*cp`8 zH&F|B^9>4Ug;@f~Z18>J5F|a2HRGqX;$XTiFS(_TiK$n@gCMRqyule!wFb6BI%}8) z8d;qWyi0&O(P?m(s4%U3gJQZkDHhtB;?TH6MON->8I)F6_dvTDofZU=5(QN^zzMg- z#5_=##L_^L_QWQ{KI#cN8KxPkLnfjZFhY}v(HY~29{IR1GjP?+DCeQU+aVbZ7CsbY z_A&sXU=$#uck-LiS&T7lqhXAK!JK7;w5yTxuvGIX*fyO$)P7>NCamri%jfMaO91q!pDQIm;q6;C6 z5(QoO&w}La{PQ9|NZ|WRaZxB~hkk-8qiAg_+GvIY`5#F?- zGM7UVXhPKts2p*vz-&^m6V@3H!YL7FLKD#OTFWxB0?;hMivZw63rgVRAR7+?Zi+J% z|B7Z(rw(z+Zybz`nk<6J;-inTQbb*Bl3;I_(ZnoqWPm`nL*H4)MfC76G1R1igHc3~ zOoE`Z^o<<++8ZT_&7=q5fDPDiXTnpd>N zMg~{12nbG$u96jiP;^2ZO&S~dfDM;Cs8JCsv@J1&W+8A~L<>^)FC0MfphoXDpn$cWJBO1Vf%hl7j zh!Y^Ud*T!ckL&`6HkKiHFwPN2QBZP|$BDfeCATE`BLz1p<}fhF#^8FmMih~WgQi0g znjCHI3S8aG$lY54g41OB@cWxPXdxn0A2irPl(<44vrO+`tZ)jtirbxU~@B2U={vMHfNL335bd0<7nQGeF2$t_bjDgO+$l>NWTj5K<^)?i?ZLJDz9y~$ z8^u{DViKcCZNGEK2!&{`I4TW-xDBrCa3}!uC^203V(NT{&L^5R!&$^3t<~HUXD110 z{TWlxXcaEvG6uK7@#wT)jnmx#BuP3$VS5snRx#%X6tG-2sOtn9qR$zmBSrqPEU#ID zImMGpY?#QzWRku?iv%Fb<>ww3=$-V$!cS5I}W0kmfrPV&twd z&XT}g2IvA}(g*$Fag`$(H)z# zU?(7Im<#qn;znJtM0FSNqI0;*V>ptzMu!C@AQsl}2nKvEM@ z_qdr5mRRqo;aA%raaks=c903U(dNV-GD zItP%(!=8C^!Vu5^#Iiy(T@9g13Q_lP4?jNsi8C#C|L6IQ4kYoc%5SnGR; zxMGS9CKhu<+`#CX3Mxfdlhf2ng_3o#(XiOMd}8N`TMC5JS|1>;c=(Z6>Fm%*3iK4LrweC7xI|H->4KYnO=7CYOL_rr2v<-hUH3ZK*l43I@7={8CUm#+v zyAE!d1uP)wG_CI$2B&a;M{yRN%IUyb^=0Scz`Jybx$Sv>=QHi)FQ~g!&1Q z^FkbGrZO6m;13noTp>IxZmDM!m`bVw>J3*GHIGFPe#tL8_cn3l+OLc^J2zEaHmXaC zYBf+*U05iw?#A>wAy(R9hIB7RXk5v}51dFYfYqW0lX%e(Tg^?#O&v8&qPeUZRL8QY zNw60R*-SR2m#{MRm!w{VgFA-(;?@G3$pHU8aZwV*-mi-jU6#ICN3k+`4O!8H125%! zXU@fdHS>fFm%+p&u7gHh$uy4$2pGh7m2?gI;~~LWA_rr}cGS>e0kU2(Q#DZo-v4fnNk+U9lc8RBR|Dj0S3CbH!zHs#M0m&4S& zw3(kFBScYE^OK4zU2hRbQ&UilB=n7Hb)-HQdPhvBk${Qdbi#n1;x&M^4x-3#6>F!) zI08GXSy*+tc;ORgi$|0Mz{``Yrj#gpxHo8x%*Y^l0|7>K>TUl3CYB^)OkD#;T1c`2 zm~it5Wx3E|FfPWj>o!6YeryARvD2dhkIs$~dV)B3-XKoazqYv1-#MLgr(W+|NMGO9 z6N}^EadG}Q0gPpFqA@0o9^wRcj=|&zLDtmHg>{lR!Kg-!sfD}Z9hun#%dwIrNk_cI zR@h8&0XwI-uJko=w1n1rF=Ii}uFjE^2D>S)w#W-rqTm_Z;vRCV3!68>v|vP&(M*_f z)7WcMhZ(0y-m^%W_?=G*C@x8GooK-mM+!Lc_JX9&aJKLT4VxF?iL1dRM&Rn=E^b^! zK&ZBnV{k%skJQUt^SB8I0C8T-MiOo!%I1|dVXJu!c7|_+0hvHjY8tp2D#AEsTnxh& zC#%IlfESiT8^7Gdhznq_Fl~ZO!>7y12;3q!kZ~szr~KE&S%qz=7Ua5UhJDGc3+}ih zol9>YERM3cUfb5G;tGuB!EF;2kh!)#ID|Z0oZQpI(Y807xl#LQj8(Hpdur^65ip~3 z?)vNE2KCaIL*lBYH*M=warUp7?{)4O;^@ZG+S}r6+v}a_9ULStEx#%d$$JI6#^~r5G7`^ny zZO$6d8671=l9rTSHf;?}3J@#(V}Xe|SI1=)2j1aeY}%OD)MOeOs0M}tz#esKmm?&+`YwB%r1tT#E%*wK9^~QBxaS_SgS{#|J z*gy<;Fd98tLN-mD!kaIZH1nmntPpxvoVmnMh~gcYS&RzZh>a7Tpj7BKxC0_b zf+asG2_|Pn5QDjVJdFr&0zUy0yXC|3MxBIWk^;6HDZ{FC&Ns@e=ULR$;67?_?dXq0cRI{ za)CS<6So+!xp;E{^KO|}@cQDKbf`ZC^7kpnD1WK!Ncvpn{!~cWz1L>4pM-Jp=I_Y# zEg7;=JnHb6?z0Fn_`yXLsPTUfeOvRfku%3&l1@qlyF5qmuPk{%& zU!Hr5IeiniIB$}9XE4&{37n2M9njv*E4bdJGq;#`{an1cfT)`(u)wAQegxi3f#{o# zHxW>M)A8m3M&Crd>40qT%q^ZP(D%zkf%UiTQ=m{IU#ZE04u(^Pf@#^87DX{i;GTvN3)XfxFU~{j0 ziI{kwTd=*$p{~2#jMj08Xb28JOCXV4WyQhF+#1Gs=_V9s?&SGNp7%jZRj5T?J#ofkc!r^BLTxp3! zK{DWA&A?=M+o)Gt0GB`_xy*HLhR|4r+`8M?06`N3i{S_WBv~tAps9DG+@#1!jLv+`-8LJtGr4Sn_H+rfxAuZ{ikD7RaM^ zVu3ly^8{M$+%2Bl!N~%7K4uE!(ct|z>mlLMm?_X_ccMT~-b8`uo7qA1&FrB1CJI#F zM1j#aPhj-T6UbraDnvnR=IL>Cda+*wncX3|re1Id&KD{W1m3MOr9iAu%U&;Qq(Zk=#=8sJeY?P zlTxy-sRE^hnu*8<1zOC^4kB%$K&4fowq0OrZwmGzym30>7OfhoCr zN{7H@I7QnSyA3=|2PX^c{!Q!PkayY+P8PWIZ&C;ItVZOFGsqDh zpBWDc55vHl@sRk84ZsNkOWa(69^{Dv+0>aGq-R6k3D01k;iclETTJt2c5s~GSprMn zRDnJp6So+BGXPjlI#^}Sx~`d9Ja~sZ2*l_9et}D>(vc4c zkls%oJEklr~GCRZbL0lV%FcX_zOl^i36rzL{H$ zzL^5mH&LMa&MPp_+Pn@{rRVP8WPzOJnOp2(p4dSjwuu5g$y2v@at9|1%)4sd7IXY~ z+9qx>e=p|=4A?w@DR!Q~KG4$y_5(LfAR7zVEP*_`V4YN837aYqu$dhMYo4u%&~KC03!W~shx2foOro}O&%D)xwpGCp1zYD$4`+qftN=PRF3^S|Kj`wI$3p{eY@F$vx`g*&)rGXoxPLp^QXxE zcFTVCPaiEFg!*;h#iTn+Y+mhj6=R37`~D=)cynPXTjmmS8(9Ok)qo)icjhM4(DOD2 z4O`%x@-tcbK1F&SkDkfTPUhu%@afKRJoT9zJAG=A zd<(1V%uhFe1kNt9I`}D)4)}08&=C$+-SZQ#jq1ppJNGt`iS{1fzzYWNjGA$`-A-`D z(5^TQO?3ImS+WMGMiKP^Bp9*zA##|@r`NjyF|hU;4pKT1(netFJTB*=i^2?$4q{{( zYp3a}JbB{6W8^RchQ&|38n}67#WW`DLWsZ=gVA-@={lzJlbN98#emT?ePW5K*$f#I z0x)qzgkl(kZ=g`&A3;vmtab&#Bp?aTEsn@&Yk3yha%XM=9utQdNm%^EtAU$WR#Iu= zCIo3qR=eVqy#YiqvPlA3cW?#(qw|=#lQ_)f;|fCfU2+SakxXO4E=260udX;_B|7(mWV=ve%OjJvVe%xNoV z4OEWIOeDw&fyp9Oz`FNLldBQL2vEnE5MLPeg3M1~K1F7ix@YvIyF-84iiHm1yVJ=8 z1+SycNy@+r2FNwz-kTA-1dA(9Llf<;9S|uA99-ZGewW<5vSJ!jbfDCP&N8|Jtl(cv zWI913Y(P!0c(+IPru&zkNfS>lQiUfMnY})_$n5p0MUK5bv&hnT`c85lPcIUICl@*XUbjvK*Xt6qSIK9J zn{{qgRT*538Eb=_;EDljCykk=^5a;tTw?H7Y<&J?6^A)Og37PMisX`;S5{18!d_V{ z{o%H_;&j~rq8MqGO91>8djQo!T(NN*M5@0X8#;`!s+rPg9txx8w6CDkcalDSinIgV>bGv(-~HhH1v+VU=ict#f>Vne=j7BPdF}Trx^>oY zuJ>dfv2%+YEj#nm&3QckcAsBl&f~c|* zDSO4XzH7U4Dw289?M^{TJoA}6eJ42^XBX*7KDkH_;mJEW-s$IlPoBS%J~RG-Id3QP z%$!?f@ARofj%VgmWOab=5qZaD=J$$xO^Cf;fHv*r_sP#r+M~4Lvy;3;X$XJU)0mo< zt$&>3@wX!hG^4E^$&MMP6Ig?s5EuhiLUkIz0;9Ymj1gO`1P=&C=emR;a+m>=42x*- z5Mx+$1kkil8D;$H3Iakz6{2qqj!#&ilPG>T^c74c;hTC|@%hiCB5*SR|BHe71e?kx-|wFqK6b5wt30oHA=<0zp_LSQ&CiR5RNM z*$SlYgs=vqRMis$rg3;HYY{?vsu4@r8L)&PCUSMQinw!aBw|H|1%4-7&9u$YK_E*S zGPkP8N}!f5WiE)6@=cf2-d529mPR6J5lCGTc#v7uNMy-bFe09G`k;lSH*F`iOjAh9 z2vZ@oXUgIy8DUBVk$O??G$CpviiA2x+|h{TtNKcz*fIcPh<=L%OK+=#hazSp@(QGm zmL-;OtF8_;xrb+(DoUu(^CGrICovtrMFN!3D&m!aE1Ff1L4YYy2Vy&sC1)uco>GK< zI~p61PdV6GTSk~JsXbE`KgkGFDyaO6Cw?Gm2oB!4mv~a<_yyx&cH@*;g|s}qApecY zopJ793RPD&Rvc*~qg$o20_?`pi>043VsJ^Lmd+%6Y#I|u6I}k`wHXa*!E5;vPs$v> z9qsJIDYJH(LNXU#D}a1dA=c)nlan>YxM7l`bO=$wy#WHp}v#8mArAy0p;ximRB*ar7K-7 z8EjxBut0U|%2rwlqK$2ks-~hjfi=v*YtqBEZu?e>qTWKK&ZEOSb06N1Ee0dFuAUn)} zkYi>VG|$x%8#PwJSs+2%i%4>4gcd|Kb4msnS`IHdVK|AUEqlg}Q(&6snrh>;W<{ur zO;dH!05}R+1sdaujNLBT1{+5?2qX^^E{a{lW6ZfaZ#vXi1!u7dw^x)xDG?x4_gUPn zq%^J2g^82lq>CAl6lWpn7dlsdtY!flIrRZtmyOW6*4Q>knGkhD#m` z=;!?%eUlUI`}x)1QT`o#hZ7x-?_rexa(IIi<(a>QQU6u{`X|a^zQc)*&$lqj@B3RA z<)OcU(fq!@fzep}@<#Xf{i_@G-_0@k<=@f2M*UwgJgNL^=T2tM@2^j>!Nxw?BBthl zc`Sqc%RhQ7zk$)5?bkOt4&UNL_htTjqWq5X)E<6!$?_O)|JL(op`5S%TmRQ6Pu7Q? zwU4@wkL;;O|M1%I8uc|m?MwZy(Dl=s`?5wo!*_T`{~G00@^#{+_mOcXeecsCrAIf| z1RE<*1fc4qGHV8~MnZPP=oxiEQn+ZErde$O?0^YlY#XHX6x-HvK}!e%P<6tgOFuJ} zV9;LFxw@#d%haKsi%2whdI1{Sm^ORF5()}0I*R~Qop5H&0M->NDfH`TFzR=Evul*v55Q>DgNDBn^G!$<#?ct1pIYn2rEKt%<2*2p9@6+MPfn zq^<&uZIBKiQxfV}9t|`PhU{?DE3gfk=lZXV*439c>QUv?zVwM=@jV{>dl=>Ye1{X= zSM$sN*1wBU{uaK$qaT-ljpknupVu6Rk8K~Gzq?b*KYbik{yY*SGw(41YZyRfiXo#N z4SUAUl;_ik0=U%J;X&(S(^Qo-KzBgVRu~G69T~F%mrX4oF=Qaip=Zn*31uQi&!`JX z;i74pX0@So227YC+aP6zVj*6k!$lT@sA|cP?5#xw>Su^Fl#wujn9@0mF&0 zZIG%J(QQURlE`8xGqxP27_-*T?0eDW>H<=@Xqu*3Z77`q6K2RZNY$$14vZuuk;Ray zmYg1fnMuR%Y+lr@Wyxsgg%a(o(HUSgAja4>Na?Mji7=3qMHWK_vK)HGtd)Sx4 zwgDM^J^)P&HdbOWZI?4y2Czmzkx1!<+>{ks7n`O~+W;6H9aNL1H1oK)J3M-QzJbyB{Q5@ugZ>te{#}gDBY%ez9g}Zil*4=n zqk4P?qy3Myul|nCQNG8c|7(=r&t9MKU-%>S4v*fypKo9^zeHc(C_dle(a+2I*JvKp z=ls_5r{fJyG=JdUz-ayie|e++k$;Cr|JSJhc;x>RI=}MNn)sVRn|H2D+N8N`r%?Q{S7MtM7P zeHfWxUI#(Bv z02g1HwWv%uco?AqHMRj$H4xjHU}J+KC?6(h6uZ_BBE9HxbpaVMDpK93(esO-1gI1t zyOWtKmrzjJvr2#g>49k$6;`_W&gM1M)NBB+*yKdrlB5MbDp1#rJDF3|OehFktN|cQ zrdIM4(PBF?skf{;|J>)V`fqEO5)zm1$11TYU zXs2jjTFB%N5Zz2vV8U}z=jyVW&I>7}hX&A&m~h>=lVz;3 zhUmw^21QU-D>)$KTmRx6B+b`6Qmm$nG`w_Clf##6)P;KAt(Ll9dhNjT1}MZ6J2!(3ATy} z^!e6C3rX8@@G=7yuMn8|xG9zisSPOul+B1BiDA*75FnrEF2~AN19uJO6snNctg1yD zUlmQ2c0LYaJOEIc);OB3G!~IE$oc$xvyN2smu7?dY_s0e-a_u)HTQQ4rY|w;Y+Ym+ z$DgzK;bbibdLG2@o9$!gx#u~0o1|Ft)Y+VP-)!#Tmf5)l$2cmt-g$_ur;Zj$`7}cR z9kV^~J7yWCh5s$H9&~Q$S10QMJiH7%io6{>0zP?StK+F|)x%>>Ks`wkWN)6V@DO&0 zJ9OvUqmP4eA(s` zZK|ganluIX;Hn)KAj=UtD@3o^`k@!K=}`6y>~yTFAwR8niVfAGjnwL$N>G;32P4X9 z4E3_8IQ1elvR0`sd*c7xEVgc%#pq4573Fr|NDH0Nn`A12d-DWWnli@bM>;G4;!-3> zZV^#fU}*LR0c-_MhdKl3(N&3BzHAifQ86sJmC5XY1kn^4lm`y^6b^zWxg~pLryS^k zJ0;81kWFhOwiPvdC$y&t>XU(Yz~+&P7c2>ict?QXVrKSLp|s_Qo{mDPK?28zZA0Z& zK|DRs*@Gvs+7zwUmZP{;iB$-w6DLszD}J4_a}+v*szftisWBnQ&|XK0tXOcv^Ju8nQM`JmHYe`ke!*C5f1!|eL;%a52 zz0L{BW(-Z}G-MmbAwtf#oU9^ZkA`%R^BDlFNSUJ{q8-NwTopOOR-m1t&Y&u(@SRy| z)84^m))-AT3u~-^oKI?35hPU!Qzi^Z-T?--0)dMLOmo)gN6V@qoT%C-1Y5GHc{RWS zG{6Qbk!#GZP>s0v8T>98}Eu6a_l5z`$0ZouYzgym~b% zv9u{^Z~Dk)ETka3l2TAuYC@6b&KGf!Lm*eO znjq(MYHBKO)sf847=f7(AUu>JkOXH^ElW4f8B5<*eY*D~bW3TeE9(0D(l5w9L#Im4&25;gsq(h^hLC2H8PQ$&|_t zvc#Qs5Lk>V7jDvKllcS2bDXMBao)WpA0FVxd zVojZd=RNc&IpV=X)Gt;Mh%m`<)XrQ)GL_N} zEH*$^VTTm4&`x)`Vo}0HnZ}c;oB6aTq(v_mEUQJM0qr1ZTUx53B!W~05)`yV)&^4j z>{rfwGvNHB;A(3MZKcP$Vvd$>@j?!uxNos2>T5P-L^0st+fXZW#G1z&%*c+KA7vZ1 zg>bHj>;P+wJ|ak&QgQlZw$EBmB@JxxZKxG4`Uon8z{ZsQux%Kaau(M>2Lza8TcbB+ zWh0KLj=EYc3LghAE}V*?e++6%c*N12ZoX_N(Xvt#RudO&Qe@H5ldL8Z5KRyyM4k^3 zN%oY_SUU(IU@sdvm3-My0Ab}@F{=UFD6(i}p^88#aBShoB_Qv zkK}`seIv6Ro_Fx;ljY#~37)esr?6+k^G>`K3KQqc_Q;~K`G_GLV3O*O`so>ilDDhF zief-MRMmtupP;NZ&9oh(Z09pU^@|m;%tuR@Nt<4-t!R*<-^?g$*cowQDFtkA^<(wmBgJ$CSf>J3p-nf-jzmzb&?cY2 zYb#K#P&GU((elb393a2Q6q(3OKS%=$*j0fgDDgyAG?RFzfAA<4BDkve@qj2@ExvZ2DtRK8VhdBMS?3&c^ZV1Xa_%yD4i7& zf=W@{_|Zxr4CzEiJtB~`m$_gR*Pc5!AHtmHu=$jC^NDd0c>+)5#YQI z87>vSvz7!EDBFdl6mSCDAxR8AQt)g=Xo^B8Uh!l{o-sCeA~>(hK*(2P$(iAZ6}P11 z8CeuQFbBCq%_p;h0`c}XF2%IH)2h>mrkb0X6)_qLy6$jPtc#cg@G_Qk0LKs=c(RVh zVTzk9d+4Y_3EDA_2-}@Mm)(lG=D`0l9AFp3qEPcembL5ZXx+7*n$)szA(VDZITDf%cbEW=qPD|p3B zu*JlQG9IWo(+U9CEccm55tIW7qEIX7mtx0Whh@2k6aDXr(TG54PXAXrH%=8RcLn4-3L z4l)=K45ls-$Tfl;Y)v>q%Wtb7S1 zScke(8*l1hx()^vTTviOD^1WCtk|Ho*hdO3;KD$S$grHDS5K%c?ZRsiap5 z^!cP_>od(Nea~#}N9@{+um9|1@0;!6|F4SF10;CBUO1`>XBTyeG?mU&Fa1sYUz+Wa z+%b!15AIKy^?Y(>9(E`*jTa_t(6T_hfK6^x0C#AB_$5Ni=eY2iuG$ieaNg|w`-Wh-6HHf?Q_P=6wp1{K~ zc#j+pw>LjLiuAjlj-;QSEG)_36_cOHTBX%@Dx;jpXuhYg0`&b_gD z7cu0&1pxrOoP&r$S`;gya!3@LQs84HWi!f66SBq%1Ff-vNu7QX)V>8lh@)^0A_}%( ztFgBfQuU8whG+pmbfp&WprE`G7;0Jv=f*|CROJko(3^pT`f zm1&j`D`#yvd8L?#?9v(FkfVIGpqCpLCOcM%m5GWvk`&Wd(&-?Z50O`$tfSP{Anc9l z%omnlwk2B$;NO^y?yla{PMayr>42TLJLO{#;ISkj3x?@ElDLCWEk2ovWzf`8cWWUk zfNV!{VN9VFo|p_kEal`70Iemm`E;V;&&?_{ixx;!3Oz%%5dFzSbr@5^${fJ0#n7w* zvJZ4e)DpD$P_;8#I~Ag!1{*bZgAk!LO~@LRpqqdpq{~OKt)cxmH+d|bc)3Va<{MJ= z#aI81+3LekIX?bFvwIkS%xrGFr~1z)JD@zdPc=J_K-oMBG9E>C207JNBn!FKKVVi{ z_`_u0l`m^nlYjDLeM&hfo)`y}C+Oql6#Lneyk)jWAF7fKA0G$rN1hT*9}#H+OCXAJ zO9I5kt{mdR;30-mYicGJmz*Z7Hq^49l)Z{nJR?e@1`}GSlZ0-`qQrsN*j3zNJ~UnnK{Fk!jGI5?MusYF7u-A*Vr057v^J1G?G)pF+7n+KGrHd0|hJNsdSW z92wTSuwnJsl&9ncI|%7E@g7e$_g+ZQo*(*tI+6Gr61iV-pp{G@x_txzg5auj22|3H zu!Fz_q!6;(i~!(A03)nWHPlYWg0yWj9u=N?O6`~ZC(U;DIQga7z0({;4u(7OUUUD5 zc4Mai9K4+}wC$k9#5Cr!|)t4%hcD@hFMYSbPAtVzFc)EyJB@qQ6 z6wg{5x!@vNE`dyG_C^yBk*erSCit=FAW@b9gOVsO2_$Qw9>S5%fJ>4*QW?JV5XhXN zY@+H1kTD1-!n_LBMOrqc(@g*seFs#B%&M3mSF!^mbrqqfl|7|s#>=CuHrR;U`Q$GXXl%)|Cc~SU)9XHbh&&Uk0P7Qt2V6ST0+{)o;}?Ogj@%ORx)xP_z$ zS&t-8(tJwDZ1q$~xmCd@4n1jQZkY{_5x+DWAL0%RyC0YZ^gn3UBf5^^+&K5j4O8Ib z@tiZF*P0tpQ5OvAt&c-)A+dD-^Ay(mbJ-!@-%R7Syg3@9naum8s~>CsE%wa}^ImYV z{NmIC{L|R`hJ7b)-rUQuFB9g4f5FY=H|drzZ^B&@FT=h}_(eCzZ`#dM1NrB%_f6pS zuHlzqUncyjoBM0H|BS!pAm$I$UEz6o|2+0fn0V$O{u<`bIiF`v?HTOLgr70-4ECK2 zb6+yuW`Q`z1^~a}a+G&r9PQ z_ROi>#-e-^Jh(4_y>DWE+in|vA@;V3{&l$ZG+v3lYZwn-0(;Yhe*ZM~OPJun-8=~2 z#(UuW9r@GP`-bOH+XLJ!%XBK%8{7_i8 zHx0i`_X%Nc$&1DNWTlop_e(?A?Sptm*U@@r$;7(JNi)8JA|S+$jgr1MEaRqO#yR+o z?(0rM8{P}NLMe@eM0!0 z)eXZQ+RcM_M)w;1%SV^#_f9yE!DsNvy7vvA=PYHP%g3*TFUhp-obZ#nw+-`l-hFg= ztM3_pN_QOGHax)FhF_)I-|IUkF=lTY<~iIwi05?g3uE@KVa(n&{G{$};nBTqc)Y$$ z_pUI0?;6JM%XDuF>-VN%{k}q%x9oK%aq}I1=E*%_cw$8%zdm8^AK0A} zMveRWeBJxPp6gA+!0sA;QunrS&h?(*oa?<4eoEIT{GyY1$q7GU*mrjx$DMaLzx4MG z;yK;>hGX=e;k*j>PWVaP+lGB9Z~m3=rM&q|_pI(+!|3{iZ@d!teT6RbZwd>$X;|1R zbbZ5a3j5~XH2gB%Cxpl9?N?%s;P#VyPWQerZ|_~hykvI`Kck!9ACzTfk1r-p+hoJL zhKF?9a7S|zlCp=+a2#mF@V@Zg(;eZRJ)B=2#6oWAY99B?%Fcb|wzxNMkvq-oJHoHh z&1v5_7Jg#l;wpF80v44*^c7U#o( zY2Q2b_JG#b<_hUga(R}HX+f8W0N+FIEL?HpRVtyff%~0+i zg^_%TE*|fFJc)ZyPU80G87!x8`yigcj$tkzA+-6-of>(^X7pRXi$&)ZhBbHB#1l6c zw~sfKV$#7fuZ|)frPhg)i@q}S4+&$rEu*%*%wP&!` z1N-$W@eKA-yI6C-AHEXLU~>dFPxu;rjo|6{8v2~-NL3$)6c3PA+j&_^3>(NDtiJ+| zQmwR~n1~{>s%ONkxt7_+Z1?ECwISHk!A1%~na%zJqki{DqAuzv5Fc;e>t`vSvSy?MgV zV6}SlgrC9sOLp^>cn0gAv74{NGuZqR-!uFSHh zY#2ko^A3+PlkQ;k2mXfPXRu@Qwuxu3*yP4O+c0PK3d3l=!mtnb1%`93pMS!m_>vRO zQ@eM<&tUV=?!6MvV8`k0SK=A$sNO!|XRtllTkr5Q*q-dIU%F?oco1JR|MX|=87!Jg z)(memgJz@d#nQ@cEGkQLM<9yIT_c=x8;i(Y6VG7Pe1YM;_t~+NQ}J#;IN>J@^MLrE zNyMrA?1Z1e#->Iqhqwc5nJDPsjHSqUK0m=uxqW$)@d@k55u|+^D_*Hzn|KE6ZRCc2 zY4{necbt3wrQv6=WAgq9KY{f@VWb1HXOss+-AhqQx3H_{E2EH;!bcI!z75Y{IfdIM zp1^7aUpEXtgUx-y*t^W0Q69`JqO#t-xo1o~fqlxv6E`;&c;30~S-o@j+%S!OC7!`9 z4fUL?Q63}XRxGm}aR+;NxKan9-#2{j^NxupZY~Ept{K25h2J);-Y60caQKL!_Q#xdHcgT-TdJ5_UE<%Zr_WA1K~4cj2ve+DJRv7T^cL3 z>~<%%V4dg=Rx9n@Dd)vel*k?S5y=hp# zFEIR!2d%!~guSiXPviax^QVE&tzWu<-F?jbp}C8F#sjLi4Rc>NP2~O&cff3Ly;vs6 zRBC~=)`Ebt1RRojx4@x%zF|)x_VJNB_?h9S?(Y7nc{CgxNqUOaMkBFGj@l-Zkp0y{dlJ!Ixh!yf)U!#&v%*7z@O?!IA9>h9eQ&ilJz z0?1wLGake`Uv;NAsSkHrec(CQyC(E|*Kpo0CwbCD4En%>^KU+m)lz7QKYNFtFuYQ; z-#N4=?(V*c=bi>O?;V<_ejDs_@;Lg$nBpY;ndBkPojq}Lirg~nH2k98*+#g^Xa>hl z8?-F26G0AwL{{X6a045NQtBWFQL<_#PN?23usYS!mSiG2%9%WC_!$r0gA)LiWu1=7dGvHSvVuXFTx0 z@88{h6Ug@dKlOyUx4VYVVZ-2@Ex3F1=tQion}!dKTg=ln$`A)3tUOs}{2Cm8H%)K{ z&OqIn&Z#3^Sy1l22fey5fKToqGR~sWyE~x`Pqq+IYFkqf znf_+%Qa}-8y)?B%-OdOmB28sWWVzV$!`6OV@+>;cK666_oS*7fwEfN4ErBA)ipIn$ z*qOx=MIx3|*f*o0N@ugR-`2r`TUsauhuW5-Xirr*JDMqhI{6GTW@n59OQy-gohAs@ zC|3?9h3%(%VJ9b%7D~YqSFxgC17}|jWYXpiv}7X$;S^QcdG3^~D$bH0HZDGCC8X#u z`^+T|m`zHkp{0L`l%nkP5R*rI_))RJ*!D8mo&$%+B`g}2VVrqUMfgEP3JmK0K*%Lh z-AlwF;xq0Ht)`ZY9b-CCMG0D(0DVx%`NPuw>8_o(G0h#ln`s`#+nJuX<=sr@t$weQ z?l0=utW%TVwFH0e14*5OZuZ2MuYs!fOyp}b%?N`_iR`vYfRj<89a!y`U?VoL^acb@xZ zG*p>Qjaj(zNwZkhR)k?( z{2CAgkKhtSm85Qr{5Piew*H!a@#~SN%qx_4hv1999(9Q4_c84+)tu}bnf7o!?DsL9 zH{#7qUw?ex=Jnu&`GfKIq&WwU;WdA!J?FnC%^%MF_vs}k-E;nX(l0*g958=4-si8! zSk>@5na1+1c%j+?Y|C-Jr^c;M2FL*uvo-~K*>-P7geZM#ce)~9+p3K|)_3%OdHJwN2 zL)^dnKEi#&eZ>A-kf^KwLU?*Udfoy%`zDYh8m8UkJ7h9{Sh)BAR>{&8PGl?45P|2C zl>Q}AilRtH;mAACQIDUmozc#42ooIQ zd~*Rr{V*{yCdf{Pa1$rp7A3=UnNa>&Hsa(@04im;jwFqolM3 z3s1F#;K^I3hoC6S+n-u_rGSE)wY|fSN^}u~J&S1f%__@We^|Kq?8KBDW>^P>h6p^@ z-I?xg>k_Nn*Y+;H6bT?fv50oxj79|!jdCb$6$xrdA%J(FLDU%l z(iONFyCo~=G^IERLVV2Vwa~7WEr|jV7|`G^ja9Xk0m5aWBZ+{}D|$+G0FBN_)K#QJ z`J%OsD4)Nkb8MamOAqG%-%0yE_&V?^y~XP>ukU$>-^TR5V72}ZC(Sv(-AVs7?Y|&A zxB1QkeE3uEFSyT?#h@tuLEtYUYyNp4-jbEgtC<0;p@5y%$-`D2h$s+&0S!JB6DEjM ztE3Q=mXsOjRZ>BjNLtMd`kQy0uqqHsFqmj(M5oj11Tq{a=;a|~F0IyQQ!v6I zrGH73qUiJxgEc>7Mk)Ryujuzel0nle0nPY>unH&S@ zqTYcmnZh1KK`oKxQqcL*({HQ1$u5b{GR^p?I?~uu96U_a{N6&HX_y=SFHK|Q?M!=8 z?{(67s_%8uz8G)zdf@rpUJw6_{WYDx^!+Qof1J-BCm$@2mV@GtmOuN~+;SehOmM(b z0!7{mOu9#b*pk6S_5x~&elB&;GC-4$-Q`WTA#*RNG~odF8W4r96=cK!#kz0iY&|%8ogm&;k<4va2Y*fXjB{3B?4*5IJ~JK{ z54X?n>zdAssa@Q@hv_`_w=(T7w!h!J67O-+zP@kvPVdq7j5p=Y=2icazlfTAvv)e@ z``2{;Wcs(Bx62od!{MR!-|mNyUwm4DvgM7G7*hI20?m-#=C23$_im=~`);Q7`);Or z``_%Od7t0SG_S>9)4pgu3I5dj!;(L{*CD0kLpGeq^J{&!;mBIa_N`9$t=?(w<;`9X z{JxuM{l1%N-sLwt>AcJDWqLfn-AUv5-AwEG%}nzrfr{UgXpG^IGn{MgCb6eW6K-8ZAL{GQVG z+bS~5N}$38G(U@Cr&q*g?3U~g!W=HK8eyhC1wQQs)RNVX=xj1Hmb03)A0(V#D4PU{ zEMi?VSmrDdM@W>Z@ll9QasQQR@0Xbx!&`5GeLmiuL9B}e)Z6}R+Ox>P^~^XUj)%pd z$34ZV~(iDHT`sb#GD|%M?J}X_)5W`)8^8ANvM_C zH^vG|(KxCqIKff`&4YDvN1%B4I>SIHZLOLALDSx6?zhL_d3@!Q?!$R6({rlx7WK_~ zkCVpj+nL7lo0-lzznSTr^IMtTbN*|(zasqWmOn`Z9t*2~oqp$EM8)3g7cs}+-ww~g zTj;<3Igg%;=fi&o0Q0{(&Pj2ID%hEYjySpWtZd1u;=(*I@#x47T;4;-)WL~Iv_bSM zTEu4TmXsZ;IZR>|GBEuq@M$lgmaH;6a?WNc1cXMkFW5qVRLhMif2*I{}UpqKd-&0O>?+i$DLFhi=51&6%2 zRa6WMVg7m3_|Ozb)}OC#TF-B0I-cLlw7)9v_IkXdX@2+o%$7ej`BnVFzli*`_-lF| z7YD)0qjPPVJcs0A(G=z1TOYJfS-jkKi9vF_NQTDyfo3{%x3#qNH)EHA072GOU4B%e zEAq@60uvkv>+qQ^J^i+d1SPQp<*5P9PO{R{$DB--6(BH&M|}7WUYuNd6~J?{DIC5U zmhHDyWU{4=FuZ`-GVT&>pMi5_aPXwf;SnExREm>ZMF^TAfEAV5)S#JuTgNoGcQef` zzMW~G$a|f1-1-8|f%7KqTjcp$$Z{TsZ{NVO4}lA6=naHdZaQsR2z)}hWk-gV((d+u zs{Mp`M}05uE&Q(W=e2*^I7j4xetY5`oISb%0H(~jetY5{8~=%RRn+^n_!;%>iTiBz z^&7^Y)85HAZ8O8lkTAB$_4d->Kl;13_C_xZz#zf}7Z#OGE%GVU{+#R+1( zvx09P%1^X??a}^LyrPsJPu$}kWOvAgJYoD(v}@P#SimQr&k%p5wrA%6OdsRKy6Q4kduv7@#-n(pHU@v& zxGuSq*yVL1da3ru;^(lZ-6Ler_s4hqDdV4{{h{%mmPhuljdSeWbnl8Ip!6x)Xx$fQ zG-LT>JY>RpYak{Ioi<@hM1>@aIO@?1Bq@kWif&>o7IU{&#mChD|gI2Y5%v4 z|5W=KON_r`Vo zyT+f>&hz9na|2922RlbvH4eEDO5YhL_3DlkpK2Da>}P3fQzsqrZ;Wg3N8_*5j?Ev8 z&rNV>Ik-fg_ZGi9@u!S`inh;9H~R9~a!yZ)_mvnKZ`S4YF%J`GopJAN$`UB%u`(oM8?+m{Qga6@aer)`w+8-KUjqk*nn5hl0 zhB7;Ecy=r1Q~kj>jSq~gvgjs1*H+|5ah&|N_)E1v7Qb%(n-kC7XHCq6g+!J+&_+XMOWRMrLu#+`X` zZqJB+Z2VKS^CW*Tu3g`N-!T4C?We^1AisZ{dAIi*`&Q?Mc~~6#L@P3>S0ySaYb_@`*&C5PP|lU6*+Np z@Ib#i6t#ae4nM+ser82KC9cyQp821k&8^PodvWgZ1LKSSE#se~y;gV>*Qpr&u5k|I zgG2cgZBP2Pckz=ZIlaWEhSCwX2wgML#S?>O0oln;dsZ3aZ0=$7%H zX+I^-uuSV@3#rDFU1vROJSuye8RRr>J+kpC#M=E+`BzCIA=%(#v3j1O|$ zS;Z)*3XoX%weio!vHj&c8IP!0{l7Jj{QyXb$0pFc+=lA{!{H| zjC)!racU%<-`cfIYfrq5d&ZyB#)RKn;@T8BtjoCem8sajG5(x(Oz-(M%Tpd-TrN2b z(->ZcmQ$DfX#ADh&dq?C{>{`Hel*Tq^rRn+bM#C8Mq-OTn?ineC{Jo%6K{xP>I36i zS&#qL_;cEM6*}uWahzx{eag&4kq>)@*SLrM=f`7rKZiC=g03hfV#$LSBoDR{QKkb2q{a+^72>#x(^AwAI^ zQ8FyXipM=g3tHUepNzj!`{#%C;nCt8^F7U`RJhgL*_B*H@I`PEWPo>B0SeHC(DZ`J zSM(-UQWYM^RN-6SMVciP`~Zrd~ zltWv%qhOj!$&~bAI{YeT&S``T3eC(+hN2(Usin(Y(F9yb_$_>4Q-(6=T~0ttqI;DQ zjqnBvZZUI2P-i02yPSZQM0a6=i2@?vvXZKbOPG=(fwjskuMklR%oT$D1jsSWV&)Xj z2A6J@6C8xf!UStmbPS-!FzFfOD9eh_&FBr-q0FaajBp{)fG~O~iq0gbeyv~mKA|RRk8sEa!N>ojH+N|^ zmblwA+Qw(9Y<%2<<0!nRET$A>2;ZpG?V!llS;W!p9tEUrWidUR+E|54Oox#b92!+8 zBApnY$%I&psk&fn(cp;rY}MW@>UIw4wKm)%b z&=#k(Z3japF+uT?foJf(t$ zaNR+T4<^S1HHr745CyC1<{m#X4mI=!up_JHmEemE)m=!f(`sl_5ITHCHxam^!zg4b z@I~)JYMoX?5lTYkE4s;vmndQu=z@14wN9&{h#0CX(TIv^uyCsm8a}bjK#~Bt z8e0Tq)h>Xh_{yjND5KRyzIql4ZY(VDS-Iq8KcTI&48)PEd1S#`1cBO>$gxX}DYNsD zA~eh7e5@C9B@c}jog!Q*08T~&`sow(W1Rs&P6F8PDN>jc7af|WNgywPNJCkn2+t6S z^(*C|4B+xnJt5=_N5drXg>%A?FS({ za8|!Ncrw~vbZC;Q+r5C@aQ60t>iR|AUK+35nHEVO$8voVNI zqJ<|7*ye=M0XbN%5ECXY#C`}0N?r6SrAXG{5GJvY`cj$C$GreZe#Ok}R~nhd*c%vvqGUg@8B^-7+@=AyDGeCnUfmL-1SCacv(A}%c|e3+Z7|9QMc*-U z5C)Jvp;FAH=T}Zb;cWmQ7bc6)G4-QvDe!>@;mly=8`eg~fl!YLErp&;*)q@s&5X=R z6k3Y_YzKiXU|HQyDs+0q3=iC7^S#!GA^;k0M3Z`Dr(>JWL?YTEw#pPE)`w!jlmbCr zOyTmg6ytz}Gmr1JW`IRT(r_bUj0JZ(n-ziwihSVtz1D{!fMDTFgt~U5vf08vdlr87CkZDzDS-CANrz^AiiSy;gJ^^sC}~v+%|Uo&JVPk3ToCif zQFToAqMIlOEr?BFWR%fKVbvdmCd?V8--jXqr_rI=bZTu`V4($eHLqkU7Nc8)a4O*f zAm*}mqLTtxQT0r=n9)Tb6*wE8O$}|GMaV3x2_(rD*&mC_K^qDz)E}?!>f)$0JwId*`@$nf_KR@$v@ms&^L`c6UJJ(?juge2UeI8VS`0C z`Pg`fGS8AhPZ-a5EDTAvqqQNyTG2R(o|4(OVlFe3f<82!kua)8)X`J5kz6fRtuMSF zO)IkOb?%Nj4D`;ZW(=e@oALpu-zrqTaAgNshK(%XG6gA}><;nL0Z6BUL`7RNmQl6o z0`5W*m|)0WHKLv34m+P!wqH0y z{bq&12MvS>BS^)J0Wi2|=0j94QP5V<2(@U5iDgvHvb>UEFVjgUb2185>BPe0q>NZW zQNvkGOIIe~%EZ2wr|!_{*fcYYAgV^-GAik@N*WCTf$1fJl9ns7jHrQ+=kCabzQUUY z*}*9C4HzzMn1yZhgYg(DzG_>89~GnN_NG;HLMjdttOw_Z;@aVg#UwWTX3dyzl`Vsc zQ)l(+&X(3hSgaa5$W8@QT$F+=!zNT=vIz4Hk@Q4E2I+u|nI2t=i|j5;E*SdR*-QjL zwQeSiZY0GWu1LtK2vG_(Nh+Ht;qXKi?5G_W8eC<&*NNpbJad_{c7vWQpKs=fJ6w?w z7|~d%;W((~kr>56EmpCqI4981&=A@EXU3=gpn7v%v*5=h3#Ch;q$iRc#EL@J-s7(s3NR+3qONt?uP07)RN zW}7;QPH1ug@qrwUm84Qu=SmcGcH}aCtH?rE4>Gb2V2)KQ%PLw>;85s-rgBxuEus^M zZPQEwebEY!Mg%LmL$uN5aG4xA0(jXc)B&;%X+XC#7EQ%(7m#w*ouUdqsL-H?UiLsK ztbkIa4-%qRQ!s>32*M6iqO$aDkvZ6<<7H)xO=S>5X5mSiRnK8nQiQgQSriDXf;Iu9 z3nFNOlT8Q$jBGn*pqN0Ps;uaNwBRZ)NU?U|L6zxVDm=iukTSt<(I#1T5R^!ktGdhq z+EfV)EthHV2@`CQ>_N9O@U%)s(hs}MGEA~40E=(YCRwb}eGG(`i1IQuWRhhXd|0R8 zCy5OwhHIBr$w>Oi9%cv0=<*-|92vH(TiEew(^IcIyf8YO`utd?wP;FJW;)+?8<0(1 zW_Jh4Q^zIbO9DK4!d$nI1SD+`zcQ``>Cpra8)EfS=|m*w#|uC)Q{ZD3qzYLgwrII4 z4`JfUWweFpMKDa|g9IL#iY*$<+DJ6W>KTaFr&gkS9Wwl= z%titBaZ*7ciw1MC%;ZQ~@*vQg&+2yv&`TMUi2?>M$JGpgDhA;)pAU7A-Gi(|%Cw3W z8$7GubY&cJ1qTq_?&?%6iYws^HLNs(ufYUPoED$de=u&H-Vqc%Wy!UZObqf|-2s>q zNOcDgSY#@;Xf+gVBozU#?iMo%u?&EKvMUi~zf7m8h^0n?CYF~4ev}f(t!g_VXWa=q zt85jRr=1K}xlX8~*8RAv40OIxa0^~KlS36}mPw3&72WOuS8W8qhzq*20^X^S6D52= zl#qg(>8V6eqGcmNog?0fWmcX_-PuU&BR7RqfJY)_ zT1AQx_Yg@1v!M1!r%2XaJz-3+Oski3gkO=&?PrJINmJlc5PBFD(3L!J$u>_U^+7JO zYRsbU?vqh)ROis5fPTIKlLcavJ|K4)_K^d3p;rb>TC+pUwCuhaLGr1GC%A`&H-mpS6saPV7vwR`38(!tSGg{ z!JJ7QAHPMf%q151V8N-%GLeb6)jU%c$v)pjD-l;gZ1V`KTq`z@M0@ zFA?SK)A(KEb&5}>H#Kn3Svr4mZYSs@+OW39k|aaXkLn@IP$mmw=fet#PZJC62@)#c zYK5>v)v*-9r36(4{VB}$Z51;aCiC&_`a!_Z8lc_n8tg@ZIm4%d3 zVw55O%uCdQJFw@i_l|JE#rS_V56(~SLv$EvD9F9O-nwREdP4nZhv^hr!mWK)KVQK* zUg}VD<}vtz`RA-hz*C4eCy_&7sDpDPt#Vl3!6huneu+|85M{~E%HdjTsk!yYhgEC| zNI~45DcULrOB`HaSJj5|4jf$)LG~HYN|`c>0S+gdG&6Fl*3nr@+kj|$d!{&lz!C=+ ziOdr5Bb#96&qBB~H5g?A8LuEH`&M5>Q)bm0kl zBwE~tkSZ`MlG&s(t)Hkz4=5U}UMv!}`shVyae-w{2P?(&e$gdKz5`k@O)CQz2?=I4 z5oWDSVfBSrW)9iunZ43F?&8yph4kmDm-xM+m*%XbYVS5E&2cVwlla9TV9|>OW!05kT%{HlShRy4fvA@r!X(Kbh>BnEjuDpO z0dyk3mC#I*(WPt?*JNS0T2kjz9FQbos#vu_T4b8~wg93YS1dml4fLcs+fgXurMq8YNqwW-SD(_SQN;7?D$Uw%+PJ2_NUUBfM z1&5W-v7Qk-%-ONG1HLo=ob_)^e!BIXfCrny`Nhe5cb+7^INey{hTGeNP}&;M;sU#> zHb_M)fi8g{`wVIYoddTBoKFc`ONvwgj)W93zz$kYV4N*%^3mqbU0!ifuI0z|BEoo~> zoy3&WCLoP6Xf#>{Mv{$Uaj8+R5)Dye8%Y%ONCfc#TrhfkK&Z_YLT3eF0t&1bOGv4r z0W%xW;!>jwqzzJ&;Dv7fK>QG{RCw5_Y9%_M)21j|46zrBe9ZWXnG0H6U@2EPf@@Wt zT7tlHZ6$HUOIe%-nSJVXaFw0aW|vh)+8*ujp`pNzO?-pO#hSU%5swAM}r zpzal)duN(`()6g)ydzL_BAgOaT7C2ax47&}{20za zRk+g(CT|w8VyJl)3GsNcMH;RWgekBpn^J`ee!$Ww!^k8qBJ1569GXW4LgB?5s)UG4v!r(XCGXUVW|vKVz}2} zk-}>8vNie24F_t21wn`@CASnFYY<{ZUnN~cggMa~xK~n>NVF=hs<3_npYk>x81cx_ zqTd>c8F@OPB)NKY?FXNRb1Eo(Fs3HKt2;#+N;o^yeuSe7FQe#Va4Sg_pk|K@lniRA zQ)y?RLA2qV5UGf{hOHW6Tvbi>6Zn*Yg}TJ&%}4zO*0J=I^+6ZUPKGOMY1L#py>Bt1 zGB0Hck|*d+g)5RK#inXwEf@O%d^!h2kO3iM47-e=ldlZQUMx{~s6doucQpnEX}_1I zQkjw&Agc>X%J=X~h$t2qP@O5Je(@AyabEW6X2MkqY%K_js7^lg;&_A{4#Y$Po-Cv| znFHd26)2eH&`$m0NwERWHkwkBwHjbY1XxpcAj4W_ddXSsZxhMy;Ua3`TkGoLWx5J> zjG`=&b9ch@DLvE!T^qKLnzG36y*LGtU}B)61nDnU1^p5nGwhe;wSC^b1jEKn(eG9Mcz+c@=$N6EGV z(k@LqMaZ2cdy3Us&-`u1L44}Be2pGrd=re_N)ShfVKO;gnzq7`io4LDCW2)WB~w}!QHcY) ziV|_P!Dy8lkYrwnBtWd&vI|8Wgo^1NdQ?&>#xd3D6gv@6?u;Q*C@wM+wTA{gH5({h zUZLOxMJ=}yA0Vi*v1Cv2!F&)evYz>z1z+tvni)^&VKH5e)NQQ&gL!XGMs+&HvAJ`l z&KNR%Fs(Bqay7{7Y6B_aXX(B@RU{z6Cy*QFgYPhjj-G-;6gp$b7`V*urQ)ybE)Z7M}XomAXIgFI>{$jonSoE}wW)JepQ za@dHor+8VIIXjF9n1!%SFE6wrq*xjjwtQn+ta%q4TtKfQSzrbH0|UNiKCIC?Js0qdo}2h{=)oMvhIoQ>E19uXJ7ky zwt39CVc(y&sQ}zgo~%_Gq+F|JmZa_U#U0RObn%ceZMS;H(YMvzakVW!^U(?P120!6 z>h;5ngtedm)pTeV%6>qU9eqj>3Z8pHLW~CqU8v5iRtzHhEgb#K zNR+V`2$k=wS1QQPt47&MVsojWNPcPEX)Me{hAULqsix6t+J!n|K`V3dYF4F;(3eVz zW?se%KZLj8|BH+I;S9tX$@PYTcWB zq4np?=Rk*Z%ewE&AO{TvPImeGV78iep^gX@61?cRhN2pIz{QZK<96vOAap0EN46C( z%{#OFPMPdmxU)0RmC27F(vPf1k70oqMg9}>0j^O{-+MXMqd?os^hdaY85i6X9yLin zvQBY3AvF7m`Mt+`*6}=yS6P43e2xx3T8b^QtxRNWtql&PXGGX4!Ko5+YF^)fQdhKN zfX<9IOiUG|#V0~o0oQbPDk)kc+e&!Jjl}^&;{$lAneaGbHJMC|PGv1v-^f1l3=z#Z zCvMFY2`^AwWLrUG^M~rd6#5Y!5yw={%4$qPu2jd=k>xZO5%p#aER-!VHn@@#JebTB z5maePBTo1&{D@Als*G4L$LX@D5j59migpr2k-j&D}PS zF>LcC_($u7{REz_Vp=qM(4781TCWE|i;SsedvnL}32s>TE>XbNgDbLDHB2h@5i~68 z7f%XHsLu3@w0@-mI|vD zD72J39ja+eerW;)M%VAG_RAvv0WQK$hDHRK8dFUD8gtYl7sQD(6LMyiOTyMu%AS zw#A8d4wROXAB73N!Nw!22S46Az zA#JLeN)Wx~k)iA5P+MWNp*V}=ZIM=nEy|SqE;@~tgjqQ_B;d?SgP$m5w_?x65g=d$ zEUyR>SFZKsVs}887g>ggDozXR|FL5x2y!Di68!(KH_Qwmv-;Fg)O2bqj|jlw?m&>t zQqRa+$V{Y8FGJeNL4RuWvGGX8&fp|7(3VI~@k4#*wDPM*-)T7iT&NGjk!;D?KUDv> zqo3~Icjf2e9udd&$esxtvw_S+E}osd4tqg{o~&!fiV1p=iJ%Hh2h#dNs5

Ky4J7 zX&HT7mNORG*pObn*!-SlCZMw!_Esri!3VH-(Mw1%r97CBtB<@KkTSfpW*Pun#W1Q4 zBv+NdaHqriy`uTM;)}kSifjZhO(BxiTSEaws`aQGw^Gz`QEvk$vq|D14-c(kzw-$)d)znM!-8BmiEl6J5e- zs@yW&Q?0!sb^27M9sHJYIjKMe;#g2KfZ-SApt4?23t534v(yt^!X&1Y{4&4^inr(N zOOSh!#-)C=ai#x;(EQI4kZFm`l( z^VV0h!(kpf7HY77b|3^bGf9Hd(kaWJg;tUWyJ|UX+wU6v!;l%;yE&>hi32zU?dE$T zX9Y;|arEyT{`*Fsjt;paF@!Gd4f=zlf8X$4h)%y(X=gbTo$JLimS%@5;epW|gO0wY z!Cy^5T^`gk-Q*u2hs}d|b7kEFCDcLG0x1*~GXYK_oSaV#0tz9nLsV@oN*qmcx?Svt z?C8f>*T0nLR!_Lj(XQsSy*!*NCuw#(c?<$NIyBJ$lwnkj6s!48p)!3YR&8>bq(604 z`srn^5G`hYhBY4B9bahyS5>EmvH>ho$^Sk)484@h3_@C9rd580O<)^GCdWhWF&r@K zarmau>6Hs~^keD90)tL0)M;xPJdmV6kfp9gLbSSRbXk%ug41TUx1yW^A3P)pm>d#3 z>XgGuqf#Z)x+2^vL1hp)NPw?9Gc6|VOjg_2NoV$vj+AVA|Of1q7+j@RK_P~moK zP@7J}5QVl=X%k0*ig*kRXTUR#lv5qANy!06YgtOd+%6JC6GhLAeoeE3GTcn-WF4{j zybOk&?B7OD%iw#`_Uoc)=rJuEzVlG`GZz4Z3yD;Yv=Xb?-wz*Enk{pkUWjwJYHQ3X zb!Bf7+X7w%+K=*^MxSQeBXMrV_Ia5N;uIXh{4DLPJI)MsDn$2$1(V(4wR^#ETYb$sV_pMM*s$hP^`sYm=)5K;8ar&wDoHyv%CR#cMxiQF z^-I4smG+9BY`oNukA-QXiydPeI!y+_ZF3@KGgn3)z-mS=w8Kx!P5+?Q(P?$SL#Tb6 z8tkPoEq1UfC|?mK+PvW~6)~WXlP*;LVR$&F>}-G3ZY>u(2%W`GJxu_R>tq8>b+iM8 zZ~~$~jV@rt`jI6Vs9Y#3TWNB{++|y)-hP<$^b0W zlt|cuu_d|Gb~xlGJ}YwG6D+-S3(KqHi%|WDZKqiynd>tR)M*8JsZJ2V87ibbaGH}k z7{JknHsWxqSP1`@(f>Tg+Wp1w3_+QtUGk2^P{K_{Y)LM)9c~JlXnI#5C7_H){lqY| zio+DzS?!?866x^T|57oIqt!*|EPlq9r0<|MY{IQTa4fsh6Y+cGN9Rc zjauJqKJXh!E_QSfYOk|4SP|4Zl8vwvw}lx!C8k+j=sH+S$%>0#))XgND6sq#hw(T3 zab6qUnNUBA3~q+w(sdnRHSFjvn!zY?oU)WdOz~oFC#rlX4~K1`nplCb6lU{_h@hPM z%{|~`1eBJSYv>fJK^fxZ)NE7O*>JKDP5TjJ`zOszq6;kuXoS!l4YEPl%>v@wx{H?4 z+TelhXo;3^L2}GkhB7>re^AkMZFUTLY)^u#VPJxxtfoQgAZ-d~vvlbHI(ma>zfF-V zb(_+0azsbd-5wJiQauTN+2}LpOjz508eU=t;V?x^Y)tSgNADs#(>DHx;bYI(|Lf7` zGI_y_neHEkr}iHiooD8IuKc5;XYT5+n9=4^dcB$yaIW6>4gY;t-CO7cblKlTb2!2Q zw=g43j!2;vC@2hMHg17?#yGk<=^s5lj=)8LM(fKxkhX!R949U*d_PlA--J_^q6M?w z<_Jc7!w0y!VQvS@8VijMt(7&2@jBDQGvrKo>EQdB!sw4k+mn9V=+~K!**6ToVUyxaDu>L@3LdX{D zrTzdPMj&aVWe=i~E|~cJpws20K+Y5nQZNwixd`hrJ@h^3M@mjwRT}~u%-AL6%N<+N zA=Nm|>U24Y9CqH=-N8gDEL8L$GePq0w4}T;vo-`9Fkchdc3an7TZBW3ShLkJmmdJb z1V1!7tup?$;U64*2kXY$B%*!zb??q2qtC_o<}1(D zdu;T)|1^sYy$U`^K2BUCF5IjipRg*FVVs)Fv!l~c*im$e+cF#lLz8k305FGHn2Av;o$B01Ok@b0)HK z0AIB`RC5*7$vQHXAuUGbqg;-xYRYe6woO9U%0|<;W{o`68rOU$N(+G~!*Qs`67L$5 z&bS_^0bdRNIQ-#%`{?6dcY1NS^o(aVDDk}R@c>k?OT24c=I~zL9ZMbRw{OC2FaU1BYa}k`@ zgNt3mdzfz-y~E1HGONzr(FdhcU4_?0NCg#%Gmq^DfM79KRw_x1hYd_PyqRUlnq4Xf zm1-m>b>ibeS9;91^fCvo0lTETwgWclWNeP6RFRdzEE|t)@w9P_G`77s*N}()Poq<; zp4AzmBmCXykRh?OsW&69G8mfx3O*yVM-m7Ytwy5~J#myl)tmeTbHk zKtIVKYzt{%f|0c7V>_kVfDs)KwhC7u5h~h88T8UotR6HrBfo1is0|cAw8yG%Muzph z9I&54gVm2l)03d&0D&jfz-&-r>LYX$I!iliIt^b2!;cbFmbD2gud=I1pE}_bHo1kb%;t zKq(h_i(3K9cPI+tjkr4H@mwE0z%b=WU^B20Ci_B6(eGsym;q#5^s1m7Oy7#uv5Bpg zkjBE*E+2T+DZ}Cggel*7Phl7m8RbCPn4BC%Jgrr>AeFN!*=jXXm?sU_#|Viey9}h>E}#%7EaO0Y z{8nICjoKuFNhBjEkq2!o^Q{OmmF#r+y0+LE2P`yh8BZaLb^(=HBf6$)E3GT@`cZg7 z0Ef_e5;zp-o-;{Bdl^qfR_rM7&`~#TcAP+_SX~ZbOuNN^NHm#Z4W+IXyF@LN=T$** zGa!>`-MF485$YgC0iD4tR)-XuVs$x$F(FdpCB>~2H8iQk&;c?~vBnj&f}n3q$f*F+ zYE+?+mea@xiIoFVW3{d^VPPK?|yn^x=+wMed41;r5X9j3jE0clWO2hD^nXhk$)WUzzO zi=?e!d{JahftfUF(+Ujxd1uL5!DbK!x*DsJt&+MPnkmo}AH>xm!QusEQ6%jYmLo-G z1=MKN73+HdynJWL3IU8>tU)j#THK1P*sN6Rm9BZYa^F%iY;kYMowqArKfq|$Jt$Q(F17JRZ|&r1_sF2Wie!f3QhJs?OS zduBKyNn~_YVqZQdP#0NQ$f(-H9kmu{Xm$FC+6xQq@~Yq@&XWeFb>y*04cUdO#bcU8 z1enSWvM7?)Ibp>?kk|rZN`5b^KwA6GQj6F~k0MtBKckuo*a>xzq5vwG#p;k?@fsfr z5P7p$nm~%ofs@1I6Q4aVO(>0*1s%agqh0C&rCX$yTWmk$M&+)QvHn%6P61g>{HKhY?6W)p{ipn zeRXf4n#ky?#HLtX4&kKJ>TCz#f#67Q*nWGqj2pnucWm-3$3Ob8RQE>S* zL*nT=ka~>GQJk@;Y(YnTM~dYZPgDa}2&g~-E+3<-0;{5Q zJycE#dpKAnPB9}<<`XRtv2RKQ5x_FcwW4+8g3=9jkYerfp$N5QJca6}5vHcOf<*BBQGUi`V!p3o>*|!;vC$;6`#1{G_qxrG*|kRnSpi zWm!^JRma)^SU4RO>}Xh^13XLYp!BTsh6@Y9P~6BAQ}Syzz6V$XSSIpdrc|y}jcQ7; z)e^#HZK(iabXCT4eNGX|yx~sH?uX6S@y5O{n^x=+v%m~Dc`-C}8K!k(*`#!n9R(Ah ze2lEK#HKiPq=cpFh9gBLsA8yoFDoqb(gL%bDxAy)F2l5LEIVDkt}W=OV|b#gGQ$bQ zkVP1`R`yj0)0Y!Ll1N5L6k=~ZE`kgVU680*m28#Nb&$exUgKkAT^$Ysu{nbt(k*RY zyA9tCHlYz8mO&x*&Tt7b;M56;8r1`)g`|TN1r`n_uev%M@Uw0?B@;|(`^p-=4mN?# zN16M2X<8CtjqpvQ0P#-RO9a#VZfqCiPCN1!gdg}ox zdg7DiO5kTyvQ=tNBt-$#6Br|!%0U>iwKA;UQyPw~hUo;f)u>Gxn8YmX2MHdly;y^^ z+oB*yb@t3~M3Ttps>B|Z3|YFYZaBw9XU$IROQVL77ML759@-AuL9_w2Xx&)0TJ0+o zs-uqP6h@2;fR|=kg88nr90~H`&q$Q{LM-540ah5r0p~poyft^@K3Zlu$1x*cMNQTfsD;jp zR_<8z4Q^=4e4?@EL_fs@5k$fU(r$}esf~V_;b>=ljA#g7mQ}l9t2IqcU~k_OCRTu$ zl3zAy?0C#u4@eSZXqanEJE@YblDZyJSnLcoGt?%Q)uYoXYh6~ieIfMK?ua$iv=}j9 za_E!5p@6~6AOsV##jVJS%}N!Syb%wmY+Tn#7cgwo4Z~L0^g_%?e%Yk4OC%#GkryM3 z!Z7V*EIXaWrC@Siq)V~RLTSq}Wa+ZHqe|}{n=zDQ;fG~Vduc(5fR9X#un3}PouZ(r zA%4Qu;-MqDDzXj~Xc_C?aG@d?f*TnQIQKOhUs|jMViI|*DU~Z#qnZ+IwS;pC5D#?Nl@QMH{F zn3qOv$U@IHZ#^JQfdRz^f(cctt$+blC{&k-;sq+yRS!1Q*ZiHyhWQmlyb*?f}QR?))1bfZ5;q06j!+d_r@LUZ3L2vUvs`_D^JKWpLbrEFjB?RS< zA)isnR;hi3LOTj9Y2@~Apq$|Afo%oy^Qv1hXsRdCeikabE<%WO-7?-YQ0@*W82Jd^fj&Uz@~*xyvX!U zISGYfNRStQMxx9oS|DQI87=`V!(5L9BsC>~bQo3u1?B~FNx}3xaK-9!2xICh<1Fm4 zffHw0JSe2zdgwH4J81)&r4ex*QEG&T6l*GB#EbzUK0OssAebaq91p<9uPNF?^LrTu zBKD0Lfjrif*a&I2MM3G-VdLI{aGi{<8oY|twI(F6Fbu3t9;oft0lFq!Dlc`Htn1~) zq2Q&O%jQM(x(=-JDxM%d!6aOA;x0YAv9MijgfJRA=JU=_3kw-(qei>b1ExKzewjgF z0!$}9`PAf9tS+biqdJ<4fJrX~(r#Qc3QP|5pd@@`YlOu}-qBkbsv?qiF+0ub48QXzV#@K`EySI)aUcqIFfW(_vU)da(&CNd^nNiq#>^dkVv_ zl^x@wJte=FQQ#UJ_6#Utjqn{Q%{752TP-1MR&h6hB~_!}fhFani`tlzfXp$OUHR%FGt1g=mf%4B-S(>5nv=!6!W z&EPB}9oViKz>UT(Q3J&}RmeyiHCm<~5ag;WJ(-@2;xLv6v>gD&>J5y`|Mmb*%@A*T zfUfY@0erT2oc0;>Q6uBta(nxl`4kHB{P{lkJelBYgxyHhs2))IHo!v%7DUO6uFANs zQ`S%+s~X11Y$6Damo{zE0+U0>L#JVnp#|h}@;W7e(URa8$$}-#h^`7OZKsaQyr(b> zH2OU`8U3)lS+7X}QIdSeYg|V+^ivog!>yT=B+97qAkz#G`2=!xW3Y zX3zqYmmln)Q^<&Qq;ggTC><0~>4^z>jgwIz{bWG73x%RYTSaiopib zZi}&a!cbX8#Sf@hI#y{9k?3G(94NYMExbI(bO441G*_@=qqI;U($i2>h2 z)fw<8lM$TCxE`)_Fngn&NvWl28#pm^KFZv4(nODoZ~~23(~K+X1#M9R-$GT$%A{PC zamCxZ96|zKXx!~qENoZ2{X_%HNeewL!WvkXku4pR4={+hk)$W1n6=!)u zY3w;^VYgG@Lm?y9AbLWyXcKGl;8C?}v6|2i3B(N|Hj7}b?DZYK#q_ImRBPfvrz>o z;*83Su1f673APSsX;(g)Hc<4K_cfz!;2J1)iYe%*??}~*3B_ulfvzn;)Q56a4suEq ztIHvbi4nCSxRIehFE$zOgF@;}DJRHaDq8?RZi5rUh!;PS@Wqs3XO?ljobZ^p?aD{f zc2-&?hR!FWz%^)~i-4`l79`U)x2q;E$4D@#*2)M;Q{*~MVi0+cig+|_WVM7O@oduA z@tC(B7eNL}uF+W-9g>$YMHdVWOrBYWF~3UTiYl2Zh+15+R>qz!}$yT36m$ zjZmQjt?M|=FjIlW3!dKTCZRA43G(9C6K_A!0ulSpa0zQf!!%=u1xCVPg?a#2vj7#| zfXV=ZEMnbz3S>)=M^;Ph_?nF?4Uq7rM8HR8O65Q?WthoNX;~dEhlz-~vDPgInPPQ0 zdaP;~&^3{cDZYKpCJ%wRNkQcvM z9Sfl{_nfrIkagI0!Z*mY#ux~j8af7WwRk9r0AN^IvoV>~H?D)WmF|dzDfpqe%2i52_(J)sF zU|Zgrykk`X^uKw427BKF%$0rL1Nh+Z{JiHQG(U{DJpj+oTmJ^W4luW`KeqmL?zhyH z;b+A?oD-3#4b?-#W__NF%*5{%1{SLi|Tqv5w>u!D73`ntN;`diV;jgVHgtR#Yc;RFPk*> zoV1{nQ-zc90v(lx=?7F`9mnhxSOB4TK6bp`R3a#hxKJ3*K?ZBXmS0uTtyOPe-nfrr!^F(Dt>I*Jac;g)f= zg)1s__EJc?l&b=Z7d(sUCZRA43G(94NLlW9frx!)sKqdBb~O1C_!*u2lqb|fiUO!$ ziqWHt#YE!+ro%8NUxGa7O0^qTWucIIV@Ak_RwFMG6nQDfaYcphP!wEF*=z+CFL-p~ zJMSqBLxQ~c#{jBdHfiiRX_BFq7S;;q5pQF4@>8A=c6N9oNn~_Y#&dn1Jb>*TLEr8UF#p`{w>*IJzwrV1ZGGK=|J`4qbNS!;4(uQO_x%yS_Z_(Y1%1~a@w*@3 z{M$Q!J>K#T@btg)0nYGW2jC+3-++53f8l>lbM@c&4)E~5^#S~E&-c9pT>q~-(C_zu z@gwf~f8Bxq)g9RD|F(DF-4DQppZ7cdJFqjq>z~p2-M#G{VE%W0gq;744{(0H?|lc} z{Q&;M^}fFW{`$Z9BlOk>IM@EI?*PxvJ0HMj|BVmOkN1raaGsubeuVt}zVQKge!lJi zH_$6_7Jf@y5hCDf7C=7v|60V+4~xxcK|J}@xRI_gPdO$;URUJ+8dWGfh$+2@MvM#~ z$f5|SOo%bi#EV}~Xi`Zb1S9vY%SDi(WxllVGb-8X@^x*ocfArRo~}QB00z0{a~Ni! zL+?{IfkWYBNLq#o)#6rU#g3xj@|kIMj-~0;ai6!%%16@%y_OOY@KG49mo}6fa0>si z12CS0zv%&H__w_SJ{50#2b}+nAED-d-vjuk=+UJ=f{(V){M-ZnQ1)Ax)jymJ5Y8W} zQ-NOO-&~3x`Q1)M7S*#LU+(_7R=W6)cVMW8TXgWRLwvY%f4#^;xZ-69nFb8y@iHN^iYRpw#B<1Vy^mgcVh2r9p-jS zS09WIk4Mi1&kmH>MuYIA{PA&;r=$D_?!+HDG}a#MQx4&&a(w6zf9&x2hv?S4`615V zAs;;+#7FJ~z8^b$_MLF_k1am?H}U)-xcYy%PM>=S{QubDbMM4)|8t8^J;Yyj_|!vq zh5mG%KJyTN-QhD2vHxT|_&7a(i0;9IcVZsZKXv%LJF$P4{?y|09;dHEoQJ&Uy!&?6 z3ptmW67RpePM>rq{-MJsJx-k9?GN$zo!|=n?PdCuJ8_<4o$uj0YS z>0dg0!XeIGe)>-QONXyR_&4eLBm2gW6Swlw$B91~|I*iVxO4&g|g)X8S{uS0m#=F+fz z-(`yLeV_Y}{&-#&cAk-P+H~P`X>4!Uf^C5ijf9Q$j-tqT>MVJ15nftT4 z@cv+Y|Goa!L(ttBzP|;EZ#cwHboj$J479xJj73R_@+bnC_no+eI4Rk z?u3uglRxbF^Yp{_`ri++e;I$c#lQb``Z|PvzyIxz_-}A0c-$WSP2m2s9sc<@asKfA ze2e42m|TkUIgYRY+3_hI{_zmD+}@A;us^56KOLg$_U4D^wm*1?d9Z%E!{6TttbK&$ zQGVcY`m7GW_aV;dp1l*^;Lo|&e{&~#Oh2c^-~5Sv{t)LTJbNd&&7aocFYm;;1E1L9 zukOU>cKEA9_!NEKb^41#d~%1sIK;WjPv418?(i3f=#SEacY@!;r(UK%9b$iNZ+!@U z(w}>s{_vZ?|C2lX;Z7X)PjB(@5KnaYc!;?|58jDKI(#_9{$2VyL=XIVD1G>Smd?Uc z%IEPrL7PW^6CUJ&4qTirlNaji5c`*tE78;ACE_y8b$aMtUw2~qK5(<=5FP802X+oI zSMHG(zw8kE$K$czM9!IkWPht%iR(w(Z|wRx>>rP>JHZwB>+AFj z9w+ZdeI8oR6Y%hhC+peYbqDY^ojbM7CMV@7pG(OOpOv}X&0qMAeCm9bEtNtB70OpG z^|mgWW;Vj^5;lZ+MVU2aG|942;mth0)*19d^n2lt3JS_BF`~fdDwto46BuXBdLgJ) zjpO*HU*6ar<~EZ}%hUwMDOfMW&M*8?L7|ztmJ{=t!W?Sl;Kn0~8Di@fKE7jS4}|Ac zKlPF)U-eP&QVH8=5c3!Q39V-eNd$(`cAI?VKYWN|FfUs8p^sCTJI>(7jQ2f<&})}x z4l$=^EGK0=qSguaaR1FiINhUnVlIeRrI?#Bm&|L#wdNb3wCHuU~KLv7XpDXbT=60#>2!-7+-?2BR}Pua2u%g*Ju3vpm3yK_KRLrt3NhC!zAK9#d9w_*orjviKq)a-0x3Q^R;=j894}2Fax+pmg@}y%GD2K90Wj&O#p;VH%?{U%AFLcx@%3?ZwB1 z_BaqRkuPJgixB~;T#T(lFB-&r^g~xfBYL1~5y)^>2-B3n*!*OAg}3qLWInh zvC3sc7&S`7<6Jbz(Nh{enZ7{dMKo~+!-z2Qqvv&av15jy*4J%qL8ckXiPMA2n;fA~d%n7^pT7mGnPELOUV z2-wMWlC`eMG_&p%9=|rTC#W+2&EhmV}EY@Ym3_%UQ&w5?Cx|YGV^DRh=GnDgf6-abYib`R_ z**u!%!7j!nh>J1M`L+rqs_Q)y&FWn@Th=)PVr*>&)evXvudT*t_=&QrcUDRABT-biB5xXVbu8MZzj#YMyT19$ zenn89TsBmKu1iLvW^N58+jr3(ckDig6)rdx~B+ z(6EaUVdg`V?7)XeagqFR&?GAp@@Qy@`Os(92^N2GuE+G6p0cqZV8uqzXkxI75do^) zQ&2|kC;};lq;#yHm`6h!kl(CLU{T-Ox#;R;PuaK-usRrv5zAl~!(6JR6DdnVagqFR z&?GAp@@Qy@`AoAH!QzgZGb_WjD}trg0LRH?wG4JKE|#VXX1p!ia!{C5$iUA_00$V7Ol4_)={d zY&!$vO_k6WyX--z-6W;3)qv+P+5l@YBIrH7`NJ==Mr9pNW=;jInFw{x)uq?q;4+) zgy58HG5|6G3;Mz&SX@x$*rp!_Nz}6wBs=kTN@_n)*iZ@LY&U&C<=%`-uy`_>ZI9_{ zL0`SJg0sy|9ZG=uCePNERxS&eXxHrR`G_HOp1QxZTxwcy5IP2MnWVZ-pIO|g7I*R^8+sWNr zvWD5xl1QDB09t~(P7s$dDp0*z+00xvE(A7Z4kfz?yf7kQ0pp&LIQsbeygYHk1AIA-o(p*W=R<@p^rpImC?YO?l=JoWbk=+DgRh_X&>^#@9W5 zr*npf?*wOf_7I%m z*+X~&5At`5Gc*X4KSo#1KLM3pjRVO0O%%;@`wA;LEO!enQCaz13rzzqh{5j1HLk!i zhac<68j1}MU>I5u3mAxPaVj@xHa|!mUdVx&4PDrPwO?51V*lkUy`w=l53-zy%Sc~z`klTrF06r*bdXFUIPqPzgo>`%Hunj z6rxP%It{{f#(@Hhd&UiOV5rvEt--300kAKsIPr3yWQ0>K!vTQBcy++ascH8@QXoDJ zGKq;J9Z|7o+%TlI(XAS+3fpS6wLmj{q%|3c{_i8;Jna`5{+1%$IFZy`VM=W= z$+%%4*Jc{w2djcHAvU6{zT78~(F`Xn(^U*d(8x{3*F#4mM2uwUSm8z<#fl&}=&;L%D*wb)S)rNgO}Mp)ZV%4R}ib*@Ji?y?Ard_dJNhpR3GGeCvap%YWv7`-5-~ zev5-}jNjlO-H+ekAl;9*KZyTY{05Jff2)3r-^lNANBo`t4!@D#B{f$Rs(%>emQ^vC(aq84kMw1foD)>Kg`b=Uv>>XnK27 zv4LjAcdL9rb71Tl2Oqj!Q#g-^k-qxf%^+ii8zjoSS?1$5F3z@8`6X#=k4T_>Nr+$rq9lpFrL6DpZZC@$I;?ZS0+fwD1 zWZN)!*2IRyV^e)vxiu1=NDTs5SvX}_K@Y7|hsbtwO zQjSgaY2_wEr>VVeCq_7(%=8tbN|S z1qc_NlI3wJv-2+QX&UA7mfSSiR40bR_H~fG+}Bm}%AAWcm#HiCbw_$&e1dv(xFkMy zJw$uie&a{WM})(7*0~m310NQyhTpS=d3oYf8(nPj!x~B!XH_^ZWvmuhOw!n%T5Kvd zpgQph6hZN24b+$WBqL2bMr;5q#_9wy?1TP=a^pnk(Mo1DWHHH>SxRf7o3ez}6~%Hk*OE)_sa+X}>9B5JYQ8RA| z>yAd`_iSNoCk8fPTXqEq7oC#jVVBu=7xy%p-rkb!Gd9(+q(2+xyc0RgEH<=MRfQn* zR7-VS6Dpd$P^8zxHgmX`B%r9Kda?#JnA&N0NY2Abl7Z2!)uXBundHMK@{6w++llj> z^0F&HxagEDk4u@e@8X`u_P4j>rpcx{G5y&v=bdN zF4AS&+5TfsTo2a9bHhtFt+J_Jp?Y2NnkofCX5J}ISjM&*W8lEE&CKX+ee$JKvOMfE zRtqd9xMMkRY$~9v8=pW?%*FE%+eHh7YR8BT5sR^11y7-hW-pW*C(;+9Ux4ACttu-t z)0#jA;yv*Rh=n(-I%+M7lTI}wef7|lNEUe38P2lqhtXdeJoMqYcjT_ z3YyT7R(SLtx_HiG5mRc5x@^DYAk)ubJ%{~F_Y9L9`<=qxWPM(ygfzIrH{OA?z3Jj# z2l29S!RC5!^}1Md8@(k{-I@e>aJ$T3cf>2e^_)w|lQ@{j!B4GaE-rOmrLqY@S&QXl zYT=-ASbe!qGSakT#0J1(tWJ>DebB#9Zkz}`TFLg^r6IIsmeQK&rfgt*a62R|*-@5O zlOT_L*hGG33JO5l7$m;Dv#la%59woJR{)zc$W#x+tF$IK0P!}KLo_1xt&UoY;-qQE zhz)?nI7dKQ_c{A3<;IDmLi8I|zN1iu15I*GAOrF469bY3_Ntd!3ur4vc&Zt(jT1U- zV_L2fopO}zO($ttkPTI%k~Es$-q2FAZaw-HDnrcepkmR$jYl?#tP7-akI z;+{s++nb6FtZr66SqBPb^)+zuS?NZ@nba+;j_J0%Z1NKiB@11o%eL=Xq_&u(p?~Sg ziSV*H^~p|9DTe5XK(xSUq_H>CR|g!2A82xUXM5A9|BSPleRpXHZEf!?HPMY3d{6eq z2brUM9i)5T!^Y$EKYO%(lRMJyi_6F_h3jf8d@xZ~U+$BPFs0Ii_(@}|E<9uUkkgDe z%Y57}3$mfgFGX%Wwc-F;*w2n2KgE6zSq)pG8ch z^h3p-ZKIIZWZbI4tF+BlT@zH(Nt$Fz&@|LkvxE7LKX_Ew&~M6(6X}#EOsOp<*)|Gf zy)nbj-X|>17YCqFR$l`b!x*-Pnl45O2dD&W%dP;y%7sTC46=Q9aZjV^?Jczy)bWd< zg`E!wnn$zjbSs=HQb&-^Vw&T_2Wj3WXq?&_3v&)f>hyDlNk*8GshylP%`ZC$W7*Q) zUiGhocsKq>cjR3C-{FpU3BL|IA(2R5W{`NEatUk9L?Ops1z_2O1=Us^}2PuV5^k zx*D3K=~p<_YybgRjB^B3YqQz zZF^ey(G^ds(~1qF^8qCA%`zXi%YtmE@=MY<3-xs;Bw^h+UpZ)Hu=*OfIAO}MVFO?> zzFI-9qS*`O#)-ZT!gcHVd6D+Qb#1sIUR%>@UfC0_kjw0Sm}`ld8VKpOTqXKVkxt3- zxRec5qZ6OqQR#kO0kyGZRbZI;CPHjS(`pjrkq?{5FFnQBPMqVEmt6tEMW*g)(vGxx zh3a**gW6qAmGDqAEYnqt5)L{JdD#^p5M`l@^ufR#br*LOew~9v4{BAE#b`9)Qua4m{ytbsVjq$x|-rTA+aITPW2jLFg6^9X?fWd zAP{9iv*?47fmtX%iGABEIEO?~91+RT5QkM)!{Z>kXs}Zp)Pfsy3PAD=(ovb|E{b&V z=$bjrQZ+KE3#)zxQ>9Cuwzodb+#0HZX0S;{*a&0829V=tA7l?oX9qQ)rRN5+JFE2y z)&D~WVXPUJIVWZ4_d3XRdAr_kdl1j@8{Co3@%{(7ey}Ha>qqN%ImrBFdiTFt=lAhj z+!21~zr#U1!`DIjS@BQ*{^!g5SAqX&P)x|<+nqP#FJl-Rml3L+CT?d zX>};V>|3LB3cDauSkuYa09cG68Kk3v;9d2p!-8n4$KC}!@v9C+OZKgeR)b=~I?~Kc zS20TTg1Y5pSJ3#fpl5X1#C~xgWO2_}Tj1QOkg#ri0!6^SriO;vEQt8V-V88|?Zl7* zO)hVOp=5DZg)eM!P)o(wq4Q-eKH=6jJ_Z}l@>meCvh1H=bVc#01Q`iNO!dgzo2Ebwr$sipSEMFOwVU8{~bQ<$!nU9-5D0$w8QLGIrZGoX$6Fm}= z82ZZ5XKSNHnGq{8m1Dye# z35YN@7$}3b86a{NV@L*URifXN8z;K>(9RcV7x#>%MUP-RYp`mYft?QsnuEwLj%R85 zP?%2|(;SD45o}wdO(CcYmG!anl2w$Xk`{excv(|eeVVz^#U_+au*=K_f^uxw09cIS z8>EX3>850%i|Fv3r`c;jjGe~Xn%3r0p6bN;CP`ZjDYaQ_hv`&n(abs>e5nNKVpo7* zMR59`SaPoQx5RQ@R#^#5Yl8<6?}<;KC}!wL&=`8LUJAjJX=x1-)SmiseYPU%>3G&E?P2_iGu$vcR}a!f*$cwd=gY9Wxc~PL;=I4X9qGdV1_$v$;_2cC;xFedA1zM6PvG4T!gKXo z9K>7k{&%F?@&0#Yo~!pi$UIl?evtmu_H@1VU#;KeAbTm8X|?YoQw^C#aODKVk%hPluKY`8e|eza*N80 z9RMS!d6Lkykx4QzvTL=ITbl(D-`JZ0hOwO(QlQD@{ehIpSw)9m#%h7ZBx7xFcPg}E z-S`BGfPGC3DYaP;@r}J1U>MtpAqAS;-U^45$yt>Szl_xai%A-7Z+9xRV%_)zihzAh z4Jox*5b=$@8DJROi6I4=+};X@l*w6@4!?}m0*gr+ZEtrfv|`=(1d4!tO${lvSrGA! zy%}H_+le6snyhyvD3<^k-{qIFT3|6rqpf5esH7a5>WVP*g))h=%yKqc;y|^G4S>ZM z*g!igDh_y)g3o%O%cm@F&=Z+ttgYnQAhi<~#dxiR9hB9V`y?ZrYMHJ&pmJL6i8F4k zr1inCiV=VqnVolWr?kTu=;fHo5XE302|H+MvmACttdy#j;Sk2QR6&z$9Myc9ejt}7}RV@PyW22XjL$(44obfHlN5PngM3tW| zV{IkZMmO4wP4#KzCgU>?(s*xw5O2h9a7VfkzrjIxuI9n*ne_<|1SjH2-$(sG{XDn= zC6xFDh`3b~rG`X?b{VFvHd+me3F(Y-Y}mrs=m7O9ONL(winR+*Fc?Ndm0yxZ+uI%5 z;aJ-kua&UFVD;rb$q1)fh8sYRe>g}MPp>oS({1K#9F^t4Xus{#g}tAzTj z39|tQV!6X4G9nKkOB3wmZxDm*U%E^)IzHLP;XK!aUG~a7dYHY^&mM*&*x}JOUZ|_p zQ+MV*$Jb%b#qKiy%TM#`&de3?&)PR!ve%!>V~-hsPM^Osba?(Zb9H(2PxBwn@zkC9 zHHW#c)P1ZT&6A&7#%q4w4n)lF-1i*-Qg`)UE7T+Y-2X$_h@j7Bb-B$Qrinheji=lr zJa(94{0k2Aq08N`o9WH@j61_^xXmBCGatI#{k{&vZ|qzTZvzi5FS3Cs%D9)$yk>vw z^30v-M2|maUx(pW=F`OE^AhyP>@#&gWlh`+Cgwm-6+S`I|q@ zTp(Wk8S#;sYsTY5fyZg8GJoJP!`;{T=y?G4&+PK|Z9eZX&C6+?JWSX8Z?4&=9p)dq zeA=C%%ky{UpSyh4VYu{Phv^!0O^Hv``wjrlwc-8~%B%pB+K599HlJ`A^hdOY};`P={gHsAXP;MW|+b3A>Rev;qc z7Hy`G7`2~08=em6BVd(PwVScj9Hy*|=k00h|yL{VW_+#`7 z?#$12`Tutq|G%-P@67oN`3vsMY4Ou-{{0`#&*<{+k6Cj({+szVhv~QvABKO0fByde z+nqW0QeIrivs(A6`egMC z6%J!3y@aTczgREA4(un_0fggUp zn9bEoLX_OAC_mwO$*>X0yzd|e`LJi40sdf}9n?NFILZK5lWDWvvb-j#6T&_x7g8@+ zgma%U2ynG8Q)e=P!(gi4SrcXh4pdQw$vYoruEoQL;iAozF0ySGk1OoW=uQ~REuU~0 zWWz>hCZPW>IgD)|Kg^!@de%7X>B`5~H3pao*Q*OMK@vb2E6m1g2VKu7xm(GTBH}FS zn>p0p!yp^>j5C1#n%v8DJ(^x9bYwLd_t^%#CaKfWW{z5eanZ!03W~mFRGZj%?dK?` za9(3#vchSwhlC<8e++Av{&uMCwApSm3!b=WrZ5}u3gmYLclxaeKsY%W`Mev({GjlB zjm=g`$3uS&u<=!=;|Odr?%O0#&loE_*#Pb2cSNbr7zDWDu=aNVh~AYrVKx^IC58nB zwDpBznWN5sy9J_$Cp#f3fY<#6QNd)skgUbXYfv++Vvw7x5S6y!7I!(Ex~p~?fdo|N zeFsU%kDop^&gOtqV2>g~Z$D^YnYkvffg*)5$cLTqvI)1ii^lZC@x1Dk!>IGG7lre> zC(%ct9@%O`E^*vX$4k>O)dXkr4PuaavK6Mn zr-Hc@kxrsQ11d2!81qCHFuFz|_9bRk1m%h}R->nyQ?)*7GV*zYs&~~4>0oCoB!7ug z;Nwfh)QRPmPdE%RJTIYxkcC^^1=OAh&Wl!BvzpLCT*Q{ea0Zb)xL9z*&62_MV}dG z8sj$djSk@;ew<1Hj$or^?vu}Z7-TYj>0x%(GlV04TtFS`nkm5bI}S7M$mW#noma%m z!34hOcDUB?1l`8-#G^VARNKpjvq+(e<9VftNqnN-cK}EqSK?F$@S|6pp@>Ws%kxT9owy$6 z6ApuHTMSXw)r+DGk(2d_Yo{D)d*U27h(UH-#>e1lU;*1GHu(9&IG>jJ?-9s4QIpjU zcH^BfX4E->qc|M|YcuKc4%kPpIG=i$&hqqOJRZ|OdYC!O!-wGzzMee{PZBSl1Ro&U z5xgv1(IG%}FaTj@Bre||3E8sY!Ss`z`k{zSH7XMbxSG@jYVTo?UGqjf(7|5uWq?|R zcwRKqF$J35cMyYI&oqF2P*=Dq2bn7V=T)ni#Pu+ra2RCkH$+)iudyC~sD9$w>2g5R z`wn7|+lSTHjGLVkL>c$mFPHgWEVc{c-X9Kw%JDKsdssjp7Fp9SJ>{5KtoKWhghEz0 z^CCMR)Ihpi5ka50&2d@G2}Y#%Olqt><6*k~^*Z@<#eb?Y!DkF$6Wrjn4LG@eK@j|0 z%@j}+nW~;EI}Bi6YkvoTBgG_|R;soaxW%(?FObB7DRaYx&X^84Qpga z6>%FuCKmS@wvXV8$q9Ysc1&vS1iOIRbGZWu(|6n4K@81wdG;_(XU_)@WcizktUdvu^*>hv`v%`Y`-m@v6BJ`||nb&s)D(_B&F+4`8@iSjSFo{0U{I zVx^xiBm%UvelM~j#2vt4^`HeqOp|f7o}aCd^|_iUpeQm`EYFKx&}R@By{}+sOG<*O z4>&F#7L6m&;1nsx%x~JvmO7L1Q-iDzY9L*%h@eld%^|bCbBf(?(1X4v(l8wWH(l=~ zg@jB8$cSX#FB?Sb`PoY5;n;w(O<9UmobaT)#=?4ir_Z*Fm`W2RH14C-%yGCy8b`=< z0P1v|Gr;%F!pnLt@gVrQnkk?tGF3f+w6F>nu4vE;8zPbv1SW+C%Pq3Y8xhjiE=?gRCBcYY*xh)OuEG@ad!1*fvM*OJH)l0caF5?z3fD znKcQG`{ZW*USvgR@&OQ!!P?JJP61CfF=%yOU;;|3Lo`w$A#N6xLZ;1j8yXT#&g$!u zOfT@d(tWDS9iYA=ufa2iaTc1+`wp6foGBTa*Np3Yq~N=xkOOsO$6?u&df&n5VXbEP z*=mzeBPM|B=P*|W&PQb0%r>Bs7<4+XOG1gt%c5}vsCuA)R8JD$cQE=`Y4GWT8XNXI z0_wx$2M+*(OtJSJNV6aV$$9Aup!DOCLQdQPYBKT~Y`fv1fi=VAF3^0|EfNv*ISp18 zdSH-w-+?p>GU#+(KX4c<>7Bd=+b&SkoodE8JS0l1V`8M+31zMf$H-KA-@$twhGRc} z7_Ny|?9n^pJ$(EyUa#j5bXicJV(+n`Z zj{6mDFECz-=(=$|%qLvl%+6fA&?;BTq6p}GrWzGuxT^_l@HPn=A&yO7GXbc&-phEx z@g*b6*vNa|!I;?Em}X`*sF{4`z`~Alqyme(noOJ7B)}jab_42$8tKPnJZ;xpNY;Yb znX;Tv@EQfSFI6@WN=T6isE+{fgW-**s$eAx6g6TnT^dr2Y3k_@{Jhf?o57?T6z zXDi1UpzaAN62a|C8U2JXA0d^F4zGhr-AnR8|(B~99+Atzh z?0pA$kYkq&(*bbP^K^IH2m0Ms-Ft9P~*ajKN4>>MI>pf>tpL`K^C4)R!&OL+NXfa|?X*IG4GRh!gz z`fR&E>RFvNArtA1ef2A{qnL6v>oDq1I}FYBbQ)%&F_YQRJ~g8N=wQp9asKTvnkl#0 zLx*vS=MUor_&pB8jhOq&oxShv+}G|r&xG$`5D(kSkOyiXiw{ev6+JD&(_-XgnoMwD zcqcn+t;StI>F^K5Xq>Pq9Na3MZK30Yg4ZM(qs&bKUgEptlL>L6vm&bVnB&u1PfeGd z5S0y(R=*bau{{)+awS}c(M~_1AYo07g<%5%eX8?UW1z9JtpI4D}4G~&E!)Y zq)jzLgu9y51drasARBg#$L-*MahN8-`{zxgmYD!_L10j{>4Q_O~-8j|9K^wo~? z3GT(ALHvw?6cr<{0mJ(S#YB~H!fe1#x%wRiQ1jJzIgDCwJSND447*Es`3rqqAmPBr z7rn?_O~!twA68f7-TY5h)RD9PVl-+Qa@d*s?cD} zG3gC@kD_zqoU(R8@gq{e&`+iV2&c|{n}9WgtHw6Y=Hx=x61&MPj=`a085?cyJNT}{ ztbZpttFIZ=9<`k|+ihmqNgNvG*f=jKh9*Q;a-cD&Iz$*ha~KCW2GfgnK>@Dx-8OfS zgxt|q$Uxd5J%P*J0K^Zb6wQWswt3&dn5fE2VKxZPxq)j1K$GSje) z8K`Bsl6bIIuQ|!G6W1vBv&%Uq3Y!~Um5JMJ3@T*H*^LWkW8kcA_9iVT=R!TBmkx$ z)m*iKhnI60VhJcvABJPpNwnFBX;ix-JpRD?BOxQBi-`*v_=n4Ep?+tL2eA!V?lweM zY1t`#gfZheghUk4{eN3SY2UWE57ee)8}d=|lsWIm(=NizsN%b7-fYPG2HMJ5(M{W= z-3CPR#1Z|kYN*N=#aUq%EFX$HrMLF)t9fjQv-K8m$}bwSjcXWi58>Ns9vVs)mi>>0 z(&cLneLv0B5Vf5ka+)H5RtTQRg{r z?@|YWbBDgQ=FQ@KpmuoC&@}dNhaL}M>A9<%%^t#inr~g)M-J})Nkj9jyz%((rF#7k zzKLdwOa1*(pqtasd`X7X?}Pn(Q;ij$(@y?`zJ`LYqn}FMW){d@p=p9pS$<>9LvfV(dP75cqoHr6 zvCebakHyiafOqJzq3^6oooy&LXYbHsLq22Y4m~#X%{1JE>-+G{iu*`yj}7hD=g&Lj z?|_|8%fROlzL|z6?G!gli*3b1#C*`MNA6o|9*d)a(-8I3s|^|Exit8VHJ<(6A?tj# zA^OQ}{AWYoP4mMV;&{*FI6v@oGZ^ukVmu8tFIcn`-iPZ*jdd z)?VP;ytf+qcA7k0TW+2{njyjsqNBYh?e4J$v7rp~9W<8BK-F_iP2ZsxESb#V`O=-r2XHPkP_K5`Ft$omHIk300uGOHmD!}Q~A z@+Mqg@Q2gx*Ji&%4~f2;<|j8a$A3L?a}T_{cZLPMkq?LP{WRaYxa<4+$REFO54B8n6am#p5o)?KI$+B{E z7Sk$&0Q*L^)y{ewio6Y_m)9El$~)uP`W~#io~6CMdEi#F{;u5 zchsYQNEE+QN|RB6uR~ zV+$k&u_i6G_e^cXRWm4;8`u>yovr@>znwIFh)gLgD^SDr1%hH|Ps zCyKR-L!klB^s%9^qqM`QmVz&oW5P1*FUdnT{3-oq)}(C@FDeBYHUi_>v25)WFKi^7 zu)jQysVB>yiS}k>wy2bs6K=&RHhs+afrdUg&tpSl%{I+ZDP8WnP;bgay@<;3{9iN# zBamdbI9BpidFa5(|B)!Y5bT>e*g*F4XG3<70iLTUiqH;fUzo(H3)};<%8~_FMebs3 zkR;izqI)f?!pX};yyH$1KFMpI$3#ioL;#pakswm{X3DN^CC2>Yd6Lr_UnVly4ynru zQy$ronwYpdBRfna=25)W(58Ssd`5*)W&2E?Q#b8e{KN3r&JxrWk2Zdlgbg#X;oNAtYi5O?3q9Ue~Z%|z5E*db32g?mlZTsji%_OtVBL-B1Q zia3d;5Nn-EtbxiFc6~&FQ?{y@+s_Sote9F`2UVsv%YCvTT`<#4tKNg#@;oM@mDd}h zzIGahHPfJ%U>%Kd9YWcj>2q_1C!wPFs5eWPtIdqlZ?El)?@(+K%M=h9jL@m6>$rx0EB)*i3N%; z&0~6sFkee>OEnkJvZ=+YMJ@TQhAjLru!@lB%kt32Ni@qV5m`?|gsG;b5N|^UcYqZU zV#yL+N!iozFIOTb{>699|A}%1w(@PVK(Z$KILD#PC4j zQfZ88R**Xb$#)^i6G5t)cy^42GqVgN>=s7_5M67PsN~=}x{S3>S0bG=B-c9<)vZaA zmKvjW*&{t@Q}!IPmKdm8AOYJ7z8S$vQs1L zh*JDRSEUc^^=w1Ke(6JEF zW=RqSwfMR`PbaeOCmTv5q{{J(+%sWM`*Rg>C&}QGd3wIuYit4!_{A|kdVfSyvovAp zIA-C2(jOh#6%iK=6jAbbjeX#UEKYHbjbRYCPN@K{!FOmz8&5|B7{&o?!&k9!qTwib>CND0ToYw1D3Yz%lbXlkuq0=anq!RJ zCi8)lDby3RqSX3_1{YuH zpVlCFwzv;yJcg%>qh*x5tn(d*@K9V%NQ<6W-O@A0?fongNZS9TL3)ZyhGer^!9$s( zOFfHbc!%aiQ5hl=|^Ln-L%8~WI3Ki?4jr*yd}7%2w%Z11B2DrgHbIJAITae7@#pjwS1oj`&Wx2-|NLu z*0UN8z3OLb^IRpG%BYTU}O2bx8MnGQ4 zq`5(O3;bP^6t5L$#W>YYuO>8TiDU{zNbYMI0I#~avreLTWJYTyM}zw7F2#)hL8BRJ zoix@)-DDS{WZd*l837UwM~^UK^PmxfcGUI$sS*A{`fX73$s~r4VLXK65T+gaKLMk#_+I{eTYkAx)K}svSnYwT!i?LmBL8 z%}0x)5Gtbx)4F9@pa($ZQXY%5+#N9SW$_MJdSMC}D3)&D)P#;77G~~1#8&{&GCriS zf+(#pc1@g2rv~asz5$-or0OKo9Qk@A4c;d~`~jZRXcF-#m3PoOnNB46KBA%2Qyfcs zN%?_@C7d$M^1MbFK9h|Dy1Fu1!$d=B8raWhJbT@BRNX;m{Ol$zdRsoC@!Xy*ZW7b^ zGY$0^tW&AQXZW&)ygj6xj3#N_*58O-7)V3!)I`Be7y&m{bq8Z~Dfapudbb7-iaIvw zWzB`8oQ48Dr@`M$xW8zq-qgD}p}PB!#x%&=2(Gas@oazsLOpo{g!@-b{Z+?~W!;hx zV-kC{0@6OEi3O#3k)sYoUU<8M`4LUT?O`j78BHa*@$9-};>|_g87+YA8ydp0f=7ZF zg+?yqXcOc~&4>{D&gxyoJ*}~qZK#FQU6w4G-7Z|G|0gwVBn{!3^D%+jJc8BFLX+q>54PA6s4-LJov7~E2$RU|` zHbE#N=LMSgvl>gfhUPFES%OgKtjX_`IxL30QLyrayRPPA#l=!z43Z+rNh|!%Lx6S4 zeyg}x;s-nL2u))Y?6IL28n6T?gH7monSNnP!LEX5H5R2OLsPlbi9k|PDN9~`2YH`n zSiUoiGG=UPhX4%1g2as^nB?!!p!Ut6g*LeAWCD=*O8ST<<+ixknQ>dtzXh;%`Pk4q zHP(%@MSfjF)@^;M)tXzh$v>^3hok5CXe{1lGw)I!>XGL@>?p03)2)`7Hypz=cR_;? z4LDvD*pUFljs-y@5Dd>$%50@x5f~35lt;AWods}$6vy}v;el4LPS&F6Y!ROW$XvwkI`8^wgOoXI7nnYwiE~8g$*LURR9HiR7-yAmG%FgynUso=bw9wUasV@a9ljpz~;!8E}X zmW!t}@kTx`jf}8n%Y=Eh7GYXf&xmxZpT2RxJ!vo_V@GHp=< zMa*>qtssyG!7$sfP|J=|Oz==BM#={?AFJ)Lo=1llJ-nzCgd$arL@|_-d5fxOwq7#enA%hh3kzw1+mjj!M;4`}X073I1s*2$ z3N8_G5wTfIWoCIq-C7+{P&SNNII|3-K#Qj|aia(fd|^TseAiXu1Wj*V;FqHCUlqm9 z+L~t~R-Y2E%^R3i+`|I%^*7BrR;_6Jz}GVHcSRP{4sSCyliczn0$^29J?w!fC>t&p zA+ij_n2(fMzC#oFbG2EALhDZ+Eo^Fs)I||i{#J1q!EvAgSLyY6%|~h@OG==hS?B2X zIA=$+0INfmV(F2}6# zOjdJBm^4jqc51a6ch(S5N++sDzNO9mx`?Iwc@Ni9a_Dr?d^?* z-qx6$Q|TE|3TN8P!mei!2-l7j5q({R;}I#d-k7IQGhxe{fxN2Ne?)^S_2}kqjWrh} zQ8Ga=)vKbA(GBaR=*{BN2njZ?sZ3~)t!|-cEB7@GfLGm83bk;}n1CjfboM~*Mu>5e zxkA_l;H)q1tN{Pv9WmZvD`1%{PiWSEwamQT&!96QQ(f5%$t?L#5yc3ieo>r=wd(Im zYYk0WWuV#kew^d!s5wrvU~qb-wl_t1#WbF#m`an}s#yBoS;=hvY>3#inztLGvbSn` zXvmxyU_Xc?rO=YLxFT+8i&q+2;xGiRhTf^c3BMY$!Xxhj5c&a4jweOZ$8~%#i=A7R z1x!ooPzWd4`!zZ#B$M(CA)ZXgM z)C5VD954B)A%1wo24|dRAG-U;VIg90A^Vx(c@1)Hn%~S0Iu8VZKe_yvhPc@d`iF?X z5kb>R!#HVC>xRd3u;D8GfQBs3)@GU~MW(r-p%2yeDB?6c&JLb0g?rBD4&~h1&_KR- z$juP+2EBHN-l?IObBEqt+gl>t>i6D;%A;P6i<-2t)0ra9XmRgt^WpVXv+G3pyjv*>QRBqtTg&Bb~ZEdMUjBm)Ut;isot95JeyfJBD8 z!TzGrBLm!6R+a6f#?Kdt;e-h;#Fn@iI5hT`$UukC0jold%)KIaAJJ&iVLb7{J(;rA zmMB{`N%ovb;SR}BKRrP(oH?12gQy8XU8aV&H6N?3y^tdnBwMS(lPXd}IHxWv44u+> zE0g}Rp^s?vI|Iq^v`Fg7_8+w+`kE-jS4D3YH%Z=T$kwkRb)~{LH6N<&1EQYL>FQm@ zUA?$^kEVD0sl$fMz3Y9LIfs^FXS49bBL~K9a8NO7w-^y9>>4bBXEoIHY;7u@6j|=G z#kHqr4&fuUafIg-%;{Nj%vL=4f#GoME_42ED3CxHRl-Ee$dYO2cWTTGchlC)0FFSo z$39C#&xv$;wm7mssj+Su$l3(urA^ha)d~^ zww@GCU|+a{hQ``UA{!&_%uS?=C?Zw9WC;w>CV7i6NW`pWGfTL}2Z6<9)J+sx1#(HL zB}K)qMJuZXybu>%nMhTeT`Y1!NVR3ytsyoIvT_T3f It9OnAV--xH#4Yj*)gQZy z=o((?Aq!}B=FVnDkfdVaX2}>MsrJ~PAd%Fe4_@IK$4x(NgH>^?nmO`VsWL4tmDK`X zatx4E$;g(-yqw2(rIxT{31(OWXysyfw&1!=%(Xs|>V75A0AVNrB6mH*zEVetYaC^g zkqVg63ndphjWTr&qB#;p4$#V7*sVrt!d=t&X*;26k1x66W*JHe?8_1&s|CE&(*nDj z*g~1tSi2;tVkZ69fPIRfK8!B9wuW(?c#7*HPHHO*#mJUZKU2nTWoTx75!o0CPz)tk zlFr5=7)qAF5H0br2@%Y;ban|}rm(nlaHE03LUBD($=JZTWVOH<=t8rT94Hi%ioqVf zRu!TqP*k%E1$;1!jk~)CZvGT?45M8HUmIEYGnMhT-9O=+v97PbVCFy;j}@wKpUd=3gx(n6U{Dh4j2esNT^%Hr#L z(UT=y3pX)-EVecgs7VE;Fo8t}GU51S^Y(LXNPIK;#z+L%WrcjKCo-+Kk1bqV%~^4Z;|Y78}EH3kO2Efe$@3go_wr z{8+3=q`q%hvf(co;|gMk!wt0RM9T=5Jmi-~JCe}O-vSSd7Vp~QsL}doJ<}wRwFWpS$P|`w~Rap6p8a3bQ*M1C= z1}9c-@n)yVgK7L=Mig>{NVy_hDS>@WY-L38=W@x$NPuD}rL3`bNm9j3u%dM{>nq60 z?JQu*c1?T`q**{+GVuesj67DVfU&!XuHmUXXm)M`g8Ct&tKl$^33OygIE zoBxfLXzQo7^aJnG&ul5*qo4C#;x0{3Kk4Ve2hXnZL+EcsehLT5!j@zo%StDt%qgGC!4|UM~IOUfoWs8#>OGQ(#^Q+69hhV$dw<{_zflElCjj4WG+Y0Gz9jA-66V$7h;m_Zem_K z6I^5M7$Vm^{%at_aC|SiwgbRjQ?^`2-H1S4$fb&q%sOD~F5((Tc~E_bc|~Oo>M(~T z5V9(mx_9foTsXjP!P&ufO zAW#=_NgX9c#V#c*ag9q{bY(aQWj1@HmNHNk=1VSP>{}iCvxUyC30P=~Gz+NJR#-?N za-Bu7)^24)2^`|0%~&j|&EAX_5V_`h>&C!4vnkG2M~0KRowRNZ!Kz>?GSjx?vhXW&uZ!SyL6t=d~?3zcZr`~zuZ6QyF^R*O>prO zeja}4UE+K3!ydf+E9UpX<6Gy&{pF7}Q+^N`7s~wbedS3XIAnzkyG0p1!L!wopi6Bc$0o*E*RCZT0l z6!Q8OQ>bU{O&Po7vOu(61M84f?rJepNIJ(7JZULW0)uYGtyS*Ahpt<>($s67DB~$w zS3%skN{WhIN-&NPt(VJ1S9YLKaE>Kx*Ht7Ow{TikoPZBKHH2$?kod7U2~_N^*{({9 zs+{PN)#4(KgXnKdoE4|g<8b3KZjguv$(cc3u5nHl9Ez+Z6F-p4$YZ4n7`uz;8lK97 zX6H6gC?*vncZ|U|b=+bPLDz*3Jz2s<3^9Hzwl)!{3%R6@lA>al5|+5eB`&%$9E38P zJyJ^0V2{YB2rsnA%Vy*R3(-xLt_&2uq$ec0$a z#0rl%#tlo}nN4xFvk+{8V+!kUEu}o--CSW}8)Kx}>b?fh z%3b)-Qv+cxd?L-_B+y`Y2ScH>sutH=kF*lWVyeo>DU?}-mf+e{k<`_?fe`7;tPY)> zCJ(0ZgBelC5hCS6#8ndHzHm<*8f!0?Y>WgbhLY zDwvRRxja$=`x@jTx{fu=0TRIni-j^DW2i||h2)4<8Qh96y6DLw&2%8~)0VU)6F-p4 z4a83!q2?S}Ew*kQk_uPc7Rsz5Imim5ev%_vWpLRTU36{BHBK4&v51qC@dF7^Q%R!k zrnn;`I?U9|l8uqd4irk;7z#MVkQFlQ7Ng(^o-GNIC0uE^K}Ugun;7g)Od$+O%zeSF ztZ3?$OG46CCZdj2l8S+ABN=O1(Yk?ff~*c5vL?FYV`*JP6!Q8C30TH>#ZO)AR)*KC zW6hF{kpRUK*98#F59=xtK3j|+!6kUMvmhwo8kz)W$x5nV3MFpJ5onJwFLsCM8lK97 zX6H6gC}tN+o><`#$GA0l633|}G@CVSW?P9Tic=fj>nq0;r_!P@v=gltfkRw0g-9MO zDsx;Jk^i>T6UedtySp@}>^=NRcgcq&4eg_4n>70S;PL&*P5=EaT_^Av-z9&m{N3W= z%a`;gJb32^;^(yV1MkxLA^*Kg-~3+>exH8t(l@_LexH8t(zm}${+o_}*ZqV)wdd~= z&HS*I@`d_Qcj@=%;eX)gffDmi*!+hf|1{_FE4Kem&u+$rGOG~FbDJB^cQfBkzoicB z`;}eTt&R+)!xc~T&WXB-L41pm1qF`qS@~j@x6#l_WCMR?IJn7-bub)USCMeFZXiTD zGpj>q*90uM+-4C`$PtEOBxOt>e(GYkGQ9BTa>>R>fMO`6m^A6lvt$Vj(GHt8n+DHz z7J|!_HbkVlSL9^;K)OYKp(?RVhGPLQImigwjI6TBtRguqVaXU1uGS5N6J(XF@e`Qf zTzE#BMMNR5uN;BMFI1&2vD7QZ&A|qKt1G6n2nw+YWnD$W)w+Qommn**vmh7~*2E<^ zV;iiB-APio(yFLNm&))Wz~yq$>?8*YWsS9Ch#VUY&epwnoMbLt+mjhj^v;R8!JGJj zT%Jmc!Wh8B%Fqzta=B=Bk^_ZecCpA~g@^g-h9&RJ>zgeJv@KVfdc0Z0$;o)qi77;W zp|du6WN1v1x-WEPc2d=57eg3hqzV~yGcMco3m^LM1d$Z1bpzq|E^!Dw2Pd|9L^;14)OltldyfXVWM~L*xm+|m3=3sm zW9^cpikZl-TayP*5ZJJeYBux42SJ)eoI)mkAeSwNbyUC;lpha~)Ju3#eQ;YSvq{BL zLaZu8jkCkdu{$}B#RGBCm1$AcW*5UYY2!CS zya}}`2q(zu(1$0$WPFhLv4|*S;sPY6VQ80J(qM8yxX@-?OqDsH!}P-%N`$C)i=agqJX`3xl`Bm>)v$nC zt?}%{6b4oYq z$aNMqDV8fE8e=N}52`EPUWtdYQcGB}1P0w=4*>^*tla9zP_FSo*mCLMCW=@T8x~hd zQL!U-5nW%zMOP+L)n*qnD>hZc@oY&$flAHYEp$Llbhli$taQ*k2Cd+zgF*v628%jkXuP?a*SVl;Y`@%hz&{%u9G+Bv~7RtQF5o&W3 zaqo6E>=YAxc^(^BO1jsZn#i39S;+23Z|? zYS_$pT38nmg=|5T5=^0<^-~wSmEkq()XS2M5j+qtlzF3oT~{HwZdKO7Ah2N_)g+H` z-1O6yv?UWikW1#U0yF+xnb?cBwWg=DSa->6xF*y|KKF%66XHo_}^a&=v?)XHS z#nxszr~~F=$wgpMmBcQh>x+m6bY&uJLNTdW@+?_`?rSjfkoCdvY{7NI;K7uw?iD#z zSV*_XFVv)1ie476CQKI4En9?5C?;KOHcOVE`x*!_$VPwB*(Ho?%g8+Rh~mAzas*U_ zG?abe5=*kM_HxO_NOr|gawWUffNN95Ot7ML1K|W&9lCDiu4(*+Qc=jW)q-3BEF&by zec^J5u2V0|0TP*|q=hoCG5%dw5yxtyY!NyU(GYsFglpWy_-PBy%0Vj;BU{cvhkA-u zR*Q>BMzp$i=_VADHe=USkfS0E*rx~*4bm-icFQ#`A$38DCsN~s#E(UsLMDD7mnuSv3RsszOc95;s6LSVL^aE{ z!NLup_{{`swnajHTa?Qsc(%~l;prNh1m{EC7Ta>~=EHq-9 z1=J-+SV%y{1k^$F#jdhh)Hs2`gF-wORe{V^shcHZEK9d4mx~jy;aR|x?YgGq`|l+6qHao-%NhqAqqTLxVq;OEyLV6hkS+?9JG9 z6*Iw#){QwgNCYg|X0cr36Gf_{tR)jakW0(Z2ETp9xE@WiIZSqCWECj0Nt448VugqK z@PzILg7qxy796rBtf39ka^m1dq`q?EdZbmM)FqZiO|ryAO##U(s?9ED)`6*WYp%$c z5JQ4z3w=TYu76v~iR6ZRTppUI>fwFpF6G>R)?MPs&&S7!hln;kE{^Z_!JFgpNSrw> z568*)K2HI@hOMIVxze{@p z-}Ns2ewX}Z`{v)JpL&<_XJoqjL4Ruf)%#g3(T(2_e$Do8jX&=D?^-Jvy%qXivSKic&B2Cd+ssTjsX)DjJ!~UQz*d)CImlnVdH@`i#X|F z{6GSujHCAIVz)9h_;a~rVghv@nuqA^uvB2|Il!xp=&*|Re|%NjJSv@n=BWKDF(2fZ^>uo~=6lERf% zh4SRoqe(W$qNYHctX)JQmJ)_h#Y}kb76Caoc($`3DBv2}5aY)JYPH4>BoMhIQI*)O zjA)FTgN>2(#sbsKW^cx>tC&gmHDKQ~7@jS-Zb`sKBeq#Ut+v8K0+H)1iZz}_LUavJ zpMz%SHc%)g6-x=R(n*83#h&O1vT{rMWC_>!An{`nQOLv(wQ$WsXDJlf0?p7JCT#5MXLZmw+FOh(cc9Vy&m7sHoJ^%ylfb?hA#e_d=Of zh=C_<{MP@nhw#7M5=W48*u(hWZ;6KN<|nnpefm)k-jBRXbB})S62GIIL%xPtMcS0TF@7s|ZG+9gR9=KG=FC3>?z51FH3Z?q7n*AwKR zIHs_&P)taJ-H9n&X;t{^lDJU}`DuHVLf?-&oQ51S9151o(QZI6Ak-SOP-H~N-bw#kh4CcE(vC)Hhdj7+!Xx;5F2jhX76Q_KmfJNo1OduH z5E1rxDhUxzF;&A*l-O)I$+-k?N_JDOw029f9^*Z^ZcTRQA^tY@318@25AoJycRh;c zxhGl9@1|V0CcE(v`-kz4Q=B8btucNJ?n-uRu3MAcd5FJ_ zduXZ*M5;+8Au7fLmfkM!NoJkaWCc`wN@$h#knnF~xsy2uZ&Pm^H}C83SRRtQa@pT4 z$vnK99!2h5n)>=Xc5AL%lihiUKeVy_d`x#={vA7iT<*wqYqA>;@%j_}<)`Rh-kTnK zp1-?J@#b8&HRi3l=@7|vS7W#4x;5F2hd4Lut~#(lZ&OEz8I))!!wT&2?)sem3qp#2?yN{{G)`iv0)MO}XxCY@Q7MTj-WYasM}fdy?JQ7+>R? zPSJmF+?DIrWH%n-q`K>lk?FQvwF;m@hiaElYa^0Hj&O`idY)*jxbAllcQ;x)c zKQugX|7<{G(O-4$Qy911F_Pbu%+Bu0b!)Oa5AlaK)(8KN$Ac5R>yDY}wq!Rp#*Mn^ z6erD%x$aBGFaKSQ@dxLwWOwJft1)IjdV5AsQI~>ywT}v=}6e zO4ed(qLi&&`0C{qX&5ivVtLZytNDy-O$IqREly-Z7$QnB!&4FSh8L@+DPt}>4wp_X zMsw+FYV+ZgxO#IA9bUs|vb(Z@$0-G=B_z36926Z3MHAA%sN`8$R|aH@50`pSQFuU%hyCb8-a8+qwcrh~=#v#VKyEBtX z4n`ba-rbQfnd7Su)`l0WWr_%28#M7(M|#}3G;kk#tO6uRv`E8>QdGB9N@)MNm~N_B52Wkn{hnkBB~UBc@D?OP_DM?}xqH8(vXj$NBL9pY%%9o0VaHz7i z3tzn+MH-%I0!`o|X=kZ%b|x~1HG@UM)KJP2Nf2XR>=H(moka>UOiUX_wb7{!Mp*DT z7(hkDC?fT1CRR%!2FMx0Ml~z7!Avxq4b5J_B;~5oVBKV^iZ2?;8Q9G#x*3C6i}sCd zKrS=|SfGcQ!5u?BEFfn{I7$_UBZs414G|$ThzTzZ3@?_@kOpF4$`E#SYLB{9*=9i$ z|M(VZc(G(@5#f_D4(3)zdcvxO*`JX7B#3d5_-K|wN8$ojjIqfMjk+1T7-hk*bqZfj zcvytU>QqH29~QHO9S2sv0J=yb1)*<1GCYZR%u1gO?ili60Xaj$Q3CMJ75lPvA_7xR zB(2|%r$LKB!5K;OES3kIl&M3RUWiBBwLG33J{S_~4dZh#ml3|My)%a;=# zKC?;~dUk~JVKGbCj)e09=vz6Q>{-k*f&{CPqE7}{4Ee;|GU71dyoPaPYhSi;=rE;K z#4h2*u1q5eKMHqEB{_*BFE{p}UbE%hTrq$NC|1u@kysQsTsjn(c#)7T5kmyz%4jn1 zgcnm~nC!LFK%dpcl~H-Q(l^l5v!w>I;tex{JBEB%Xctez)d@sKSbo?#5sAO%inKMn z*p&&Ed_3ZyTk?Ztg|gh(HxQUY;LS@3OzGL_BE$eWL)g{T9T_HS!rp*6G>#;;YnI%x zJZWd!QVgwbok)WV34L$o$P88^C1L0%LwqqI7KOW`3)w7dU$%CL;PQsbi(SHt)l!H7 za)z+0iwzFS;b>PwL?)3Djg*8JOQ=@W31VENVdy8rR*~3faU2I$%_n(H10ILaVe&o_^e{8HW5|bvb`A-nOcc_r zRpPk71_HxqHYi-RbjOg-u|v@?Omo0d23s(Ic0`0rR}!M)V6`;cv5RL6^@v51nh2lL zO+&^cNYa`&yqGFeX|iR+p@|HhN>VS@Ek6Od1c?^GWgTTP%Qg!UVHgV5vSvu3Gz_bDC{8%i+49y@Sd&_WFRWuF zc;l179Ya1Ww2P;-=q#J17a12c)DjZJLmzBmW^l)l4~uAP39Zs$@J0t+LFgL@{3r== zE!{EXbL`@oj7c^G-iAc61~Oy?!faUtF&;YXWMR=_vC}}z1iBE!flH2d1RDq(6-*F_ ztcoQ>SWWCSAS4MTNi|Sds=x*UV+>DvZHBEhDl@?{W{&INOVkoi}223&$O^QIA zybj73@?ilv1EYZi*(rk^L_jAZ!Z29O3vt4WB}5pOeAsblBvG_c3&R!AF9IV*i!?lw zZ2chNvv#bF7)TC{6fYV!A?W6?OcKSr;l++8)rOBp3>rad5C^WnrEdTR2AU`dv2I}c zxzHqbDvDqc2Mv-3q3`&SZ&nFLu$F`uQw3>03``C(5=m4iqEba--vBZUG%14FS}J46 z=h(S#6pJ`$pj!K~H((Bp6|5!UfO)N-^ZA;_IFq*g<&Cw2dm8frxap4F*ci{jU3ZL!@UBOZhw!e({C3^c znD^?Y#?sO~jise~8p~67*Q0pj9lNoye!Xt_j!j$Mte5h4)ZfeYD@JHS;zFapoMxy&3#oHRQr<)qf&AR7~CWSAWS-Yg&* zN-zSEhG#llmuxpSb7&~?f~+?{`~%8t4Mr|WhzKvLOST)sm5Wf5;lQ%-Ab^IfOqeZ; zAgMUjB@$13V*!J;J7puqLWjg5`&I=kBSLu66GsS;36-D9XA?2uypFY|sY6gE1=KD% z5$S0-oReXp=yiscaB?(dPp(=zI2XxvQ+A67`;aNfUt;#@PO(-g2 zn3W~cFqx_cmj#s9@lG6aK$+ABBf^!;Zo~RORR}K*!5x!ZleJDjJ+R*St4^JoR^A*(TD(rPDCl&mEb8J zPPi&jy9fiCp-QpPJ8{UqRcRJjlVk%ICYDNSxd@)f9B0}p*E(@Ub%*YS3gJqO%g@l< zgbQIho2Uurg$2vT6$plon=spz%ql1%^@%8sLu8KePmYnnPh3&mCt?M$W0#ki!dHZW z8C;M)#zW^4iWH!j8U(zdBoLS_ydZ+JPEc$HG)c<@laX*4#yK$Q%jh1yBsC0>RLU2(w)Yy!dE_ z0owhrh*i`;c>xrQ0|-#)^)z@%0zy1gLitoan~39|90!Yo9P?T2v16B)iCaXXxNa)a z$2hgHt59+=SU~`Xl0c+vlZ(Az%(^3#4~sZqmvEE-#o{m{Z77?%rXASaj-a$xrPZO2Wc^9AdU^IMiWXp@L|ApTovqt#X(LC6R4eH zT+9?3z0kwlPrSF3gsW3aPSA+z-T=voG@4xmj!Aw})j}%3mTzu5_4Ry%fm;U=2F z9Ya2q&n9BRd6~Xp9pnrsQ%jgKPPU?vmAk1%qB*nj+2O$Jcmvo49Dof(O0W`Fq8X$z zI5Ri8xLLuSsg0;d|2R{A>r!Oia}KXjYR^H zGP=?z=vumC$cII&@JkpKjt5hNfOis+Fw#;L6@+-G@bjr0nPS3u9qYsu2+G84?QpJ2 zUyE=+bW*{BCfgAxFMwidM1Vr41eru!;1iIQKFoyjVF5Wq!chVgi-R14(7j;FgUCyR zbpw=|aPq2j2)jD938x&zQZ%_dDZ^Jmh$%D)Vr$AcjkE<+x7LYc?>l)g!>;d{d=ZeJ z!5u?B)z2ns<Kiv zT$Mr$v5FXWUI4|`KVay%$%N(tpILE*k{zLZSU}E@aFi+A;PuLiXm)N zu-QQ|t_Win494ZiliRpxsO4Kkz%lQAt)Ft>1P{BEn(rMTP)4SFk3P@0_CNm;b_@l<1YNOXE85OeKNRX$cKgEb(nB< zY7^E01Sr={jggZ|xsv}Xu<;Z}ZQB&{?5ed=A*z?PSR7{LO;8Inb{mOq3F8cwkhTud zVqhh!n^}J1iiQx19s-giN`^L9E#0X+d4~_%Q7x8HG#o9PW}__mm`Giu;h6xKQ2Aki zoFU;^&kLZK8U(zfWQ{~HA<8tK5ZCq6Sl__b- z4!{`yALQ(^^PQ6g%Zv+>^+4}Qmn?TZCq5~YlKF-^_uE`f`V+q4KlsjF-?tkc%j@C4<4f>(*M3}c z`=h)~e(4X2KP5fFufKCiw2d{b{qZ-N{0zqXWBRYY;`cVkeY@>DXRe-6vwq=o^5S1a zU!LwU+aBrHHRnUOX?XN}{H$TtpZ-|h+gy&VO?`E9lg58?pCkOzqwF2J>CXKl+h$G2 zpZ=Wi+&!M<7vH&mWczK7!}n3RpdzzR-VW`ayerj{v2v3mVkF(_N@bR|&`#Y!G+u0uOpb1k^PTne7 z424oEPJ3l*z$_H|mvq6g!Y^)n7GSj?15STUN5-KsAz`J`0Db_xe22q^DW z6WKnYIiJ6u+T8SY-<|V!_Px)^zrJ%kXur+z8+tw*2yf11e_V4+Quee$U^mz5U020~ zi35xnH`8nYc=>)d8-{3bi1$n`82(5Hn~%8ySEq_1glniCcPZ)laDB7(3ImI|fM7GZU_b#3hv@kvC~S}c9EFn_1ZHv&c>v?9E*`QUvu)N7 zW;4A7u=;yCDz|#6R#Tlt5*6-gh0y*)?rV#~;hdcH^D< zP__>@N4EQ#`&hP*HfOfmn)_h34>p%&CNea&UM4TQ2tG}-{$57_>1jR$Z)Q&zH_Ghc(y0+T)O#q zI`#bHcTR{m2gSek8Njk53P<)y20NSu!=Mi8O~9%tkDV#*MwPHJ33=>-Fex#E|p zdZIW9>KeUC6GuvrAx@pt5P{JvTLbUQ)}qMg8!7&;XCv3YN~fM=??`8=f0d0qWPDe) zf1mFAHs?Xyb>}>APjjEjJ~ljCy(O8rxjt-`<|BblgEM>ckiPy{M)fb#z5C96O17Q> zZT{!x?3%OTp!Ukv0DvAdxsaC-NrXZC{4L$*WZSIo$tL%f=FV~byKL`l&fD-)o8!jj zYy9ud@?DQ55AmmDd)J+#q|ZtB+uWxe<^T6P*B|ds`4;^3vD`lupOWpbcW%CH`OEYv z>HgN-=Vtp`b5qUFZH}MG+wPpdB%he=Ki@f?qTlA`1iZnS_F3j!K#1o4b6TEQ<1rLK55+-pq6Y8*;N^R!FcHm_ zpi{hbXg1cN1dFkt&=*@&m|PG8xaReXB3x0<8Z`%fBBvJqNDu(cjtH_Z+k5Bi}od7f>-Wb`ns#RugvymbCd1HJNKp8{%DQ| z^0zsEBLDTT<(D4iKkwY!wEOOyNArcx$;;-Zu^XHF@@y}f%O8N-?%ci2O{e$WIsawz zwa>}Joujk+n!6+0)tt}dPi@ZIc2{$GCciq{-nqGNcRVPh@pDVIZH{>k@~h7>&5`Sl zY^3Rbao>^dw>fVsk5L|A{+;J_p66iRjNBQY^*nWn=HdVGckZ<57hzh@LG0sNnD8ul zpX=$f9M9T$!Z9OA6fC(-@$a)Gm9}@Jvw_>P^~gN88-{bwZ&~mp}KSd05s6$1_HxzwHbs{lVS^D0EiQYikMt5fRYG~ zQ9^#$2R*#R3&XA!Kd-f#O%#(F9zCywC7r@;}k)FE(l>%3;e;^x$LCmG zJULOJz6xT)rH6UMFvmDql8{i|?ja!L<=a9gK-75Q%?-2{xI(0`Gr3>@jc-_n2+Iq4 zc!`H{Qe&~#TFo9)Xs>4VuVVbG_xXp9^!SOweT-grAdKICqnrN`v>q5kRtR$D0 z1oeXBTf>?@VirT{s83d9n49Iy@sbTg!z@5z94B^jp|^-&u~mjTgpoE4r2Qm_;PAw< zBdTu*?B-I7Fj#Cy#iB+R2^y)-O!7x!UDixy2w2XA9s`!MGW|?07?uFXtwhO%qGH2^ zNvB!NlJ~i?H4wL$LP5mn2Odt2XO7oJ3|@G1>oglK^jNTzmI;cvYADA7ByMuF%hN;k zdg&X&cyXa?0#?YGTrivxAZ7nB!^J~@A@WWW29=s#{?$t zb7gBFE-{6I=MKuW7}hn%OX9jFR2P^r84lML&T=-fr5K=%M1|4AC&BVG(l72RXHbA$ zu557@gNtd^Bvr0Vi-E6MyKrEv%bLkluVuV8LtJ7;o_w;9qc0ji&@axKCR7uc>MevK z64|nJu#%H2j_`1DJafD(dR-HOmYNW0=z#tKGrgs>2e`{Yc6}lgHppO6p=<*oXsK5y z&Dp6bz%339FX3w@c{%Hv5ai7*f!$o_AF$Y3Sm7AT!QuKuC~T0yqC$DLIqcSFW@`G% z=8}p;>EofXO%C)>)W2!&(dX-%>pA|!=Dc4$&9A?6K9RRQmONl{&+^E9d2`d&i^uzG zAIoX$#^&hlzUJudzUJ)hw&v{Zw&wiF|EYKG*5>Acyz#M|*7()>p?B`q=H?9VymLG! zcilOEByVdjPvc$9&C~D$K<}|_zK+D-1K&1 zbJN?6&C%O^&C%O^&Dq;+&Dq;+&84@ynoDnYHP_y5YOcNA)ZF%VPjlPbJy4x_sEI_F~lQ(xTGB?BQycuOz< zHWx9%ETJs(g-xQuny4^{7?aSAhZaMO7aEeZ&0x|fw77gQO-(Om>T51n2(n<(9pP~H z%GQ9CP5r`#jYgqRBro4c3B*4kLE?vG?$8X!0udA~9P~+4<5)PR(&U1nDvL3RVL4*@ zflHOLsm-E4E2B7Ep9B$8A`C90nTQ;cAb7n>EhTjvpF}kt`k6^yjw@7IFlkLj_`R|< z@IVPy9z%pWSQVlKYc6S^L^FjRF_71K%?1PH=6>OmsBrQ)LZuLf&@s#IVKVUL&Cv{% zryzE7WlM$Ws#B{C!HNspmQ7Ho zuTO%)1{vbSRF^{%EbxFKCrQ$*Pol&@&rEWN1{jj1wA~4V&4qgm@Bwjh!2pUs9N_aw zP}m>?IEwIhYXoqv^lBiLV}7I+;GGQB(fOl2!PfmQ3E>ZW)OXu zBpEGRKf8r@F2Q67Eea@?51Vy*m}eO;nKNJpRr<9PaS$|*Ow_Pn+9Vp6<7N9p9B$~qOxuXrpAcR zwO+;GVgTjxNz?$<0PylXQ<$aCX1y-XU|x~H7lR8M2ZtF65sixicZm-cX6Vw^44Q8Y zy|P6hhC2pWIKZC}Q3H>&@xj7`D9;EnxZ$wj%JyKh07s@s0(mBCKtoLpqA$~xL_@kn z=~B~b4u%|d(hzoKv@YVw4m%umIGP|+Q^VN{pb1!^W^%y*szc&ZK-B<+6B4G#J4sWl zV_icD1{d}V9~}~BteaHQ#J-Mok4c2_ThmPdhMLJmZ3qz+2Jw^NtgXy3h7G|&OfmkoI)YCxw=o>Iz0QkOMDwN@+eWREO65+Q{Hd_D;VhU1DU3z;)StRq9V zxQM|;TXhIXv5DC=8^Qv)L)7bq#y5Um*_r?V5t9oB`1ns$7{pJ4fe#VBp|BD8#?WiM zW&;41$pr&^9SRRBjd{_-t4XH`SF2%7H-Y+4>eKZ2lSCJDyQ3uRj5BJQP}tCXk1|)b zD8vv|#zfQ*Q3H>&A+wZMll6k@+d3GsH30zTOfDE8kY}R8Abt$8hO$KXhQgk8#KXvy zEebKXXz_4>uR~#c05S)K;;JU=1;;mjUTZZQ07Ohx6E&nc(es%I*h^wv)@)$>5cSH| z0DulNxsaC-Lg8!PI-*)rSYs_2u5S%%`iQ|bbw%dVA@MjxalCL3k%7r*@4D4FkH~gL7}*6SDj?3)v%_xxvYq(YJBlgD6CTy8oeaKSK+j@7_Mt5!Qh&^ zqPQhW9Mt*2%9^N%3$#;wV>BEt@EG7T%SPM3ngXG!Ph$tGcWiBHyFTl`Sq}h|pF| z)DTfUGs{{l$?|HlUT}SDSkp}apuCaw%79N(J(}6PdnIYxl zi_}qV_CX=LV#-1?9N!w&G?eD-RCTlBpcJi}4Pcj;tJE{0uo3yTK!$8lh#`WFhXVwX zpeK;m?t?;cRg?9C>l;6>Y%vl;gtls;hOQ*tHxQVZtGcWiwfWYuuA2b>5mVJoo2c+C zt!G$kC7F$WS;Pb;?{j5qATHP_V`^3$5`UpRCO>PDZqxf7RgPqL=PW%V_#98p$g@aj zPc5eiha>faq$DCbILvb0wG_j-q?u>sivevUGm9`rL?yd(g)`8ZBpzBYw;8m&l8^~C z`s34u#wNy7yd;r}6?H6dfyK*(vwBr=m~kn^BeD=uLO4#rj4dQMkq~ur&@Ht(nnf=Z zhr%M|kjl@dqd=*l&stRmK;lHRaK^*P3}_<>1dj_5*X~>qECxeRR%kP37j1HApCkrk zV6TCOIuS523B*exymXOuzgW72WWDV5Q#U{ZP0<#pL^=dk(wNvK{bip# zBt(vR#Yql{{phH!jKd;X+?iWxRBNHD8y)o5Chns_nL^mOY_OB?LZ(rxrdqQQ-6To7 zwP6|xi&^o`5J_}!{*T?VQE=n9x#j!6^+ir;7)0RW+Dck8r_5wU!*N>$n>yeO*!^h~R|F9RUTu=6l;p%p3gokZ zaK1~&Lf0kRAbH(oWP*N{ei}T6U^lAj9#HQBTGA9^#?3Vq$w>}TzI9aRy{gvc+%v{q z&c=AB^?j0$qI~i z&h?A_a{hii{|nWB%J}b7Uv~dVv-cC4%~xBJp)zyQpEc9b>)~YJ7+b&v@<;L`Y1BLs zH<6y-I^D5DNJ|@X-CaOSni+LEMO#>0Y)hkhu|CA#-TqhH3dUw2b7 zF;VNsX${GUXqaWhvftFrT>!Rs)1YhM)XS60Z5#y?-%?)pmnA3C@5lSXx5tOw$9&(8 zA3Ja#vOAyh&G|~~D-^HEO7=T*^0U9a9;C~2?dR@$S0Kt5G*luMQ}JqpMJdUN6eOZndkTBv5q`*~7*eh9*lci6sahn9T z4a%iRRXHR9$lX&Ta_g*g;$DgydR=G-0^mr$_#8k2Ol;m%0I0Ctnxjaym6X(3K!~wL z7PIri_DTS9_he>n-Rty5CU|Ii*`+PxBSawja-$gIMfL*SYaePjTS?E}n2AdEc3+ zT1bhMwta+wO+;yhti6FceAbY&kdG)Hb>1IriB`12u^rgs? z!3^BBm9#Zfgq17Sb!pe~4H1Zr@l;VL2sVr@$q*npv0_W5^kvD3z86Wg&=a%~-WnqU zSX-5a=;9hJEl_S7G;g#!IwtHY=_@>qdL&DV@vu^7VJ;<{=vdg}yO0toZE?8=GP~DF zpSu*zLkqNR)9no*FMhdqu_1F8*=|Xe%sKB)XbxI-O=cmG5!J@5tl0>vwNNN zxj34K78o%5TF!Y1|G%B}Owye6B+9NL4QfbKlIK4fvE_(&V-SeB;7KiJ7H>s1U5O*yKV@?%#Jd6OTD@D`f)h#*dZfa zU`3=Ad+nL2YO;exASYgv$e8(VKi2D>%lR&SFETSMGgbls)FPU7ofROQ#0_Odo4qbA z_WdwO)x-v_V~eSHwKYePYOA%$!wHIWiG+o&OSX9^MXZRsN*|?_v^UZrqov{4O2F+2 zW$X?hAxOJK5O}pES(1~NMHn7T=QV;kmC|WpZxbZDQS(H)_t#K*dqWcv!019+n(CWO$@n=n2YQ#t>N2Es;GwMmm&ru4$klT{bf6 z0Qo^jfjb0VZIY)1)1*p@#%&T}AYB(dQY|F=bvK0waMxC4Wym<+Nm#%Fk>kDqP)ji`F8wrEAH)qAEiKTtO*ccdBgv}? zX#kQFCs}OG2WJ+8N(#qDNQj}zc^;`GNXPQrJu?y0dJW#gxJCrenw{WTo@+_N4Ai5! z+$aE3@oGzw4$W3lUiX(JXJhh6wUCn6T?S%s*H+%pFe4qxiWVrhP0Mca0P>&vc8y=) z@pvMC_VM|YLM;4_`sDcrd0;m=Tk-KpK8s`Fv-G|VcxST`DQ$84oa1v94c;S&xUF>D zR%}zd9J@b_NMFQ4F}Is$<`jUvwZl43lp^I(e36Hat2(5qTViIldugQ~ZCoz`%qcyer|(^u0)`g`S{|AVni=Swo=G>5d>rOXG&E1e=P?dwC5B zLE0sPz^jey;wWUy;B2~?lVxoXL@Z`crH%A}n`4HbDF#^q+Hveq*14ux?OI}FB4QeZ zLY(sh1c6su^CiW2SgEr>2TPaeSlH{Av69za1`*({t#n#L#t|PHjxEnioyNqjv5dG; z27>Tjfb8)W_UctqAmhOl*i$JT06RZyyDxC_Iho-SzX}kw#||Ma4X>7G86P1660m?1 zg<>jRZDf~2!NfOPDy1(=P9!W~saog>%3TIxaMxC4A-Xt{cc`{5Z5ix)FM;Sn`ka#- zujcipO}AMFH97Z;yUj!Ix<$Xqm_TXE`n|W$u;6pdcxp#gX#e%cBYRVILaZO2Gt(Nw zY_)EpQp&O9sL=wws)e2)^et>z*S$_}q(fQfnr5|Su+P|?&|JTv7$b*g@_lh=nfR6h z84o9tVQk@Pn5uNpm$GvWF+oONm9%DJH(DC;tpuAo%?#N6(MxC%$Pmh&Gx8i@np8=F zd=|hlw#WiK6~y)hXj*Hst$yothnJ$+q1rlpG0=^M*#a_xjsh2|VQfj3(jFh%;Z zrs_GfKu;BceSwXLiwQD2D`^BNZYV1nj@xpS@ev{*0gE>ku8u|#=X^;{mhOZp?k{U9 zFR4p|JNhrA==G4UC_G@M;%o6gLW7sPk$|l1>u|XTx;rUL$~k zw0RyPau`08BI06VB1$W1Yp6)#L*s_61e-d|4A}jVE(CG~!G^I(#>{v7Qa#QB0xW)( zKIc>+*|0Cbqy?gj?N-v(P!U!y&3n`;z|<@TLp1c}Dn)XVBmE>OQf;+1-OR}#F}5&J zJ(Zd6MX&oDiRUWRbXMN5nb?h%hGQ!MrUT*CZi-$)+Ob8D{UJ%G$nl`-B4 z-e)o`cWL89Y_4b(;LczqfMvzZX0_AH#*3Pk1G>-$0)bas^CdZ9s>b6iKw$A>k%c{q z9FjDG$3mpbcCXVKDw6onaBO*+@)=UBbnK>xLP7ex+Pc>eAUUy;qN+_o45aI#N2-OM zppA$-C<@kAWn~CW@^kvD3#Fff98|gw$ zBqTOhewFtz%*d`omR0wt72DL!RczK?44D-5-hD3sixO-ZR8my6JuErVy*>SsN2f9h zXbe$PrXM>7W$h89&5CI8Tz=;Ah6|Xk3i=&Vl9X?3g!;)jC1uRtyW&gOF zl1cRZNI?g!fCU|$>u5rMcz}uWBE4mY=DUPK}o>-LQK@4;uCla*P z#da%c1Pxme0#~f-bUaI%8Ki2m(4~$81YT{;(@i~~Zz+(8)DH!vQb~Ie5jhMWiab+9 zQPf#UTSG-NBWh{g!M3ebE?@{-{2=WTohig63qh1Yw>Eh=k*upXNv&R0F)FpWIcA_Z zvbHMAnb?h%<~>p=$Py!b6`!lnBDgPsDBmFIG&#C9dFZz+InlWiEPNogFF@0pK^L|4 zwlh?`HvaNpR z0mvo;CZj4Zx2@QwZmz1X#;($fS!n3}u3_-T<|)k55lA1CZ`z3FwY zQ#1AiFReS!GdxQgau;B-&`|)oj^0bImXkxQ)WEr%@6x+`PqQ(@n2G?LTZ+vUt&*k> z_nB_i99yBS!}qa#L(Gsd^9)Rdt5`ogoi8cd=uXc4W$6+f3s|ZadhY`2;&!a-tN;h- z*rBXwv)839gPlRDCJX742m&u0mb~R~ui8l-`k^?NRq8?9*+>rnY9V4>XQk5`GS=#) z;n?zAON=&((C9+iB{DcaWqL>AN=Hw>?in-VXGz|7kqGPym=Z%2TNZcnI3FV&dfjM& za@%5a0d7!+{Gdf}H&_=7iH zb?le*-~8?Qef3-9KDU2=)2MqbV@7|k$Gg9rU-p-?uj`k6ynfq$Kc4@T{N4WcxNrY} z?9G3ze_DJfpYYdxJpcLs{dj-B|L#5C+4CRxU+O>W_jvQ~;{GS%7kxbc{q)~a|5t|g z@Bcq#_nqw{{lbs;YyWcI_xOwc?eTd2!A)_ce~!E2JY}iM0qIh9{um-tiMJ`w zVH_9>l8)PoZR!(ZhJ>I?RX-NGnB#m&*%|}|a_*dWn=P_{MHPX4fsI&^F5Ejm4pe&} z+EJB`XrQ^phF_E^JJ%Gr!c#vym84t7!+PCcmYj{rBh`~k+ie-o+AE4_O$GQIM>=FGkQ&)FU>E0Ov~r9_GB`vs=Ebsk&Ac=&2&GF96h9lWp}gWqlA2MbjOs zrJnWN1;RADo3ir^Kys2J{Uqs@v2@IQvq|vu+QoP$LeEBe08nd9w)ILRIPV~57NaU1 zw-sBqw;;p}2|{}GDhKFT&Y&$WCSqEz!F%jbRGSOLD?g7IMn`P*5s$j)lE`L9(T^tw@*nFxKggAfp`Oy4_ts zH!m{Pko?ir+(i&2Cd(YinE7r$7UZ)y7QRczq8tyjDx=7gkqNR{`f;3(kq#LdgONm9 zv1KDOge^vpcIPCADBn7&)1+!`^1$L)a`NQ1}ZWOpv@oGzw4$W3lG;R|<-z9b3^-VW>(d%vs5#aHwr0HX%Ls`)RRkeBH?&@yJ z&attt@F5JIElHPyvlT@ur7ufPq^=9^Y*qpQ)FRr8*lzXv@%&8sX!;1*?~|NzKQtjf zEoKq=X!(4Tq(igS+N=j^$uPF?RIe)C>_^$ThTinP%h#zHJDwa{>A0=framDxZY3bec$rrK;^=$%&3dIcH;}kP``T#{);!R)BUKJ7l!9K-;$1cwvv2V^+{2 zKoEGfHD8hwrm<3I0RoF3i!AI>8WudtrRSm?TBn~h#~8JP^Jj;NgO$lNH07;SeK@WS2IeYO}lQF##y z#YU0id`V80?j#SU=AKJ*EMTcx=m}!dQoK*#O&I8O$M47U!|FqJ_fwsZWug1r{h;`M z_+FpH9CBdHd&8L9Tu=uE%Z`AU2C$fek3@*NQaCDV9j247l{3YOxuKAt_Y&U zYa$CjNz*;_^a!+oz>J?IVPTId9rUHhlfew!wUx9rRD_i))^%yu@(mG)j`37cCh3MiMEiF)P8#HgUJ31!pD(Ndcjd~m*6Gq8%-*o2qJyjTiQ1FtO19EGIem`PNb0 zG9FgyERH2-V@6Jxz6&YQxDCb-Sg^G=_VM8|4P`|ORMn;%MkXSrnH9td1;EoN;+!uj z#=}aT1v*$7M8^WXs)dvQFli-H5BeCKp%_#~OPg7pftx`R19pF;3oU{O0CkK? z1@c*-VA!NSFBDAg47XLn~NT-J@1)Q#W@rZjQ}fj6$(~geys>iG*0H z+8$Q?bxEJ!hXlYB4wU1;xXuc2XD|}HwCQc2F!aXDZw{& zgvU2J76vcbMw7=~1`)Wpwkj(_#t|PHjxA49vfrJnN|;@4lz|}dYJ)`yhUTHe2Wb;# zTMk4l>}_;N(x`d1y2QsX@Hv8va;P_wS)!$~Uz8WwId1k9q?>Jyl5}vklJdG|%uG(C zt{W18{U}B3BY2VcFid%mAhcU`k6N)!-Q3N%c~2!py>}BB4ndWcH~_uYOh$hmXgZLZ){k52^Y z)J-Iy>>{@MU6wPN1TPK8mgic&Ap+6SP8EfM^m(-<83H6HR#H^8Nr-`TUGzw`&=a%~ zaR)`g+N!J!!EUrP;#&#K;0YM8`=gi8B9INvk2odi&}_9fOp(4UIY&>gDhRsKPPns% zn3zasC9RoQ#E0fR=m?gJyj!@Nvhxf;jPE+qPck^O7!T`p*K!#KL5J;LvpX<@HcrG? zxwb0niA?a)@M?LkB}OKe5nV{TLasu*#v3F@v+o!^yJbsL_HHRSS6`fQ`tSdq&6l zaUw`@LqS%$g+PrYLa7V{vA$`t?)2P{+cR4g$NlBf>?9Svw!UDajg_Hm=X{iTyP4PIb z;YbrOi#EH>v|^k3gcvF#2p)mzImu!}&jGd!Dq4@+UzVKcSS)L8Rx+Iv_H1?8?(#O) z-~_O&FtFiDV4J$R3&;pM3S7ko6YhLT*=pV7++WrP<&r+XjNiL}x`<|7X9YMw#|~vh zo4qdWaAkMRROgBv22ykaAncW-TSj#FsM;j3$+6SIK3G@T8n^8u=t5+E9L_rs?Wl?w zl359sjcjJ19?j*7K!(7pO)>=E?Z(Z9N z&LCBjh4e{eur6%P(@l*nZ#ECFgIY3-Ej(pWNn;4L5j>V+9qV4FW{Pj*a1g=Gek`$P zwgBUWjsl=qKf;xyn;Ki*Yr_=j%aRib3-qcMk`2J5rC3MNDnK}i=dhVss@bV$c@9_h z6C!YPxls^6h3$r3Ye-I*#(Ldf)&`x$C{jJi3v5KzU4%%iH}5@y96400hAS~OgWcQ( zWCR@r?$CI(C0UXarf0(x>C2kRORmLqA`mRw&)w76;4b}0IG>9%6%#err7eRUcH$-r zX?IR?#s#b9>88e(w-j&OCd?)$(q1H$vnd~n?p{?C$2u!%hFzSg1`jQ8y0n+Nxr)yf zyC+pWfKeJTPdAFIq`dBU#gelzd8AtC3EGGi4UFFIbu!FIhme*QXxrw6yM;TUOcvto z@+dbO)8<`n8Kg={odpOiInlYT{gTx^6jEI3b)=o*aazOTR-FpRR)S5PW(Ms3=q0oW zWC&%?8F>ycO{%0oJ`3O&TV#Qr3S#>LG_5t+R=;(+!%NZZP;DK)80bdBYylZTM}Z5~ zFt#L1a>Dd%m?C{yQ}vu#pr?w!zQ9Jr#RQq1l{A7BHMe$HX=bqHx`#J4;z^$8J>5Hzx`fxed~DBi-$ z(3&Y>crbs3b^ViaZ&j1k_pS^hP=)YH2vOJlFCK5s*->c8R!4*_b9-l9QK3 z7#>XLMI`(RPxY!KK$DU)kCRB3?OvxfR3!1CbqCwFQlF4%%_1$XNKOC-yxO|glAJ6ZKB_he zZ2aK27WOtlvZ2H?B@<+`^wR)9L$DiFX#kVC8wj3rxX-<={4ZAllK=aBL-5Hhe>h)k|nLDLTo)&$o_|K#mR{RhuxI zE_SF5Q?-z6fZRPZi9R#}a=IhPkwdi*XC>HFWZuhb+#DMS%py=~0=jV&G9K3J{<7q# z(E`1yg`OZx$nYHW?JjR)jg%@YQJ|_eGDu>;?$6GN2(f?)+f6bA-|fdz)h23{DQjLtkHO*?LHx)1HGJg8wb*W$X@p!)d2Ke~>zW4_E0`BA0@8J|17 zsp{`;<~Uzcw$an0#OPt+>$N-$QsB*fEb~rK6&>Ye_RSc7OB|S|leq zhAkOu-|fdD_ADU4;%Dh|P89;t=Ap=w!3?XKDrsxT_?az@k;}6T_6-rZx!foNaSL8; zNzzRXmA4cQJitIamC^yQ^TW3L0ym$N87}dw08xAF5Yp1{YI&CN5h5S~3pi0IrsCB` zb~zMGe6yue`m*Fi!UC46g`S|?WgrH3ZB-Vciz9i5YU|RL!M^trh%ThhImz*AUT@lT zn`KawbI-WjJoK(x^qY(cl(wwjd;1IvKF5rwc2tG-_v7&i@R9RT^r2Gt+4ynskvU&d zpIUS$OtU}-iysS5^;FqNr#^R2QS~ajLL?2zh-lbqsb}p4w4~W6sCq48M6y=2qosL|R7#iqgqU&jo=S=_-W@;RI zbR0n1G>)O_EXu)32k1iUK8GS*XdoB`t(n-37=lOC%`Ns5Vg^>5d#M15*96|?cn@r~ zhLsxW%bJS#T)c=#1a?{|ZE*)h!P-iiJ~%aGw6s9mwo*4&u~}4WV5)imqlk09B;cbv z$pZ}7Q>i2@V1YznUjV43*j)Kl($-KBRxU-}qn5plj}U>ItBeA7Dqd~fi;MGaFOP|D zDG*|8k;Ux%u92mX(f*(yk@VMv;-L(4`t92S48zhn9(NDUk7S zA{oXOo`$JP2Yo3!*ANqAZk4T9a+{+brr|etmv4eunp%_2m0e0PdHtBy>Q09~_pQ5a}Txg!DlJ zO2T;OUuMD1coe48DW{Hx>7|&*B)|%~dO-2|HPl7i+rCt8mX&~&`$V{<-?0XwInAt;W!Q$0cuoKE=`AzLg zsehYeM(0eKqug2{VJerL!(lm@VKR3;MMAlf68Q=Ai|DX^_MT8%y3>6)LCFL^g?kYN zFbU`7;#}G~A`fL^1AhhR0macefe;9;T0}GiiTngdf`NNIq!uh*Z5#9rjR2LiQ>fIe z&2f{5wvKYq7>!T~exUsvi4LBF-2oPH+{LmHhj%0xwu&INuvOw#uoKE=2(9Osf-*B5 zL0TWQb(D){MtY@N0Syccaj?d13vw8Qh=w3poY|3J*gAyNDnh+FtDF?0KMD8PIarR3 zn@nwwe)gVFH#*qRtj=2+5N1wq4yM2p z-8Zy##5FMnHt>PuxzP!~Vel2qsdlfwHQvi(_=S7)$*Ie>bipM@K*ws;MWW`>fP(j>ETh9m2oDRY!t zD$C0d7;Z z{8*1D9(fADaYG`9{;U?)Zb`7nNi;jlEF zos(|j>%4{vz@Z}U2Vkuhw_B2&M>Is~o>MYq+n}7F0E_GW&cPJyxXEDfIhZP^*JOdZ z_^}>Qydi8c2C7A1ximO@&H;(=U29!93fy?L#ak!EGfPgvf~kPY*$zHw>nIlu#G0ye z(#@Q8UPA>19U|3et>ja!noD#cZWjuBmV9QEGx~5+tjS6k?;NfIPX|Doms#$iO!Bi* zfF4kMopB9xCw-Cay$`x@*hhk4>oDKnkJzWq2g66^XSJU-pAc{}#;gyTgx}wyAJKeF zy!@1```7@){n+|6>zz#5M_RXyhM3vUPC2Ux{nS}G7~{s`)UK3Z26^5j3F*v|mlq_= z@SJ2xMC2sn-KAD%=ryt#AiYXC;|&PvLY$A(M}X4bRI_Dn)KGuC4H3w7Rd6Oh6^U(s-w|;df z(BrnG>e-)yd&mP36Nr2Tk z$r4}ZH6%v0WDB#U)~JS`D)uB2Z_<2WTrI9tei1seb6w4=X)lYEKnCe z){<1{0+ZsdVZHrXl~-r5fH{DAqMwBuueNv%jz&QB95l(SoH1s7(AH5dlNp&qkX?M* z#nsEhsKOvx5YdniT)WvI$r&kP4$Dpq>7`kHLtDARy+~XIWDLqWXUZIL4Gmh|3gk$Y zWU?;EDDE28+dDxS6Q1gTTI)g4&%%vYo5CF=o>_hrESL(YobBL~wvKYqK&+`cC*3eS z=QUJN&>>Qd&MI6x)V*u1%j>R0Z`1|lWOQL;LGjM9M76J-JhXM}sX%?>QFTsu9H}}( z1!N9=5yv6gJXQ}0hBr`@uL4W;5HJMJyP0^%GY8AjDGtKh!3~bKa)o*&1IDp&%BiDadMV~H39y2$9#Fjg3~>>k zvxw>7>92b@Jjjq>*jneSz*4<5tDF?0Kgqc0gQ;*c#;gyT1Zn8U$oJ`1;H{Z~tCv?q zy931i3{i^lDKSRf8|wChQqEdp=LI=PtR)s2?M0B`>ZW_jhbAF+vyIlH3cC8uheXfx zE$$jGHKcg?+M(`UYYnM|t&-IxoEfra$wl@eaYv{%RMtr-nI#VmTHOldNX;9?8dh+_ ze2*G3=-~*CM5L`lNUb8gkUFcJ6r(>0&#`l`92+;8+B{8Ho|s91)j8=VzRtSAq_}HX zZ|}t90pb~HOi=yoJ)v%Nu%TI@%r2Z9i=Wx zy?#jK=Xj*O&_y0!HzliE=VV}|YR0`TS@>?`pjd_8`!gS+k&vRi_m;TMphmPy4H2E| zua*2Y9S9kW*?zr8+z;sYBl0`scRpXX)i+=E{t|rrdhx5kuS6TW%MUWzI^r72FOUto z6`%(c-(;GQU7JIjsYrmvIH@by#p%)t!WaY_R$nUQsQOaiRn zs0Un<3eczUeus#LAW@GP^>K;@y^M}(JSo4JW0%mHhy*{0aoWE zYf06!KgHcvL^K4eamaLC1B6Hmg>LS&xib22TAE(gjCZbo8@ubAA?4Up*$N3$E&p;u zU>kOY_d8^RrJxH(un<74!$Hx{!j@N?!W|@@S$-5Om;YOa5qV@BspIjoh329@#QBuk=tCcY}T zN50y@6B+by*hi|0Vof<~YF}*|d{z<$U%(x16%;ywZmGqd|LuwJI#BGCeGJ0UVtmmL{D%^}Q>w~tAa+yG^sapO(hpXQ(!3)w? zqQPfXUY)@i#Gx*)yOv_qjMf?L(#)`Ce92sOe)hGKX%%L+DQF;)lLT+je!}a|Pyu|X zf|3E~v=O&kl6H>>QcHe%GzL4NlxBuCV|>J|oGEiMEmz=(w>_X&8?Zat;E^I8OzKe;Mi)II&OGCKw@-PUnIwx7@{e)dHW~z%) zFQU7-%pVeFwnZLaHzljXixSHqKBrsI_+R3yE&k@(eOl*(<%|Z_-UVnyA#GzJi z?*v^qGyMW;vhiCOEiuutrWzgs2C$jY%e0IsK*pfM!TA`-aXT&7H>YQ{F zUuWIC)CC=~z4sYHd3D>sz5zIhOK!3Ow!89@LwvmIGeZF>8*yewf?=x&QVUxpZX3jivKd0_Ii{e@Oh=H`2TelqPz>_3Qh*7d z_&Vz*=xBzAY!8BDal36`FigbRDnh+FD@J(Wg=*_LXq*Z+W6b)XAzXPVW&u{qk0q{t z!(cu^`bva1E35MAjL3i?pq`YoaO2gsK@N^K=St?FNoM7YG3$f2j&hmI$Q**~;?pj! zULHmjcMTEI5G3WvV_QJ?dPuDza)tWg1ZBepR`|@ZG{_U(rbI)ya!dh%gze(T5?3!z z8C11MfU z6JG;5;#_2VC+NcA9SMf5!$ExxPgB}9h#6(${6wC)W^ImV@JU0s@{BMGu!0UlmUul^ zbg;Xj)l$2d?lGAiNjusB%c6$d%X5Pp98E7Jj5ld-;tWUDIa79dtV4r@8M@?14e=st zJQT&Mt+$7ojZC{G!Cif{57 z4(Nze$x^+slEvX2Ne-=|*(qnqpWSnGRt^%wKMCWVgQ42jPNvmAv)mIiafk$HmZ^u= zpJAzYjh7mNm3*y=3_!nqd33W(Y1<%C*_i;7%JQaW>}Fh(hvp@+Hrr@Ds?JHb^Y$XR zU2)eC5e?BDu-kdFXN-b;0N-I+jS1%8v z3T~M1QA4m=T)S=dLKlgPtB72oS}~!-3zMxG?;Oma9XCk=iCOZ*LBd+DlPrnrjB8*9 zsztWc(oPWKsIoG6fBqusGRNKleUg>(Lk)JIw##QJm)o3P|zV# zjn>MWP^}t=E|i;hgl=vU$_sLkxUe(IN7*@YwoHK!yBl5pfIz}jE;)xIasqXWyM{=H zSSXPnPd_{{9H@SF%31R7B>5(kjS*Ti-nnLNj+;C*geygT7~Cm3L027Rm;a=@pWE91vBC-B5DYd#kJc8 zGjb9^Y8An>(N-TcGZ=fXV`4Tx`*H-~OqfyEqur)z`LP}moD3^QtS4a95Uj}3^dba= z5v}*SVXEcF5?|*vQ~(auh&2?+MqJ}15$O%= zA+-op(uUyd}2VDTaTNv6X?zayuP-(j-VjPewWEfI$11LlT{?Pr(i4J=Bmv4@Yn$S+ouz zwXo&Yb~pK?ahEgdVodRFb36mjgy)EBVhn810m;W<@p@eY-BTB(UOy!Awc3a^B*bC9 zc9t78v~~WV0dX&OicF?6$nz!-4O1nT$0We&oMefwGp>Od5V}$7_0v6^nKz4AYhB(9 z(fMk_a91TT8Njns$hh3jn>;j3l_b+-0XWBxb>0y0SK2N$Bt!iM#RA;-HZXt^L2AL` z)%HBX2Td>N&&8-QYh!nDE^Qrg4P|13ZUyK8#nC!}u*HZ~i-?9Gkst3!FmSJj)PlvU zZG*m{5ukE*3YD6*Id1aM)=@4RqY)~>544{n(ZO@DJHR52yI3~j@Qwt-RuQBYwo2Rz zc0$<KmFFca8GPFvxr^_zg`0S(i7{8u(m6J`NA5no9_}OAXl`1c`dBHewA2N(8ATAIa)I zoS*>bN5M)7hH76snKl~2l_S$+fx2~)^@!r(sQ_HHNQNT11I9ZN44`^QEm*wTo=5ng z=>^^Akf~rD8+T;`EOW%wOLiNYDy`(0n>TFTuHc5l6qF2*!Qpn>?1fe|MCqQUv}t$~ zD8OV`-^Ij%n=xj6&=9UX6oUY(<;Oa22)bOsMe>yhPek{a2#&ND+8$C1TP1E9-azrd zpLeHW4z`Vm$ZnTnEW z`Dm=1?V#9b5|AVV+@|WBHkWzMYpCG4ts|*j3@iDVUPK1A4k5K*DRJANoD>iId3V!h z47Qu&5q{Fv(G(`$B)_`&I6!5tUtxld9Bh<4sYv9n@ir)MPXwt2ONrYCy)^k+%-oey zvo`jf7(l&_a%;;Oz~S;~q2_I!CxAHK)!RFn(c;Wh2h>^*%45tbLcO{VC&gNNON10E z;Tq(5lgra|<>fI6usSE*#Mc?uKzGs?+1~q@EDrlfFl-f-_xln1)cG{{DE+MVvkvP+ z^HUzYb7>N&>(OpgwR{{NQng$=_UuWIC%w9yCdi%3lTswm^AP0)Ul(ULZKU=&8MG=M6!(E4YWehZ>@LOaw=gMXLx>3tL`o1v{Z^HbY8;fYo3QI9r@c zTSvK~OdO(H0eV1jSWX~pv7sTNA-Y40ryri91FFy0&Qd)DdEiQ#iHAIM&DxSVF+da2 zkem^d04q2!WG$(B_NN%J*LbNRSgFT!)d98E3~4{Ig$!pKpe07r4F!G>em>*-VULZ1>IbTx-znIaI}>tyg8Pr_JxpAXAh>jhcXGU zf(}F0BZ4zXp!-MY_EONrgnc9!whjm7F=k=QtF7P$iD#A{1tm-c*329f$xPbvw%0_V z{1wPC_wmjRrct?76X%)*b1_ndj*rj_Wq4ah;!J~10Q#($nSz9vbEAg0aQ)&$< zW1V2glBk}EuL`ax{!l{(JskFtWYIc=)SA;*+Xj6@`{wR6PVpwF*XX{XVG7UpOkgcg z7eCgLROkYe;;v!6{aKY)H{_YddPL=QR}FWI2d+fg%0-wr?GDbMvQ9!delUI83JFuW zKwH54yvRQspyHaY_=9rPO(biEe6QlF1i+{N* ziieQ`_q9emOxNK?iJB&_8+jb`xy ztk*bu1jducn3dJyc1w~YiDAA9EY;7Z5IFB<;vwJ6N!=I1Pa48YVd6~!>f*-|S1%8v zio1qLhF(K=z%}OHy@e2$g#qMVo*UfYXe(E^7eNE;rhCdH1~ApSJSG8F(A5Kq*Pr2l z8K{1#-dKrzs#Qm>qJELb_4354KD(A5%=iV!)*JOb@NpPgDC>}-% zW_Msj)Q~}SJ>zY(dqj|0xG7nk{y9NmmLJ7lOgt5{9emO>OU_giBjvwN(?ZSr3ANeLX%eIu3rSh&5Hqzg*_%<-NsS zLu7-`O8y!foB=tyu~W{1#j9bHvqrOlqozoxa=^U6|lL zQoe(14;8D$?Y5CaqCKRR{PgNRoS-nyPhl4m_ljoW$U0}r9Fd0x32PZY){;!t1sMf5 z%vYkl6WW;Yj)cP2;h^Ye&FQO6;SLg`KMD83N0}8W$4#aN;E1=qEQOGxpo{g0oM@;3 z9Jdh2u5}OBZb{lbqLIeu^wp-}IY9v?!+H)H=3u)yZt~D1Bu~sl4|G7Y&;yFs>l)}T zRWPd2O5|%r3}zW(W}UB{&+5JR)idTX?@i zpu036Wvw=14JA?Mivl-ZZSlbMXoi=PIWTsLKLdph=#I9Iaz&XqL;?}CpChf+<0-D^ zOYLI118TK*T|+_?m1k#}JtM1cXnJ`)#yi))joo$5ki32=wL-#F%a3*5Jd&|XO zlTC-G8iy{l{h(8^aO2e$uR-FO;RT&*=%dWa8R{kvZ5?q<%*1Be52wuy@eX)h!97d_ z5e>m=aqYH|Lx?@37J*9K6z-t=4xAqaD;mVyrT)^s_WJz4-HN-{!`f3M{ zD`{0;-I7o+1k{sqmV6x~obv|_D9Ic**R0L)mO9HGAos)=1X!JutVaYVqYCeLNQQW7 zMZ6k{idq^#8mADn`gtA$FQ@c`v8RU7BBr2IIFONxp)j7!$M+9f~r{Esq zAfh2yEe`uga%dewYGKQ(ZG*m{nF@V|&zuZY``XFnog=R0F|k1cGz&eTc>Nhd5rRiB}o5EzUB9)k9k}Qdv3>xT;aFOlL zYH^L11Y_9Ubi?h8Ru3WnZ)j%RHEyn1Te2dhnKS67Fji? zGMODoJ4p=7Lov0lwheM{G`*C}lqQ*#GvZjrmWQbl5@zB}0_x(&5?3z|ql&wR$OfNP zd38o)fIDDW)YI+Db4DLdO9M*6c;{eTa9=xlXqa9K8i)(jt&=Q?>pW9l3c#mZL^K4e z#Wmgr2JVRu3rZqxGm-K6iq;qIl#f0M}DQGz1B* zXS@vzphS>bxG8Z{xC7bD@}pp-)U3^MlO&L4vH7O;El{^kx-F?XPX*wFdV{gDKdZ$x zUJ?x46U|OJYffKn3U?qI7~!1k6qK`BIorV}O@cIKY(THy>gaQN_d9Vj5i4k)xLJ}&=9UXF_QqRbCM;#&TBa23{+ch@5JN*FkN-T zTI=$D{3DuhH~BRm@jhZd-QSPcFRj0R^I`bQ$zKWuBYs@|QXJ0gNOB}0`m26XFW;LK z>kWUQzs3Db{ZjlE=93=GC+v%%>wEBPnkg>@;7~(ELy#=a>_{+d9YSj1ro>I*4iXQX z9|bF=W^Im}B!NU4*}iFogzeTzmZWvYKQIGAL$*Jw#Wh|M40538NjZx^UTquX;Amzj znJG;&D`z;e&Y3btxpYQ*RxLkPbM*4w;;tdG!Dm%ooxuXS1D1usbo=ro*=sR48c;Rk zor7_~eeL9-VR|WOATCh1PO>Df^Gtau0H1CV(GaW_*LWKkxF>?t!cB?W27N;_1Nsb~ zxn^zbE|a3IqbX>N)}w0q+zIN6;*qBSTu%|v5G1&s@is7k5I*4rDXSkAjs_ zvo^;~l0YJjEZ(8pcnR1#$y!o1ZphkGzf`U^9qxefj)V@V9#Ts_v(;IBL(>bo_u9E; zZR{@2rD;Z8zlA{eLSU-7d3%R7=B|kKe5oPWsYiSfYxsyDwO}c6i|2&0S$@bW=O|YrCiz*N6COvZ&X6$iITzX9$&40<2ed$~eZD+O5$dH`IY_J}78>c53|Aa| z4c@sle3jH|vcT}Tg8W$L4Pgx{3_2u3{bq^sZKMsC){r+2gJ`Mhr_;3Ff`_lXppL_p$-{SsS{{BVv6X?_6SF;~} zPyYtLsQxMa`v2Vc59ssN{Cap7+Tr@`K zSGP{OiLW#Mf$oNeY=2hr*O;62hB}LWcFI|>c(rYigQKl{NG_NP&^X&cDbdzZu6dXt zRPs9EAr53M0nnCnk?oymBM$FKFl-eK%0*`tp&r6HCu=MY)s<4Swqzk41E*eN3-5P`)DR@)DPUUw*+q6yx_x;PeG|%N$u)ZBUQSae?;YZx>>{tF4`R(Kz_1Ko`7c#0ho>0` z1BcW{@obpwGpsoMUzYhN54yMCPob{I>ZAF1mVbCqAG@bc&4=-P3Krja&>xh!SN&^4 z`IX_9pkI`K_(uQR9@KB>OW)``;1?hC>zRD2e|hLn$n@9eOApG&l)KMDQnZ_$@CNBBSSp!1O5f6)8AhzeRuiH+uh0yuY1ON#)nos-wU0`@4I7f#Uj) z%&NZqH|~%AT>g{HyV2vzFEWN*i&;B=>2y7``_q&e*gZ5`}Ite zU;KOddgxEcO6E@q{p&wm6TkP3&Y#|Mqy7y0H+I1LE%?qi$~TR4 zepRSC7R>Xn&h!%8!&`Veui*83%J1AL!FQhJ3}>8?0FS;vAHiRn$yFx)-wYi-kHGfa zoqEAFoE89LROhyTeI{4gdwHWS*A1z+>b84h9@9IsMd%Q;biU$$`9bev`XQNfEpFBo zr|fdibL?HY0!*p06tMl{gVL|zia%u7%=$|+z5Dk<{Y~@h>^UT}@d$tVpg!D}9`pxg z@~OJG_hf9_H`?QT=RtW~mZxHTMshf%b03Vu;B z+JpY6%s+cjKK35)4<6Jf`QA5*?7I(&CDtuVf79rM!>Ee(_uT`&ajG@;AC)+86#|;CFsy`wsyBw)E}m%I`tH4pIHh0Q*6ie;Vqa=a>FQ z&vtHpPxLQ7=#R?08~xFz%QxcJvR{oKHblM&ejNHFRQqkjVzx3S2^cMlp>Q^G^ZY@X z+Wv89;oO!tx3a42QVvje?}wn~Wcx(uhWFEB*hPxzT?Z+8w&Y z{|W8(H}pxkD2r!fj0P;Kz}E0~7;igqYWoTZRc#%;(9!WQiM?+k4r~h&$9^N2ZjO&g57!D2Sh6M`k4Y`(c zK*{h2bG<`Q##k}GiQo!$h%(O(q2ETnvA(xI`+M;#^~=qIU*u%`Mytn$$~A?3 zt?Y=DJCvd(nOrvKCPLn!{gVmE%#f-6lF+|A(+B+0gZ9CFk7RnIJ@+cW&f2R+$eocZxAJ=fgdC%k*u z^HAV7r0?AWrojN5N87@hmX{W@NkIa}XeP}UvWFh2q4E2BbCLHJPqXns!9}flv^zo-s?qmo0;mrGt(8^??WhI z{`_(%yZ`K>|4?Qw=t{HL+qdhgUtN^%AP>UhW8q&_wHJ<$H&6x4tvZD$Tt zVsCq-J;e{UwDI4&tr+I%?vaGN-0T(Jt1)OrUrwnVh(@Olqwfo zq3`4EFdZ;PIA9W2Jg9hlcMue|=gZJ61Xsl`gm#H9LgPEG zy_fs;qW@4PcbJ5{xJ|yKgcke!qDxi77jb{Pet|s*;Dh^q=B+E^HS(QBio(X2pQf*#uOJS->S^4Ns^=xawS9UXiL>|IL4o+?_hs(rYn%}S?I+7 zFf`wV%X0x!&e!v=WPTUQ_x$BWgK(AnCFHgRV=QLD_vlEZ-)djc%k>M7bH4hjY8V9O zuV%6x(VzZB4GAXc&xeLSzAWS~lRtCOe=764i~1e(OZae6gtHX8bCKfd6nZN$aQ2t% zH)@AyhK{MqrDDi4g)jquI7+pUtMZRBbnCdlf@C^4M~V zc`y*6O6UD#E?(JpnGxw%TT`@lNn2R+yo7p(8HLI=U4v9=TRmdbp=h>O4GoVPaa&sk z%>Jisho!Fb z-Gf8CB374u{xeqv% z{0gyR5S_-3GKBnq>9T?+`#{W6Qq#|n?toLlvG-IHnOqKWnDdCog03Kx`7u#?sOd*i z>M%mfE2+TDqm%3?!|}YpbZk*d-qT>f!(dJhx-_A~vG?TQqozkDBSN=eI-R9eL&|V? zFqzZ1zLyRspf3&~L@KKuG0y?h2_`_kPQ&40K%dt4(#fO_WlJ?Z3XZBg=IL~fKkJ~@ zF4XklKHyYvmJG}wI?0YQg#3W%;FrQ`5cLZ+J(Q3Fb|Hv^lj8NbuCG!g$^#!&w zqzU-!PMR;M@PMJ|g>W>A-0>`d-VKJGQd&9%?7~kPxf!jcIX2Q_hrliS9vo@R3r-V~ zU^_T+homz)c)S)=n5NTkWG%ZoTpR_Y(*l1=7a|Gk>W01qNuOJJ`#X*f5$M%3I52!B(0x5kvLwSK| z_0$4z;H6O8IMs5&^?_{^%BtZw55RgEWtdi-L4Zk7A0Gx*dDXH$VdLY1a;;Eg#(o!= z_7!BCQmS#0*dB0w9PCCh>Iks_N<3X*hCTre05md`YNVU;fG8n_ZMR~kjFrif7-wwe z38=qae4xzFno?y@OQ;34$bo_npr6oWYBxilfN*#SkIK|EPO&|p)LKW1UYw~Hfv;5; z%y1f<0C}ZTpuU>av0PAUnkm(Cpx`rAN~@7xAl;H!%1(iL&EOAb)OkfORi>IA(D+0H zUv`oUVzeY(=@iW2Pa3Bgtz~_bW3dG++PqUuvPFh;kY?n|PJsi>3dR=VFp(+C!i@0Sf9xBrh zQA+KV9*{1H@uzFgfI2Bw86cQpgmJuLgh zwc$9Zk&~%W@mXpLhr3?Ha^dNs6x-bh zAv@GIooXgP*rh5{su?)0ZNOt|D0syXLp47e);Ak10cUFYXmE%N07eoJ2b>o}`uVXe z=1^E6o!-D|q;2m3}=7?E}R~%GXl+1Upbpp{CD=Z}Ca6;Y^5G&-h0$6Rq zklIJMEciUd6>x;EM~p<`b8Ho^E&k|UQZ$NRwCj7ER8_`O7yzaS(EX+fAurNZbA^G5 zNeWfQjDxg`i2_-e!RXP}6^?B{jI!DqCab7$7|buD4TynZ5d3XntI>=j-l!Up$&SFV z=V6c!%9M=Lf%^+Y4Z+_Qw)#kzsZw9?De_*i`kBWMDY!D5I2r;35Q8#(A8$5fp6fZ=w{E}%mcl2Ljpnr!Cz?3IM^Xg z2n)FzOz?`MXBMGv1Z~Sxz`;kKnk^7PZ3oj4v4R>_Tz$-jAg8`*L##Z@g%r01Fq%o0 zLV03GvxdyTeB#A^p+=ux9$qo5KFovw%o9Qb5E#r`i*R8<^jA(7sINwtLDx8MB8<_i z046ggC}3nYM-TL1@*EGsTNSlKIt6{lM-Z;9`=v1~s4zYHaAB_~en`QS)yA%JfEWZc z5>1?Bg%A@F3}DbGmgbBx&m!B%Q{Xn$hp8HkD;(QO4Kds(VNq0rjfL!NXS%*DGJbs>q!mIu~J#<0~els-s;0d7P|z~XtuJNqYu}!WVBQASR!hp zNw_$=L&#YNA}i#Lex;0<`;&%xxq8_#R26Jax~fw;3lOJIYcFtY%0!eW3o+k_|>5qRoDwzmVn zf+0gk(YF=cnL6VzQI^b=fyfgFf*CW|>9hy4=g?U)(*OnMRltpG9IR|62++N7FW*7n zgaj{S=(zX2CZz@F@-_%HnJ($*Z5)&vyJ=!?MjImhsM3oxnaE;C$Tnd3FiJS}3s*9P zne%LE&(RI2QZfk>t`t*32v%9m(ZSVYxv`^bg6|wq`BX)q9(fgr8^mBpftdS~Micif zfqI^Yn+Ay=WyePFaNQO{yy62l1N1bAe4=(tKhsjv%s@6>7Ge_S;Ti$~3K%}vWI)Z> zUr1>n=LxTUMX+T?r6G(N7QoR-vDP(54_A*VA1=+b*KZryjH@C|*1Tq>Mcz`t@L`li zqKj-M3`)pc~J>v zXX;c4ERXNqh6kk^QPoF7fy`V1%oYvgvk~-NBUC8juID0BN;?a&du<+%%#s2V%G#FB zV1Dw^#EoD(q|MmccNE}l{gQd7F`!eU5WL308uN>$_J+!TVpEiiBdO)A}h?UNG7 zVNqcQ^OFxMpU4^VXhdzF!VNkycEN#lA%w?PVU1W0OMW;Y@SLD@2tx()m&uMOLdYw? zc<^ARW@R;3Uuc=g`9!8^`-(6E3}N1qBg+txk;RfQr*57>rlc$0X8P#Gz@0>WYy-Lq zl-Gp-BddAw_|kNj?2b$LdzZ5jMA=Rhx=x1brmPsE{G@xj#teBDZ-Wp-GzF8X_)W3k zGnA4R7Q||BGbtG-|zJm6HWSWugx=|3|U z(9NObf+%4SRbyM6=s~G7q)X#e&IQs(rnG(xeGZte&N0zB6Hv0}G_G&KnUH*e^o-E- z$OP&IrmM3+M7vZ*I+II6V%8U8B6#^x(<2jBxWII5u|vEynv=&}Go(AyRP7{G-8klnb!mAy1Vta2 zTt^wQ!ULuoKADC#}{4mcb-1H$o;8bB_2Sp7d{d0)GoM%1TDU(|bsl(yHq?UAPLW5)P z$-zfWk4#2{Zozcxj7LPflw6{c*SGwcge`X{eOuw10#Y7vX-XlKlJ_*rWk71AzLyS% zI+QKlrANV0mB&1t8)G^gvxcExxbT+jJH+x4@QEfejU5FXYDl5kttX1A6E0OZDQjI? zUL)WG<|p=M=r%*21EzxznW@T?KKK*`A;-J4ymA*YOEoi`gMRP#nVGz(RO=xn&nW~9QsiO>&P*0mMN-;{~q4O}P zC0&|WXCH8;eOpCXF}uzq9t%2m4Ot&OAn`n)o!(@|w>Z=KSVee1xcY>4q~YSGx~@>u z8*+iMEe-`9pV*rbG0b@mm=0oOup*Zq?t*q{LK{(1)N}B0)*}-xc7g3kj6NDxm=Og zDzxQ3;8f}n@PXqe_PFxvFkDJ$ZB@wWJbYf_rws@QHG-mDL}f1a`=YN z!C(X&2-7$P!)ivNd1nUm0Y`O#c8Zr1qY>bg*BYoyvl6}FF1fO}5hTdEDfLXJyOfQR z&bAC{i7?GhwSqJh$(5}@gk9$mk40J)bNG`+Qby~E@_;jhg2WG)pUxmIII0V5r$9=4 zNWmbZRtm!$aEASbWam5cQEq_?Pd%R&e@$vOO{%==i84$a;v)?@SY)y!E;!8x&_q4R zPz`;mylPn=*o12%O}*nF2u;RgK0cPQpvPi4+R;RwS3s#@DnQ7u;Z!NDTogU_=F{KL5vQFhb?Z9{Yevf1_7lO#6llw+!2l+t6eZdx_8h- zJ;+dv*Hn4c5@A6cw2`KsL%uAD3u3fjY(tzA6Mhh-?r1%01Q_YO6*y62%v3xNm^PjZ zY8r}WF-3%>8*tPo13MOU6QxV0;g_Mr8Qg0A4JIl=um{!l)wyr@Ory(w$L?*BA>B-3fjSrX~E2R(|(k*Ew z=@dxO42$8!mO7RQZ73q7fJ>WqDy0Xc3ooOzDjevB$B^PneVo=C3w@+X;ZP=9;({2h zF;;el;K5ItR2h&OZa_6X5b%NH$7*Xtag}p*aee7Hc$i5|^|40LB-h1~rOIDF-wueLF*W zfo(OyY96weLTwXkEEkm86k-ro3@YSK%ZlX3_kbF$Yy^fxj9vr- z<6L4vjU>c>*OAL8>Q;Hxa;f9)YT0iDeBk)88k4Xb(v^__+8Hb-MhvX-s)AP#Gzuv) zd;upI%LTXFr2tI~MKkzL`-)zhl}!@DiU|}kWh$Nrq$#7e863_9jtY);9B}+-(jyZR zFAybyf*?^RT&ixNT;R~)ET|1#?ac_jlm$_S#UxMU@TbZv946AM4mvD`YUfn-7&)PJ ziRf(^?Jy1z2c(`~33H@nz)Z;SIby=tXYn=&)zhb7!bb&17q&|%H9 z_^Lv;%Lr#O6mTRUFNc=mP-}~8?d$rV6_EH_OpQ|(XY`8P?czkbFJVHa(q?l*0Aq^znfHo=ZcZrDVa-#u0$J6IfiD8__y*k!GsRs6Fgn^iS~K<$heMyY z6<~@05x~-~FLd;I?-EMuHMkTw1lX;$YJlj)=5Rt(6v)CjFBQ;$id9FOXOVU?35qK8 zkTdGJ1ZIiSfQQ086;-;8G;RqLZ-d zXCtu?%DQX=o`oo|VTHA|c6<`nM+$7y^HyJGt>X0f2GxYa9~`U!Nz;_|=2@hj3OTB} zt0_4+<0GK~ybMur8r*s2rKdtD^I=>gJ2+Z|3#rUSqJXAbP&yTLta`Pyd2mSreY>gw zq7Md&jurA&yh34@fFT#wJT!W#d~9t6!ci42q-pb^j**NR2h|wF3GXIpj8@-B*Vir@ zY@!Mme#{m~aBT&naW`Ux3<8};{Nx8|n-F%2VfCQZ1;OR`g6hHq#r%LrAy$0GJd13D z!jGzO^#F(yaX7ARrHl;FB3xKA^;g?S*H)isVvdTBv!F>aFa8W<#6`pjDXZpPoDJq# zWE(vcXo!O(4%M`-z`Ii^cAC3}FyflUF4C+}pJyQ|LqHZqXCEsYcHwz8V$vuv>)`0- z(Hcz~6nqXuNMomfgAa3IYs;i$9%WpZ zz?vU;)Oqy2T3H%rNQ#2K<5RN*!hf+9iaB}`eNP#StEFTcF>u(nc^?1-c|4Yp!Rn@i zg+aMkhYK?uZ@ug}(pep#;JipBw(kyB=0Ze@cf)d#qMOGuNf~B~ssh9$UM9LPVrlw- z;-!Mx#({Nkg;22contyb)1?f)wAdNA89GJdEfA2FUsfVEc64hiE!-?1(-ICQd0~Y| z-;0Tw;)!N?3SXa=5UK4`xBqk|g;Bj_*>?;X4#sQyNFv{3YUv%-`);|RBA;0}vR3olTZ>OV^* zui(NCv<=p>K@a_L{s!`zIOk?Ma)%A|PQ99(Fdct#)m;lb9y2K!<>GsnD=CW2U4kZ$ zY7Js~)_k1b09IVhl-c+Swe((qh%Y`1w`abXyrCi}?hAP^JTwE3FkA&iflgolrafb2 zv0eIcejTGZ&}X6UQs)pazB%c;ij*7#DWuKPGg`vs^66FU_ruE)!)+Sv8cs6vOGyHo z6=v|%U0)9=FuQ#=#w&wzgU9b+XueC;Ym_d+0_&2=tzDbME))MC6?cJU$D2)~07l`* zy6bDJ7^cCHB%bTYjgbq=z&vV;CPh5wCgGLVvvm?mi_6Bta8i}>Yf$TNL$NTzt01<| zCg;q2F%4&A?9qcV%W=7Af9{s0Y}MB#^-}K}UJJ#(4#p{fq2l{c7F@n-Ud|vJ3a?z& z%(J1cE2Ef77n2h4oM$O`rxUP87qZ~vKq3rZ^4}Y+-K!I=x?kOgG!mvv}~EYt2|1LWx?~8&7P1L29U6>DF4J&V+-|{ zuNr$YT&X+lnw$NMU1%?xH5KlWtE8KgZ)Hq+Fo;IrU%2YE$=Qls{C&2&@7>FuJD&^f zRUrQ}S2ap&zDGAWU*M;yEHZpa^Rn*MTJNeQg)ZMRw>0h1zkV;@UA0D!AJs-h7;MIQ z?i8p@dxWWex5ldjMz{6OrCc1PuQ$LSMxqWO)}~i4XT5+K9JR&MQ)%H99uVvFr=wbS`(o5*5503WkK^nQCa&ySNHbluR$K%S_ zU%WIFJJ!r#EM|G}4ybu}ElqoraT+&AbAmrGgBdwv$-4&`ra1G(GLQ=jg%K|cj;IbQ z<4N5%GQ0TH?3D>V4`h7r@@0uviu)`b4TH6))9FFhH^Oiw3>i=cHah7Lo5!V4nb9Dq zHLvIb*;B3{&Kbm8tmU=Dbrn4Qyb&bRlnn}hk%;jZ?^T(Ge@+Gl z$<5(xQyf(obtXyrkct-&q;pkSopgD?jYz(e+k3|GLUIGz)kl{bSA^H9LbJjtL;NBKu(|k_mo~D8uP?eN&Cq5wgxvT`0PciWA7Q*SbT`r1AP$VFKfZ%z zcIeyUl%aN60PW0+(a>_Xw#x*P{D4;rzRki;Y0dYC*tXU7YS( z6;M~1(H-*C3Kc*6OFHF;(C@4zq$*H*!5la z6J1LGot&&w{Kl(J)UNwk>UUS|7uYZA*Dm{QC|~F<^=|9eUfeJK)WhVtzi!)Op%oi&( z2c%SIY*Ak4q7Mf`Ho*k$=b!xFMRRx#DJ!mZcQh2Y0y9nqs3BgXl6ztxDtzv#hc!Xw_@G%CR;D57va9OT28=ao51Bth~^z_qsfi znuhXXYf|Ef$Lu;EhmOyF?^e5d_RwP287gcXJmRrSItF;MUsD0Z!gd=%oDJ+sn7@*z|J-PlHHUyO1E({3u?c0ZAbR1?Q}GI}3C8GiBt!>?TC8zs84=(P zdMP^-k){+T~9ahTHaDxTM4tHzAzpLIL*b$MCphIBf9 zyKPbm0+B=eZt?nmO8U~^B_@Wu z4ps@+EZbu>5dfh>d`;aJT-v;0ctoXskIdJQY3MU41)Hrk{TPZWx{vY;O4-3)K>Hl= zc&w<5?)Ic#^lg!4!|-yYnANzdAZv-2lcLNmHzeI_D4M7~iZ4iTxeDAZYXa)=SW)d> z?(mwbpm;pA%W=8Hco;7;vslL1pim4lS9;cZf&c7PG080n;kj*Wj}=7HW#;W*kKPFaW~+^E-3eAoo^U3Ni$wvZI~{_|HQCF`o`5;(=<*h(}*de;gMtjec! za(P>xuvg&d-$fs<+FN>G*jdx>!S-0WaP>pm3o%jCh5zJL)o+|zuFDwUUZ1*~an}ij z8SY2P1p)4_#$&(P>>iKDimG>dgxRm-gWI!$9_TE@HkN~+8#G3?>Z_sqXrjQ~%#14@ zgVu^lFtk@p+QH)?FNbjSuAAkC7XQ7&ll2Hp_l7wuc7QTPtsW-xlbll zS2JQ7DXT$nR;Sc1N0{O-sR+c(TlexXp!8KH6ibA$`ieI2LS<$_hCVvfW(GI^xr+0} zl8mh@S|HQ1$Cs%}M6=GF_lQ`8Ol_A)w>x0-PJ-;Lw}D-c8G~GYwhuo{d!6g`jCBH! zhjux>PQ^sJYo9u%kBn7bms7nQZ@TY`V8yzZDuEFv8^H(J+8>Micku!B}ag3Ln!Ot(ty2m+b zehobily9aB?iPGkTeIe6g{h$1vn`VZfsP<|VV9b0&~(5X*A zbGzZ8vmAlhB-+&3HlEOBk+R&Xw)Hxf^rkAltg6v0tITBAn71=W_*<*I%? z96z}1fAFgLC-{TQ{wx20{V(Jv|A0s1-_(B@Tlarx@~7^7z|PhBUkf}8{7nsu*;>H_ zK3WPpX&d7`$xt*oIqBt6(i_?p|}8P#1VJh5J8qRaaq4 zueP_(1zv7=uRO4PuZfnyZ@Q}A9%)H6`lJiG`~|aXwsXawUG`tO>aP8Rt8!uG-2SQU zGU0ESFJ^cxef{UIy7%G6zk1nzA^j%!K3={#zL4wd;d}DSk|yTB?;4>X7U4Z#Ji<#_ z%6e%RvDz9HX2&&mUtq#KTia!#>=`RgxO*M5rNj!xOeD6jtNcxRQ&}$yWvj7p#DVup zGwn=HmL{`N45FC=osO~P6r2NKLB+vZ>Nof>YPLEYjKXQzWp1nWCB3PNpFN0(pGi~R zUMC$6xydv*gE^f&uPzDzZsWz9;M9Kkc0+Yh|Z`>(&(kTQZ4Yk-5objD@3c;n(!T5Vy?;W1-nSieL;aR zP1m(-EwG@(@KWx8$QJ_uW;nm*)A}2z!iUOu@UzQUI0_x;8t9&!^}S$>u&N^r)vp0$ zNh^g)|R1=vMu9>Y!{T@jSK zeq|1f2$m8NYrKf#$%U9>O|>nf))fj$l^1`E^Bj6MIbOQ2{g7Ux7_b%QzOeZ;ggmL` z%D0ZgRWiV>$25@h+QLqnSW_!}s4R0^c~|`2qz;@VhnTl023XZm#_{P;LK9oZ2cgP% zv~;vQ2-s{abNfBSA;jaH=^ZylD~O*-ON3fDE7lp9!Q#>ub0=MAEJf|9?J%f!DQmVZ zu-O-FS9Lw{hQ{l5>Ytj4~!t$>eK=Yy1?>?d=Y^% zo7Wb0X&J5F$wftRTS5F(8Y0xfS+CBB7%ZMk)C-P##p7JiTqSyHu>NNZG{YWdcLhfF zd&D52@wwex;d%TEKjkGA@HlZ=yjpsDm!$FJRPda%arkoeh(O8pfP?bp?X*_zs*I zH6oVbje5>gpFw$f)dsR@HBgUN0WCRDVdW9>lVz#7TQh`A_0I6Fa{p?Lx0cxQy}aA^n*|{CL>!5!!goKzZIYe6q&zh36>rIiWmKL?HmQb0l*krm1_e z$6`fN*9|k6BkSDq;XnXO`XxiS86jn$(E!I0VoWvx;x|$@S# zNZ*zkmxqrLR&|{6!7hlnQX*IMl*zwVqI@{6Yb-O1JqYCY30oNk!xBei5Aho*yjt-} zSAN_w%W{|3aGW_Z68U9!l?D`rV%Y^vzm}l?X5SZ=WD}$-?Kjf5g_8NBt4`ieFMBWX z+pc=P6}}%<=L_TJzV0<1s7y%`^L;6O%6#E|{T?`Cox?Ae&SDQYXsEn|RuxSRp!yto z)|xZB_e|eq5QIPH@1 zN}KXk3a?h-6{eX|7Z@>|sdOiqIg$XmD%oVW{0U!>>!TdvA)VU1cf3o-kx_x;Dp8w3T7Nxe42*A z%Wy`YNYP0EJ1Xk1aqZZ_@zO5h_0?hN=r8lDymPX0nJ$=U^Ve5F%%EV!k;PphJtBwh z#sV3rKodpg=y&8qR~Sw!jp2dfS)-$Om-UkSc@h*wz{g~FjQ5;bAa5H~EP{*|Cy#HP zz#L?YdzbKL617ZSkvpxl#LRS)@TA0nq~oShj4AIdtR)HnJx%}_WUaM{24+1&`&PQt z_xQL>W+L*s*La|;QzOQZ-GKb9ud!ve3JP7~VRBM6F6DB~%c!G~HI1D|e{NG3=3jc-iW z>ZA6u$repv?o)5tjPxS92wP`1liF1s#ybe@AG)f`n`cIrX1gbEODFPkq~}&qO>Z5e zbd12qBQFXJ={ep(Xo90DS4s|s+r}t^C~EnsFDew@>BSoEOMZ^^-lf>T{9NSoRIzF! zcyv@vTURMLVm_y=U49k5d&qy;N7aunyWYFhi*>#izDch?+-t)xL1^D!6~~`mb-x^U zF=;1v>MoEalLjx>^G3TOvLcLf$C+FkO8pFkwm#TswQ0T?;#sh@I+V&k39U-Q7<}P! z8RBP7@@S$@&Ki7^nvM2EpvSqS0W&tG7$H9TCOU-FILwby4*8dF^f3XR#}m!FkC;v% zVOSAS{)_e3sn13zG1gm-#z4CzAq+Z66z}lT_)=WrN()=X;|JXJsDX? z3sf_yn5G2$UqNW1Ge)#Rd6`AKAharc_jvb>p0FUX;#{O*4xSBao!5A~5S{h(+2*U% zGNCv4(NV1rHd>v9;EQrL77J&2+W^VU>(vA#IP<5r%VZtp#XK1p`#UFEh#);aq+VDB zeF3OWj}5C|J) z^IX-&(durHD_p*N8K$9Gg*4~lBSGdM+uWrS886luD}~)l>GQ0s=?(tMRoVGos&~-c z)$h=v63pmzw?vgQl8g0lj#BP`oJ*v$CMxFFT~c#~oh3>D>&>8J{oJz|Jk zc@!~$af_LYzQE-_yEG)?9qTpCSE-1Kkr{ednr1z-0r1(Jdjy>3>@$U(bbR=Q z{Q#Elq{qKjn5t|~;{-=$5Tc@43{=K~w@;u5vw%jcUEg`@m$!|Uub~9Q1V&;x(=87t!YV_zXWBI!6KF0)F zVm(*>qyUqY<05xIF;-pc|Hf6h_UxQxb2o1Ar!M>Rs}gE(&L(D+e#g$jW4}<&>nDFz zB7b~YR@iOyd0ELXEcPdZZ}G~bQ&Vp1xff5UP+ojdkh{q*AVz#4dnoQ80+47QnuO0K z$BxMFQZLMe-M?wcG%~Q630~@ls}8`=Ppd{`16mz za{pCyCI5wl|KqBDZ}8`r&7E|p z5nlpd2(3rHovnu&q^WXu1NOMJ@4l>hjNie_#0+w9iz08L?_u3Cf-L%QcA}wI5Ub6` z(7IOWTN~N*dO!I@LSOB>FN=N&O5S4}hN)Y_vX1I|T)&JAr~J$YCaxj*GQx3=WNtmw zUL=bk<_2r8Q5he6A(E%rhtS6aMq;s0k+;*CBrZT5&jv{6Ih`!n%K^iAzHqLm&#RfYW|T19kgoUFS3TdtwWUU@b$b2^yQHod-NM5Y*4IO$)xP_V<+J2FXun5& zKfS8UK0e1XE03fm*{7==L)9z-Ez@0hX3t>py(;d&2y^gGrjYINg(wI;(`Xq-U#I%Y zib7?NsE?CVW2Q24SvFtL2faDqMUg`3MSWg^ri~ff=GI1EaQkEJ`l~g8#$YXx+D{?(fStw(P{K^B%x>$xLFRpaS(e$G{THs}-z{t_tf!NKyhc1~7503g; zxgDbqsg8~ilgF6!4TiVeM%~21&%!VH#?IstW)trh=SOO24;44WLf>g$xTqicu8(zg zc`z2y*DXI>HCPF#qtqo{2dn$va#dqojQys24c>(tdC<&vum8qXyZlctJ70`nPrfGp z5$u<+UsAul{$2btLt0{C2LJS5l~iND4&S9=y#M`vde!IO&gTD?t8Um2E}K8ZhcAFX z(SAYwv%URQ_uJ*l&$W@&(kg%bnN&S}Mk7-8DmAo6jHFHZglx6kj|-Ua$;DU#b&}a2 zObg>0(>iOzgBjK9z6~h`EgI%|e19g@vxj*}Fl$4wsMFUzFS;bD$}-ij+(6@mtL=c1 z{GKbC6=WPSYmou)cU?7Mp5_=yh+hIZ&)72Qpk3y0orJy%V$<4)AG#$OyO^8 z2aKW#Wacw@#Clvp%+Y$R^JBIMaO_TwK}WiUPOJLF@3z(vW1O==)=+QUj5UGO0i$RF zRVm(i7B5{46^TS z_P{ZE{8${O$8`&LdLRj((9T+yW4&Y>$G!NyHr^J}zkAht0atcczmE6sB%Ahnxlt(p zk*oT5KHv7QU3R`E|AN1M)&7Ccm&ZQ?y_oF358S(uu#_3dI0&4~E~uNSM^@?ZsG&dJ zmfz%%w0sScQ&%tzIYLe0(3I?(^Oz}LAv6$=89sUIdSLTja-#Y~>ORjJ^33=qN#}UU z>h)Zxb?*DJ0!Y?y-`{`Leoa5PY%a|9zj{>!{PEBCqz99A;wurAIOX}06t#mBqsMQV z>S=aRrV~U@g5@`9Y?2=(0ZU+Wz2dCCqS3xMD>MKQ)0r3(}(3mAI_bhEnTR~jz^MX^S^b~ z-HC5=_hjAPpKs~Q%OV)i-_#Bm(enNH(x8V^aAl>?GzRS#f0|kft9Z$Y>I>;(N^(h- z?+c1P9Bv>#3;vg`n(zLHm(9KVck%A?HRMaD;E$W{rT%_ne4&m*kuJ+G+`})hN?p#D zYV7Wu*nc54bdy;r?D)pGBMdAbPA9uim;HsS62o^w&~Lmdkp}+yW!<&P6Enwg<+od2~xGnnAK8q z?!twhoNuMOF)O^9W+^rZw5UuxS8ALG7Tx%D1%|gmX_@*wLY-f&qttqDHH6n#6t5ZA zAksBu^Jk%AY_YTp4L1u=C$(f1b9|#kQC;>Ot}z^WeAoEY$hwXd9s{0X)Q2AQ#_pzxh>bV~|nT^V?{mlHIKr#|}OCiyEBej6|a zanT^m!);5G^#MI12%cZ?+U!uyc-e(U0<;hvdy!J zUI)V`!dBl&4egPOMf)C6cb9x$J^OhE`ps9J(C#%`?uGc)_+_y81MgLXeqi1ryz7IR zr6B?w->?ba72uW2M$CD9CnGazWZl{-JXAs&eIs2{WHG`80TKMBc6gP#7mYSdg-srL zym(#QBrG#QaCvZ<=j6Wc00pp)Q>tptx6(C5jT|l2zAPQ-mT`?stOXk2&3;6#C)^`e z$jw7zZM!xZVi#XspqYo@B@UA^ozv`~q6;=J37vkzUtFgK8eZdh9p^2^W>d4q*>(er zURJJ34Hqb&XIZ!v(h<>w^HB$7KC#xNE+QBUTf?I9M~ieAIOVSpF36LS}NH zpQmj4%p@R;w^evE$@nmxf0V8%@G9!}Wn8U>jtJFEYLqfieLis5b3L>`+>LRXr;Z=@ zHBW-6%3ZhDt{;MiR&&T!=fzJ}KDtIN(#pFOw`q?x8kF#n$fyywU8UhpH} zy*_RQ|5K~40KT_R6uJLLVA!84z5-x>qR8+gQ1(aQ<6p!-)bPKu`V@ft88Gut0p_0p z^UeR?dgd>}lNWzczj=DT9v{Bf{PHbB01e!;19<9uQ3Z!C=78}Ku;*jF3}>hi`YN$Z zMgR~fTQQODNFA&tz@vullXqH)suzPNw}gZ=Vn~cbq3~x8^LWchmaKS%(HcFBqlyd0 z_~0%xdpv_yrijHvr}W3!fZ^8>U%MoY!@=}OqU3}Chg1l|jC=vEiCIZt1`m5a*1p4z z3ZYerWikT5nX(lV>5kOFS^_-kd4?y|(@vslp~#>oBqs*yJOjn<$Q8;$qJG}_L<^Yt zG8il=RhCP*)=BOl!H6O>CJxG%#>i){#2Ziqt}LSH39)-5M43_L?I&e0`bnwjBy0}egR%PQmLG(pF@QE-Uh zOlspZWCQ>XW1cXXrH0XZ8ApW>qGHf6qqT`7kSStVtIMl8hI#NCP0qSQMdXko^jbGt zIHZ{B7@cy3H8G1E%pfKcp0#ff^&v%HVwus}M8X-0baJB03y_&Q-$N$D)0ad>8LF0E zYXfv$QcU&Dh!CqaN#u+$f|yK5*1kbjMG4Wh)X8K7AcAQsCej_LgS7;B)brp6)zgaEnID5VQ`>sc zy_su3mG4w)R&0Di<}4}8tb2xW&BIXqe4h98!+@YBhDSPVj|rbb>&8v z%x`iuYr{GT*{F?x5h!-%5N>`=fMx>Ik9A1SL0}wJT!Jz5*I`*16fy*}frN}_{=G=iWG$WrHvYE29alNr2Zw8n#J^cDIl5kP!d8F+?b zTus8%^Ed;UTROm$$Qny$j~uXsRFGvl4h3Z8yn`c5lq@>6!)U#X=qvP9VwsHe#37#R zWt^i+zBF~=k0H67zOcirLy7_TT|z3zAr*=pBjpMybyknTdMyd@VJIvqdY}Zmg?l`M zR;FOh6J1`Icj#+cp1hh2pV7heh*C0W;GP}8Q|F6nO%fRy;~`+r$J%$;Q6aP{u}nq) zI8(M_BHfWXSWAFMJrDas_4FlCwZq7u6RoD*9{R zE~l~hvS3nYC%xl=NtlPA1~&XU&MTe#X}YOu9M)QzhD$0GyCdIN{Tqd>Uf64y6(At3 zDlQn8H81gHMd}%f!kUDMqBJwp))7(N?^p7>VhIU><$~%+hI8HWOrVXELT z)MF5$TEv+z)C;XM}Y04-=3WAPxql7~$6gx(2 zp+oQ~gCSk7B_W zkPK4=hoK&W5Y-~id>Ka-m*#o!lsxb;qEjo$dKpKBP;Q9;D0|{U=^09}B~`vOb>RCKAWlZuQN<;gtS|xf1Zf`n{4x;EaR?5@;FXR(bqw_o zEBz3LrXeu~bpX$iFRI|s#T+nJ1-JDwqOZ_bmeW`_w8|pk?Ed-&4ioPr(C6WTqsi$D zJ6IiBORu#78ZIfO>mFqs*2FAwFoKv&NY=hVv)+CV{Bs!TI&0)4ntqS4%h{-~m5vNGjmje&FBX#9U zooO+6Qa$Y?supT05<@nMP%zFwW;vo43=Ts*%tWh|gjf$4M@1>OL;&$+W#Ac#^g*l3 z3v*^z`4Gisp1ve9$}l#D&9O9HmlRVSGa8zovZmu9hg>@9e5{voR9PYzLw_Ak`3zby znO{>vqO(8F1-XFqm@1=(U(EHAC^;dGK;1!jQ8EI61w+h%YcR_P@nIYlLWn}>^>75R zgH}v}5u+}z>KNw152~jxiK-pOhBF~KbzM?Sb<7B=l`8yse6wilken~$sIo*bhW4iu6G-$$@uijC_V$`ne7Ky@?nqez9SYidyCwgtKzq!I6kLs8=#T@GJx8 zC8aWik<4gqBH;{0x-oHRADX)EPQ!CGU z846{f)|dq>P(}L;T0sf6Bsy{=IHYNo^q4B6hwsw}l^N(E98yfzJ<2$&$wm#+57sge z)s*U}DCL$2Aik^&JVTK_XmxpE&I~Ibvxq_DOCsB37#qWupoefsG1W0T2v%#dQRc=g zjMhWuc~q41;|htB$p}D1ML6)p0?SE%oMFcg>i@rPzJq*geyjN=`_1|l`AwgJ-yS2r z{(RlC_8aD%57s618_#!tRB;K$&|imGoIxv7u;!U2hdoq>zJ~ALHK(0?wmDeo3E8NP z7-g1O_b6EeASPyo$qWGqA8X%XM}^R;#4;HH;7r+yiF8NmU@ZY2^*qCq>S-rYwNPZx z6B0HCR-S=kcjO9XWwSC9T^A8!5bHTdKGB~zHvkFC$f;q^;jbRRa^#kl8m&)W>Ez>Z zFe3n$U+TklF|M+1I2Gf>=Y$5 zdJm_rB(?F0ON?g+;jEnZnvhf`v((0>CYuT)Rg_k&1uDZ4zz$k52^JHbkXJ{ahYLaV z^o1QZ9a4mzkenJWDW>ZlWgOPTEOIc@53L`>BBp2=lEFgiR7H!@-<+uZ?Ts$^($oRt z7kWIX`NXB{U=4z35hdQ7C(r7oP)rK9>$cu5P`~Y z1h9ivOoGMa$g88zvsdB`s7ymW(uaNs36Y$j4#*;vVJb_V#YkAMC80Ttql!x~hW|pSudfJ65zgX!B*{C5H!8v6eGorV`i^s4qL!{0Jv4|jDrCq#+J~OhoCz%{rt7|P-fLnO-kBj7*pD^H zJeidgeTk68mz9BMDAH>Zrk=+cXy!h7r6Uv@4xTBH1D22qIHW?cV+0{W-tdB9;3ry1 z@hs!0;u4IZzYfu#K`T?RHilv*={Gyr@LeaTU8wSl4Hc_|Y}7^!r;u66nHe8r^$+Ht zUSY&n4^vi^B`}Va;SS@O$ef`VXH0X=i4IL3{B-23JGQ8WVw1H>&_lSS4p>;7FNLyj z79+vW5J33EvkYe?MPFi>$jw9*PxaGDca)6wp{Wa>OF;+lgs9qKWX3E3R8E+%9k7Zc zUkb%Q#loUPkxxR)i2m9)3!zix+A1^fjF)l7T=MGX1u_qQ=5pE%lvNG}O=<}V0hXyQ z6p)qk4vw>W2aKZ~M(brnU!kuohX7TyHj!|4f3YRG#5B>N%ro55s|<0P`6-5~O38^! zjAsYoM9GL%z;PCHP%jX5H^Z4}sG%c9uZJT&J17Jt7%}SdS{frC{B-0*1$%ySw8tY# z2MK8eik9K5oVQLg#2nNs8Sy=g5*gy5@*OJWZn#CE{7U=iDgD>qqG%9(v49UIR@S{XS1XO#2qSXy4pZL zgjB#GEfh|Ma4>|0P|N}2XaQ3%vy9I6CEkYcdXy959ZSgw(#njtZfbA?Rc>0@y(-CjXD!vQdoVHoE2ezx72` zbz|24OiElvk$s)v5fI_G$hZ43f$hY1LMl_37lCVZG&X^au0|m0m z9r`GrdGWCaO=|Q!4(}*h%g+vo+Gc&6)&GBeU2f^?_;+0Q-1Xme-TtomqyHEDG5gi> zFZ^F~-GA2~bN?m(-?z)3`24TcypsNP-~ajI&*J>}PXW;1Al3!u1R~x4M5r5Xx+B+r zvZUvsOv3@Qfqn^{Cc^K|mRZnI`XT1@>8fW+KWQc5dl^@iOFck;1<_w%PNraO42OeM zZ+5-IH>}kvs{H94DprMT)J7c6w9G2!V0?_#zn;K&#mH9=Q&!aw7-wa;V4g*`MlsHq z$u%b(njZXgWF2>GQQOL>THp^SXcU{L~T@E>4K zAoXg?kt;#)EbKmQRhp{a9yF;#&|@PA8Xyk2$k{#0C`zNemXs%0>{JLnA>M=_)`>)` z{_N7nL?`4Cbf%D-qeGnqnT9M=BTp$%$V41uTG?GWHaew&BrslOK?PC=b=o)0keUA37sZ_paJ3_z(pau3^+22=G>=7<2))n zm{E~BCjg>hhYNH9kuIbT))L@xJ`a9SUEL(A7EET0Aq5ml5A|8bF&(w1`gDtuMTM%5 z=Ve?~L(~KGSMVGkU`|a@N~D9FAA?vsTY6D%w(d~nhdQ3rArx}fApwtLkaF{Ig2&@N zM9zKk9EO-yTGs>gcdno^U|Oc^l~BsmbF-ZwKXV(#F(4y4MK$sg1qz`ZX5{B~RPBlV#lW^hhaLkBvY*06cYW zoOF_5Pr$Lj`k=Xt^doc|A`RVIC(=Ch{hnU!m?+Xw_Yf5w!_KD(WaT0HCFrpcs0YYB zWe#g%)-s7MQyphKK(C79`LQQUhEOcQ2|hjj^cY=R1f8c@SnIfFsK?qg;>4Ff5ikb8 zmWfby@+iX|Ou1)SQl9uvQ&KMVRFEP#tZb%**ZI{W5|y<&G}Ampqghc!Y@@w`Z^pjQ+;>)-yr_~srNf1PTfjOCiHP0zptf30J9ll|$R`LnCy`dMf zQ5$iTOiThib#9#2#H^S@hUkadjw$L#=tk>&{Kaiq0n>s~ueP~zht`>{ zi9mCjh!;S+D>vUkkk~`YN0dqI7!yP6A(1q!n8;Pr$x{wId$r|MN86r$Q1x}g?i^f1 zizr}e8ZK#C6TfoyHKBz$fFI6%#Oc5&H&Xf`(y+g*0M{te$0Vkn*O}I25rfE$P;3U| zfSl$KIzf&!t?Yo5E99tiQ5xm7q&&fBYI*it>H+#IIIMtanSyFeu6%UV(`=j>%F&J( z8$B2ms=sHEPzdKpnd%s%T;Z6Q1s@}bYH4nDALJE*G3aeo%0C1t8_5Fwt{q{std1hGs=p55uJiX%kFQYRCLh8;{hfk+op2WtuN zIG+bUsIG1jRSPCF#*hLErHA?~Kc$)Z~8dya8c4G|B}UqLJ`FsG&{CDK98 zk3r+?cKAGP>z?6&$r@`2g`kl+!>oHC92)CvvH);enw{Nar$Xqd;Sj(CVm(`uP9XJ` zLL1TQ0@?inbsl+YZU3m zs7p>0Or857AE?z$B2Zv9jit2#7^rhS%Q&W^_OvEN4lA#*XB3_oSJe=cr%u&@OQwUQ z*3Pmfa^w!JvwXuQvNKudbC#)*U+Z%;gm$C_;soZfCe4v-ys*~@A3CV7iX%i@>YM&JU`-k0qua8r+ir($=-*cAadGh(DtWLyhgvRK}UiUyaG+-gbLj$nRFy_3B zt3n7-u{xP)l_jZViao5=wP8T?_Vhatw$~jhB0&z(FM)3DSqFfp&Q0N%ka$34`9bs} zP6tMM80YAQh^`DF8Vk$`q+TXhK04~2y%IZ3Wg6;H9sLptnTXMiQ# z2gjNd&Wz#FjTT)QHe&@$3rf9Au6%UVJw)lqiV7KWm#L#)LLn0|x^aAwBpaJ5TL>G}P81@9l zD+Zk=ka1NAA<_f%H;8qCIe|zY6LXP|j=IMz>3JyAaKLP!UqYveAZUO%2yjuzF2f$o zqQUwg=3$&i#ZfLDni7O!1xyP{y-ajMK04|il5}K6g$%jN)X^`Ykck-GIKIenSQE2i zu2&h1L5Sqd9if{oCxZ#Zy1<-3>Me;1t^`4f$&#LjGHP9421zMU$V3n{fOh5}$6-x2 z3V=^qNuKqfvnq}d8^8l9Al3!u1X6EFRB$B-Qsf6!UpI-W1(O+L$VRC#da~CYfsW+K zLQ)g}AA*MDS;h*7L^%U_K$RAyzpZ)ITasR<7@E3DFY3)`G6p{_Q=i5t1@wS3q)8q3 zz#MdfH&Ghpg zldbQIoMwH5>IKFNdyVi}hL~wulQ}FITI$&}tx@U`L%_(F2IPaEj;!MjS^o6&NGX}F zoFPpMzjSUiV?nnk0DK5#%=uxA3L!-E=*keHvA~=_>S3*}ZFGP=_(64blc-uS8|a0! z>bj(9VHo7Lu{qmgNnDALD-jH$1)0dR*t z%7~{pKhrx*!?JeOa7iRa;gGV^nwUinM(2aIj5*T*9ZV8F@C0etUsix?6zO9UQ_t&6 zD?c3(jr+t}(Ss&+2!+s=sV)kgz)tmXRu6N%4$V)V!x+6Jd`>+qJ%se6lcyYb(1psF zI@@tJ z?^h4na^w!JGg_avD)|n(JtM9)W*IJN0qw4QV{vg1ca;X=V#k;mVyD|~(y(GWb}|7C z@by;@^BaOXnz~BTpwndu0b3^A=xL#IVo zhLD*9ObbdqV$`)Q4af&S9a&MqK0hr}mv;mhQzSCNdyW^X4n!6p&bbwF=Pq3DLhNg_F_R)OS7{( z7V5D=%{Ihg0@1K@j&lN$E~F0D65w$@4}MTx-6X0OOlFKB8>I&B188^U3hl!v%G|u@ z@R<=}hz)57!;0zH$pkcD0)O=|zagljsjK8t&}q1y3f6!e96;sLKv1;S8u!2))`XUP z%;t(R=1ew>tKw*B2qKvW8diX76zD!4l0y!i-YI$7XmU2kWojZ)3h1FG(zFPBkz=za zv@i$Y$&J%8L#kG&u2u4e1oqqZ`wZWJzj)NFNiO?m&U8>$i~xBUDW_HjNd% z?2UloFi=M4X1**K_F(!sgfi$5GK}<*(hrda%305rgf&XNB~ig0T4zhcSW-qj#rX+E z=I;{pupKE=D0EVGS`%8>X_OawKH}~)!G_cc5nUNVu>z(Ar5<)UOs!5wMt~>P)k?l< zTge(LIx7b1TnEtZ$~QLWqLA@Pi^Yy`_B@Ay8m=KsBQvdqz?^7H9}}JArlan`Pd8Rn z$dJ2C9sLsYfHNd?;~1oz=9-uVKMVnSb)4y(Cu~URFA;`Ntfa0{>Me;%&+Cl8xkIOM zjuG1^lNq*zLLeJ0?2H(g!={iO-bBgDE5@9caaFnOK$9Sd1@t7HOgVTl91c=_ou}D2 zGnAuAZJ8Q*0RqLHbx6~~Fv!ikvm~+S9OFfY51R8JX2kPh=jcX@t_&d>3(N_mUM5#Q zI_e&xpt`!5qgpT<=!LXuxTI-e804n#I@@DPKLia44V7xKA*COp0w<#jJxM21upSdR z)z^8Ng|%WziiUbrJHUh#P-x7_Htj`DQztx}L+N^zL5I*`kf92pCtB(PYSY>Y7ky}yv*J^;2tZVK(gu!pp0wKT(*kv>xT zAr6y{nI@^_WQx7oa;mTM>Cp6PtHKUOh@nF0 zHbfJE#uJBQQ>~YDA$7@@2IS*yvBB3(A`XGsK)-}ev?EOmzaTe-W3o|);3q9jHD+^4 zA1S>dPF5d|GNp3@kxq^W7Q7`0Qc&`=RY_F6J!n#gpobc`^B0?D=EQl{QnR~qn)Nr* zg!MwN4xeR+$Ie5C^^Plo}nIV zW0)ZuWm~2&vuFpzEal5WVh?6SeI*q-jH}9}9-zO1=r1rQQ?NFM!$GPyyWZjRw5>Z- z`O`a8tO{w>a7oiLOPz8~I^kbWV7$u6_cF>IpBX)T69xRlOl=kL(q`&RIJ-DP{SP}WW=-D!Bk7V z>Md#H5d>l97k!iwfz0wi(6yc@+mWV~9S~C_ly#2vkgnH~5OXIPSCvaWKz{|%U*Mcf zXxAgsLC?>5uJbetYef~YZF;CiDM62kF=&7|8qh@~Ya`kFsDhC%sXUL0qg)3IB=9Hv$W*h3UlS2y{p1(QKXxinpuG%XB++%`6eJq-NdGbA)rs>Ozseuy-{ zXT?PcYm|CRqPhe@{N)aPlo3yHetagP0FXHq<9e0@W`~Yi2aigMk_8d0kLPn#eO0;C z1N2w$93Nm#Am>+)NL1F>d76g^)zwX+YQbbkQ7)}eO@jgA1j2@|!WJi4RGj;GrUNQA zr1V3iVSiZxuFQ0v1{e~Rp4S=w(;TV-D~YPN2Tf{pR*=)4b$~d5K+A$|Phh;zt2+x$ zGOh|^^@gB>A;h}CoIvU=iAvAwYydo|u2vFN+e&7bAscn@CGGvmDu+#>taIdn&kR&* zC3#-PRW(FCKz{|#@d4&!ioRHLU?C?RbGQ?z{z5{uH3B932tahoN2o zSXHDvhoKM^Lbry4gbIii0>?RlIERqB;SNn*e2GKvoZ)zE#I>F%+cGauJc4dQ<7E&B z62ogro|}Dh&(6_ph>a~oVyJ%WaC;vvKlSwVpgDw2Xy68)mRaS{ z;Zz?69x@}Ul_b85t3s%Lh%|I-#aUA=&mwKC)g@mVkk8>sbwyQ98>Okz&oP2g8v$bg zMtS54Wx=oq(+@s!=@2rE^pVmJkp{|H&z6KW%Jw2r!5v!X%{na}l@U)dAcxZ!<3JxlhVLWQ z7(=2JI;4hK_rM(1gkVR)EHCW&h`ZAPlfll(dw75e#Ja$oKBfvl4y zJ&&czXATYyu-5;8>+XfUAGh=l?sFwC#!KqGv`ug52aC|fIs2yX{oy@w-_u!mNRm%3 zm0FsemjMuiozoD9nbsl+YZU3mwG#iecIMNLp@ezj3FDf5im;4&Y1`|e@r%t zS^1DBW6pShUR5sg1Zmh`R)A|>hla(FsI0H^G!NNm)}!XkPs`MJ&>TW1$TEeajLwZ_ zeT3=-#;XiEgbXA72;B@pCo`=@64ofj88h;SKn9)j5Oo@@Zl~2Kw+BsXbXE{`t^>qb zIeV;vhY5y58FO9+Mye2c8lnk7tZ~F zq-mL@4l186z$oO;7g9I!p#k|E?$FV9hHBjO4iWk# zbb>5XU6j$enU51Z9(TxdMn30dD3qbKnpiQ@DoetOufL`+5)N+SX4j*Z>S`5L{`44+ z->h9VBql>!+6qCnS)ZJmCZh8sMNK@dFEhoPQ?h-xL_s|Sp$;wX2BH0&=cz%`2W(;Z!#=7Fr1L&xwO z%W0#`#;_$60vXe@4xpVmbQIc$QPg4jNh_()VO&)%^>T-yB`YpUI+=nsk0h2Quut!( zc-rvb%umbI(Z~8IJwcXv0c?fbIMs)4PXIXWVa$0M=||{h2s)W*Et0TCG0vD5c{(_B z&clU{rq%7VoMS+$=wodFh&tD^j3ZVfS15~aQL^-d=;uH@F~o+H-VoeX%`6m2I+N&bfO(;TH_lC2SZp0)e8WprP+BISA`IwV!AQYS|q7u ziaqSqwT%w2$7!UG5;nC4%6T_^fA3 z!WyOClBnPgZ8x|>A7#W-oS*6Gky5f^+mWUz(Ar5@HfOs#Ifa&v5$WXN?rWwOR16}`U`@d9XP94dwEGUkenqK}MxFXO6mnY@h@ zJQys0TT%OaGKe`pE_e?u$nr4nP@N$oqVj+|rGOr`BTZ}EQ|7QHw6N1CFZ3FL(}5v2 zBz)}2k|7jJkiHyv(8mOasjoAbT&YF6N^O~XsnJGYlb9^v7^wOnrc!-X z4N(u!U%_*HfH{E}Hzsn-Nk?sK25?gpaC#Azcj_GJ$8pA6epzh9s>X9>CW?>~9#R3u z&+9Cn9m%1R#ek^+qaR4^Ia<-HPuEivRx@1fnK_*0z9rVzJ7}{LJa@vinPh;V{<3HX(lo{Jgbm=0rZ zlQN`NaiZpgEtY~~KHccKB4&{lb~SE}!Ed|t49ge}KG$=gX7_r|$qk)dwzmfI86QgT zGX^lR9w7!%BxNSG4lX$)bA@%Dlnmf$f!W!9TAT+yFRGnDf5%{0O<`7ek8-*b6vz@- zD;6DOD3UU>p(%O4{?aoz13KhH?gR|skY2@!K<3hq;vaSCXV^C`{qmX~AKsd8g|*@@ zzv+2(Zc_I>qY2O-v2Whhp5Mf9mRtFAe~%1gN1P`Op+nYGH52&T80kB{~Gogo)!OV4!l19-r=cx?_M0-Mu^ zv$1)G#n)k)vw^%yG^g(t_N=es@%jJq(%jQ`##{R5T>8xX{x|JC|7Dlvx^I8e{$;%U zlkzXde>}KmtNk?M_XjoW^Y8IXVrPE$n|8K;*`@n;@#dxZuKGg#_jSKozWTrMH|?o+8Ek%Ud}uQ zFIVCU*y-mdGycfl6FZo;I2+--1{Lx|?gUcfLRMIh5p@IS86Rm-x_7l_9z~oItkCs;rfinYo_Lf{**GoDflwritPq1mR{@Eip+V7LjPI6mxcZEhjdSM5M7?qZ=J}Es2~LD) zntV`@96c?K1h1AXGE7kY4s;ce@=*f?Q=Zw|QVv(2v9j^_t?XTK@!W+FMiZ$38$vy1 zmc|tQSksl8fyImB@u_odclsma!S!-64w#+o{6L@)|JmX;)nR=mhIINq+BlG+{D)94 z2!6TFK&}>8fmDE5=A91)v~4M4fjKMWni9t=BT*^~B1F&64^!b?+-3a}y6o5*QL zM@Ba*EiJ(G!+L${P?I9%WixoW!&z=6_TLjblHoYvb&(1%ewxtp&;~!7Oc-)ASacPS zPaQ?S=hDv=8O1?<{?gAC8~)#L>2pW^0^a+kd%^#>Z<@d6f7_+`j{kv|cIx{tn17Jx z`Pu)F!F>DD{gc4q#O?YN&e02v<6H$Ib<~w*@zY|b){AVaY&8-*H_ zNEHInru!g6Ih&f(nO>1lJtZWv90S)bHi-?#;NbqU0keSV z1{ONVP;8HN?E3j?2_^Nef4fU}|9{t|`Cj=#{!QOBSAP4_{vG?z0+(Lf{zdyDxA=Sg zdHav}pLc2gGVgmnuiLV3kM|2;`z&_*MjnD{I7-FOqHNH$Tog{mhjmrtP|=z7DeN^y zoODfeXsS*HZ`P;CklL^zJgR|7MeUGQV1PG0z_SB554AN?^!G^8wMoK&t zyjh%C5QS&@ z-Qr4FwxqGsI_X)-G1vp<6CtW3`zLZI_$l$G8K?)d5W~~@6!r*Zh$iuL0Yx-0aeaym zQ{C<lXF$?pVu0Q2198Q59t zqA?35MyyB-_Am%tQ6(^C2hp+op;))9(x9NOabnMbr%ldl5HV%hQoVo^VU;0kPM*$U zj_vF@LSm%{8{i@pU_*FMtbYLou3ruPNj~tbC&x$-PBdnp#sU(d!)na~{mrqFJx9Il zQpnd9zFkoqa`d#cpA_Rm(C)@j2OPI^&~sONMVdjMu@JmWa0*OZXOmL^n&ysG3W7}P z8HojLOz58Fil-O!b7DO($m*8OXhVOqH6Ct5#CM3Z+(?N6PJ5P@Vh4Wau8r}Ik^5dR zl&pz}X$b`TD`K2TYq*8buG>UR2G%x-`5r z1*veF2wxO8o<3t$ptzHqlk>Yi1wE!{1UHr~BMQhHdLkWx&y&ZrSJe!x$52|s4KkBn zu_TP}yAymYrUMg$5o~E-4Tr-LRx9TIsMCSvMoJWrd6t(#52zR-^6?CGP3l1lC2JyH z#VRlw8kC`m{ZVX_wiLCpQ&88y49DF-FUK<_m@tv@1u(&BH#v}_!0mZdcxz$LVEN#5 zESx5S-W&M?@#X3>R%sl7S%~3jeF|#kY6LfyEh7q;f|Uct!4~RzOnX(fDU?8qKggfO zdL8Q09ONUhbLg#Gb8yU@x(1mz?skjkmOx#|Z70xUZadrB{+3G@?hBf`?Hwzd@&DYj zt9xy`RS#H_2R+n)lK(nqW`{O2{-XoeEfb^40X7s@J$^nK5pD#fcq79mxxf| z$Y3nd1l#T10@t9W`rwU;UuADJh7JS{Lg8}>RF zy6Wd{%Q-vLr$LG9G@KTz#}GC(l*zHZD%%uFpv52L&tko9Dp|vWXai+(WUb0Jg?gOge<0QiqoJAmf5bGkF_O~)v$GjkBfg#l z(3v9X0MlT@Utj37X9JH748?NVbWuUlLM2$}2VxcSKNja<-$)b4D{V^yjWJUd6j-kV z%NLji+m4^$MfSnmxvt7toCxJjUSuXc8$YJLMEb8VXcb%OO*z2k&4C%e~tVr zamK79ojuY32c^Mwejoto&l1KO)=7BVu~NP8QIw}~g}&nncU5vJ>=E^;c#NB=pg)N- z7X0d^<)4L7oWr4*Ct!BAQUpfz`P{j#>Th)Ex$b+H{=t{^cfB-o@s+lI@tcN@mhTdi z3NXuYr8}L6Di->Ic#J<0XUy98ok&;vlwHt9lf<9Z=?=O1-B`B72E+3?59jl6B4$`o zEMwV?rZnoPKM{{{es_W~p6CY9oEXO7S73Iw^Ft5~9_I>U6_P_MV0N}t0=bL@cx=5b z#`;ACNeie?{zqa-^3nZ4+#JL-F#QRVxB>&2EJFMW_^C9BBOiy;=1aShf#)QVDvULcPQo}p&&1Qq58HB*xgH}PTQ(yNFb%eb!GY%w8?Jf)!w8t24LcjzQ0N4{ zq8E>eSk-t&mG-Ilq`yS2INhonK%3znjnmxSIK0rzL@!%+WO`3TM!e;YuaV7v)TO!5 zd0L->H+QOg%fH*DnXVaMwgYt+ntT-jv$M?wXM5|?nxZt=gGuIvJ~^J8v-Z!6$Ji9Z zT3&98*Leu1$Im$}Fgx4%q3^cqQ{-Vk4xUems~%$mJwM^n#UebjMAIuOPUmS@*+a1% zPV)H~npowmPw@mD4KTj#>jwtB>r)tT$V%GcCK((3wB3OQfpw0XL&7mZH62U+#X>(2 z>-9&m%=}q1W_^O3eAC!r5 z(tXu?-7j96w}SM)|4sL&WpG~%Yj<%Rzraqy)VFlvQ_gq%uekKN{CNZau1j;}w=d1B z;%`QavrzN`FEW&~@ze7}eo<_K){h@2GhLs8sDd1E<}h#@V9hj>-6BJp!#RLlLvXe{ zE0Rfu5JFcJaZ)P!9)BG^ggYCLk1xf{!R> zgk0Ygn@}Kbl*X7j(GB2aQk<5&xOd)$%4622$dFpO_c{wd=NTtWD;9-5qL`M9`M%hM z=D>~8m}X=>;H*#3aOBQJs$u6b>r>=GJ)GfYU+U?0#UdGI5k#UO=g7lJ+x7fe@ii_^ zKeWyGveCamrZfj5aC#wMI$t(vAOTDRJC9kPA}8wM40ly)a%wM;*C`kvQT`E1riw?kW`O@bi-mYz@zxSN$dbM7N7aCFT z1*CWN-@9}#wLJ43v^i#diY`ITH_^2Jz)Lf}^WSTH;nFD4>A)+X4!{+s-Q@J1hE2QQ zl`)4N;dl)JYr6$lj`dQ%^^?+)E?I5N+}Q3{H|dke*Y>hlozW9~%;}qT8C`u${!-<3}tm(=%IX2wegd;CY@e zi*Yb#_s7nL_9gFnbJTfR=bRF|BlRx3K83xR(rKf7d2R(>8H4j<=8gbzaG&0a2xEb= zNZp-MNdS})XMv;~rBJikGCq`uFI!v#Fq!dn1nEKcu;g02flG~8{2R6OF-qW5#+rD`)yI-YZRRE_BR z6iIdh2VoIb3tzy27>jPJrX1lEM9;IbhA@0+m_d+T@rY0B=fz7syMA}q+Zkz@abeSGrkf{ia zMuGUy@W^R5qN2T+NTHEzOrwf59r#~}%lNTaTjqf34B0*ww$wn;nL>#j(nqQg2T(Za+sx_f+xZO9g5|OpPNL3 z;#DSNP_dh%&GOmLukO>(o{*8rSdM&%Mj?Hf{uHL%O$YVhI&zKqrHjG3Gz3^E_+N-| zr?E6BLltu-EOZ&FAp+B2J3pmk1;4ET%Q)y9WcSwr^vFpT5}i@Km`I^v2pwIdip2+~ z^CxjLeJY;m5t33so2I$afk5LttwN|QRl^uycDDEereFoj28Tb-J-vKun@5Z7)t)e_;bz?FPleQ+?JNR@kiz*g6$WUw_ zFCSb#5!2F;J7ujI@@YJF0`XlZAb-?I+IC;pX_$hHy0}rjnBYJ!C@w`f1Aivg{c~{^ zx=fBC*_u=uF`kSpSBy!*I-N>jb~bDWF0&u8&k*l39$)Um*FIc`@NgC&QJgh$G9b<5qLhDP=&EN z-08^}z;#}TX6hiV8p#vsu`f-s&w zZ8DYvAz*g4QpV8DP+N3KU|AEOKgsk6hc|@36djVlCPw`NlVa9NrJBwZsS)IYv zG42H3f<>oi0K(Kb_GfcXulI0Wep1|IgzY8FZAPPQIgOX-#BAn1$!n(}AOkS02} z-6^rblkUN-3K*nfIAa`tI=cL%xS0rh#8tCC#Y1H%K6j^VSh6F&Y+Ucoi!NYxHtfW8 ziVk2SI>j{sq8CKxp^AMfX0$(xvzqlO9;zYP(ynBVvLmvBwLbF>XD+$pGAZ=4qJa)@ zkqWRO)RQRxq5fEW1lrsT0+X$n7SKZ-PGL0YJ0E9wvpz*0)biQlwi$^^C%Y0Z0u4<; zFl7hPv7}2O@j?D9J{*i!Y*{tyQ#@3B&Dp(KiXs-zmyLfqcHNM z?>S=d)8Yn%pAN68S)bye4rjSrzIZqkeMGGGj+fj4rs1}@x1z}GFhiK`#RLa>L0=U& zQ({cCF0=Kv>X~E6jB!{`re{2I+D*pk88Do4Dcf#%SD&@wzjWzlGQIcg`jeOTMthNe z$)$Z^eK~wl`xX1G%@^u-$hXTc+wL9zdEYeG=ug56eeKe{tKV^HGWlWN#-H^m?0L80 zEEgHt_rB?o5t}(fUH+H9^J=|#Fe#Zt+agS7VR0E|QZvMGQHtelNA zA7W5U0Voy)H0Cz{It5Q_ccvg{5D ze^donnxf%5;zr9gJ5!F;seIZ%nT(%d13(L~VU?w;v!Q~hcG)Qr)mmIubj)n0FKtYK z=)EUi-=o$v5jd9)k~C@^g)dM-Fe+r^w@B<0@@s*|Lfkicd&vL__$Fc<$I{o+-^( z@~+**DdKpwGa>^y3^V01>r-S%Jse&43J0u{6NPE#FbF!)C+M`!Pq1>V;ebh|t-+#7 za;8-~+pK1N3d!cg&gpG&lR4DfXeK<4&JoIJlhJgQhI^Fk4Z4G}zv-pDGgs~^PHn*z z#Xh+-FT$4r*W+0bhU*<6%P6*&0yzOqPkO1v>O68bXE@7!OT6aqT-y8WFbwZ>&bZeL z^)7Y2hA(FL|NTolg75N;OFOj}?;H8jrF}oXjAO%>xdSv(Yrd?=tKn1Tb-Z)DKO(@gIcy zJ^Bi)9y?FSpxsT5HlFMj_C@i_u$q7A(vQN)c>zF9Z_&#D971m`%30r^`#a(@Ak#T4 zlQ|T_jHz8G_A0hvOYtr#4sd;BCOp_OBV+(NCT=V@ahtdUKU>_%(E!nx#RXuq=8j33 z*;y+Z^AsTy#Z)Q`kLz)170x|EI@{$$vHP_PV2Yhf3%Mq8vipXZz%kL?vp!>Lw=7b; zVS`E#z@*=1RG%j%MaOxkw-YZqYN%*6(3)_27CT$3?(Os{8f(@8r=hnjg22GH%7 zVf18iGI78m9B_T)G`>+bP&pF~RIX29FY6WU4*ADo3?CESAK8vzfV3S}M0X^kfZ;h$ zj8EARIuAiaBHf*L_iGoRz3Nj}JoCOO#_2H`TW?dRnVDT0DLZozl6Nlc3ps9Y&TZy< zVpZNI6K=3q_~E6U7}hEtTLda16>z8&cN4dj3SKvBL)a*>`Rh+qtR0DJ$ znY{<4G^WR$BMmZ%ov#AUpg(AF;5=~x;d4FH1C z$f;Lh8Mn^qj)AoJi_qh2J^U?C=d-!#y)7=I08O}oc4f{)%`S}*Apkis^(ul-1yq|N zI^YmG5U2%=;j>dJ@uZ4HR;$CkC%!5I&C&Wbb4QRWjhPy%p7l-%!!sN@mfaxlOE`z2 z%3HpYwfnnW`V9B4yR_dyUnSrCelPY!P*JCT z`c&7;2q2__ngi{-AmK*Mcnsu zI0MYird5sjBm15h*Rt@r6X5-l3CRJVN`soP9)zoap*k`OglCDyF*?XlY4N$cf(Q?P zWEjhROUxkY@wpS=ePFr_0m%ZYm>nt(LN5v!G=yp|RSZ>hy`mPOR5GsoIKbtP>}>RJ z&QFUW>YAS7n&3@uGSBu<2aB5l(2K$nH6}_ebOXa{*3+hkJ-^G}Qb~=g`i{6caotD; zq&Oiyo`+^|)b8QB=s;@qczo1P#?ri9mpJW-JbQ!ePI-PZn1Nn4XkQg)rfXRYaZSd2 zGf&46m;xH@xEWZ0S*r3pG!$c?6e17Ujr=cMnlO?Xuy6`6OGfgvH0B#Bbh;x1wRjZM zlLrk7>(V%Cp^#pjHknAo8{qOsW)^l|5ktoo-y8X2g7-`2jv&>Z<2qzL%~pYtXH4i! zMn%@MI>wzqDvOe}3&0D}ZgMuq5JYj?j_m@fsOaE(BjZ@&{gMgkq1uGZ6dG7Ois|8B z=b?(BvS%9`jQACpoelSC;H7G|xIZt}`rS)21J7HR{@&c1TzL3e&GeLid}Gm_ zp2pI8tCnNCvN46)yL$7|PKI~mHzt9eg!RAW244QVVhou5txKaMSPN(Eg@ihPucn(s zgTg&fshtfBzq&E>w*mS5Ebu)sxmYGCKN-BJ#^WpEX2HtoaV?8cz#=pm-Dpl^(>k(344}OzEXT*r>Du=?5oguk zt~TSOBjodw0rWq5lqv<4-Q`wZ@x)_AM_0@$!1(F$JhV)uPdly%a?w#(j*lH|ldaf< zQ=GPV)zJRMqtu@e<6$6Kr05D#0Tuyw&qEvhY%=M}&0x_{Ob`D$Xsl*^isQ1@ zarEbCX$5XFX+@3^Zr0BZhElwL7j5H&V zQ0p@G4HaR~QqmqSXW)~r<04JE4HuIsundq#vTof$?f6qD3y2(~N0kjkm&U}|! zsZ|ZURLvImhhkJlXieIKRNM(>rp<(iy#^coAo?M31aNy2&yj~-jF%0RUect2-&v@U$o>XndNyj??)f-|onL3Tu zKYMA($_-DZU$g#AFFn_rnSE=#VBW$hdJPp|US(cvXPHAgbY$m=o57;0VP2m>%vPn= z83>0dh0_AFvuX8?Sl=}@f?`p^4DTK@fuq?7R&Myw&D|3gSf53zh&4x7tlBq7oFXxTY`%@f1UK9-!HHf}Q^@wWE4b zJ;lf3d~rkTG+LAPAQg9l%`i-6!tPY}LG%NhD>}JZi}d_X7)uvoCs3K_b#noH)k8y@oqco$8*!^~AeGOnP98mxYJ zY3PP1wQvXrU>e#WXVpxBxU~%lot}Xao+lwTo)(y$4fn5#Uu4b3N+4F4=?YQ-7NM40 z$P!p9PS?XA6)V9*6J<#xs$#)sR>yc@PZ*A{u0XPwJFlb8_<_e|?vui60M66p zyyX%Ra{gV}pN5~Y&vYMNocHj_#rM{_9A3A)7f1Wa#eZqGZ~No$GuUSrNBzmgQST$~ zj~D-)*?wCezjX$E|Kk3gynXRsntjaqCi)`!LuAk4fMQ-XOdzb*Uo zi=VN+aq(X1yBGhZ**|#eeDUvJ{Al?;@8e$vexQBzF@y#<>=FISY^V4*oXOw6ICq;n zboVb^{MTlGesOO7$;G*t-;WnMiiJlbbh(Vr~Rs;<`lTGh0!{O%4U3_LBCZonA#vK;S_OH(VEc|45&mF~C zdm&EB*!Ci_dX;;x_b>iSvwOK8zIDEf_b<+y;m4P8-iMEdysXeToQ2@?QTZ#gC*x!} z-k-vm?){5*n|Ck%OS6CQ)_V<1?w8Oj=)35R)Zyjvg1jQP$=Oo6r$RiKGP!3{2 z==e-UtI|_k=;^ojKN;s^XF{0!Np&dyEtamelk6ep@ofwbM{jf?*%7r$ zOe6u&YC(2?0xUNmJ|TLl-(EyTIO3V_*-e>vd8UfSi7GQ|fh@5y(cKfv9PCA$oF8Os zrvYQ0k23u_pSp_jPvOwj+Br1ToL%cW*$`puqbmgmg0*r(zrO&LP$0{X?ipx$qk}Qv zJ*pyz2cDA5IN}@FZVf=mM_K&np6%3&cgnSPGBXb{6g2YiNElmmae>I!I_%QEZgrC;Iy-6!oP5t0!L#ElVDN9x_8yGu@JzSk30g2NxYtksnGy~_ZIAJS-~>vLn5}vA_#koBENV4! zQ(UPcQ+m`={q|_olYUrKbdz8VCLfR3I1j~+1vCv{4=f23hFUuU(m5ftA3>t{(>s_v z*Z*4f4E90zUaLEL3$NJa&@Q8U%^h#U3%z;qb^ayU=K_5rA6)$NHeb;)bUFn;j?DQ- z?cbKoY`-o1cV_=fFU~7}{~P!3_(y-sr{7+PKDf#zDmACKA(RnQWc)pCwsp0{?q&y?MU-q?$6DCVfK%2_6x&(1;6PV|JKF*huwGf z#=mj?On&fIKWTq2n?JLmxG!a1!MunsT>KMrqd$M? zeIxINe|EDUhyTv(c`N_BIA0;3pZquePyNRIclF6Xg>!xWo5YJaEdEV-W&q5oi+lWIKvchaN8YzJj_4C<@J`BhIJoxhS=WufzCm)%D>o`)h zsbS-lV-r(n?V&&wzb@MmcwK)Peqf&uF3#{H&4;@MHuhPQ9{2M|=D4Y_>73do+VhhN z#d$J7lu*EGrD$zR&EODLs-fyY*(9YhF}X*ri(ZE>hC3(r&d?oP$uIRcX730NbH?$$ zj>@?EH2v2uuGhY^%zy~`iHmRY8vMtzKe_mmJzv|G!g*KT@$8+r!_Qv)muCOq&&?rc zfUke!zN@z{{%f;8zj)uyhZpBge1p>5`1k4`rhi1AUp$_~HP8t|em=kWce9y|B-b;_ zBo5HtP$9KM`2Y7A^L%TYDX(rp4klLz-a|JiK(vHT#Mn|u;} zx?{k}xmBEVo~NrpfS2hGtiLmR=l0e(Q@-NHQ{Nr$@frT3i~rW_&%^tkKD_vTYad*EzqJo8j+)l_CimOpJKS&lmuB;YeiBa8 z(Hf{fHpdM0c=S8Hd|G_myoQKQ!Fc+s&@(ruc~(09s1V&?dZVG~87gN!N>f4s-dxdxo#dXlixd|0KkCRDP?Yas6|Z)WFBv~gpk zsk447d`9OQnz{Y-#bGCW=i)!e_I2U^*B5uEzCdr!5Q4e4t;d#VLU~ zyT+5_WOSbAg(o{dqNL51dYm+P$;NeNV3vu+t!f7RAY6JPY*yk-xbXa_Lq?4Zo3l3Jf``}@By7}=bmfd+cvrbd(lp(?db1c{MTlGesS)=1-u(a4)>u3 zlHd_;V7oN{C7;M}WQwGaidmL+-d~1i06U?>j8DG5F8lL~`)=M2XLjCAZ^B#RnvTBQUA}y0ZyM!YL5s6&qhag01q-p z2R7g+l47SRRzRvf1}mbD1}lYZKtzBs&+Y+W8XfH`kEoAqrTHYvVtb#3Z^ZA&KG*u} zX5Eih!_Sv5zOVGd_wt*w`F8*MW}jc2`!KrSF&-FId85zUhr;tav)>Hwj_+Q)YrlK( z|4jD(F7Dr^fA##`@^{$E`{fM!wEr`2{k=c+|4jD3>l^RO^c(%(#sBoT{PCZ<{|4Uv zbK_skeZKss{@v`~8h)?Me>eW}KOn#8r5pF8e=*!q8QH%O{&UCYEUiCiLwcgExcWJ8 zS)FW0`txW%F#l2eTj7kzRBY~@ID7ri+t1Cq{hx1!$Hx~p>h0{8*ZM}d*M0lqzcZWf zYhUZX>Ec}b{fqmTiNjle>c2JnTjA&4ytOwk{>T4%ef6*Hd)fRKh@bf{$QQ%Wy?^oF zn*FWtpZd@Chkp>f>-S&!_p%x4{cyDJf8)P3`&;3@w6`zLOMCyNe=pl_?01F##*3f7 z@o&6!|HzO2yn8-h{GYSmneDW1|G9PLcQ5`sv;A)xUzP{HxaaX}{Hre|(tOQo-}=UX zXEyJgOaJfUU-`y)Uw-YaRA2ZvF8%G7{@v_vedFH^|IT0Ax3c-7ep~qO%>LiS`MdF} z=WBSL55JU1`~&CR`R~-@|6|8o6kE4(E&Bg|Jy>f46fH}39&Ix{shp7DegU$g)4BKG z@{WJwiF4~;zj40X+ix{r{4LGP`Q6@MZ|)z)mv7u#yS=k_H~!7uukHEk<-6tUtuM$m zU-W?YUt8T1x20Tf9r{5v13fwC@N|3it-cCpPzLUfgj=Qybq_`*s}@a^?e!cnrE0cU z=>*o*K?%nR08UPCiE*89TEuFzhy@d1Mk|yW6;5X|X`|O%8R!eK!9xy}-Vbf1d4K_?ywBtHEWsQ{9h-C%5M}Q3rBdJNDI$C-*j;n7-(f4)*ga zE9y=7o6&m0%Nuv7EcD?+XA5Rxp!*x=8+b7rZfR4V(s9qFE{*8N7#|3KGn#aDxCqZ^ zMZir*ls2FKMZvQ%s8UNk3>mNEqEg*Oz*Z>j^u!QYSMs&cB_CVegd2;rtLHp4i=@|)810cZ8*e>(QN2fa1|cxt2fRqnee3> z&)ntD#a)<}yUDQz=l)9J#-Kn8`Ra{lY=XR!ZzK9~SM<9}N00ZP8FQz)!5mi{dY2HMAoo<%k($M61EObDL^|G4 zDCHo^P<^ROBO9!Mt8jJ1lH;CAX9XVNg>yDdsS?n_=HjGBlEWluf#WAEQ6g@@0V+(v zaGY)MkPfSoI4^Z}X#WI|e-2ld0M%-dDc-{cRYucUDjka@`SIp3A;yoNO1B+3Rq_nWP3@o(6p*iKqz6c#j zNmwyzk)uK@<~ec|P^*t6$GY&PL)0FVWRRJNqBe{ZKha7Bs-RXzA4R0y2E|;o=YxQF z_aal(eh^p?qDzIys+KU}A~Xd>n^K{eOC5Pg5}5!Td$^|2L7kY1ed_PIg@iO^v>tMTDV`$-NTj2LT_H>oQPWc`bEFlDj~f0vll~G-^!? zbc74GA}UrBoz2C-Veiw;w%H^wR3@QC)%81EgrW;m>ojHSOI@%k%RK>9E8J+0gu&QN zEi%QNCYQQpPv&!aX~`#mGx+KLDi;+jjH%D9j{r6VSvs7auTRs%EzSPPM(@W%~1*j$_tL$dNyTq=Ap z0a$~$p>IGvS~2A;Q6xfblIi1xf`r&POafKX*>f@nHH5pBwc2p{VhSI@B-737h%G@P z&*7YPGuH%A*$yByg>lHYaJ`SC(W(&BP+FUlgq3b$PL$@dwHJU|M$FtvhfR2dDCSy1 z1c^+*3b$EK#-J;)`UalE(;|s_A4h{Ar7qXhWIm?jQs`=iz0^m3k*;Uh=h<@Wok-G6 z_?yv~^*OvCZpttEVl-OWvNSD3&d<7m?nnz5Kq`B(z#r=&P8 z0QDE?Na>0?E>FodA_~wjB!w^zpDh&_X0K9ly$dH*UDg-|q|bFYinLFM(`fqi-lfy- z-E1mIKBZ6Q&1~3ShC4c`U%K(Sx6Z#9EyFU{P5OAb_ZH`CVB~=J>CB6ad^*@g(=gDJ zqcSI5%X%}K8Mon#OG8`RnHdCzf_J28myDA{ z3nl`T;%zI$J9-Awl3tV=*$4>WJ$NUJ<}?bWGFl{F(zL+M#*_x-Wat~(8|wh2&M9D; zeDa@Z+S2s)&fd`8SO+NeJ_MkcV)CD8Z?E%^QoH||}%xU;|W#(kWAPjvCys;|v5 z-ycw2`C_3SX=j6n$UV4n(x%~>MwlxB$+mYocBDaxuuCU92t7HYNvoWIt!`-Kxe|a{ zhj$@kWamHA z&W-ypDW}&o9p^%6%LI0Eu@ODf5||LUl_rzq1bk03`CJJw2}dUv8_@+V<7}#WO4 z@q$idDZMz^GLE(uMc>J{5m3~&LN5ZSfFz^bF6ks<;7S1M7zHHm=pIjlt*{utEIpuA zCV8OK5d#;eM8qf{aYbi9Tg}|gg4ZOYRM8VIMAK=Alq(7}>KIsEJXf>~x#--=Vn9IQ zJ31XPa3vtE%hAaY7c{H{TMwoqlf0+XuzGsgB}wL*2*Xs9Z=*iUcxlHZSTagZ+Aipd zBouXDocJVvqa%8xF{5>JPeo!WU(;!$4sBfo*)f5g3~@oDRqTc$K&CP31)V;0oGSr@ zCD%mAh<4iAc8Jm235WiG!dEn6EXS1qlkgsNL_1wIG`Vi>sfc__8y_9ox(Kpm0y`Pv zOfxV&AW|W+s)eA}blRvxTbErl8PmxSPc*~O1SS(qVrpt)@g{8ixVH+J8jrTE5(i2%{`UV81;fqA0p=>h)>#&wb+OjjR2`yHv|D7 z-qVP&99IGeGw`4zI%pI$BSN(?0U+Mf^wvs{7biZtciJ$;OMq0Z8-f5C#-tZ?+K@O` z0tid4iI5Qu{h>Q!7RPZTAOIm@#Ol~r6i_F+j&(IHnTg0l3}h!Xc;C^8lZccn0VWZ6 z(1s12c52bsYX;L|0JB`tpkZRIoLmIi@=Vir^3zS}jF)ZaIE~%Mir$O- zOWH^Vh$X%1px6yX0EevQhIX>)L*!fuxS%7FX3FxWrU8=xIY@{Xv{L7cl_0MKT+oJf zK7-op=FZ;Gm4Z88Fp+cB1foxLM9_}Wt7@|#{_Gp);jY{Nhi{y(mhU%fzG$8V5D)A% zee+23%oAXexhC37-g|b*?SJ;hYcjq#2DNlz^0>IAOWIz!Ekp*ORor+cc}aIm+mF?| zO6Co1WX8&nu4(6OASf?CMzcQAC_uw#L7poC7j!{&^{t}9SV3kro>n0)X%hG(f2;&~ z-O`3BcMSX_8Rdo`0K_F71?m{Jkdy0{HY}~M7(la`-n`t@ElmdcxGkAuUAJ^ZW#9*k zt6^rD@|q?h9phjMV~DEjmTr+#GpW*?v}Kx?w2exkv<`d{LttH;E@{Tnpd+m)iE?vK zMPdb=XwdE$M_W;P-O`4o6&W;9OByvluW3!psuV|C3Gzz71zk{G$Z3*RIRRU}rZsiP zIO;&LuH+MD;6W$9de9;5)Yi#V4KHa;D3WcVl{ha>m$V=hKbg&vsDSJ*==32bZVIi$ zc_rY2Mpz2B5~-kB{Q!S}P9yYjQ)ng5EBP06WXN$Sup}xV`wMynpi$RW;=B@YK`X>l z`P2qbQs8=R*AZjXu4U^0Q#cruI8R0k(uV{)Tgc4-XyhYU$q7g&bI##!N2>%qm+86h z+u6#~gHchtR?)D4H${@-oFX=!QiFcV){$i?Z+L{=xjI%A@h8q&$8 zK}dV|K*rvtL(tTXB;o@KJHjX~!kJVH2^2<=AuMngj#QS;U3KrusfU)ByrJ4kZ?=8fVP!NQfVYc#A;x~ zl~obWKSijl#EE-HASx)IajZ!spIIf{JGSTmwrl%(aoMJ zZb1;_Xi)P*?HaI4;SP+YrDe>8f)6PXa^RGpRFWe#H0?AJq!@n+GeA$_DhtiUNl*Iq z!8zKRoE7eoOh;rR$UXxK7=KPTvcSGD@-fVym+6jbFWYG(321Rvp3mVh#}!99I|Gy4 zhZ|-qyP-8LnRM@TQBMYr#c0Oz;>3rP-Vz`XBco2H$Vg)(2Lb*G_vq;%TqSW{ap+|V zQ;X<1&fy4%ZRzu9Aqluh-)5@}V~C?!eqW}mBlc!CTPF58JPn=86kB)aH{oO>adS@< zH&jS+ZGe;y!)i@O2g@AsL}1umdT!Skx+IQ@8q$^=1GYk`7*dDI(pY!K(*Zh&OU?&I zv-hN@$DU5G(-hcOrngh(v|p9{WD@Y6!EH{lZnJISc)dwm8kfp20_#e?jZ#klZJ6M= zagX<-DK^7gmL&FV*gKemJ5mIVKFSRMTQC{T!f+v zQ;W1*bEykfgaqVDpsS8%+O?ruAN<@-0b*A(lo)!rY zDQggYyg1e8O}O1kPfUw}yc4lHNQI1&6^TD?_`&AdbX8ubqm3yM)chc)vQ7b}JSz%+ z+z{fLi&N{^h%wZv!_mf+2x@+!U29>uu*>UehXE5!BQF)IHMQ1l(%Au6)5#5e1EFmm zokM`OLBTc}4OK?o>qR&bWU%s+Nz?;?a7ZB;C9Cs>M5Y`tS1E)PSf0~88_!@95Pbu7 zhFRf7*OhR+kE21%XzElyhrb^Uh3)B{jxmAyiC~+JUQuSitPYqYgOEh$YMQOI5^Bdl zwnP`tu#zpbOYS?ls;BILI`X~rZ%!PImFN~A&*>PW(xE`1ZD?4RMs>=c(&?%$YYYKj zg{#JSJsJ%e?RB`rpt~L|(<92%wJYS7a@josN|fv3tnfPh&1_`xc?w@G~m=>jc%l9Is;n(~mc2K8_> zGcj(?>LEi2rKqb?w`U^S<$Z;sJ^qx|2e>XH1Ox^K(XGY%B=|(3s^f1YG z0SUjO5!V$7B8A>Yh?lgtL}NO553cDKknlSiF%40mQCB4eMAjP`og#QGuBy#?O(U)= z32Ds`0urxiXTx1`92q4iL&0CrSVp0gnEE&m^$VI<1a9R>tDFpdLu08c32I5SXNXs{ zv*D&|hIGw10dHxPUQwXd!R&@5(o;G-&~!!SRt`v3et=)nD03yjD4p>ZXIeLSqG^l3 zjhm>_B*%ffrxE5#fKnW`lrgOHpJ+!b8FDmfl@qYl4UN37Bp}zq#HJvojjVKcS}gXIGOZgt(au&f|> zCt#~fS~ZTk5`bC<6Nm2AfF~Lhfm=Bg1Qfof?TCXb329vxCZ@kZmo&NbQs!2UjFQu9 zn*6!!k|Y8#~imIO!?bs|}!rwize_H|_&LK;b(Y%W)+j z(dFpm6CJ5(k%U5{0>=AuefdPFLC?CnBTd^oo{4@5RZEjA2Pp zSFb19qpg%Nh(iKd>Puzb1KlK2FWZ=5&Ow{T1Vutgvq7-Ar;^OTuW79%%*BaM;wQPd zf>8!9I-;GnwqJ?($s{=R2Nb@dX^rK$5?~VEgN|sYt&!O3pS^Jg zX2{z+dqeMcmYd)2jNJ9TJl%LHVd?MYo{BHw4IO8^NWHupLr=76-5}J@-MF*N5cvx7 z1VX%{t}gINq6(V`I`RqKWzfR>kZ^2dwRD+y^0X(Wuz zxflez}v~KRH?3Q`2Y3;__v7}xk@p+;P zs;g<+S*t~Cid*6O1LC`+(}u*kQ1VH4N*n4fP1D9%t3|BLt?*2eldcEa0~Y$!VO^YT z3A)n}H7#kamR>-n5s3GeR*IuO(V?v?0T;AkLt|mwTIlNLp6U&aD9(6E@*>HJ&UC5k zlCDKRQnhXf0=%J7$Ebz0TnV_K3#v=H76tVB{Ec7U8IgoSW-^O(v`^EHp)&U=oM{4R zspW=NI&BO?TNgom(w#D0fSLwUfK;s;f&edR46#!rl0dE`T+juD)}erAB$&mxrV;Uy z#uQ_$=)DqfK^uwKW(q8O_;iDdW|U`|S=AzJl2EUj;F>ng39?K{6r*rAroE)i{-GPc zyR(noc8bPu+NRqFKIUO8N)P^LkA?EAlww&0QPHtz6IrL1W|Z62GO*QoW@U zK!h=ECCKZRwpXTd2xlP#g3UdZ(I%Z~hKyEcyySV2bV*liT}AO@JEdv^zNLvkA9vaq zMqpjHbVOy~rBBw45O_bV?QGmD?ey-_JC`IROpzbRH z7qpSeox-5pXF2JLAb__}%ZV25ma)_!If&~@f>D?c+LoRjLnSO7+}u+cZPG@k4?t*& zG%Uu8(!PS)n(oF1V2%B<7qrft-xT0HHlQD5mB5Aq0 z%QKCXW;Ad{*R%`xhK`E_2hZK$Ij$sJ(2+6058QZ7`t2J(?(tVuK5L&i&zVP^=RJP| zK35sP;%KV$`;JskoUVL-(fM4GE0AXCg33P?|6uY*1HT8Sje@9jjBf=CICuIJ(UWwV z(uRwq+l-i_6UPNjA4O3q_YMH{){=|k8Ve{oA*gQTCW!!9f(RqRPcK6k(C zvV#_|o#5)EVG<*nWFtwZfoWKi0ZP6-=#nEll3GMG>hc^p8rqHpu~BOkz!4$=xDpyh zc7jY!qCHv0Es#+Gsazdl5+j;q<4_?)qKO4_BcbM~SUQMo)Cp*7ZRWIeA1DnCV#HS3 zE9Z{mIBfmPVEb+iKoF-94jw)9C$y6NJ{-Yr)2}tYW zxK@E=K_u4{Jm~sYNB2wVRaRNuMN_3s91CWGTwX?QHB$QnmR;{g#ow(!) zG)D{ODwwrbfjTvj0Mh)R>t7uXWO{hc6I_ATy>C50Rxok#B)Jdtp=2)0(TId%n1NM0 zS{rq#=)|OSW+}PMo>c%xhy>vB1K7X%Om&#>d$2P{pdx`2yMhI{l}^@?9+b=l*Ki}~ zXAtS6Xj82Li2=wxWG0WLP3&3))Ct9t2%EXoIX))HuQQPevS@);#4$cZh`CHiPu&K< zCr#QfVOp*%GqrRd$gC{nVo!i6X-0y-LnBcK??iaoEzpWU zO&q}j=sMXrKpGe+*Ki}D5bf<6q68O(y~b+K*psn1q!qV zg%%i<(D+ucfS&X>5)ip*N*it@6r!}#>Qo@zH5qkS8s;2JwLloBtAHj6sH9~JgK}uY zIx!O&7rKX__tv8ho((n#9W&F!A5WAx;Tmp4RpgYh;>UnmgQZn#$Vvjzx;U;?fP_Ht zkmd)le{~R)-|*ChV1}^nU7r*{UCE46Pm*J@4`pIlTO&f2!KzT(uNxeg(zwbEXh<%fahf` zR}ZQ*qa17On98QI)3O;Cc7oE!Au>?`;})=;AXUdw;fWHR2k#^*gh(_cBbKg>QqWez z%*5D9;6whsU47(P9*k#DDO=plKlDUG@h9hXwXB9K^zgy=!;hXh~%NJCK zI^~=5tBWRIqf|eTw(7?aW7G*CDY%cp;~(|Zb2Gp4*7>X9DgVN!{@<qF-<0Y)-9NRh$guYOh6esYimS8VHsF8 zZO7P&OCG!j%?;q8>UGTSg*j3$8q~t#a3IqiA`=-GT3}SGhsFsu2pzNKP7DAr6nm9) zk)OKd4XaK-j8PjnR-EK$DQ#lcDp02;rXBhQallRxVVy`Or$q}$3R1O?suLqhs_v)| znx>SxEy;6;2-<4fEQnPnpsj%9c$^tC5GzPbv_^?E%aBn`5Mk*2^kRzOkOZO2sv*Gw z#n8ewIIP&)t}_~uP(9UHaU~OD)CoXNJZ+PSv@TV%3aA@pr)3LclL<;66AQA}#4>?0 zmc5fBOkzZntOG6mO~dNQ>_I}!ksS$b5yK+mLcOcyi;hMFTg61!5jA?0Bur8AM%#c{0ynuToJ z5o}>BCWx?3tfEB5#VsHyxH@S-N@7HlY+U+u8rEc>A>ST&b8M?&Al+4X$%E^K&&3hB zD-dS)!mN7<7y+v^k`!g4W*N6YE8-a63Kr0VEfJ|EO)1F*CqJsl536kv1X69_SPOR+ z`Eq*;$0|UJK+`Tgn+!Wa4#mQ!mvIaH$gML-e$qaN-*M|aJn#M&-uf?o>c9Ke{hR;q z58fC5m;5vLyZ_+p?(3g<|Lgm!{^R!RpE_TF|5NK%zUy_FFkFILk6k_4d}A5|wtG|42X zLhfs-wBbfVB|+NQT9T>Q1_`Luc;-w3(z@0_)B>}s?6hoQRN?C$DNt;|%_@8B0aMCJ z)a6mg6#I0VQtGzg<;QUcE(1U^6(kXUX=sZvQq)$x#%?YI6k?ao=AcX(NeVP*jWaet zEA~zr%uJ9TQrlrQY0hAm9wT+j4{MKCn*#O(W+ZWE(L&=Y3Y}FzlMqxnvmm4Pw^nb! zFc*YqjSDT%3VhS4iU}6hlVl>g2PGb)4L5?M<*Kb&Fre08X+;F2VUQjZyH*u-I+UH3 zEzCnu`q0wM6`VF4XA-Bdlh`CO3B!<@h*aWu;FiX~P|7J|#g$COftMT&Sp{f83~;Oh z$!cY%Wea1I2}&Om$y~u{!!e4Z2M9e8+bBG|7FS52b2*wMHZq7LIMTEr@Kx zOAcN{XF8$hpsZk+-BngC;1)*KoJmfiJz2&rU^_voO2|djDD}w2feIlKO)Quj2{lK> z(m`aSPC#4d(9(=@tW}kQpIW+Y0J?@u(R7B&lw@Yab_=v3P!nKC>qsAdJXmqy1YV0A zwHlF7hz3@zt&LG9psl*bLgNB#l~sVu5YsMAnhcAt=>9d)Y?g5gjB4wo)66LKI7J+V z)`|35oWV|@BBAD}7)V?Ct!s*N>QihsFtZfa24WED!cHlqUM4fsV>?qb@~U&Pf@Ub4Q*);g;%&)3wfu+)GH? z0B(|;kx3&-fe;a%rUhCNs0lEn2qT(g;{byfskGtb+XHWoZ8gkH#RRBWRO%e%6gbiHk9AEe7#GVLG(>+8_GCh#5V3p`P*+@Y8*V@vC8wrI7L0fGL zBHJ+h(vXEfWzU#N)6k&AsEl+uU}EhclRGinLH7{!);H@GY-@C;qV&LtvKG098wrI4 zL0jn{BHOS5qakZ7hzrNmt3a}V5@0wUbp5M?p!}Lxhn@vdnAx`;Fe_Lk#!faaJvvRU z;pEeGB1w3@ubvZcFk;epn4NQ*jcpjg^vvmhNmT zlFehGZY5Fxw=l93ls@72;It{+c1fRAY!aCSf>f)h>R>2#LR!NfRpbq;ZUiW<1dbID zN=s>x+cc|Z9O4~7Z$E(ji_M_)3BLz7tBfUq6U!M7mkE>H2Pn$e^^YZ8^+wQE!^||p z1_>Z3fqFfrb5fwK6lEkQPFkUz-71GFFnwS-WErlzkZ4qdU+Q6|E?krl$Jp<+3cKj04E!K~{{2*` zC&|VElE61k1J>4vghDj1YHe-QrD9pl8cS&tyH)`lAre5EAHe?AVJh7z`1CSvfsCz> zxC}srI7)OL5}^vAk#f{+>8~N8fmIVujAGRZn0V$)rhO5blGRa2u#iexLc<&^o&7b@ zp7tqs5iqlFJz(gDCvEhohyyzbNFq&^767RvRA~--kYofl03d;SJtD3GQ?CN*SOjac zwt)+J9pyC4MSYf`i$%~ui4y=r+Ck_%p!C3rvKG098wr&JX~(ha7#lXgL<610V|Qi( zJJ?>cG5C_t42w*G!+}f>&v}BIRmO7kfLX!9dQ?zF`@^@+op3W*n2p-sTD<{oA_YRU zP99pI72cz6a8e0LfGkc*)VEuWqyg$10F4l`y)pg|W#5 zr4J%AT}+t~v*V$1WrhwGC}xt414L+~T*HloN`iD!w6(Ee#>q(H9_&_HIutsq0GWXj zfXfeH|LQQ+^zfV~xMHn)-+F+oVEgDva!mH=G^GtEKe-N)b_p*aQz|aZA6gl)CY}ie z?4Q=!y}?o(6WTM`M#j-S)T!u2A?66!GhMG5GNwTelm0Pf?4)7sEG?6ZlMi%K8UeDQ z%wYwD(o)*Qu2mpetwg~OpVTQPD1E~3!D&OEnS_#rT*)*NkzHt#13f5J>uHThs3a_F zkJrYAmpp;aViJuBKUM)U10{eoKY;zKuV}+MF%uaVTA&pPoY)mCfUc8`Ob<%(;Tmql zTtx`l%4C^RabN>RLsk-y*2QtH0?C4yTq8TX*`|@G>r76BG&tnm19^f+smCd(q9x(y zZ@q?civC{eF!$!2)r^^PhF!FPWNv3x4Q3_?ovGe;RR7(2e@pds z)v!mUXkgWiXpGu%mWF}rU{Y!E*ea`ljU^H=J)3mMp$+Rqdroj+HtBuq0kVP}K#vL| zQcarDWHENZ$;Ih0<01=^j5_+$au4>^jkW}udKE|(1QhxF_1EskXlR>Eq;+vzs{jds5`fDOXkD@sL|7+Qf$X3KBn4L|4J*-vr0R~gfTVFu1{xGm zIf!L}svGI9s1LaZ`^HjQJaM`TXp*pPN3eylz~>MSQwyJ76oysy*h6DwAdV8FheQAa zBu$Q54SQ6I23FmO#;EDBtdtz!b8*D!3S^5wfQVfXQ2Sdem%bEe&>9q4U{pdAyMhJu zB-zOH=rpAbC%+ppNZM)b@oH1du3}jWcNV)<3xt?@70@IEMZOC%YJY3x(w71aTH`_s zj7kW1t}0fLo+KNY9-T&BGGghv<%bm@nP%Yf+=(2xkSNV4#~OQPin7zP8Hc8>4oV*r z$y~u{!!e3D#z%L=T=GfQ4F?4tJ1a*mT}L#qYHe+dIst7pE19&di>S2<&?3;ZOV1|5 zPEbiI!Ne-;g|5LN2?A=u3NaV1s^sRt3A`4$r4b2*g=1Usi^w*-LVz>;i)IM2@kUCuHb+i?M*UOY*q#fU^G>C zR0vIzqn-5I~wAiK7}WAy6I_%L6qZL!2;;Y$*&G8OFVE5cehZ=DPzS? z0BQ}ER;?i`2}tYWxK;rY0>wj`AHe?AK@emoatP9H0ZC-&ny^C5g^Lu*IdB56MQ&+C zLMf+=6+ea;8(#9@gaIL~%j{VNk_D6i!wn4+XJ{nqI&%dF9MW!R?I)Kp1u9{ZjRQpJ zm~ss_5(*LdZ6#Yowqf|Cq3u`@%dJ%)Sr7>@91rUL)nO`Z5^L{S5DC(DRLa^{SjJR| zv6B_22W9L!$1=H!5VW=S&NL&i0i$7*WFoDL<5~rh1(94+@Sy8o9o@tJooTHQa|G;L z4`Mmvh!R~V>qrkuhp<;`k|mXdWi`x9Gm^OE#HTNieG|J@MQb?+g`3mTjB>28 z<0k=SCajJPLcmUt$>>lhJ7@vhMKJ3YY!EtA6Oph66O-9w#%)^rR~yFa8wiIP%Jv^g-)6#X~!g#O<2VP%ufR5zzMt-xup>ag@t2V z$rh1qc*%<~Yb=NXj#VI8t?aaHVQex%4#mP}3wGv3Rm)Xb5hXPdmq8c0<;J;?b5T!uoF}g?_^~f zXP|S1+)QGg?l0VSOv%oH0{tg2m$L* z#P}P24=w^mnX0Xm+$2Vn)OMstr`4w49h0FBQ3u?qGc&b!m+(;T4l$dQo zn;?ggWrXgLqF~ND-9w~*5sVohm0|&OrY1raLL=p<+mfy$KdiWtsW^$)#!AUS3u3vo z3Xl+D+8No|%}!AIm`Elcue7006bYQz6)b>JA94?zC~J{xxRFp;5VY04AhHd^FAZ4; z;Ji2@cLl-ht`aGLTNqg!1m!n8^#nJoOjUW)1BME5`sm@HJ1XR!l~T6_FF%nVR>Q;< zs7}CW(l<%n*%Xq^W1%hsU2ta!II5heMD|VVf)p!#271K)!;BT(fnA^8}Bb z6C6JsN~GOHFoH!J)`?`U;KHnX-+F-1 z(_n)fMuicnCQWIwwBY5(K`c`PBvU~h{b{)e`^JK}Vob>@ph*JIE-c7gEbw)Y6eyI1 zHa7wYz_n_aL|sLzm;QD(~?65nq*1Mv8_&pAc3k6d2lB- zi5sl~Q?CNa0-AP2U=Cx^G!k_xdQljr$-M{i6-*=vliY_)O4CS}MnsFlY}Ou+q=?KQ z03d-LJTju(S_RYz#iOwTnoNPOd!#6s^8^ceY}M)kvw|H!5AlgeHEBxCkT19!&5^x+ zd%W5du=CS$5B6OXBE-~dRdED@DrpvE)c)3*p!7jxri(%nak}q4Kvpnu@g%tqaT=*~ zY0#=~Ic~)6R2(C+9C>iPE!7jJt3xLXQ*uf3gSvld0NuagIZtr2%2eq{KeS(2%MfCMfLZR=n+#73=EK$B1@J1tuno8apnDU!K@n>FUK2h0j~ z0EZz}H(XVKoRv}o1qn4r#nM4!qdw$S6;4=^xKV&(6-ZVqktPxuwZF9{h%h4j^rBF3 z(7gw8&Tvsy!X%T_Jt&jS+8Pn+hz4dHv%nRz^K)1!HAY%;j%$tG+*aZ?qCvE%CMbOn z(I(5#0>QQ(PSBZAqBAuSYH47a9Cce99}3IBsvAkyB*HI^joU(FxwQ(A8DiR{Nt0pm znQD41e0mwTfEg@DC!IQ?L}zLu)FL->{L!F@ia-Tz&CJ9oNaB(Q*K0Q#8?{z}WI;^3 zG-;BQx!{Q?-zb4WG1f8@1ihx!U!B&v8Q^bK?kNK9#O}=%@4=VueW3;Jfh5`hq zZ;proj#WV2C_61%81zFM)`_%O!G&38FI>m?=!uvsx+vzrsne7$4VlTs$;TwsjU*%R zk|$8F78)D1Rsk|YB-a$$0QRpAhcF$MB8$Qqv3GpLv2qb*qArg@rch0qQj*((mmdci zSal-_Od|Z!&=zAYxj5oXmiEkq(12bLbVn;`Bq@@)f}3q}?*X%diHs-x`#>Mc*jYJp zp1S1?t9HCLMt#Vu!r7V3Ul+%<3e>5IX-7nZXkjOauuiN3*+C0P3a(BXCNZL<>W-EM zrg2OL8Wd4Eh-HDQ8%ZE84Uv1WZ!D$76Q`?yCJEbigbiF!4xK@W=hMs3Jp{zog8^2s zLFhV}h@ONJCtSmgAc+V;TLCbj)?jJX8nO^@^x~NCWW*6D9@6{(_OA|t@*AFdg40HU zOZh@U)RoLea}??T_$tY_4BSYlBrGeFWlD?BepmrArGdCSd&+^%$ApEEJ!|ZkgrE?+1cZRa zVJhP$u?mC-Eift^%SUOjK{AlKmBA-XDRo+&p!T;`E`2G`pfxD8z^H^Kb_EOQNwSgY(P>H>PJTCHkhIg<u*DxgUSihLJj)c)4Wr7r~^S%MU9+ zGR?r{xf3~XAyJx9jy3ko6lJGnGcN1|rH_eZuHdxc7{%%lW5%MLE{~a%vwb>^V>0BY zjwtP{S$bHzCS&9b%_v3Hj~6>S8dam;fota6ozS1&^Sd_h@;fQ zArYw*#?IOr_NXFnm>V#QOa<%-94k(!mg3jcYwVaLpb)zRgn*r(^f8gl6`VF4qllwd zyCLS1Ppa-Zd+Icf$pAGljpnErNSkU6#4ovr%+gXjV{#aHgR-l{ZNwr|;9$7?BI1z6 zlrYLvtsYJ2h!SHRZDWe3PLrdaF;chuumU7gfojJYD^oyeK@4!L0ydU_LhOQ!+TU6e zls+bsxq_Qj_Shp#GL4KU$;PEer`4w4-F)>er)P!&fJU8$GSzJ~G)92Mi(|r*p&2I1 zPRkZWoS_YKN#+WMrb+Kx4-k)ZvX8?w)B$zE96M`kL_#slz^cirk1QShIjp2~rhO5X zWLB`4gc6VlaLE)CL|7+QQJisc3rGsCP8yJs7||pfmp+|_H5q8gw+G%F+uGx)Rz&=g zd$6x=`W8#}tOCgbvTaAOg|TQFiMk$R1qU2*?*X%dZCweIY+U+KCY!Z2BGeA8pshU~ zNf9Agau4(!{D)ht3$SL<-;*#wHV#KH>M^v?-kAPM=OIc2Q)F~r!g0iz)+2}tWQdscyD0VTk2L&MmlX(Z}8lhednhZHm}z62}8 zFvoT<6@zgnnpmO{35952)!N$FiA$b9b4;2U@t}ow$zB*iy0S$fi=*iNMZ_VCDPfeU zDsOt!W`#IPj2;pJ43IQAYBlUpDH>RHBO0T2oTW+M9BFLSS_SIV#Iz%#K^(C79D*nc zKD`VrFsju9`3g1&ovDc+A0TOR)N0tHQZ%sYMl?q4OiPo#IhGcWt+EQ(SRw(_vq^^> zI)jwXPcI#76Z$>&09nBfz_6ya1CR!UMrsDANT@j~HX?{@CoXvc2?July3C$cAXz{O z`0v*Ju2}M=<(uK#@hj@DbA6`)hxCg^>sKwu*UvXyc78lqMHM{RtgT^>3MZej8U_Mn zE1AOz2&JVvV*(^AOLmn=yGfH#O;GxnNahMo8;(&BoE{)lh`r+BU=4n^YV675r0&EhA%u&b`s!5YG zL!Q7;b5v|Z5ZO+&znp=3TbfahwJN*1mvAPm2=MMgQ325X8=mt7H>*sQj`6Kv0X@X) zhND@hDWwKd5mn@e6_)~yQ6KWE3U^v9hyjjOfP_HP&dAPgc7jSWiDWv~xCLyt_dvda z?PH`~vBE+;KqCp5)d=BI= z;WWEN2uCelx4dE135YRj1IJ1VIa;_;fMXS)MWAVy zo=t|GAi_G4%oQAP$h`;53bqeMFWu4R4wB8Prq-$dq+9QM$vyJ*a=YB9W#C3a@q`9e z&DU5TSzVsjw`SH@@=c=x)|rL6m7SK2to#n0=RrK5E!dd>T9Lqsr5j={MM+IW_vkdG zOJiUfB|$nVtYj*-0q13<_LFKf&9{tH5SBC%_=}bh~yfM2e5zjb&pB3CyPSc zID`v6Db#flHA+26HZnaZ@gQxu5%euTtcHmz#$}IXH7f~DnAo*Cv?igl(LPIUVJs#n zeN4rN3Dk4I--G9lSPaI)d^@TPAHG1MQBP^0TMz0 zCM{bS*$E=76Ukh`0f$Wbta6j6%RbYKe1N23vVj@{Lq#s_w9Y)nhL=2e4@v{os0=5I za`tUPr)V-N9Odla@SGV-VHjnqRu6oL5J#zpgKh)hlctoqEqM8f{ICKfQ-Nv&$68Xo zS`Y&qs{jdspzOjD5CV3B(kDFi1g8zhDB>933Kq~qYTT&++fGPpL_%e>Br#6;F<4@- z+^m%3wA9YF(%N=RB%nl^Wyr7-L>M|hy^NbCAqk?KSSrL(qVs?}C;+8)R*syf#+zeX z0Yp&RDu!hRTV`%St%A?oTe>6c;XorqMErgUjg zL=`!zvYFCI0s}8On)KCOQq-7d6{u4a$u*@MO{SP22VvpU%eVzH>ej;vIx|XirY1ry zBN-`2-InBy{IKG}5M$H{7>$jEkk&=iS_Mc5H0{tg2my<)=>84Q**;BEFV*S+vVv9W zaf&zqk(;K}AW#EC%~7!tL6Sj$=VdLNz9duwF2|TvK$8R_#4Z|GEGEe0B$By;(}rUd zdnYk7?H5my6@${>wAu*asHN+cH>?^@eKba$fYH!4ndT!jC95q-AVy`UWeX$Qv|%o` z$ue$w8iHW!BsPf=O|lNO$W6m)1NI=H=E#o3uGD6zKIFm8Y3V*t8XA~W8n4U=Kx< zV?kU(Q?d$>5MtUH+1btFBPhQn+LLA60;AeG>CBk+(c=_x03w%UO9tq4M9@ceB(;bn z1DEF_j6h;eOZCL*8oODo?6hpgg`FUiS@`r~N{^d%G{=!uY!Y=9k^V%anlz4`)lIq) zmZ&wbh$Ik~=T78c-)^8ft)^i+W=7pvI$Ob_GZgU|V+E%TM|)$Yauv&%0u0Nb9gzwU zSdLnaflZkZ7+v{AWE)=cjw(1wh)ZaztO6v2NWk=LGAurV@@t|!S;j3es;!ewW@x1z z4vu92Jy&RzGFrNB`C&E8OvMBwaIB;xQz`B0xK?I~NeC)w*(4DlOHleCvW3gg%(YZH z#;4+`C|YHe-o#3fIlUM;1?W2>wJHkL>LX?{@m@6Z{fbbfj< zWkzfTzUfp&XGVFkKlmYDa5h8O+$MW+j0E>mqio0-A&p zUGjy1 zO+jlO3w0}T1Ed+Xze8sbM{x|7anmI2l0K{0Bu13fM0AgY(tu|CXi3K&WMCDJhRUv} zF)S-32a_04Zmj~Egr*((`k?OLp_80MGFNcga0DgjSj8q$ms66iI6XQIYvC}xN2O?B z!V6H4j3mM@4Q=btQq-DyjlJj)P>5XuLcroPm8c|=xq_Q*k|4_QQ5h_N(Nx`0AvaA9 zzAZM3H-fenO%F^)3S9JR1@SNj?xnu7n&w%C%8I^ zO=3iotb=qCkZ8&RDiUgrih;DXk)uCY=U`ZlhzUPdfn-6%lL(T!3q{jN)YVo+mZ1e& z5y$veumC#a6X85GFip-H4T`96dTeWtM^Z#Z9&!&esM38Pv$7;cCGKug9JRl-CMbOz zVunR0-9z*wKuuT~EP$?)jRPd1R-5RICRtL5IJT8+5seKmc`;^<1+lSf6(AwRv@^1^ zo1Gx0;7&S|6Xpho6!eDU*rldD8=i3bFm-;b0XNa?eVswIoF< z2?uNznTn&vwyf!!V|TVG5t$_>dn8w&9La7KncX9Wb7oN(R+;?`q(?4dOe}yO4^|vF zfvct*jYud&1FP27#!g)F1e$|87srH?ML9b&Qui`sR1P)CrSsE^DYjAIa`Yf}1q(Y2 z@hCGHJ*%umX~Q5bX}M}^7N>{+q32~aE15{^GJ945O%k^42(~a56J#ZkK%%ay!X9Mv=gYEo>5Xbx+-J^yoCD)NLW}*I)H@{OV8s8uCZ-8|%Cj zz9YZ!_IdyPhvm2K`G=B!Ci$1~_wB#?>HFUQ!l!=xwfFx3;NJkQKKfVjE6%+1{cTX> zlz$6;#hF{}e*^OLf7v;=>yw{<2lDju#q`(n=kopE{a^8}U;SJ6o8SHX1Lwc`zw+Jp zMei@6|CisddiV3t=Rf)G{5!B;4`dy7j{0BE^Z2*VRtoPN^&iJ`OWW_`iKqLBJf8jl z{&YhRU&i)t!h8*X9Pbn6OE})`soA;aJ)d`f$M*VP$@e?GzuQyKyU#m6*k1n;{KM1b zSwB9}$EW`X@^!<<)6HAXI@n%=4LLc=a`WN$kW2f8miT79It^Tomy{J4rF5cPM_geB#<+pcx z{vDY2o_`Yt+v`7y?MQua_xl9- z5_po&`xUlp_xehEh3%fbzS3S{yJxSjv{%^fnRoPV=l#9oz+Z9m?!VH!;{Fl;F)6H&3>o5!^b!8_U!YH-|eHj+wXjX-}wd~-Q7OAyZz2L_>+&vy?wTFhgZFd z8{rMSY^%5t-oVSYiW}h#ylkr&A%6(l@8fA_Qa>Y)j>prp`Wbn2Jf5D_&&Z?W@${^I zLQaN39k1 z(_0R;PBHkLXFua}p8ZaDhmUXeJKY^VzS-||PdK;k(`P+?ry6X|v!C%f&wi)7!^b!K zo$d}F-|Tm~C%g^Yt&7Z;G5DNkKjU+r{Z4m>k8k!n-5ox@+3$2uc)bwoUFiDh9ccg^ z<*q-UM{~yKJo}yQ4j}Pz=v)}3N@bS%lr@O<)H~XFL2`|I`&gAI_+3`H( z`Sv)a9(>NTpYb`*ey6*`$2a?(?hYT{>`TYv{>H_7^quP)t+jrqH(s9NcRt@TSL5&W z#{2F5M114-du|@#_Y%J6{&stx;CH^jZ{Gg*`}-B}E%GEjzkkKE_uA{PWPQc6_ndd= zt2^)CSH1YYc=q&kE1T$ zBd(l2eva$&zUedjS98AVo4$Di`6lxD?D?y{=`;IRbH3`EzWY0vr^w2;8s;7UQQck6 z9sbeWkLvFF_zqM^-)YF~Ze>Ep>?y=50Jo~|<8vJg5r$6)O^Evk&|5^PxZ{FS6d5Zhz;ZVfRUDfaO zXa0OXXTIYnUCa6+I^Z4u zS^YWheEw|SXZ7cPybruXv+wo1D?7H=mmC zj5wzLOy0Zvcl`7m-Ry(UKKSel_vz-X$dk@DwAa26{#DGk9c0HpsvjL*>ht-GzvDlv zKj-D?XOO9Ttoo|v+q&mRvwOX%&HE~Ee3LW#;Ij`t`x&3}?4z4~@Yxs6V|a1##J_XB zi2e}uo!)rgHsM8h@p(bLvMp{EUkv)9ZM9v+T@Zf|FMeOMubkoDD_(ba&Av~6t9aAk z<(Km|oNxbo3Ey*npnts}^PXJoR~YZqNAb^SuQIQ&&t$&U`h@tFc~Z~&6?Uh+!mqH; zWPV0_)$0m-$b9o8`Sbnw(|&$h-|^0?^PP;h-pi+cd~45t5eIku7xTz}MDwc0EAAuN zZ#6vLx_EM2IUVPq&-ht)cfP|vn)^}RU7tIA@&@Q>zxRCm|s z4qv(4x=){#Z|m`#(=&e7-JS37kLG?zkQxTzNtKo<9vsob$91G{G+)a)!p^E!&hzgOfy{dDB@dH6gWih5-8EuS^} z`JMjEpU-<1^*jEpo`~)D@w79k&&XMS=FjJM`ZIq%e>U%}{?y>%%Q&0synR)&XZ@K! zpWo@v{Q3OZytn##L3qb_Pv%{l`f5L8yBCh@jP2*Wj`~r5=FjJI{yYAyzTct!U7Gr8 zKV!UqJGR$nj`JBi>(Bi8{7!%7&*!JS)6+Ws63*XT&W}%v)1&UppU>~~Xa0QtY~EXa zHQ1lvUUPHj^BH@`e^!6aJD)$B_g2s2tBdz&zk(k4oM+#))Pv7H{P}#w-|?T-pYzV= zr#v2UeR|8GpZbn3;*EZElXpI!@pt@Z_2<0v`6-WE_vy19zf%o1=h+9Jeel`O_?%}S z-Ry(Ue#YlK`{-sLeD*Uw=h;U$H@)4uk~2Q%+3)mc{(K(a;Ij`t`x&3}?4z4~@Y&D! zoM#^$@5la5P@mtK{rpaU=FjJI{yYA&`g7j-{FGNO#QsiDpWm7NO#MoK=FjJI{yYA& z`g7j-{FLX@&)bl{gnVOD=dXcvzPap!&wj@bKDyZlpMCJz&-k2YAKmPO&wj?|Jp1VK zG+F1H%6`V@Jo}yg%%9KW8+`V`XFua}o_%z)4?g=DpY!aa<7xlo#b47;UOere{QPM1 zx^dovpZxvV?mvF}Jb@qm3V8Q<|9)!!$==*E?>_I~PwhY1n|uD*r%(5v`pZk-pMCn? z{*V3z@-1-9e}i-S=gvRXcka`7pr63Ji}P>6KXInVKhfvzKJy0VU*UNd^Yp{~-8?;i zqVL>S-oQM8`L6(2k5xsgzw7*W<$iSTpM3YH_q>Zg@oxd|1y7q#Z~y4|r}}cQ_O7k} z#OpDn`X|ry`iZ{uw0A8(`3~l^#x^_wmHH{fIoCKF|Nvd7qu<;bv=?)9n@Th&-P7 zwjYtl(|i82`tSBT`+1M@)UxmO{*I0Aujc>iyqRXlCI`6ad=BXav%Hz}E^!SASYW}az`|P|tSuWm_{R(=l zJ4bymslLjO@2|?6oXMSi@YxrhSKg=GC4UU{g*<*v$G?o9+{bQtio>!0T|7^o->h@u zTY5wuPk$Xhx&KtZ>9hKF;g>LE?Wf!K3GAA`il5BCDsOToclN<&U${>?Z$q9cUr4U} zRq&AkNCwk@8aG5 z!YXy=`zZe1{C9qy_N&iJvj0N94)QKs?N`{Y z-Rmpu752`v^Lu@jb%pJoy}s1GeENBNUj2o<;^rlJr+LM{;y#l9&VT*Xk8kb%FC&`o z`v30yck}UnaPe+%<#e2bKI5aC{Z4m>|F8WjK(N)uVfVlFT9O695J(`e{q2_dgA-ep zWrs|6pYj~H?eX;d&a>^}a4F&^O*DSo_x#4$_HaG_Q=a3tJ)WN5dA3~~TIcEGjcFbC z+MWHz+4gWf|5KjhwmqJn-+8uOTn0FV0B8HL@JiM@`@OfVe(!CcaJ_iXZ(iFTuIJzK zc;dPpIAuKTUBO!3qu)5&9iR*S;-4#Tx{O~!Z>}#Ac#?S7b z@}Bsf@^z|sYf^RJ^BZT|!}a`6d5+umczS;4*>-Uez{%S_U>%Yo@2&wcScuF#n*VdCg#43>x7!kulUvc zmG4w9E(q=f?uqY)*H`w+_a(p01vj4RYxnK?3VY>ibJ{hqxF5I^w(Z)!*0$>_?3J(0 zY1hD3e2v$A#n*ks*Zm1!roDd&*OR0tdBdSx{ z_jfVhtvgJO?O&cws>bWS;_JTR>;8ntam9ms(LVC-e(!CKt^1zeI9uav?YFId?`?ay zp5J-4#$oEdBe|rb`n|U`w(fg=<7|zywcob-y|?Y*dVc5G8iy119nngj>i6E(*t+le zjk7h*)_&XS_ujUL>-n8$Yh1P;JNv!2t$y!qd${Va_!_VKim&^Mulo}|E<$@J zu=d?({qQ+<QsQzV3T|<7|zywcob-y|?Y*dVc5G8mH68dz(m%NVi09`HizR zzV3T|<7|zywcob-y|?Y*dVc5G8i&*U^5Bd4JD)GwnTreY;pao@m$F;#wsYE89dsJ@0B&5H(RHN z@A-#y;j+X}nrM7(|2A*7P7mMn59>nfJbk<|&AGkM^Lev%dib7ySXTl$gdofIW1-d4 zeiZk_^EUoAZ_~{a*KNTm;u*d>iDlAc9&%>t^zc1DxDms1TZbpE+w$fvA@fZha#ZzO z{%0Oe2Q$*7G_CRd+q~I2J$%nUtc$0>_IV;YmGQk!4{v=5fA#)zUEYxW<-20LN3^oP zXP@w^&a%RY%3Ij(_IUr>+k;*0?wA?{RmIw^*#7@T=pW`AP$& z{W!j(=eVm;oW>Dx+^vs)uE!Je$>~<>6ym64`dG=-T+_dOR_l5+b z{kHb&^mQA&*3OfX#l5+T+uz38JNE0i>$;Au{kHbo+ON~sZSYz3@0dxDk8ee#(D}Kl0;vZ+>nEAQ>4)!eWPcJs& zf5JTRKjlBgANhGgm=fl6wpYF$_R4?dYdPEX752*al+(uB^%WOBkGm7Ti#_G~)4Sui zi;d(zX?Wxh!1>hoN!`V+znDGOH}8()E;f??q~Vbt7bqTl1$eaE_wt_o71!JM{N}ao z;d=h3JjZQ&JUA_%n7@Q}pWztDJ+2q;`ORzF!}a`6d5+umcyQ7>ZLR8FQB3b1*NgZ3 z=C$qNdj6+8$8CE&*;b54tGe&`E3UWi`ORzF!}a`6d5+umcyzM5C0BO@kt_dGp5wMX zo}S-%wokZTyyrJB?#105L9BOo?()oQ+vDl^ooD-m>&1J1^Ww9=yCXV%A?|QJo}S;l zwmn?W|CHyrZI4H%uiK%wCZ<#O^mL2#*5sb@9Je)J-S_;)**@WV@t)tjxCiZnui@@# zu~=7Zjn{p}*L}s;{S&^%>%Q{gG~1uUrVZq}x+=cL>%QXazT)ft318!NU-@u??eB5d zMzpS;im&myulTyJ__}|>*LdAmzS4e-r+SY2;||k3dXBpqJ>x54HoxL4U)@(c?gQ?K zZM(Lwwe8w>tM#?(E3PYFo71kZu+>-bHD31>kNcogk!{yChJE!_e2v$A#n*ks*ZmW| z#_PWF9Zz2$bM>|RcI~^>`r7sWRqa#1^0hhb`U+cp6<_0ZU-8G&$L=(t7~wv}cI~^> z`r7ps*Ojl$Y1dcS>Z|w~ultI}$>6~~X&-razxTE+7Hj)$9k2VI-#FVIuKFv!#_PV~ zaoRrdT+&hf-rKfVtnIgTyzYB`<7|7l>aX}3ultI}3G0--lJ!neOwFr%ym3a?ea~;4 zZ4X!d6<_0ZU-52>LvS!`^?PsI!`FD-JFoh^x9#Dozv64W?kip=t6SnVES`FIu&;jm z?6<9c?`?ayp5J-4#@X6$Tm9bKZaAK_Zi!AE6Vp$c!`>LgycQah-F;pgdLP8n}cMz;FBx9#C;yzZS>{odR5aMfS&HD31>?^D}PP7|tr?`?bd8n1ij zRloPPJzVuye2v$A#pAU1C(y?S>G8xH|Jq&P+U+42fj!8)4a&fqAQ(1U5|9}3(9<5w zT?%4+3woy@c6;WwV=xHc1wpuFQfKaV^;z=}M09_^us;E8Si>I`fW-oh??L43L5B7q zbajFZY(Y@(w90aj#r7aIbZ8~eLD1~9sXl`dYl3d=A_&+{%jK3yjx9)?4ecTG5X>@} zi6PxREe!SXlib+h5g|z40GPySUoxo3O^GK!@ZXhS^zQ_qHWYR=V7hmJavG;W1@R{a z2>Zgo{5yuP0M4y1GD>rNy)IH(gs(-yd&j`}-T}zl2SDFl3Gm+n5WN8?+6Um{IzG^P zeprLe>PzLZVKNwxQ&xb))i8mb|9B8S4JfQ(Wa-hCu-=PgE_B&M%zX1OR|H z`oN0@(Y9Zb96*E~K*SuFknqTK0znNP1f(fe1577S_RugTYojSUZp&5BHkJ)UU59cA z9m`R%N^j*(AUKZYvg07fW0E7tW4QsTQmY9wJe8|VCy*pW=GL7TyYST6fC9r|u$e51 zcG?GO>VqWup@ST%d-e?iL01^*IQh;%wo{Ukc4EBrT{^Fl*8)Rb)wmCGZ-zaL0GnfQ zcJQ65fbZlg0}m^bI~K9vwgd9+ed>dqjYJ&Va#)e*?_>44=W!`3q)B{Q)!NYIr2S zZIHeF-QZ9-s+s5TZv|rxgAMYN90Uh;m>CB#490ge2u53fI~ZCAWrWyaa5gr?4D2W~ zv4dcq+YgvA2S^@~0pKWD-`ODZ3^;_2$_V5*I64~zgZC$3kPb5I95~;h7kks`sR)Ls zT0y0&paqB`iTl0=0rl^rz$|oR0587A_3h_CFoNg zu0Z~o3bP4RGTuW-nW|vvqZNRksSsl35-4Q60{We)0PswLu;Uf>znoEq#qeJzEwIN@ z7mzv-paYSN!NG@iPa+++nL@f8@V6u=O65TWL3+W$pOv&td2nAR0ezjsfRn%mX1>tI zK+=Xc(?HT*BvyE?D@DwtYIu{!EGLlxs_`4|=_DjsR@Td}^M6@0!3r5jR zlA6dQN+ljIkx+fGl3VOdMMI+Ll@Ry+ZRb>c7v=6F?=A-@-hN#Sxh0dL6TFF znoy=2&q}JQN^otJ5I_Z$fg7tp#IM%58cKqso1U}6v@-7Mlxgkr`i ziJFiCc0{)%;+=##5lyXa65InxAXfpw_oP^ngZ!l8$_Y+F{y-8nZk4)8XcdI)3ABy8 zUJ{i!iK?9>mP>hU5^}amRN$VdgVBWS^5B?Ef{^PZc=-=6taf>A5`v5Zg9jT~x*5!5 zx`mE6|2YhPP=SCCnOX*MC`my^?~+{MoFt7|DGOuA>U$;Wfi(WM^4cV3=F566l2DW_ zw)jnB+fyU2n}heASw8m zl)dLnw986ulIoHprq@kEf&F2#_nQB z`YRLBz*dok;{|6Zy^Xa&ehRr?me|{h ztYT_-YQld2tvHJr90 z^I7vOHHABojkrg01tasx8?UU0D%fSBCXk85C`3>~Gx?Ahc~GE4-xlX69czHQGDMav3XE=e<)A9*iEwX;3SC!JJqut`ClN^gQdD-BiK=#;sHCk&WbjAI+ltI*5S5sb zCY#w4BMLM#L@icIWQmkYkYdbf6BF!;paU2Aq;5;39#;g0^%16XPeedGrWI7UHgQMB zN`Wnr6rDuY77~dl-Kj`*jno~sTB&oc)%2CDVkKO#y9K7Dz#x{ zEEIf6MgyqvqB1i7amg6hOW90(n~VX!tjr<5P0oXTBm3};O@1Vy`ERMrvM(!3^FJ(^ zUlk4~PI&SQqMcmGp+j4fxN>XUoHqojzEZhRjNt8-OH|5N)f zy=!2c`m5g2EdIUieQ(*vzV%*bV1WI$IP3Q3u%WUxDzz;prMrb~7LX=+UdbMJJgfu7 zMpR}(vPeF~^SsgD)>GrN-xe6JVbPsfW!tqoZ+-2WaaP?KkL=hTE1u$q>dmPz@27B#ab~yGv&S3E znekZhzB=1h8+pjw#4ER(sebQ$?mpqV@A-{WhnliaLDe0R_0GrbJ^J?y+-tj66xFVA z=)+CAr*Mzp;K7^?s`=9>)si2(`nsf&lY${ zIsTOGF-WDw<%3C^$5g!T7}X1+Zcl(?!f_W9AR|bSsK@Iod-6rYIsw^sjZW+SC*aGcCSWMZ0d#-@V(S8lJkBxACLF>@$~i$g67bdv9wjn8fCltapk^ ziG8q}KNlov07=Wrs(wUxS9=t*x~#oBcJMd7qqnx#t;bwAcJQ~o(5Ka!Y${`%lR;`ug{$p22V zKnZkwR`MSSMi_&{jpsCZwx0%@G+$0<)AE5N+WI4uK*m=kLGYOb^Y}~xf{!Hia|^op zzbAhR38a6V2?+iOl92IL2|x6YB>ciYDgnpWcF6IOL>%8p;#By2JDu+&`Hg-a>{lg` zuPgUK2{e2qVILn!bhiHz65ji7gT>i?oeA>)2ohZTk4l2!I|=^-`%MXSd{(l81R_2v z;k}K|^y`Cz!2^?$u!0*J>DyugFq!}~;s-QHP*Se`Bf=~_A`FM~`TJ*NO&=BQ&Hc-W z@+;@beP~VdSu*S;inc4ApXA%t@?L*G{#8UV9bKyJw!LbN4gDL5^6UB98iu@LM)*s( zf(WPnM-hdX6^e4kD_UEk2zve~B8XX_NON95Br%_faKBcx#*c6TQOswJqCbr&X0xI- zem^UScLkv_uD!^WIf%qV!45JAfl zB7B7lh%lcOM4IseMXQKnHY-{SmNi89yRkqKDHag%&=)9zlr^obA>w=%D3Y4*MBvd4 z`!;OyEFcokccS>CI`Bl#V^0nT;GCBvVjmv7u|O1WSOWOs-Y8}~!-mLE>9HntJ)wr*Rd|;?52{p?*X^yGIY8jIxx#1@;?5v&2vmt`sKIJ3eLwHg) zMCKvflLeK|$8a?RclZ&6p&e+GXy9X!FV65lM%p!Gsquk~r-(jom60TMLn6@1U{`91 zK91v_%njcw+h%)5#=@u@Toe@GvobjOK&%Br1#*}NgFT%AHJ!lR!nNpOQPpNq)&)1p zvpQb>*?{5Tv#BIp&de1Fld1I&A%w<-3en#mO&Ic4D%{R_(F{4el?wmg2ywUz4@F7TRFI6`0#BllLzkHp!Xy`xCf!O>a8cu* zPD+_;mGYbyw;yBtM=0G7ec??1pGo7?SGUhkX)!7O8m(3u68{;bxS5NU@{AUfO5|G7 z?0hXL|J$~9ror}$_Lq}V=W3<@k*+`ef5J@p7g_)LSoKMvZ9VbnuObE2V$zVdoHTm> z|0TrX1SV_2eg_Lmwd5Oen*Q*gLAo3%<~^XidimJ$|GM*%D_$GMag-;n;7^14q7oU2LkEiEV24O-ehr?^__FG%BnR=3aZ zb1^B-{hy$epC>-ifoBTK{|fUJgCM#7r1COgGMo@X^y5(BdBzeq3f38n5ED73LJCH) zLXdGkw;5uc$dC$5V6?l?I02r(26U7d$Z_mA7@VWTJhx%+znC~aoYA?( zaL2Jz;Qxs6mxIsWJqpPfQo$p}AC!Twg7+Q30q`VtSaI-IMfgyO1n3Ej?>k5rx$BCo0 z;qCw&1y5n8z%wF#L>$LEssi_T7&`@?AdXWTRe|qp7&`_21BlVtV0(lOW8;KJpZM!L z{)`Le%|v&R0@--OC#!qw?&0%Vk<3B{9%Y4LQ{WsT7Avqp@kC74L1M__zX4*D9#%X~ zOynRjCh!x*6T}$8=qNz@8^sgEIN@Q%>~3hr0*w*l%O6$Dt1_sVS89+LH*RzknAcAf zPY`2Zql%AzStrWCI=K%`5R#(ph9lgb}ZY@ z2#w{MT(o*~P$I`05A9Aa(X97ZkyBJEapRJ8HI`Idm8W9KZxhC=(a`bpH^JsQLIY{_`i@*7vAWdmr#_%TuU8%z z2x?8TIslpZ8!P;S13quB$q4~0fOyQYf*jmC-&%5Yhxb7_`ju0GEYA&IV`G^j2Pv74 zDSjeC}7iX5Z5ASVX` z)Fm70SctobJSrz-m>k>}V}U$Z1i)3V$sv(Cfg>TTmOGC*5}n(P9MnuhVP?T%;TN{? z2*o8~z{AJiMCaDzLVyS`X2D|N7q<80Ok}QVB{>ah50SGX3k94!`FxDU#;J zDgZtuUnsvOM>hYZ4U-R6(3xe>6#v1%&tGAYOJrKfz=p}~cB6BO3J96Be;K)ios_fj zvvQ)(2g;5vluwbPWsin>yn~Ftl$=L~*KsT%4*D2{B#j5MuaqbHr2LvZym%@(J4Yq` zBAj)J{zML@E9I8*19HuYJ+uc(C4RBT*l-lq5|;n)g6~TC3UXArAg5SNTtQT)Ytuw^ zR<2FL2>~T@$H~F3H0E+kO1ME5k03YX6w1;-Lp7CiaVBR$l71kEs!Qcr9BMWtSn^+( zwx+s;TFBvxywZ;hqFD;nasepO$Y3HJhBK%LGKkb`njC2;1sZAL7FFT+L=IlEeId6F zPC|5WN##j76gvae0n8V(l!->=k3Cb6mYmySkQ{hCrnn~;sY*c^A7oQT+E(<>$fftB z+?|}pa;aP`fJ7a5tk>lBs#)zoi)8*w3oiSCI*=28o_Wqcq0MRh<>XPD9Tnx-5tW4| zG`07see~Q$E|fC~{v)}^RmeF`Q|{zIM}GX`Us@P!enU>)GdY#8C}LJh9lMue8Bcgo zF3~6QK;cp1V)9_UB4-0lNtEZD4L>ONxU2GOaxyXBV|qcZ$wK;ZEK}sD%Y3~k1D+{z zG#LqeVAUCwjgxy4N=^;rxFSb2YV-@+hC+$y*yfYUOrBD~6;F5_*A8jnf&Z2q$>UE} z8%|((2%|y>(jqMWNg@}Jiigk%{B**91y-Ixep{d%IkY_G;WNMoGWAtVGlBv?l`xH;PM8#g>pRgw%w`ThoiJG-;4_MVp2ANj{9y%nw&N{F zyHg#cYDt!bB3nys_;VQA>Wk#|BnoD-`Q~W zCkmjasRC^FcLj2g!v&yyrUOi$Gh)|#@p!<4NFH4Ju%9ac?wJDMpXor_$2$=B%!pd!2JU61L0zAQq0x_T2L!RqkvH(UiQvmco?O?J1^-pxbuYaNg9__@4biKw4=;ItO z0OZ_=W;>WHz>kNDhA-Mo2YJ7S3jp}r4kioWv@n_B4gekR0MMBNnEgxvD4!`H><>Gb zERf9(kBHyq#E5vZ6COWC4Ah;~nJJIXt5Lc!moE|5OLT&-?YG j0uVD(0EheA4kioWbDQY^;%7Pl_e=qDPZS{cc!B=`zSILN diff --git a/auto_round/experimental/transform/utils/matrix.py b/auto_round/experimental/transform/utils/matrix.py index 46d684e80..184d91055 100644 --- a/auto_round/experimental/transform/utils/matrix.py +++ b/auto_round/experimental/transform/utils/matrix.py @@ -1,98 +1,17 @@ # # Copyright (C) 2026 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 +"""Backward-compat re-export shim. -import torch +The canonical implementation now lives in +:mod:`auto_round.algorithms.transforms.rotation.utils.matrix`. +""" -__all__ = ["apply_transform_weight"] - -# note that apply_transform_weight reuses some code from -# https://github.com/vllm-project/compressed-tensors/blob/main/src/compressed_tensors/transform/utils/matrix.py - - -def apply_transform_weight( - transform_weight: torch.Tensor, - value: torch.Tensor, - location: str, - module_type: type[torch.nn.Module], -) -> torch.Tensor: - """ - Using the transform location, apply the transform_weight to the - given value wrt linear weights. For more info on input and output transforms, - see `TransformLocation` - - The following explains how weights should be applied to values according to location - - let x be input activation - W be weight, - yh, xh, Wh be transformed output, input, weight - - note that - y = (x W.T) // torch.nn.Linear - - Choose values for yh, xh, and Wh which incorporate matrix transforms - - let V, Vi be transform matrices on input side - U, Ui be transform matrices on output side - - pick xh = (x V) - Wh = (U.T W Vi.T) - yh = (y U) +from auto_round.algorithms.transforms.rotation.utils.matrix import ( # noqa: F401 + apply_transform_weight, + multihead_matmul, +) - The following shows that `yh = (xh) (Wh).T` for the chosen values of yh, xh, and Wh +# Old private name kept for backward compatibility. +_multihead_matmul = multihead_matmul - (xh) (Wh).T = (x V) (U.T W Vi.T).T - = (x V) (Vi W.T U) // transpose matrix product identity - = (x W.T) U - = y U - = yh - - :param transform_weight: transform weight to apply - :param value: value to apply transform_weight to - :param location: determines how weight should be applied - :param model_type: result of type(module), passed in to determine application of - weight transform - :return: value after transform_weight has been applied - """ - - if location == "input": - return _multihead_matmul(value, transform_weight) - - if module_type == torch.nn.Linear: - return _multihead_matmul(value, transform_weight.T) - - -def _multihead_matmul(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: - """ - Performs A @ B for last two dims of two matrices A and B that possibly - have different shapes, as is the case in multi-headed dimension. If - shapes are different, this is equivalent to converting the last two dims - of the smaller matrix into a block-diagonal matrix with the same shape as - the last two dims of the larger matrix. - - E.g. if A is half the size of B, this function will perform - [[A ] @ B - [ A]] - - If B is a third of the size of A, this function will perform - A @ [[B ] - [ B ] - [ B]] - - This function will error out if the shapes are not evenly divisible - - :param A: left-hand tensor - :param B: right-hand tensor - :return: result - """ - if A.shape[-1] > B.shape[-2]: - head_dim = B.shape[-2] - num_heads = A.shape[-1] // head_dim - A = A.unflatten(-1, (num_heads, head_dim)) - return (A @ B).flatten(-2, -1) - elif A.shape[-1] < B.shape[-2]: - head_dim = A.shape[-1] - num_heads = B.shape[-2] // head_dim - B = B.unflatten(-2, (num_heads, head_dim)) - return (A @ B).flatten(-3, -2) - else: - return A @ B +__all__ = ["apply_transform_weight"] diff --git a/setup.py b/setup.py index 1b759fe83..16f7aaa17 100644 --- a/setup.py +++ b/setup.py @@ -186,5 +186,10 @@ def fetch_requirements(path): "License :: OSI Approved :: Apache Software License", ], include_package_data=True, - package_data={"": ["mllm/templates/*.json", "experimental/transform/utils/hadamards.safetensors"]}, + package_data={ + "": [ + "mllm/templates/*.json", + "algorithms/transforms/rotation/utils/hadamards.safetensors", + ] + }, ) From 8c96d9b3dc6a8caf535b376585147729c1ebfed7 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 23 Apr 2026 11:35:48 +0800 Subject: [PATCH 77/90] refactor(rotation): physically migrate inplace+dispatcher to new arch Move 4 files from auto_round/experimental/ into the new arch's canonical home auto_round/algorithms/transforms/rotation/, then leave thin re-export shims at the old paths to preserve backward compatibility: experimental/rotation_inplace/apply_rotation_transform.py -> algorithms/transforms/rotation/inplace/apply.py experimental/rotation_inplace/utils.py -> algorithms/transforms/rotation/inplace/hooks.py experimental/rotation_inplace/model_config.py -> algorithms/transforms/rotation/inplace/model_config.py experimental/apply_rotation_transform.py -> algorithms/transforms/rotation/dispatcher.py Also adopt the unified Method-B RotationConfig schema in algorithms/transforms/rotation/config.py: * single class with backend in {auto, inplace, transform} * fuse_online_to_weight, allow_online_rotation, hadamard_type fields * normalize / dump_group_size / to_dict helpers * experimental/transform/rotation_config.py and the rotation helpers in experimental/utils.py become re-export shims Functionality is preserved: all internal imports rewired to canonical paths, and every external caller (compressors/base.py, inference, __main__.py, compressors_new/entry.py) continues to work via the shims. --- .../algorithms/transforms/rotation/config.py | 196 ++-- .../transforms/rotation/dispatcher.py | 154 +++ .../transforms/rotation/inplace/__init__.py | 12 + .../transforms/rotation/inplace/apply.py | 882 ++++++++++++++++++ .../transforms/rotation/inplace/hooks.py | 786 ++++++++++++++++ .../rotation/inplace/model_config.py | 169 ++++ .../experimental/apply_rotation_transform.py | 160 +--- .../experimental/rotation_inplace/__init__.py | 13 +- .../apply_rotation_transform.py | 882 +----------------- .../rotation_inplace/model_config.py | 168 +--- .../experimental/rotation_inplace/utils.py | 785 +--------------- .../experimental/transform/rotation_config.py | 60 +- auto_round/experimental/utils.py | 79 +- 13 files changed, 2165 insertions(+), 2181 deletions(-) create mode 100644 auto_round/algorithms/transforms/rotation/dispatcher.py create mode 100644 auto_round/algorithms/transforms/rotation/inplace/__init__.py create mode 100644 auto_round/algorithms/transforms/rotation/inplace/apply.py create mode 100644 auto_round/algorithms/transforms/rotation/inplace/hooks.py create mode 100644 auto_round/algorithms/transforms/rotation/inplace/model_config.py diff --git a/auto_round/algorithms/transforms/rotation/config.py b/auto_round/algorithms/transforms/rotation/config.py index ee1ab8ee8..fe579142c 100644 --- a/auto_round/algorithms/transforms/rotation/config.py +++ b/auto_round/algorithms/transforms/rotation/config.py @@ -11,11 +11,31 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Hadamard rotation algorithm configuration.""" +"""Rotation/transform configuration (canonical, unified). + +This module is the **single source of truth** for the ``RotationConfig`` +schema. The legacy location +``auto_round.experimental.transform.rotation_config`` re-exports from here. + +Two implementation backends share this one schema (method B): + +* ``backend="inplace"`` – QuaRot-style residual-stream rotation, implemented + under :mod:`auto_round.experimental.rotation_inplace`. Works for any + weight/activation dtype and can optionally fuse the online Hadamard into + weights (``fuse_online_to_weight=True``). + +* ``backend="transform"`` – Per-Linear weight + activation Hadamard with a + fused triton kernel, implemented under + :mod:`auto_round.algorithms.transforms.rotation.apply`. Supports only + MXFP4 / NVFP4 and cannot fuse online to weight. + +* ``backend="auto"`` – dispatcher picks inplace when a fused online rotation + is requested, transform when the data_type is MX/NV-FP, inplace otherwise. +""" from __future__ import annotations -from typing import Any +from typing import Any, Optional from pydantic import BaseModel, Field, field_validator @@ -23,69 +43,117 @@ from auto_round.compressors.utils import is_mx_fp, is_nv_fp from auto_round.utils import logger -__all__ = ["RotationConfig", "normalize_rotation_config"] +__all__ = [ + "RotationConfig", + "normalize_rotation_config", + "to_dict_rotation_config", + "dump_group_size_to_rotation_config", +] + # Supported Hadamard transform types (also used by HadamardTransform registry). HADAMARD_TYPES: frozenset[str] = frozenset({"hadamard", "random_hadamard", "quarot_hadamard"}) +_SUPPORTED_BACKENDS: frozenset[str] = frozenset({"auto", "inplace", "transform"}) class RotationConfig(BaseModel, BaseRotationConfig): - """Configuration for Hadamard rotation transforms. - - This config is designed to be embedded inside a model's ``config.json`` - for serialisation, and is also used at runtime to drive - :class:`~auto_round.algorithms.transforms.rotation.apply.HadamardRotation`. - - Attributes: - algorithm: Fixed to ``"hadamard"`` – identifies this config in the - :class:`~auto_round.algorithms.transforms.base.BaseRotation` registry. - block_size: Block size for the block-diagonal Hadamard matrix. - hadamard_type: Which transform to use (``"hadamard"``, ``"random_hadamard"``, - or ``"quarot_hadamard"``). - random_seed: For ``"random_hadamard"`` – seed the generator for - reproducibility. Excluded from serialisation (``exclude=True``) - because it is a calibration-time detail. + """Unified configuration for Hadamard rotation/transform applied to a model. + + See the module docstring for a description of the three backends. + + Notes: + * ``block_size`` is the group/block size for grouped Hadamard. + For ``backend="inplace"`` it is forwarded as ``group_size`` + (``None`` / ``-1`` means full-dimension Hadamard). """ - # Override BaseRotationConfig.algorithm with a literal default. + # Registry key consumed by BaseRotation.from_config (kept for API parity + # with other BaseRotationConfig subclasses). algorithm: str = Field(default="hadamard", frozen=True) - block_size: int = Field(default=32) + + # ---- shared ---- + backend: str = Field(default="auto") + block_size: Optional[int] = Field(default=None) hadamard_type: str = Field(default="hadamard") + + # ---- inplace-only ---- + fuse_online_to_weight: Optional[bool] = Field(default=None) + allow_online_rotation: bool = Field(default=True) + + # for random hadamard (transform path) random_seed: bool = Field(default=False, exclude=True) model_config = {"arbitrary_types_allowed": True} + @field_validator("backend") + @classmethod + def _validate_backend(cls, v: str) -> str: + if v not in _SUPPORTED_BACKENDS: + raise ValueError(f"Unsupported backend: {v}. Supported values: {sorted(_SUPPORTED_BACKENDS)}") + return v + @field_validator("hadamard_type") @classmethod def _validate_hadamard_type(cls, v: str) -> str: if v not in HADAMARD_TYPES: - raise ValueError(f"Unsupported hadamard_type: {v!r}. " f"Supported values: {sorted(HADAMARD_TYPES)}") + raise ValueError(f"Unsupported hadamard_type: {v!r}. Supported values: {sorted(HADAMARD_TYPES)}") return v -def normalize_rotation_config( - config: str | dict | RotationConfig | None, - data_type: str = "mx_fp", +# --------------------------------------------------------------------------- +# Helpers (free functions – match the old experimental/utils.py API) +# --------------------------------------------------------------------------- + + +def to_dict_rotation_config(rotation_config: str | dict | RotationConfig | None) -> dict[str, Any]: + """Convert any supported config form to a plain ``dict`` (no data-type logic). + + Accepts: + * ``None`` → ``{}`` + * :class:`RotationConfig` → ``model_dump()`` + * ``dict`` → shallow-copied + * ``str`` → ``{"hadamard_type": key}`` (``"default"`` ⇒ plain default) + """ + if rotation_config is None: + return {} + if isinstance(rotation_config, str): + key = rotation_config.strip() + if not key: + return {} + if key == "default": + return {"hadamard_type": "hadamard"} + return {"hadamard_type": key} + if isinstance(rotation_config, RotationConfig): + return rotation_config.model_dump() + return dict(rotation_config) + + +def dump_group_size_to_rotation_config( + rotation_config: str | dict | RotationConfig, group_size: int ) -> dict[str, Any]: - """Normalise various input forms to a canonical ``dict`` for :class:`RotationConfig`. + """Return *rotation_config* as a dict with ``block_size`` populated from *group_size* (if unset).""" + rotation_dict = to_dict_rotation_config(rotation_config) + if rotation_dict.get("block_size", None) is None: + rotation_dict["block_size"] = group_size + return rotation_dict - Args: - config: One of: - * ``None`` → returns ``{}`` - * ``dict`` → validated via :class:`RotationConfig` - * :class:`RotationConfig` → converted to ``dict`` - * ``str`` shorthand → treated as ``hadamard_type`` - (``"default"`` → default :class:`RotationConfig`) - data_type: Quantization data type. Used to infer ``block_size`` - when not explicitly set (mx_fp → 32, nv_fp → 16). +def normalize_rotation_config( + rotation_config: str | dict | RotationConfig | None, + data_type: str = "mx_fp", +) -> dict[str, Any]: + """Normalise *rotation_config* to a validated ``dict`` ready for ``RotationConfig(**)``. - Returns: - A validated ``dict`` that can be passed to ``RotationConfig(**result)``. + Behaviour: + * ``None`` → ``{}`` + * If ``block_size`` is not set: + - ``mx_fp`` → default 32 + - ``nv_fp`` → default 16 + - other data types → emit a warning (no default) + * If ``block_size`` mismatches the data-type recommendation, emit a warning. Raises: - ValueError: If the config is invalid. - TypeError: If the config type is not recognised. + ValueError: If the resulting config is invalid. """ def _apply_data_type_block_size(cfg_dict: dict[str, Any], block_size_explicitly_set: bool) -> dict[str, Any]: @@ -110,49 +178,13 @@ def _apply_data_type_block_size(cfg_dict: dict[str, Any], block_size_explicitly_ return cfg_dict - if config is None: + if rotation_config is None: return {} - if isinstance(config, RotationConfig): - raw_cfg_dict = config.model_dump(exclude_unset=True) - block_size_explicitly_set = "block_size" in raw_cfg_dict - cfg_dict = dict(raw_cfg_dict) - cfg_dict = _apply_data_type_block_size(cfg_dict, block_size_explicitly_set) - try: - return RotationConfig.model_validate(cfg_dict).model_dump() - except Exception as exc: - raise ValueError(f"Invalid RotationConfig: {exc}") from exc - - if isinstance(config, dict): - block_size_explicitly_set = "block_size" in config - cfg_dict = dict(config) - cfg_dict = _apply_data_type_block_size(cfg_dict, block_size_explicitly_set) - try: - return RotationConfig.model_validate(cfg_dict).model_dump() - except Exception as exc: - raise ValueError(f"Invalid RotationConfig dict: {exc}") from exc - - if isinstance(config, str): - key = config.strip() - if not key: - return {} - if key == "default": - cfg_dict = {} - cfg_dict = _apply_data_type_block_size(cfg_dict, block_size_explicitly_set=False) - try: - return RotationConfig.model_validate(cfg_dict).model_dump() - except Exception as exc: - raise ValueError(f"Invalid default rotation_config after data_type adjustment: {exc}") from exc - if key not in HADAMARD_TYPES: - raise ValueError( - f"Unrecognised rotation config string: {key!r}. " - f"Expected one of {sorted(HADAMARD_TYPES)} or 'default'." - ) - cfg_dict = {"hadamard_type": key} - cfg_dict = _apply_data_type_block_size(cfg_dict, block_size_explicitly_set=False) - try: - return RotationConfig.model_validate(cfg_dict).model_dump() - except Exception as exc: - raise ValueError(f"Failed to build RotationConfig from {key!r}: {exc}") from exc - - raise TypeError("rotation_config must be None, dict, RotationConfig, or str " f"(got {type(config).__name__})") + rotation_dict = to_dict_rotation_config(rotation_config) + block_size_explicitly_set = "block_size" in rotation_dict + cfg_dict = _apply_data_type_block_size(rotation_dict, block_size_explicitly_set) + try: + return RotationConfig.model_validate(cfg_dict).model_dump() + except Exception as exc: + raise ValueError(f"Invalid RotationConfig: {exc}") from exc diff --git a/auto_round/algorithms/transforms/rotation/dispatcher.py b/auto_round/algorithms/transforms/rotation/dispatcher.py new file mode 100644 index 000000000..af3dee646 --- /dev/null +++ b/auto_round/algorithms/transforms/rotation/dispatcher.py @@ -0,0 +1,154 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 + +"""Unified entry point for Hadamard rotation/transform. + +Two backend implementations exist: + +* ``inplace`` – :mod:`auto_round.algorithms.transforms.rotation.inplace` + QuaRot-style residual-stream rotation. Works for any weight/activation + dtype. Optionally fuses the online Hadamard into weights + (``fuse_online_to_weight=True``). +* ``transform`` – :mod:`auto_round.experimental.transform` + Per-Linear weight + activation Hadamard with a fused triton kernel. + Only supports MXFP4 / NVFP4 and **cannot** fuse online to weight. + +Routing is controlled by :class:`RotationConfig.backend`: + + "inplace" -> always inplace + "transform" -> always transform (validates dtype + no-fuse) + "auto" -> if user asked to fuse -> inplace + elif data_type is mx_fp / nv_fp -> transform + else -> inplace +""" + +from __future__ import annotations + +from typing import Any, Union + +import torch + +import auto_round.envs as envs +from auto_round.compressors.utils import is_mx_fp, is_nv_fp +from auto_round.algorithms.transforms.rotation.config import RotationConfig +from auto_round.algorithms.transforms.rotation.config import normalize_rotation_config +from auto_round.utils import logger + +__all__ = ["apply_hadamard_rotation", "resolve_hadamard_backend"] + + +def _to_config( + rotation_config: Union[str, dict, RotationConfig, None], + data_type: str, +) -> RotationConfig: + """Normalise *rotation_config* and return a :class:`RotationConfig` instance.""" + cfg_dict = normalize_rotation_config(rotation_config, data_type) + if isinstance(cfg_dict, RotationConfig): + return cfg_dict + return RotationConfig.model_validate(cfg_dict or {}) + + +def resolve_hadamard_backend(config: RotationConfig, data_type: str) -> str: + """Resolve the actual backend (``"inplace"`` / ``"transform"``) from config.""" + requested = config.backend + fuse_requested = bool(config.fuse_online_to_weight) + allow_online_rotation: bool = config.allow_online_rotation + + if requested == "inplace": + return "inplace" + + transform_backend_name = "transform" + if requested == "transform": + if fuse_requested: + raise ValueError( + f"backend='{transform_backend_name}' does not support fuse_online_to_weight=True. " + "Use backend='inplace' (or backend='auto' with fuse_online_to_weight=True) instead." + ) + if not (is_mx_fp(data_type) or is_nv_fp(data_type)): + raise ValueError( + f"backend='{transform_backend_name}' only supports MXFP4 / NVFP4 (got data_type={data_type!r}). " + "Use backend='inplace' or backend='auto' for other dtypes." + ) + if not allow_online_rotation: + raise ValueError(f"backend='{transform_backend_name}' only supports `allow_online_rotation`=True") + + return "transform" + + # backend == "auto" + if fuse_requested: + return "inplace" + if is_mx_fp(data_type) or is_nv_fp(data_type): + return "transform" + return "inplace" + + +def apply_hadamard_rotation( + model: torch.nn.Module, + rotation_config: Union[str, dict, RotationConfig, None], + data_type: str, + compute_device: torch.device | str = None, +) -> (torch.nn.Module, Any): + """Apply Hadamard rotation/transform to *model*, dispatching by backend. + + Args: + model: Target model. + rotation_config: ``str`` / ``dict`` / :class:`RotationConfig` / ``None``. + See :class:`RotationConfig` for fields. + data_type: Quantization data type (e.g. ``"mx_fp"``, ``"nv_fp"``, + ``"int"``, ``"fp"``). + compute_device: Device for inplace-backend computation. Ignored by + the transform backend. + + Returns: + The same model (for chaining); also stored on ``model.rotation_config``. + """ + config = _to_config(rotation_config, data_type) + backend = resolve_hadamard_backend(config, data_type) + + # Resolve fuse flag: explicit > env var > default(True) + fuse_online_to_weight = config.fuse_online_to_weight + if config.fuse_online_to_weight is not None: + fuse_online_to_weight = bool(config.fuse_online_to_weight) + elif envs.AR_FUSE_ONLINE_ROTATION: + fuse_online_to_weight = bool(envs.AR_FUSE_ONLINE_ROTATION) + + logger.info( + f"Applying Hadamard (backend={backend}, " + f"data_type={data_type}, fuse_online_to_weight={fuse_online_to_weight if backend == 'inplace' else False})." + ) + + if backend == "inplace": + logger.warning("this backend does not support real exporting, please export the model to fake format") + from auto_round.algorithms.transforms.rotation.inplace import apply_rotation_transform + + # block_size -> group_size (None / -1 / 0 means full-dimension) + bs = config.block_size + group_size = bs if (bs is not None and bs > 0) else None + + model, hooks = apply_rotation_transform( + model, + group_size=group_size, + allow_online_rotation=config.allow_online_rotation, + rotation_matrix=config.hadamard_type, + fuse_online_to_weight=fuse_online_to_weight, + compute_device=compute_device, + ) + # Stash for downstream (export / serialization). Plain dict so JSON + # serialization (HF save_pretrained -> config.json) round-trips. + setattr(model, "rotation_config", config.model_dump() if hasattr(config, "model_dump") else config) + return model, hooks + + elif backend == "transform": + supported_hadamard_types = ("hadamard", "random_hadamard") + if config.hadamard_type not in supported_hadamard_types: + raise ValueError("this backend only supports hadamard or random_hadamard") + from auto_round.algorithms.transforms.rotation.apply import apply_rotation_transform + + return apply_rotation_transform(model, config, data_type=data_type) + else: + raise ValueError(f"Unsupported Hadamard backend {backend!r}") diff --git a/auto_round/algorithms/transforms/rotation/inplace/__init__.py b/auto_round/algorithms/transforms/rotation/inplace/__init__.py new file mode 100644 index 000000000..9bef07da9 --- /dev/null +++ b/auto_round/algorithms/transforms/rotation/inplace/__init__.py @@ -0,0 +1,12 @@ +# # Copyright (C) 2026 Intel Corporation +# # SPDX-License-Identifier: Apache-2.0 +"""Inplace (QuaRot-style) Hadamard rotation backend. + +Canonical home of the residual-stream Hadamard rotation implementation +(formerly under :mod:`auto_round.experimental.rotation_inplace`). +""" + +from auto_round.algorithms.transforms.rotation.inplace.apply import apply_rotation_transform # noqa: F401 +from auto_round.algorithms.transforms.rotation.inplace.hooks import clear_random_hadamard_cache # noqa: F401 + +__all__ = ["apply_rotation_transform", "clear_random_hadamard_cache"] diff --git a/auto_round/algorithms/transforms/rotation/inplace/apply.py b/auto_round/algorithms/transforms/rotation/inplace/apply.py new file mode 100644 index 000000000..680c1950d --- /dev/null +++ b/auto_round/algorithms/transforms/rotation/inplace/apply.py @@ -0,0 +1,882 @@ +# # Copyright (C) 2026 Intel Corporation +# # SPDX-License-Identifier: Apache-2.0 + +"""Hadamard inplace rotation — public API and rotation primitives. + +Supports LLaMA-2, LLaMA-3, Qwen-3 (and any model with the same layout). +The entry point is :func:`apply_hadamard_rotation`. +""" + +import gc +import typing +from typing import Dict, Union + +import torch +import tqdm + +from auto_round.algorithms.transforms.rotation.inplace.model_config import ( + MAPPING_REGISTRY, + RotationMapping, + _resolve, + infer_mapping_from_model, +) +from auto_round.algorithms.transforms.rotation.inplace.hooks import ( + CrossHeadOnlineHadamardHook, + FullOnlineHadamardHook, + GroupOnlineHadamardHook, + _get_custom_had, + _normalize_rotation_matrix, + _resolve_compute_device, + _rotate_embedding_grouped, + _rotate_linear_grouped, + apply_cross_head_had_to_linear, + apply_exact_had_to_linear, + deterministic_hadamard_matrix, + get_hadK, + get_or_create_random_hadamard, +) + +# --------------------------------------------------------------------------- +# Low-level primitives (model-agnostic via RotationMapping) +# --------------------------------------------------------------------------- + + +def _fuse_ln_linear( + layernorm: torch.nn.Module, + linear_layers: typing.Iterable[torch.nn.Linear], +) -> None: + """Fuse the linear operations in LayerNorm into adjacent linear blocks.""" + for linear in linear_layers: + linear_dtype = linear.weight.dtype + dev = linear.weight.device + + W_ = linear.weight.data.double() + ln_weight = layernorm.weight.double().to(dev) + linear.weight.data = (W_ * ln_weight).to(linear_dtype) + + if hasattr(layernorm, "bias") and layernorm.bias is not None: + if linear.bias is None: + linear.bias = torch.nn.Parameter(torch.zeros(linear.out_features, dtype=torch.float64, device=dev)) + ln_bias = layernorm.bias.double().to(dev) + linear.bias.data = linear.bias.data.double() + torch.matmul(W_, ln_bias) + linear.bias.data = linear.bias.data.to(linear_dtype) + + +def _reset_ln_params(layernorm: torch.nn.Module) -> None: + """Reset LayerNorm to identity: weight=1, bias=0.""" + layernorm.weight.data.fill_(1.0) + if hasattr(layernorm, "bias") and layernorm.bias is not None: + layernorm.bias.data.fill_(0.0) + + +def _rotate_linear_by_Q(module: torch.nn.Linear, Q: torch.Tensor, side: str, compute_device=None) -> None: + """Apply rotation *Q* to a Linear layer's weight (and bias if present). + + Args: + side: ``'input'`` → W = W @ Q (rotate input side) + ``'output'`` → W = Q^T @ W (rotate output side) + compute_device: Device to run computation on. If None, auto-detects GPU. + """ + dtype = module.weight.data.dtype + dev = module.weight.data.device + cdev = _resolve_compute_device(compute_device) + W_ = module.weight.data.to(device=cdev, dtype=torch.float64) + Q_ = Q.to(device=cdev) + if side == "input": + new_W = torch.matmul(W_, Q_).to(device=dev, dtype=dtype) + else: + new_W = torch.matmul(Q_.T, W_).to(device=dev, dtype=dtype) + # Release fp64 copy before assigning back so peak memory ≈ 1× weight + 1× rotated. + del W_ + module.weight.data = new_W + if side == "output" and module.bias is not None: + b = module.bias.data.to(device=cdev, dtype=torch.float64) + new_b = torch.matmul(Q_.T, b).to(device=dev, dtype=dtype) + del b + module.bias.data = new_b + del Q_ + + +def _untie_word_embeddings(model, mapping: RotationMapping) -> None: + """Break tied weights between lm_head and embedding if they share the same tensor.""" + embedding = _resolve(model, mapping.embedding) + lm_head = _resolve(model, mapping.lm_head) + + if lm_head.weight.data_ptr() != embedding.weight.data_ptr(): + return + + lm_head.weight = torch.nn.Parameter(lm_head.weight.data.clone()) + if hasattr(model.config, "tie_word_embeddings"): + model.config.tie_word_embeddings = False + + +def _uses_layernorm_with_mean(model, mapping: RotationMapping) -> bool: + """Check whether the model uses standard LayerNorm (which subtracts mean).""" + layers = _resolve(model, mapping.layers_attr) + first_ln = _resolve(layers[0], mapping.attn_input_ln) + return isinstance(first_ln, torch.nn.LayerNorm) + + +def _bake_mean_into_linear(linear: torch.nn.Linear) -> None: + """Subtract column-wise mean from a Linear layer's weight (and mean from bias).""" + linear_dtype = linear.weight.dtype + W_ = linear.weight.data.double() + linear.weight.data = (W_ - W_.mean(dim=-2, keepdim=True)).to(linear_dtype) + if linear.bias is not None: + b_ = linear.bias.data.double() + linear.bias.data = (b_ - b_.mean()).to(linear_dtype) + + +def _subtract_embedding_mean(model, mapping: RotationMapping) -> None: + """Subtract per-row mean from the embedding weight matrix.""" + W = _resolve(model, mapping.embedding) + dtype = W.weight.data.dtype + W_ = W.weight.data.to(dtype=torch.float64) + W.weight.data = (W_ - W_.mean(dim=-1, keepdim=True)).to(dtype=dtype) + + if mapping.positional_embedding is not None: + P = _resolve(model, mapping.positional_embedding) + p_dtype = P.weight.data.dtype + P_ = P.weight.data.to(dtype=torch.float64) + P.weight.data = (P_ - P_.mean(dim=-1, keepdim=True)).to(dtype=p_dtype) + + +class _RMSNorm(torch.nn.Module): + """RMS Normalization (no mean subtraction).""" + + def __init__(self, dim: int, eps: float = 1e-5): + super().__init__() + self.eps = eps + self.register_buffer("weight", torch.ones(dim)) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + rms = torch.sqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps) + return x / rms * self.weight + + +def _replace_layernorms_with_rmsnorm(model) -> None: + """Replace all ``nn.LayerNorm`` modules with ``_RMSNorm``.""" + replacements = [] + for name, module in model.named_modules(): + if isinstance(module, torch.nn.LayerNorm): + replacements.append((name, module)) + + for name, module in replacements: + parts = name.rsplit(".", 1) + if len(parts) == 2: + parent = _resolve(model, parts[0]) + attr = parts[1] + else: + parent = model + attr = parts[0] + rms = _RMSNorm(module.normalized_shape[0], eps=module.eps) + rms = rms.to(device=module.weight.device, dtype=module.weight.dtype) + setattr(parent, attr, rms) + + +# --------------------------------------------------------------------------- +# High-level steps driven by RotationMapping +# --------------------------------------------------------------------------- + + +def _fuse_layer_norms(model, mapping: RotationMapping) -> None: + """Fuse all LayerNorm parameters into adjacent Linear layers.""" + layers = _resolve(model, mapping.layers_attr) + + for layer in layers: + mlp_ln = _resolve(layer, mapping.mlp_input_ln) + mlp_linears = [_resolve(layer, p) for p in mapping.mlp_in] + _fuse_ln_linear(mlp_ln, mlp_linears) + _reset_ln_params(mlp_ln) + + attn_ln = _resolve(layer, mapping.attn_input_ln) + attn_linears = [ + _resolve(layer, mapping.attn_q), + _resolve(layer, mapping.attn_k), + _resolve(layer, mapping.attn_v), + ] + _fuse_ln_linear(attn_ln, attn_linears) + _reset_ln_params(attn_ln) + + pre_head_ln = _resolve(model, mapping.pre_head_ln) + lm_head = _resolve(model, mapping.lm_head) + _fuse_ln_linear(pre_head_ln, [lm_head]) + _reset_ln_params(pre_head_ln) + + +# --------------------------------------------------------------------------- +# Unified weight rotation (full or grouped) +# --------------------------------------------------------------------------- + + +@torch.inference_mode() +def _rotate_weights( + model, + mapping: RotationMapping, + use_fast_had: bool = True, + group_size: int = None, + compute_device: torch.device = None, + had_dict: dict = None, + preset: str = None, + fuse_online_to_weight: bool = True, +) -> None: + """Apply Hadamard rotation to all weights. + + Args: + group_size: ``None`` → full Hadamard rotation. + ``int`` → block-diagonal rotation with this block size. + compute_device: Device to run Hadamard computation on (e.g. ``"cuda:0"``). + Weights are moved there temporarily and moved back afterwards. + If ``None``, auto-detects GPU availability. + allow_online_rotation: If ``True`` (default), apply extra input-side + Hadamard rotations on ``down_proj`` and the OV pair (``v_proj`` + output + ``o_proj`` input) that require compensating online hooks + at inference time. If ``False``, skip those extra rotations so + that **no** online hooks are needed. + had_dict: Normalized ``dict[int, Tensor]`` of custom Hadamard matrices + (keyed by dimension). Only used in grouped mode. + preset: Rotation preset name (``"quarot_hadamard"``, ``"hadamard"``, + ``"random_hadamard"``, or ``None``). + + * ``"quarot_hadamard"``: fusable (residual-stream) rotations use + ``fast_hadamard_transform`` / random Hadamard; non-fusable + (online-paired) rotations and their weight-side counterparts use + deterministic ``get_hadK``/``matmul_hadU`` so that the online + hook at inference produces the exact same transform. + * ``"hadamard"``: all rotations use deterministic ``get_hadK`` / + ``matmul_hadU``. Full-mode Q is a deterministic Hadamard matrix. + * ``"random_hadamard"``: all rotations use random Hadamard matrices + from the global cache (``get_or_create_random_hadamard``). + Same dimension → same matrix everywhere. + * ``None``: same behaviour as ``"hadamard"`` (built-in butterfly). + """ + compute_device = _resolve_compute_device(compute_device) + config = model.config + hidden_size = getattr(config, mapping.hidden_size_attr) + intermediate_size = getattr(config, mapping.intermediate_size_attr) + num_heads = getattr(config, mapping.num_heads_attr) + head_dim = mapping.attn_head_dim or (hidden_size // num_heads) + + is_grouped = group_size is not None and group_size > 0 + desc = f"Rotating (group_size={group_size})" if is_grouped else "Rotating" + + # ----- Resolve per-operation Hadamard sources ----- + fused_fast = use_fast_had + online_fast = False + if preset == "random_hadamard": + fused_fast = False + + # -- Matrix resolution -- + had_matrix, _found = _get_custom_had(had_dict, group_size) if is_grouped else (None, False) + + online_had_matrix = had_matrix + if preset == "random_hadamard" and had_matrix is None: + had_matrix = get_or_create_random_hadamard(group_size if is_grouped else hidden_size, compute_device) + online_had_matrix = had_matrix + if preset == "quarot_hadamard" and is_grouped: + online_had_matrix = None # force deterministic for online-paired + + # -- Helper: look up cached random matrix for online-paired ops -- + def _online_had(dim): + """Return cached random matrix for *dim* under random_hadamard, else None.""" + if preset == "random_hadamard": + return get_or_create_random_hadamard(dim, compute_device) + return None + + if is_grouped: + assert hidden_size % group_size == 0, f"group_size={group_size} must divide hidden_size={hidden_size}" + assert ( + intermediate_size % group_size == 0 + ), f"group_size={group_size} must divide intermediate_size={intermediate_size}" + + # --- Full mode: build Hadamard matrix Q --- + Q = None + if not is_grouped: + if preset == "hadamard": + Q = deterministic_hadamard_matrix(hidden_size, compute_device) + else: + # "random_hadamard", "quarot_hadamard", None — same shape → same matrix + Q = get_or_create_random_hadamard(hidden_size, compute_device) + + # ---- Top-level: embedding / lm_head ---- + # When fuse_online_to_weight=False, skip embedding and lm_head rotation: + # each layer is self-contained (weight rotation + online hook cancel out). + if fuse_online_to_weight: + embedding = _resolve(model, mapping.embedding) + if is_grouped: + _rotate_embedding_grouped( + embedding, group_size, use_fast_had=fused_fast, compute_device=compute_device, had_matrix=had_matrix + ) + else: + dtype = embedding.weight.data.dtype + dev = embedding.weight.data.device + cdev = compute_device + W_ = embedding.weight.data.to(device=cdev, dtype=torch.float64) + new_W = torch.matmul(W_, Q.to(cdev)).to(device=dev, dtype=dtype) + del W_ + embedding.weight.data = new_W + + if mapping.positional_embedding is not None: + pos_emb = _resolve(model, mapping.positional_embedding) + if is_grouped: + _rotate_embedding_grouped( + pos_emb, group_size, use_fast_had=fused_fast, compute_device=compute_device, had_matrix=had_matrix + ) + else: + pos_dtype = pos_emb.weight.data.dtype + pos_dev = pos_emb.weight.data.device + cdev = compute_device + P_ = pos_emb.weight.data.to(device=cdev, dtype=torch.float64) + new_P = torch.matmul(P_, Q.to(cdev)).to(device=pos_dev, dtype=pos_dtype) + del P_ + pos_emb.weight.data = new_P + + # ---- Top-level: lm_head ---- + lm_head = _resolve(model, mapping.lm_head) + if is_grouped: + _rotate_linear_grouped( + lm_head, + group_size, + side="input", + use_fast_had=fused_fast, + compute_device=compute_device, + had_matrix=had_matrix, + ) + else: + _rotate_linear_by_Q(lm_head, Q, side="input", compute_device=compute_device) + + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + # ---- Per-layer rotation ---- + layers = _resolve(model, mapping.layers_attr) + for layer in tqdm.tqdm(layers, unit="layer", desc=desc): + if fuse_online_to_weight: + # ---- fuse mode: QuaRot-style residual stream rotation ---- + # Q/K/V: only residual Q on input (no online Had stacking, no hook). + # When Q == online Had (e.g. preset="hadamard"), Q @ Q = I cancels + # the rotation entirely, destroying quantization benefit. + # gate/up: only residual Q on input (no online Had stacking, no hook). + # down_proj: residual Q^T on output + online Had on input (+ hook). + # v_proj/o_proj: per-head/cross-head Had below (+ hook on o_proj). + for attr in (mapping.attn_q, mapping.attn_k, mapping.attn_v): + mod = _resolve(layer, attr) + if is_grouped: + _rotate_linear_grouped( + mod, + group_size, + side="input", + use_fast_had=fused_fast, + compute_device=compute_device, + had_matrix=had_matrix, + ) + else: + _rotate_linear_by_Q(mod, Q, side="input", compute_device=compute_device) + + # o_proj: residual stream output rotation + if is_grouped: + _rotate_linear_grouped( + _resolve(layer, mapping.attn_o), + group_size, + side="output", + use_fast_had=fused_fast, + compute_device=compute_device, + had_matrix=had_matrix, + ) + else: + _rotate_linear_by_Q(_resolve(layer, mapping.attn_o), Q, side="output", compute_device=compute_device) + + # gate/up: only residual Q on input + for attr in mapping.mlp_in: + mod = _resolve(layer, attr) + if is_grouped: + _rotate_linear_grouped( + mod, + group_size, + side="input", + use_fast_had=fused_fast, + compute_device=compute_device, + had_matrix=had_matrix, + ) + else: + _rotate_linear_by_Q(mod, Q, side="input", compute_device=compute_device) + + # down_proj: residual output + online input Had + down_proj = _resolve(layer, mapping.mlp_out) + if is_grouped: + _rotate_linear_grouped( + down_proj, + group_size, + side="output", + use_fast_had=fused_fast, + compute_device=compute_device, + had_matrix=had_matrix, + ) + _rotate_linear_grouped( + down_proj, + group_size, + side="input", + use_fast_had=online_fast, + compute_device=compute_device, + had_matrix=online_had_matrix, + ) + else: + _rotate_linear_by_Q(down_proj, Q, side="output", compute_device=compute_device) + apply_exact_had_to_linear( + down_proj, + had_dim=-1, + output=False, + use_fast_had=online_fast, + compute_device=compute_device, + had_matrix=_online_had(intermediate_size), + ) + + # OV projection: v_proj per-head output + o_proj full/cross-head input + v_proj = _resolve(layer, mapping.attn_v) + o_proj = _resolve(layer, mapping.attn_o) + if is_grouped: + pass + else: + online_head_had = _online_had(head_dim) + apply_exact_had_to_linear( + v_proj, + had_dim=head_dim, + output=True, + use_fast_had=online_fast, + compute_device=compute_device, + had_matrix=online_head_had, + ) + if preset == "random_hadamard": + apply_exact_had_to_linear( + o_proj, + had_dim=head_dim, + output=False, + use_fast_had=online_fast, + compute_device=compute_device, + had_matrix=online_head_had, + ) + apply_cross_head_had_to_linear( + o_proj, + num_heads, + head_dim, + use_fast_had=online_fast, + compute_device=compute_device, + had_matrix=_online_had(num_heads), + ) + else: + apply_exact_had_to_linear( + o_proj, + had_dim=-1, + output=False, + use_fast_had=online_fast, + compute_device=compute_device, + ) + + else: + # ---- unfused mode: no residual rotation, only input-side Had ---- + # Each layer gets Had fused on input side + compensating hook → equivalent. + # No embedding/lm_head rotation. No self-cancelling pair. + # v_proj treated same as Q/K (input Had only, no per-head/cross-head). + + # Q/K/V: input-side Had on hidden_size + for attr in (mapping.attn_q, mapping.attn_k, mapping.attn_v): + mod = _resolve(layer, attr) + if is_grouped: + _rotate_linear_grouped( + mod, + group_size, + side="input", + use_fast_had=online_fast, + compute_device=compute_device, + had_matrix=online_had_matrix, + ) + else: + apply_exact_had_to_linear( + mod, + had_dim=-1, + output=False, + use_fast_had=online_fast, + compute_device=compute_device, + had_matrix=_online_had(hidden_size), + ) + + # o_proj: input-side Had on hidden_size (full Had, not cross-head) + o_proj = _resolve(layer, mapping.attn_o) + if is_grouped: + _rotate_linear_grouped( + o_proj, + group_size, + side="input", + use_fast_had=online_fast, + compute_device=compute_device, + had_matrix=online_had_matrix, + ) + else: + apply_exact_had_to_linear( + o_proj, + had_dim=-1, + output=False, + use_fast_had=online_fast, + compute_device=compute_device, + had_matrix=_online_had(hidden_size), + ) + + # gate/up: input-side Had on hidden_size + for attr in mapping.mlp_in: + mod = _resolve(layer, attr) + if is_grouped: + _rotate_linear_grouped( + mod, + group_size, + side="input", + use_fast_had=online_fast, + compute_device=compute_device, + had_matrix=online_had_matrix, + ) + else: + apply_exact_had_to_linear( + mod, + had_dim=-1, + output=False, + use_fast_had=online_fast, + compute_device=compute_device, + had_matrix=_online_had(hidden_size), + ) + + # down_proj: input-side Had on intermediate_size + down_proj = _resolve(layer, mapping.mlp_out) + if is_grouped: + _rotate_linear_grouped( + down_proj, + group_size, + side="input", + use_fast_had=online_fast, + compute_device=compute_device, + had_matrix=online_had_matrix, + ) + else: + apply_exact_had_to_linear( + down_proj, + had_dim=-1, + output=False, + use_fast_had=online_fast, + compute_device=compute_device, + had_matrix=_online_had(intermediate_size), + ) + + # Per-layer cleanup: drop fp64 temporaries and CUDA caching allocator + # blocks so peak memory stays at ~1 layer's worth instead of accumulating + # across all 32+ decoder layers (was the main cause of 33 GB RAM on 8B). + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + +# --------------------------------------------------------------------------- +# Unified online hook registration +# --------------------------------------------------------------------------- + + +def _register_online_hooks( + model, + mapping: RotationMapping, + fp32_had: bool = False, + use_fast_had: bool = True, + group_size: int = None, + had_dict: dict = None, + preset: str = None, + fuse_online_to_weight: bool = True, +): + """Register online Hadamard pre-forward hooks on ``down_proj`` and ``o_proj``. + + Online hooks must use the **same** Hadamard matrix that was applied to the + weight-side counterpart during ``_rotate_weights``. For ``quarot_hadamard`` + this is always the deterministic ``get_hadK``/``matmul_hadU`` path + (``use_fast_had=False``). For ``"random_hadamard"`` it is the random matrix that + was generated once and stored in ``had_dict``. + + Args: + group_size: ``None`` → full Hadamard hooks (original QuaRot). + ``int`` → per-group Hadamard hooks. + had_dict: Normalized ``dict[int, Tensor]`` of custom Hadamard matrices. + preset: Rotation preset name. + Returns: + list of hook handles. + """ + config = model.config + num_heads = getattr(config, mapping.num_heads_attr) + hidden_size = getattr(config, mapping.hidden_size_attr) + intermediate_size = getattr(config, mapping.intermediate_size_attr) + head_dim = mapping.attn_head_dim or (hidden_size // num_heads) + + is_grouped = group_size is not None and group_size > 0 + + # Online hooks always use deterministic (fixed) Hadamard — never fast_had + # for quarot_hadamard; for "random_hadamard" they use the same random matrix + # that was cached in had_dict by _rotate_weights. + online_fast = False + + # -- Matrix resolution (must match the *online-paired* matrix used by + # _rotate_weights for down_proj input / OV pair). Variable name kept in + # sync with _rotate_weights to make any future drift obvious. + online_had_matrix, _ = _get_custom_had(had_dict, group_size) if is_grouped else (None, False) + if preset == "random_hadamard" and online_had_matrix is None: + online_had_matrix = get_or_create_random_hadamard(group_size if is_grouped else hidden_size) + if preset == "quarot_hadamard" and is_grouped: + online_had_matrix = None + + # -- Helper: look up cached random matrix for online-paired hooks -- + def _online_had(dim): + if preset == "random_hadamard": + return get_or_create_random_hadamard(dim) + return None + + mlp_out_suffix = mapping.mlp_out.split(".")[-1] + attn_o_suffix = mapping.attn_o.split(".")[-1] + + # Suffixes for Q/K/V and gate/up (for online input Had hooks) + attn_qkv_suffixes = set(attr.split(".")[-1] for attr in (mapping.attn_q, mapping.attn_k, mapping.attn_v)) + mlp_in_suffixes = set(attr.split(".")[-1] for attr in mapping.mlp_in) + + # --- Build hook factories --- + def _make_down_proj_hook(): + if is_grouped: + return GroupOnlineHadamardHook( + group_size=group_size, fp32_had=fp32_had, use_fast_had=online_fast, had_matrix=online_had_matrix + ) + online_mat = _online_had(intermediate_size) + if online_mat is not None: + return FullOnlineHadamardHook( + had_K=None, K=None, fp32_had=fp32_had, use_fast_had=online_fast, had_matrix=online_mat + ) + had_K, K = get_hadK(intermediate_size) + return FullOnlineHadamardHook(had_K=had_K, K=K, fp32_had=fp32_had, use_fast_had=online_fast) + + def _make_hidden_had_hook(): + """Full Had hook on hidden_size (for Q/K/V and gate/up input).""" + if is_grouped: + return GroupOnlineHadamardHook( + group_size=group_size, fp32_had=fp32_had, use_fast_had=online_fast, had_matrix=online_had_matrix + ) + online_mat = _online_had(hidden_size) + if online_mat is not None: + return FullOnlineHadamardHook( + had_K=None, K=None, fp32_had=fp32_had, use_fast_had=online_fast, had_matrix=online_mat + ) + had_K, K = get_hadK(hidden_size) + return FullOnlineHadamardHook(had_K=had_K, K=K, fp32_had=fp32_had, use_fast_had=online_fast) + + def _make_o_proj_hook(): + online_mat = _online_had(num_heads) + if online_mat is not None: + return CrossHeadOnlineHadamardHook( + had_K=None, + K=None, + head_dim=head_dim, + fp32_had=fp32_had, + use_fast_had=online_fast, + had_matrix=online_mat, + ) + had_K, K = get_hadK(num_heads) + return CrossHeadOnlineHadamardHook( + had_K=had_K, + K=K, + head_dim=head_dim, + fp32_had=fp32_had, + use_fast_had=online_fast, + ) + + # --- Register --- + handles = [] + + for name, module in model.named_modules(): + if not isinstance(module, torch.nn.Linear): + continue + suffix = name.split(".")[-1] + + if name.endswith(mlp_out_suffix): + # down_proj: full Had on intermediate_size input + h = module.register_forward_pre_hook(_make_down_proj_hook()) + handles.append(h) + elif name.endswith(attn_o_suffix): + if fuse_online_to_weight and not is_grouped: + # o_proj: cross-head Had on input (fused mode, full only) + h = module.register_forward_pre_hook(_make_o_proj_hook()) + handles.append(h) + elif not fuse_online_to_weight: + # o_proj: full Had on hidden_size input (unfused mode, matches weight rotation) + h = module.register_forward_pre_hook(_make_hidden_had_hook()) + handles.append(h) + elif suffix in attn_qkv_suffixes: + if not fuse_online_to_weight: + # Q/K/V: full Had on hidden_size input (unfused mode only). + # In fused mode Q/K/V only have residual Q on weight (no online Had), + # and activations come pre-rotated from residual stream → no hook needed. + h = module.register_forward_pre_hook(_make_hidden_had_hook()) + handles.append(h) + elif suffix in mlp_in_suffixes: + if not fuse_online_to_weight: + # gate/up: full Had on hidden_size input (unfused mode only). + # Same reasoning as Q/K/V above. + h = module.register_forward_pre_hook(_make_hidden_had_hook()) + handles.append(h) + + return handles + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + + +def apply_rotation_transform( + model, + group_size: int = None, + allow_online_rotation: bool = True, + rotation_matrix: Union[str, torch.Tensor, Dict[int, torch.Tensor], None] = None, + compute_device: torch.device | str = None, + fp32_had: bool = False, + fuse_online_to_weight: bool = None, +): + """Fuse layer norms, rotate weights, and register online Hadamard hooks. + + This is the single entry point for applying Hadamard inplace rotation. + The model architecture is auto-detected via ``model.config.model_type``. + + Args: + model: A HuggingFace CausalLM model (LLaMA-2/3, Qwen-3, etc.). + fp32_had: Whether to compute the online Hadamard transform in fp32. + group_size: If ``None`` (default), use full-dimension Hadamard rotation. + compute_device: Device to run Hadamard computation on. + allow_online_rotation: If ``True`` (default), apply online Hadamard + rotations on ``down_proj`` input and the OV pair. + rotation_matrix: Rotation matrix selection (``"hadamard"``, + ``"random_hadamard"``, ``"quarot_hadamard"``, Tensor, dict, or None). + fuse_online_to_weight: If ``True`` (default), fuse online Hadamard + rotation into weights (down_proj input, v_proj output, o_proj input) + and register compensating online hooks. If ``False``, skip + embedding/lm_head rotation; each linear layer is self-contained + with input-side Had on weight + compensating online hook on + activation. No v_proj cross-head or inner-head rotation. + + Returns: + list of hook handles.""" + if fuse_online_to_weight is None: + if model.config.model_type in MAPPING_REGISTRY or model.__class__.__name__ in MAPPING_REGISTRY: + fuse_online_to_weight = True + else: + fuse_online_to_weight = False + had_dict, use_fast_had, preset = _normalize_rotation_matrix(rotation_matrix, group_size) + compute_device = _resolve_compute_device(compute_device) + + if use_fast_had: + from auto_round.utils import logger + + try: + import fast_hadamard_transform # noqa: F401 + + if group_size is None: + logger.warning( + "fast_hadamard_transform uses a different Hadamard matrix than the " + "default implementation. Please ensure consistency between training " + "and inference. This will be refined later." + ) + except ImportError: + logger.warning("Importing fast_hadamard_transform failed, falling back to default implementation.") + use_fast_had = False + + mapping = infer_mapping_from_model(model) + + _untie_word_embeddings(model, mapping) + + if _uses_layernorm_with_mean(model, mapping): + _subtract_embedding_mean(model, mapping) + + _fuse_layer_norms(model, mapping) + + if _uses_layernorm_with_mean(model, mapping): + layers = _resolve(model, mapping.layers_attr) + for layer in layers: + _bake_mean_into_linear(_resolve(layer, mapping.attn_o)) + _bake_mean_into_linear(_resolve(layer, mapping.mlp_out)) + _replace_layernorms_with_rmsnorm(model) + + _rotate_weights( + model, + mapping, + use_fast_had=use_fast_had, + group_size=group_size, + compute_device=compute_device, + had_dict=had_dict, + preset=preset, + fuse_online_to_weight=fuse_online_to_weight, + ) + + handles = [] + if fuse_online_to_weight or allow_online_rotation: + handles = _register_online_hooks( + model, + mapping, + fp32_had=fp32_had, + use_fast_had=use_fast_had, + group_size=group_size, + had_dict=had_dict, + preset=preset, + fuse_online_to_weight=fuse_online_to_weight, + ) + + return model, handles + + +# --------------------------------------------------------------------------- +# Quick smoke test +# --------------------------------------------------------------------------- + +if __name__ == "__main__": + from transformers import AutoModelForCausalLM, AutoTokenizer + + model_name = "/models/opt-125m" + tokenizer = AutoTokenizer.from_pretrained(model_name) + model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") + model.to("cuda") + text = "There is a girl who likes adventure," + inputs = tokenizer(text, return_tensors="pt").to(model.device) + print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) + + apply_rotation_transform( + model, group_size=-1, allow_online_rotation=True, rotation_matrix="random_hadamard", fuse_online_to_weight=False + ) + model.to("cuda") + text = "There is a girl who likes adventure," + inputs = tokenizer(text, return_tensors="pt").to(model.device) + print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) + + model_name = "/models/Qwen3-8B" + tokenizer = AutoTokenizer.from_pretrained(model_name) + model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") + apply_rotation_transform(model, group_size=-1, allow_online_rotation=True, fuse_online_to_weight=True) + model.to("cuda") + text = "There is a girl who likes adventure," + inputs = tokenizer(text, return_tensors="pt").to(model.device) + print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) + + from transformers import AutoModelForCausalLM, AutoTokenizer + + model_name = "/models/Meta-Llama-3.1-8B-Instruct" + tokenizer = AutoTokenizer.from_pretrained(model_name) + model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") + apply_rotation_transform(model, fuse_online_to_weight=True, group_size=32) + model.to("cuda") + text = "There is a girl who likes adventure," + inputs = tokenizer(text, return_tensors="pt").to(model.device) + print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) + # + # model_name = "/models/Llama-2-7b-chat-hf" + # tokenizer = AutoTokenizer.from_pretrained(model_name) + # model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") + # apply_hadamard_rotation(model) + # model.to("cuda") + # text = "There is a girl who likes adventure," + # inputs = tokenizer(text, return_tensors="pt").to(model.device) + # print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) diff --git a/auto_round/algorithms/transforms/rotation/inplace/hooks.py b/auto_round/algorithms/transforms/rotation/inplace/hooks.py new file mode 100644 index 000000000..2b3d26c6e --- /dev/null +++ b/auto_round/algorithms/transforms/rotation/inplace/hooks.py @@ -0,0 +1,786 @@ +# # Copyright (C) 2026 Intel Corporation +# # SPDX-License-Identifier: Apache-2.0 + +"""Online Hadamard transform hooks. + +After weight rotation, down_proj and o_proj require an online Hadamard +transform on their *input activations* at inference time. This module +provides the hooks and a helper to register them on the model. +""" + +import math + +import torch +import torch.nn as nn + +try: + import fast_hadamard_transform +except ImportError: + fast_hadamard_transform = None + + +def _resolve_compute_device(compute_device) -> torch.device: + """Return *compute_device* if explicitly given, otherwise auto-detect GPU. + + When ``compute_device`` is ``None`` the function checks for CUDA / XPU + availability and returns the first accelerator it finds so that heavy + matrix operations are offloaded to GPU even when the model weights live + on CPU. Falls back to ``torch.device("cpu")`` when no accelerator is + present. + """ + if compute_device is not None: + return torch.device(compute_device) if not isinstance(compute_device, torch.device) else compute_device + if torch.cuda.is_available(): + return torch.device("cuda:0") + if hasattr(torch, "xpu") and torch.xpu.is_available(): + return torch.device("xpu:0") + return torch.device("cpu") + + +BUILTIN_ROTATION_PRESETS = {"quarot_hadamard", "hadamard", "random_hadamard"} + +# Global cache for random Hadamard matrices keyed by dimension. +# Ensures the same shape always returns the exact same random matrix within +# a process, across all calls to ``_rotate_weights`` / ``_register_online_hooks``. +_RANDOM_HADAMARD_CACHE: dict = {} + + +def get_or_create_random_hadamard(dim: int, device=None) -> torch.Tensor: + """Return a random Hadamard matrix for *dim*, creating and caching it if needed. + + The matrix is cached globally in ``_RANDOM_HADAMARD_CACHE`` so that every + caller that requests the same *dim* receives the identical matrix. + """ + if dim in _RANDOM_HADAMARD_CACHE: + mat = _RANDOM_HADAMARD_CACHE[dim] + if device is not None: + mat = mat.to(device) + return mat + mat = random_hadamard_matrix(dim, device or torch.device("cpu")) + _RANDOM_HADAMARD_CACHE[dim] = mat + return mat + + +def clear_random_hadamard_cache(): + """Clear the global random Hadamard matrix cache. + + Call this when you want subsequent ``random_hadamard`` preset runs to + generate fresh random matrices (e.g. between independent experiments). + """ + _RANDOM_HADAMARD_CACHE.clear() + + +def _normalize_rotation_matrix(rotation_matrix, group_size): + """Normalize ``rotation_matrix`` into a ``(had_dict, use_fast_had, preset)`` tuple. + + Accepted inputs: + * ``None`` → ``(None, False, None)`` — use built-in butterfly ``matmul_hadU``. + * ``"quarot_hadamard"`` → ``(None, True, "quarot_hadamard")`` — fusable + rotations use ``fast_hadamard_transform`` (random); non-fusable + (online-paired) rotations use deterministic ``get_hadK``/``matmul_hadU``. + * ``"hadamard"`` → ``(None, False, "hadamard")`` — all rotations use + deterministic ``get_hadK``/``matmul_hadU``. + * ``"random_hadamard"`` → ``(None, False, "random_hadamard")`` — all rotations use + ``random_hadamard_matrix``. + * A ``torch.Tensor`` of shape ``(n, n)`` → ``({n: tensor}, False, None)``. + * A ``dict[int, Tensor]`` → ``(dict, False, None)`` — returned as-is. + + Returns: + ``(had_dict, use_fast_had, preset)`` + + Raises: + ValueError: if a non-``str`` *rotation_matrix* is given but + *group_size* is not a positive integer, or an unknown preset. + """ + if rotation_matrix is None: + return None, False, None + + if isinstance(rotation_matrix, str): + if rotation_matrix not in BUILTIN_ROTATION_PRESETS: + raise ValueError( + f"Unknown rotation_matrix preset '{rotation_matrix}'. " + f"Supported presets: {BUILTIN_ROTATION_PRESETS}." + ) + if rotation_matrix == "quarot_hadamard": + return None, True, "quarot_hadamard" + elif rotation_matrix == "hadamard": + return None, False, "hadamard" + else: # "random_hadamard" + return None, False, "random_hadamard" + + is_grouped = group_size is not None and group_size > 0 + if not is_grouped and not isinstance(rotation_matrix, dict): + raise ValueError( + "rotation_matrix (Tensor/dict) can only be used with a positive group_size. " + f"Got group_size={group_size}." + ) + + if isinstance(rotation_matrix, torch.Tensor): + assert ( + rotation_matrix.ndim == 2 and rotation_matrix.shape[0] == rotation_matrix.shape[1] + ), f"rotation_matrix must be square, got shape {rotation_matrix.shape}" + return {rotation_matrix.shape[0]: rotation_matrix}, False, None + + if isinstance(rotation_matrix, dict): + for k, t in rotation_matrix.items(): + assert ( + isinstance(t, torch.Tensor) and t.ndim == 2 and t.shape[0] == t.shape[1] + ), f"rotation_matrix[{k}] must be a square tensor, got shape {t.shape}" + return rotation_matrix, False, None + + raise TypeError( + f"rotation_matrix must be a Tensor, dict[int, Tensor], str, or None. " f"Got {type(rotation_matrix)}." + ) + + +def _get_custom_had(had_dict, size): + """Look up a custom Hadamard matrix for *size* from the normalized dict. + + Returns ``(had_tensor, True)`` if found, ``(None, False)`` otherwise. + """ + if had_dict is None: + return None, False + if size in had_dict: + return had_dict[size], True + return None, False + + +# --------------------------------------------------------------------------- +# Hook implementations +# --------------------------------------------------------------------------- + + +class FullOnlineHadamardHook(nn.Module): + """Pre-forward hook: full Hadamard on the entire last dimension (for ``down_proj``).""" + + def __init__(self, had_K, K, fp32_had=False, use_fast_had=True, had_matrix=None): + super().__init__() + self.custom_had = had_matrix is not None + if had_matrix is not None: + self.register_buffer("had_matrix", had_matrix) + self.had_K = None + self.K = None + else: + if had_K is not None: + self.register_buffer("had_K", had_K) + else: + self.had_K = None + self.K = K + self.fp32_had = fp32_had + self.use_fast_had = use_fast_had + + def __call__(self, module: nn.Module, args): + x = args[0] if isinstance(args, tuple) else args + x_dtype = x.dtype + + if self.custom_had: + H = self.had_matrix.to(device=x.device, dtype=x.dtype) + if self.fp32_had: + H = self.had_matrix.to(device=x.device).float() + x = (x.float() @ H.T).to(x_dtype) + else: + x = x @ H.T + elif self.fp32_had: + x = matmul_hadU_cuda(x.float(), self.had_K, self.K, use_fast_had=self.use_fast_had).to(x_dtype) + else: + x = matmul_hadU_cuda(x, self.had_K, self.K, use_fast_had=self.use_fast_had) + + if isinstance(args, tuple): + return (x,) + args[1:] + return x + + +class CrossHeadOnlineHadamardHook(nn.Module): + """Pre-forward hook: **cross-head** Hadamard on the ``num_heads`` dimension + (for ``o_proj``). + + After offline rotation: + - ``v_proj`` absorbed a per-head (within-head) Hadamard on ``head_dim``. + - ``o_proj`` absorbed a full Hadamard on ``hidden_size``. + + Since ``H_full = H_cross ⊗ H_within`` (Kronecker decomposition) and the + within-head part is already cancelled by ``v_proj`` through the attention + path (``H_within² = I``), the online hook only needs to apply the residual + **cross-head** Hadamard (``H_cross ⊗ I``): + + * reshape ``(*, hidden_size)`` → ``(*, num_heads, head_dim)`` + * transpose → ``(*, head_dim, num_heads)`` + * Hadamard on the **num_heads** axis (last dim) + * transpose back and reshape + """ + + def __init__(self, had_K, K, head_dim, fp32_had=False, use_fast_had=True, had_matrix=None): + """ + Args: + had_K: Hadamard sub-matrix from ``get_hadK(num_heads)``. + K: Block size from ``get_hadK(num_heads)``. + head_dim: ``hidden_size // num_attention_heads``. + fp32_had: Compute in fp32. + use_fast_had: If True use fast_hadamard_transform; if False use matmul_hadU. + had_matrix: Optional custom rotation matrix of shape ``(num_heads, num_heads)``. + """ + super().__init__() + self.custom_had = had_matrix is not None + if had_matrix is not None: + self.register_buffer("had_matrix", had_matrix) + self.had_K = None + self.K = None + else: + if had_K is not None: + self.register_buffer("had_K", had_K) + else: + self.had_K = None + self.K = K + self.had_dim = head_dim + self.fp32_had = fp32_had + self.use_fast_had = use_fast_had + + def __call__(self, module: nn.Module, args): + x = args[0] if isinstance(args, tuple) else args + x_dtype = x.dtype + + if self.fp32_had: + x = x.float() + + init_shape = x.shape + num_heads = init_shape[-1] // self.had_dim + + if self.custom_had: + H = self.had_matrix.to(device=x.device, dtype=x.dtype) + # reshape (*, hidden) → (*, num_heads, head_dim), transpose → (*, head_dim, num_heads) + x = x.reshape(-1, num_heads, self.had_dim).transpose(1, 2) + # apply H on last dim (num_heads): x @ H.T + x = (x @ H.T).transpose(1, 2) + elif self.use_fast_had and fast_hadamard_transform is not None and self.K == 1: + x = fast_hadamard_transform.hadamard_transform( + x.reshape(-1, num_heads, self.had_dim).transpose(1, 2), + scale=1 / math.sqrt(num_heads), + ).transpose(1, 2) + else: + # Fallback: use matmul_hadU (pure butterfly + had_K, no fast_hadamard_transform) + x = x.reshape(-1, num_heads, self.had_dim).transpose(1, 2) + x = matmul_hadU(x.contiguous()) + x = x.transpose(1, 2) + + if self.fp32_had: + x = x.to(x_dtype) + x = x.reshape(init_shape) + + if isinstance(args, tuple): + return (x,) + args[1:] + return x + + +# --------------------------------------------------------------------------- +# Registration helper +# --------------------------------------------------------------------------- + + +def register_online_had_hooks(model, mapping=None, fp32_had=False, use_fast_had=True): + """Register online Hadamard pre-forward hooks on ``down_proj`` and ``o_proj``. + + * **down_proj** (``online_full_had``): full Hadamard on ``intermediate_size``. + Compensates ``apply_exact_had_to_linear(down_proj, had_dim=-1, output=False)``. + + * **o_proj** (``online cross-head had``): cross-head Hadamard on ``num_heads``. + Compensates the residual after v_proj's within-head Hadamard cancels. + + Args: + model: A HuggingFace model whose weights have already been rotated. + mapping: A :class:`RotationMapping` (auto-inferred if ``None``). + fp32_had: Whether to compute the Hadamard transform in fp32. + use_fast_had: If True use fast_hadamard_transform; if False use matmul_hadU. + + Returns: + list of hook handles (call ``handle.remove()`` to detach). + """ + if mapping is None: + from auto_round.algorithms.transforms.rotation.inplace.model_config import infer_mapping_from_model + + mapping = infer_mapping_from_model(model) + + config = model.config + num_heads = getattr(config, mapping.num_heads_attr) + hidden_size = getattr(config, mapping.hidden_size_attr) + intermediate_size = getattr(config, mapping.intermediate_size_attr) + head_dim = mapping.attn_head_dim or (hidden_size // num_heads) + + # down_proj: full Hadamard on intermediate_size + had_K_full, K_full = get_hadK(intermediate_size) + + # o_proj: cross-head Hadamard on num_heads + had_K_head, K_head = get_hadK(num_heads) + + # Identify target module suffixes from mapping + mlp_out_suffix = mapping.mlp_out.split(".")[-1] # e.g. "down_proj" + attn_o_suffix = mapping.attn_o.split(".")[-1] # e.g. "o_proj" + + handles = [] + for name, module in model.named_modules(): + if name.endswith(mlp_out_suffix) and isinstance(module, nn.Linear): + hook = FullOnlineHadamardHook( + had_K=had_K_full, + K=K_full, + fp32_had=fp32_had, + use_fast_had=use_fast_had, + ) + h = module.register_forward_pre_hook(hook) + handles.append(h) + elif name.endswith(attn_o_suffix) and isinstance(module, nn.Linear): + hook = CrossHeadOnlineHadamardHook( + had_K=had_K_head, + K=K_head, + head_dim=head_dim, + fp32_had=fp32_had, + use_fast_had=use_fast_had, + ) + h = module.register_forward_pre_hook(hook) + handles.append(h) + + return handles + + +def is_pow2(n): + return (n & (n - 1) == 0) and (n > 0) + + +# Adapted from https://github.com/Cornell-RelaxML/quip-sharp/blob/main/lib/utils/matmul_had.py +def get_hadK(n: int, transpose=False) -> (torch.Tensor, int): + hadK, K = None, None + + if is_pow2(n): + K = 1 + return hadK, K + else: + from auto_round.algorithms.transforms.rotation.utils.math import _fetch_hadamard_divisor + + hadK = _fetch_hadamard_divisor(n, torch.float, torch.device("cpu")) + if transpose: + hadK = hadK.T + if hadK is not None: + return hadK, 1 if is_pow2(hadK.shape[0]) else hadK.shape[0] + assert is_pow2(n) + + +def matmul_hadU(X, transpose=False): + n = X.shape[-1] + hadK, K = get_hadK(n, transpose) + input = X.clone().view(-1, n, 1) + output = input.clone() + while input.shape[1] > K: + input = input.view(input.shape[0], input.shape[1] // 2, 2, input.shape[2]) + output = output.view(input.shape) + output[:, :, 0, :] = input[:, :, 0, :] + input[:, :, 1, :] + output[:, :, 1, :] = input[:, :, 0, :] - input[:, :, 1, :] + output = output.view(input.shape[0], input.shape[1], -1) + input, output = (output, input) + del output + + if K > 1: + # Do not explicitly repeat - OOM + # input = torch.bmm( + # hadK.repeat(len(input), 1, 1).to(input.device).to(input.dtype), input) + # Use bcast instead + input = hadK.view(1, K, K).to(input) @ input + + return input.view(X.shape) / torch.tensor(n).sqrt() + + +def matmul_hadUt(X): + return matmul_hadU(X, transpose=True) + + +def random_hadamard_matrix(size, device): + # See https://cornell-relaxml.github.io/quip-sharp/ , Section "Randomized Hadamard Transformation" + Q = torch.randint(low=0, high=2, size=(size,)).to(torch.float64) + Q = Q * 2 - 1 + Q = torch.diag(Q) + return matmul_hadU(Q).to(device) + + +def deterministic_hadamard_matrix(size, device): + """Build a deterministic Hadamard matrix of the given *size*. + + Applies the butterfly ``matmul_hadU`` to an identity matrix so that the + result is purely determined by ``get_hadK`` (no random sign flips). + """ + Q = torch.eye(size, dtype=torch.float64) + return matmul_hadU(Q).to(device) + + +def matmul_hadU_cuda(X, hadK, K, use_fast_had=True): + n = X.shape[-1] + if not use_fast_had or fast_hadamard_transform is None: + return matmul_hadU(X) + if K == 1: + return fast_hadamard_transform.hadamard_transform(X.contiguous(), 1.0 / torch.tensor(n).sqrt()) + # if transpose: + # hadK = hadK.T.contiguous() + input = X.view(*X.shape[:-1], K, n // K) + input = fast_hadamard_transform.hadamard_transform(input.contiguous(), 1.0 / torch.tensor(n).sqrt()) + input = hadK.to(input.device).to(input.dtype) @ input + return input.reshape(X.shape) + + +def matmul_hadUt_cuda(X, hadK, K, use_fast_had=True): + return matmul_hadU_cuda(X, hadK, K, use_fast_had=use_fast_had) + + +def apply_exact_had_to_linear( + module, had_dim=-1, output=False, use_fast_had=True, compute_device=None, had_matrix=None +): + """Apply Hadamard rotation to a Linear layer's weight in-place. + + Args: + module: ``nn.Linear`` layer. + had_dim: Dimension of each Hadamard block (``-1`` for full dimension). + output: If ``True`` rotate the output (row) side; otherwise input (col). + use_fast_had: Use ``fast_hadamard_transform`` when available. + compute_device: Device to run computation on. + had_matrix: Optional custom rotation matrix. When ``had_dim == -1`` + this should be a square tensor whose size equals + ``out_features`` (output) or ``in_features`` (input). When + ``had_dim > 0`` the size should equal ``had_dim``. + """ + assert isinstance(module, torch.nn.Linear) + in_features, out_features = module.in_features, module.out_features + + if had_dim != -1 and had_matrix is None: + assert is_pow2(had_dim), "Hadamard dimension must be a power of 2!" + + W_ = module.weight.data + dtype = W_.dtype + dev = W_.device + init_shape = W_.shape + compute_dev = _resolve_compute_device(compute_device) + W_ = W_.double().to(compute_dev) + + if had_matrix is not None: + H = had_matrix.to(device=compute_dev, dtype=torch.float64) + if had_dim == -1: + # Full-dimension custom matrix + if output: + # W.T = H @ W.T → W = (H @ W.T).T = W @ H.T + W_ = W_ @ H.T + else: + # W = H @ W (rotate input columns: W_new[i,:] = sum H[i,j]*W[j,:]) + # Actually for input side: W_new = W @ H (each row is rotated) + W_ = W_ @ H.T + else: + # Per-block custom matrix + if output: + W_ = W_.t() + transposed_shape = W_.shape + flat = W_.reshape(-1, had_dim) + W_ = (flat @ H.T).reshape(transposed_shape).t() + else: + flat = W_.reshape(-1, had_dim) + W_ = (flat @ H.T).reshape(init_shape) + elif had_dim == -1: + if output: + had_K, K = get_hadK(out_features) + W_ = matmul_hadU_cuda(W_.t(), had_K, K, use_fast_had=use_fast_had).t() + if not output: + had_K, K = get_hadK(in_features) + W_ = matmul_hadU_cuda(W_, had_K, K, use_fast_had=use_fast_had) + else: + # Apply Hadamard to the last had_dim chunks of the weights + if output: + W_ = W_.t() + transposed_shape = W_.shape + if use_fast_had and fast_hadamard_transform is not None: + W_ = ( + fast_hadamard_transform.hadamard_transform( + W_.reshape(-1, transposed_shape[-1] // had_dim, had_dim), scale=1 / math.sqrt(had_dim) + ) + .reshape(transposed_shape) + .t() + ) + else: + W_ = matmul_hadU(W_.reshape(-1, had_dim)).reshape(transposed_shape).t() + else: + if use_fast_had and fast_hadamard_transform is not None: + n = W_.shape[1] + W_ = fast_hadamard_transform.hadamard_transform( + W_.reshape(-1, n // had_dim, had_dim), scale=1 / math.sqrt(had_dim) + ).reshape(init_shape) + else: + W_ = matmul_hadU(W_.reshape(-1, had_dim)).reshape(init_shape) + module.weight.data = W_.to(device=dev, dtype=dtype) + + +def apply_cross_head_had_to_linear( + module, num_heads, head_dim, use_fast_had=True, compute_device=None, had_matrix=None +): + """Apply a cross-head Hadamard rotation to a Linear layer's input side. + + The operation is equivalent to ``(H_cross ⊗ I_head_dim)`` applied to the + input columns: + + * Reshape columns ``(hidden_size,)`` → ``(num_heads, head_dim)`` + * Transpose → ``(head_dim, num_heads)`` + * Hadamard on the ``num_heads`` axis + * Transpose back and reshape + + This mirrors what :class:`CrossHeadOnlineHadamardHook` does at runtime. + + Args: + module: ``nn.Linear`` layer whose ``in_features == num_heads * head_dim``. + num_heads: Number of attention heads. + head_dim: Per-head dimension. + use_fast_had: Use ``fast_hadamard_transform`` when available. + compute_device: Device to run computation on. + had_matrix: Optional custom rotation matrix of shape ``(num_heads, num_heads)``. + """ + assert isinstance(module, torch.nn.Linear) + W_ = module.weight.data + dtype = W_.dtype + dev = W_.device + compute_dev = _resolve_compute_device(compute_device) + W_ = W_.double().to(compute_dev) + + out_f = W_.shape[0] + # W shape: (out_features, hidden_size) where hidden_size = num_heads * head_dim + # Reshape columns: (out_f, num_heads, head_dim) + W_ = W_.reshape(out_f, num_heads, head_dim) + # Transpose last two dims: (out_f, head_dim, num_heads) + W_ = W_.transpose(1, 2).contiguous() + + if had_matrix is not None: + H = had_matrix.to(device=compute_dev, dtype=torch.float64) + # Apply H on last dim (num_heads): flat @ H.T + flat = W_.reshape(-1, num_heads) + W_ = (flat @ H.T).reshape(out_f, head_dim, num_heads) + elif use_fast_had and fast_hadamard_transform is not None and is_pow2(num_heads): + W_ = fast_hadamard_transform.hadamard_transform(W_, scale=1.0 / math.sqrt(num_heads)) + else: + W_ = matmul_hadU(W_.reshape(-1, num_heads)).reshape(out_f, head_dim, num_heads) + + # Transpose back: (out_f, num_heads, head_dim) → (out_f, hidden_size) + W_ = W_.transpose(1, 2).contiguous().reshape(out_f, num_heads * head_dim) + module.weight.data = W_.to(device=dev, dtype=dtype) + + +# --------------------------------------------------------------------------- +# Grouped (block-diagonal) Hadamard utilities +# --------------------------------------------------------------------------- + + +class OnlineHadamardPostHook(nn.Module): + """Forward hook (post-hook) adapter: wraps a pre-hook-style Hadamard + transform to apply it on the layer's **output** instead of input. + + Used for v_proj per-head Hadamard on the output side when online + rotation is not fused into weights. + """ + + def __init__(self, pre_hook): + super().__init__() + self.pre_hook = pre_hook + + def __call__(self, module, input, output): + result = self.pre_hook(module, (output,)) + if isinstance(result, tuple): + return result[0] + return result + + +class GroupOnlineHadamardHook(nn.Module): + """Pre-forward hook: block-diagonal Hadamard with fixed ``group_size`` on last dim. + + Reshapes ``(*, D)`` → ``(*, D // group_size, group_size)``, applies Hadamard + per group, then reshapes back. Much cheaper than a full-dimension Hadamard. + """ + + def __init__(self, group_size, fp32_had=False, use_fast_had=True, had_matrix=None): + super().__init__() + self.group_size = group_size + self.fp32_had = fp32_had + self.use_fast_had = use_fast_had + self.custom_had = had_matrix is not None + + if had_matrix is not None: + self.register_buffer("had_matrix", had_matrix) + self.had_K = None + self.K = None + elif not is_pow2(group_size): + had_K, K = get_hadK(group_size) + if had_K is not None: + self.register_buffer("had_K", had_K) + else: + self.had_K = None + self.K = K + else: + self.had_K = None + self.K = 1 + + def __call__(self, module: nn.Module, args): + x = args[0] if isinstance(args, tuple) else args + x_dtype = x.dtype + init_shape = x.shape + gs = self.group_size + + if self.fp32_had: + x = x.float() + + # Reshape: (*, D) → (*, D//gs, gs) + x = x.reshape(*init_shape[:-1], init_shape[-1] // gs, gs) + + if self.custom_had: + H = self.had_matrix.to(device=x.device, dtype=x.dtype) + flat = x.reshape(-1, gs) + x = (flat @ H.T).reshape(*init_shape[:-1], init_shape[-1] // gs, gs) + elif self.use_fast_had and fast_hadamard_transform is not None and self.K == 1: + x = fast_hadamard_transform.hadamard_transform(x, scale=1.0 / math.sqrt(gs)) + else: + x = x.reshape(-1, gs) + x = matmul_hadU(x) + x = x.reshape(*init_shape[:-1], init_shape[-1] // gs, gs) + + x = x.reshape(init_shape) + + if self.fp32_had: + x = x.to(x_dtype) + + if isinstance(args, tuple): + return (x,) + args[1:] + return x + + +def _apply_grouped_had_to_weight(W, group_size, side="input", use_fast_had=True, had_matrix=None): + """Apply block-diagonal Hadamard to a weight matrix. + + Args: + W: Weight tensor, shape (out_features, in_features). + group_size: Block size for the Hadamard rotation. + side: ``'input'`` rotates columns (in_features dim), + ``'output'`` rotates rows (out_features dim). + use_fast_had: Use fast_hadamard_transform if available. + had_matrix: Optional custom Hadamard matrix of shape ``(gs, gs)`` + to use instead of the built-in Hadamard. + + Returns: + Rotated weight tensor. + """ + gs = group_size + dtype = W.dtype + W = W.double() + + def _had_on_last_dim(X): + """Apply Hadamard on the last dimension (size gs) of X shaped (..., gs).""" + if had_matrix is not None: + H = had_matrix.to(device=X.device, dtype=X.dtype) + # X: (..., gs) → batch matmul with H^T → X @ H^T + flat = X.reshape(-1, gs) + return (flat @ H.T).reshape(X.shape) + if use_fast_had and fast_hadamard_transform is not None and is_pow2(gs): + return fast_hadamard_transform.hadamard_transform(X, scale=1.0 / math.sqrt(gs)) + orig_shape = X.shape + return matmul_hadU(X.reshape(-1, gs)).reshape(orig_shape) + + if side == "input": + out_f, in_f = W.shape + W = W.reshape(out_f, in_f // gs, gs) + W = _had_on_last_dim(W) + W = W.reshape(out_f, in_f) + else: + out_f, in_f = W.shape + Wt = W.t().contiguous() + Wt = Wt.reshape(in_f, out_f // gs, gs) + Wt = _had_on_last_dim(Wt) + W = Wt.reshape(in_f, out_f).t().contiguous() + + return W.to(dtype) + + +def _rotate_linear_grouped(module, group_size, side="input", use_fast_had=True, compute_device=None, had_matrix=None): + """Apply block-diagonal Hadamard rotation to a Linear layer's weight. + + Args: + module: ``nn.Linear`` layer. + group_size: Block size. + side: ``'input'`` or ``'output'``. + use_fast_had: Use fast_hadamard_transform. + compute_device: Device to run computation on. If None, auto-detects GPU. + had_matrix: Optional custom Hadamard matrix of shape ``(gs, gs)``. + """ + dtype = module.weight.data.dtype + dev = module.weight.data.device + compute_dev = _resolve_compute_device(compute_device) + W = module.weight.data.to(device=compute_dev, dtype=torch.float64) + W = _apply_grouped_had_to_weight(W, group_size, side=side, use_fast_had=use_fast_had, had_matrix=had_matrix) + module.weight.data = W.to(device=dev, dtype=dtype) + + if side == "output" and module.bias is not None: + bias = module.bias.data.to(device=compute_dev, dtype=torch.float64) + gs = group_size + bias = bias.reshape(-1, gs) + if had_matrix is not None: + H = had_matrix.to(device=compute_dev, dtype=torch.float64) + bias = (bias @ H.T).reshape(-1) + elif use_fast_had and fast_hadamard_transform is not None and is_pow2(gs): + bias = ( + fast_hadamard_transform.hadamard_transform(bias.unsqueeze(0), scale=1.0 / math.sqrt(gs)) + .squeeze(0) + .reshape(-1) + ) + else: + bias = matmul_hadU(bias).reshape(-1) + module.bias.data = bias.to(device=dev, dtype=dtype) + + +def _rotate_embedding_grouped(embedding, group_size, use_fast_had=True, compute_device=None, had_matrix=None): + """Apply block-diagonal Hadamard rotation to an Embedding layer. + + Embedding weight: (vocab, hidden_size) → rotate on hidden_size (columns). + """ + dtype = embedding.weight.data.dtype + dev = embedding.weight.data.device + compute_dev = _resolve_compute_device(compute_device) + W = embedding.weight.data.to(device=compute_dev, dtype=torch.float64) + W = _apply_grouped_had_to_weight(W, group_size, side="input", use_fast_had=use_fast_had, had_matrix=had_matrix) + new_W = W.to(device=dev, dtype=dtype) + del W + embedding.weight.data = new_W + + +def register_online_had_hooks_grouped(model, mapping, group_size, fp32_had=False, use_fast_had=True): + """Register per-group online Hadamard hooks on ``down_proj`` and ``o_proj``. + + In grouped mode: + - **down_proj**: block-diagonal Hadamard on ``intermediate_size`` with ``group_size``. + - **o_proj**: block-diagonal Hadamard on ``hidden_size`` with ``group_size``. + + Args: + model: HuggingFace model with rotated weights. + mapping: RotationMapping. + group_size: Block size for block-diagonal Hadamard. + fp32_had: Compute in fp32. + use_fast_had: Use fast_hadamard_transform. + + Returns: + list of hook handles. + """ + mlp_out_suffix = mapping.mlp_out.split(".")[-1] + attn_o_suffix = mapping.attn_o.split(".")[-1] + + handles = [] + for name, module in model.named_modules(): + if name.endswith(mlp_out_suffix) and isinstance(module, nn.Linear): + hook = GroupOnlineHadamardHook( + group_size=group_size, + fp32_had=fp32_had, + use_fast_had=use_fast_had, + ) + h = module.register_forward_pre_hook(hook) + handles.append(h) + elif name.endswith(attn_o_suffix) and isinstance(module, nn.Linear): + hook = GroupOnlineHadamardHook( + group_size=group_size, + fp32_had=fp32_had, + use_fast_had=use_fast_had, + ) + h = module.register_forward_pre_hook(hook) + handles.append(h) + + return handles diff --git a/auto_round/algorithms/transforms/rotation/inplace/model_config.py b/auto_round/algorithms/transforms/rotation/inplace/model_config.py new file mode 100644 index 000000000..3ecbf9b69 --- /dev/null +++ b/auto_round/algorithms/transforms/rotation/inplace/model_config.py @@ -0,0 +1,169 @@ +# # Copyright (C) 2026 Intel Corporation +# # SPDX-License-Identifier: Apache-2.0 + +"""Model architecture mapping for Hadamard rotation. + +Each :class:`RotationMapping` describes *where* the rotation-relevant modules +live inside a model. Currently supports LLaMA-2, LLaMA-3, and Qwen-3 (dense). + +New architectures can be supported by calling :func:`register_mapping`. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Dict, List, Optional + +from auto_round.utils import logger + +__all__ = [ + "RotationMapping", + "register_mapping", + "get_mapping", + "infer_mapping_from_model", + "MAPPING_REGISTRY", +] + + +# --------------------------------------------------------------------------- +# Mapping dataclass +# --------------------------------------------------------------------------- + + +@dataclass +class RotationMapping: + """Declarative description of a transformer architecture for Hadamard rotation. + + Attribute names follow the dot-path convention relative to the model or + each decoder layer. + + Config attribute names (read from ``model.config``): + num_heads_attr, hidden_size_attr, intermediate_size_attr + head_dim_override – explicit head dim (skip hidden_size // num_heads) + """ + + # -- top-level modules (dot-path from model root) -- + embedding: str = "model.embed_tokens" + lm_head: str = "lm_head" + positional_embedding: Optional[str] = None # e.g. "model.decoder.embed_positions" for OPT + + # -- layers container (dot-path from model root) -- + layers_attr: str = "model.layers" + + # -- per-layer: attention (dot-path from each layer) -- + attn_input_ln: str = "input_layernorm" + attn_q: str = "self_attn.q_proj" + attn_k: str = "self_attn.k_proj" + attn_v: str = "self_attn.v_proj" + attn_o: str = "self_attn.o_proj" + + # -- per-layer: MLP (dot-path from each layer) -- + mlp_input_ln: str = "post_attention_layernorm" + mlp_in: List[str] = field(default_factory=lambda: ["mlp.up_proj", "mlp.gate_proj"]) + mlp_out: str = "mlp.down_proj" + + # -- final norm (dot-path from model root) -- + pre_head_ln: str = "model.norm" + + # -- head dim override (None = hidden_size // num_heads) -- + attn_head_dim: Optional[int] = None + + # -- config attr names -- + num_heads_attr: str = "num_attention_heads" + hidden_size_attr: str = "hidden_size" + intermediate_size_attr: str = "intermediate_size" + + +# --------------------------------------------------------------------------- +# Helper: resolve a dot-path attribute on a module +# --------------------------------------------------------------------------- + + +def _resolve(root, dot_path: str): + """Resolve ``'a.b.c'`` to ``root.a.b.c``.""" + obj = root + for attr in dot_path.split("."): + obj = getattr(obj, attr) + return obj + + +# --------------------------------------------------------------------------- +# Registry +# --------------------------------------------------------------------------- + +MAPPING_REGISTRY: Dict[str, RotationMapping] = {} + + +def register_mapping(key: str, mapping: RotationMapping) -> RotationMapping: + """Register a :class:`RotationMapping` under *key* (model_type or architecture).""" + MAPPING_REGISTRY[key] = mapping + return mapping + + +def get_mapping(key: str) -> RotationMapping: + """Look up a mapping by *key*; fall back to default if not found.""" + if key in MAPPING_REGISTRY: + return MAPPING_REGISTRY[key] + logger.warning(f"No rotation mapping registered for '{key}', " "falling back to default (LLaMA-like) mapping.") + return RotationMapping() + + +def infer_mapping_from_model(model) -> RotationMapping: + """Return the best :class:`RotationMapping` for *model*. + + Tries ``model.config.model_type`` first, then ``model.__class__.__name__``. + """ + model_type = getattr(getattr(model, "config", None), "model_type", "") + if model_type in MAPPING_REGISTRY: + return MAPPING_REGISTRY[model_type] + + arch = model.__class__.__name__ + if arch in MAPPING_REGISTRY: + return MAPPING_REGISTRY[arch] + + logger.warning( + f"Unrecognised architecture '{arch}' (model_type='{model_type}'). " + "Falling back to default (LLaMA-like) mapping." + ) + return RotationMapping() + + +# =================================================================== +# Built-in mappings +# =================================================================== + +# LLaMA-2 / LLaMA-3 / Mistral / Yi — all share the same layout +_default = RotationMapping() + +register_mapping("llama", _default) +register_mapping("LlamaForCausalLM", _default) + +# Qwen-3 dense — identical layout to LLaMA +register_mapping("qwen3", _default) +register_mapping("Qwen3ForCausalLM", _default) + +# Qwen-2 / Qwen-2.5 dense — identical layout to LLaMA +register_mapping("qwen2", _default) +register_mapping("Qwen2ForCausalLM", _default) + +# ---- OPT ---- +# OPT uses standard LayerNorm (with bias, subtracts mean), +# different module names, and tied lm_head ↔ embedding weights. +_opt = RotationMapping( + embedding="model.decoder.embed_tokens", + lm_head="lm_head", + positional_embedding="model.decoder.embed_positions", + layers_attr="model.decoder.layers", + attn_input_ln="self_attn_layer_norm", + attn_q="self_attn.q_proj", + attn_k="self_attn.k_proj", + attn_v="self_attn.v_proj", + attn_o="self_attn.out_proj", + mlp_input_ln="final_layer_norm", + mlp_in=["fc1"], + mlp_out="fc2", + pre_head_ln="model.decoder.final_layer_norm", + intermediate_size_attr="ffn_dim", +) +register_mapping("opt", _opt) +register_mapping("OPTForCausalLM", _opt) diff --git a/auto_round/experimental/apply_rotation_transform.py b/auto_round/experimental/apply_rotation_transform.py index fc7dbf297..520f6b2fb 100644 --- a/auto_round/experimental/apply_rotation_transform.py +++ b/auto_round/experimental/apply_rotation_transform.py @@ -1,154 +1,12 @@ -# Copyright (c) 2026 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 +# # Copyright (C) 2026 Intel Corporation +# # SPDX-License-Identifier: Apache-2.0 +"""Backward-compat re-export shim. -"""Unified entry point for Hadamard rotation/transform. - -Two backend implementations exist: - -* ``inplace`` – :mod:`auto_round.experimental.rotation_inplace` - QuaRot-style residual-stream rotation. Works for any weight/activation - dtype. Optionally fuses the online Hadamard into weights - (``fuse_online_to_weight=True``). -* ``transform`` – :mod:`auto_round.experimental.transform` - Per-Linear weight + activation Hadamard with a fused triton kernel. - Only supports MXFP4 / NVFP4 and **cannot** fuse online to weight. - -Routing is controlled by :class:`RotationConfig.backend`: - - "inplace" -> always inplace - "transform" -> always transform (validates dtype + no-fuse) - "auto" -> if user asked to fuse -> inplace - elif data_type is mx_fp / nv_fp -> transform - else -> inplace +The canonical implementation now lives in +:mod:`auto_round.algorithms.transforms.rotation.dispatcher`. """ -from __future__ import annotations - -from typing import Any, Union - -import torch - -import auto_round.envs as envs -from auto_round.compressors.utils import is_mx_fp, is_nv_fp -from auto_round.experimental.transform.rotation_config import RotationConfig -from auto_round.experimental.utils import normalize_rotation_config -from auto_round.utils import logger - -__all__ = ["apply_hadamard_rotation", "resolve_hadamard_backend"] - - -def _to_config( - rotation_config: Union[str, dict, RotationConfig, None], - data_type: str, -) -> RotationConfig: - """Normalise *rotation_config* and return a :class:`RotationConfig` instance.""" - cfg_dict = normalize_rotation_config(rotation_config, data_type) - if isinstance(cfg_dict, RotationConfig): - return cfg_dict - return RotationConfig.model_validate(cfg_dict or {}) - - -def resolve_hadamard_backend(config: RotationConfig, data_type: str) -> str: - """Resolve the actual backend (``"inplace"`` / ``"transform"``) from config.""" - requested = config.backend - fuse_requested = bool(config.fuse_online_to_weight) - allow_online_rotation: bool = config.allow_online_rotation - - if requested == "inplace": - return "inplace" - - transform_backend_name = "transform" - if requested == "transform": - if fuse_requested: - raise ValueError( - f"backend='{transform_backend_name}' does not support fuse_online_to_weight=True. " - "Use backend='inplace' (or backend='auto' with fuse_online_to_weight=True) instead." - ) - if not (is_mx_fp(data_type) or is_nv_fp(data_type)): - raise ValueError( - f"backend='{transform_backend_name}' only supports MXFP4 / NVFP4 (got data_type={data_type!r}). " - "Use backend='inplace' or backend='auto' for other dtypes." - ) - if not allow_online_rotation: - raise ValueError(f"backend='{transform_backend_name}' only supports `allow_online_rotation`=True") - - return "transform" - - # backend == "auto" - if fuse_requested: - return "inplace" - if is_mx_fp(data_type) or is_nv_fp(data_type): - return "transform" - return "inplace" - - -def apply_hadamard_rotation( - model: torch.nn.Module, - rotation_config: Union[str, dict, RotationConfig, None], - data_type: str, - compute_device: torch.device | str = None, -) -> (torch.nn.Module, Any): - """Apply Hadamard rotation/transform to *model*, dispatching by backend. - - Args: - model: Target model. - rotation_config: ``str`` / ``dict`` / :class:`RotationConfig` / ``None``. - See :class:`RotationConfig` for fields. - data_type: Quantization data type (e.g. ``"mx_fp"``, ``"nv_fp"``, - ``"int"``, ``"fp"``). - compute_device: Device for inplace-backend computation. Ignored by - the transform backend. - - Returns: - The same model (for chaining); also stored on ``model.rotation_config``. - """ - config = _to_config(rotation_config, data_type) - backend = resolve_hadamard_backend(config, data_type) - - # Resolve fuse flag: explicit > env var > default(True) - fuse_online_to_weight = config.fuse_online_to_weight - if config.fuse_online_to_weight is not None: - fuse_online_to_weight = bool(config.fuse_online_to_weight) - elif envs.AR_FUSE_ONLINE_ROTATION: - fuse_online_to_weight = bool(envs.AR_FUSE_ONLINE_ROTATION) - - logger.info( - f"Applying Hadamard (backend={backend}, " - f"data_type={data_type}, fuse_online_to_weight={fuse_online_to_weight if backend == 'inplace' else False})." - ) - - if backend == "inplace": - logger.warning("this backend does not support real exporting, please export the model to fake format") - from auto_round.experimental.rotation_inplace import apply_rotation_transform - - # block_size -> group_size (None / -1 / 0 means full-dimension) - bs = config.block_size - group_size = bs if (bs is not None and bs > 0) else None - - model, hooks = apply_rotation_transform( - model, - group_size=group_size, - allow_online_rotation=config.allow_online_rotation, - rotation_matrix=config.hadamard_type, - fuse_online_to_weight=fuse_online_to_weight, - compute_device=compute_device, - ) - # Stash for downstream (export / serialization). Plain dict so JSON - # serialization (HF save_pretrained -> config.json) round-trips. - setattr(model, "rotation_config", config.model_dump() if hasattr(config, "model_dump") else config) - return model, hooks - - elif backend == "transform": - supported_hadamard_types = ("hadamard", "random_hadamard") - if config.hadamard_type not in supported_hadamard_types: - raise ValueError("this backend only supports hadamard or random_hadamard") - from auto_round.experimental.transform.apply import apply_rotation_transform - - return apply_rotation_transform(model, config, data_type=data_type) - else: - raise ValueError(f"Unsupported Hadamard backend {backend!r}") +from auto_round.algorithms.transforms.rotation.dispatcher import ( # noqa: F401 + apply_hadamard_rotation, + resolve_hadamard_backend, +) diff --git a/auto_round/experimental/rotation_inplace/__init__.py b/auto_round/experimental/rotation_inplace/__init__.py index 8cdef31b0..07b3d40c8 100644 --- a/auto_round/experimental/rotation_inplace/__init__.py +++ b/auto_round/experimental/rotation_inplace/__init__.py @@ -1,5 +1,14 @@ # # Copyright (C) 2026 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 +"""Backward-compat re-export shim. -from auto_round.experimental.rotation_inplace.apply_rotation_transform import apply_rotation_transform -from auto_round.experimental.rotation_inplace.utils import clear_random_hadamard_cache +The canonical implementation now lives in +:mod:`auto_round.algorithms.transforms.rotation.inplace`. +""" + +from auto_round.algorithms.transforms.rotation.inplace.apply import ( # noqa: F401 + apply_rotation_transform, +) +from auto_round.algorithms.transforms.rotation.inplace.hooks import ( # noqa: F401 + clear_random_hadamard_cache, +) diff --git a/auto_round/experimental/rotation_inplace/apply_rotation_transform.py b/auto_round/experimental/rotation_inplace/apply_rotation_transform.py index 1052b036f..86e429c10 100644 --- a/auto_round/experimental/rotation_inplace/apply_rotation_transform.py +++ b/auto_round/experimental/rotation_inplace/apply_rotation_transform.py @@ -1,882 +1,12 @@ # # Copyright (C) 2026 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 +"""Backward-compat re-export shim. -"""Hadamard inplace rotation — public API and rotation primitives. - -Supports LLaMA-2, LLaMA-3, Qwen-3 (and any model with the same layout). -The entry point is :func:`apply_hadamard_rotation`. +The canonical implementation now lives in +:mod:`auto_round.algorithms.transforms.rotation.inplace.apply`. """ -import gc -import typing -from typing import Dict, Union - -import torch -import tqdm - -from auto_round.experimental.rotation_inplace.model_config import ( - MAPPING_REGISTRY, - RotationMapping, - _resolve, - infer_mapping_from_model, -) -from auto_round.experimental.rotation_inplace.utils import ( - CrossHeadOnlineHadamardHook, - FullOnlineHadamardHook, - GroupOnlineHadamardHook, - _get_custom_had, - _normalize_rotation_matrix, - _resolve_compute_device, - _rotate_embedding_grouped, - _rotate_linear_grouped, - apply_cross_head_had_to_linear, - apply_exact_had_to_linear, - deterministic_hadamard_matrix, - get_hadK, - get_or_create_random_hadamard, +from auto_round.algorithms.transforms.rotation.inplace.apply import * # noqa: F401, F403 +from auto_round.algorithms.transforms.rotation.inplace.apply import ( # noqa: F401 + apply_rotation_transform, ) - -# --------------------------------------------------------------------------- -# Low-level primitives (model-agnostic via RotationMapping) -# --------------------------------------------------------------------------- - - -def _fuse_ln_linear( - layernorm: torch.nn.Module, - linear_layers: typing.Iterable[torch.nn.Linear], -) -> None: - """Fuse the linear operations in LayerNorm into adjacent linear blocks.""" - for linear in linear_layers: - linear_dtype = linear.weight.dtype - dev = linear.weight.device - - W_ = linear.weight.data.double() - ln_weight = layernorm.weight.double().to(dev) - linear.weight.data = (W_ * ln_weight).to(linear_dtype) - - if hasattr(layernorm, "bias") and layernorm.bias is not None: - if linear.bias is None: - linear.bias = torch.nn.Parameter(torch.zeros(linear.out_features, dtype=torch.float64, device=dev)) - ln_bias = layernorm.bias.double().to(dev) - linear.bias.data = linear.bias.data.double() + torch.matmul(W_, ln_bias) - linear.bias.data = linear.bias.data.to(linear_dtype) - - -def _reset_ln_params(layernorm: torch.nn.Module) -> None: - """Reset LayerNorm to identity: weight=1, bias=0.""" - layernorm.weight.data.fill_(1.0) - if hasattr(layernorm, "bias") and layernorm.bias is not None: - layernorm.bias.data.fill_(0.0) - - -def _rotate_linear_by_Q(module: torch.nn.Linear, Q: torch.Tensor, side: str, compute_device=None) -> None: - """Apply rotation *Q* to a Linear layer's weight (and bias if present). - - Args: - side: ``'input'`` → W = W @ Q (rotate input side) - ``'output'`` → W = Q^T @ W (rotate output side) - compute_device: Device to run computation on. If None, auto-detects GPU. - """ - dtype = module.weight.data.dtype - dev = module.weight.data.device - cdev = _resolve_compute_device(compute_device) - W_ = module.weight.data.to(device=cdev, dtype=torch.float64) - Q_ = Q.to(device=cdev) - if side == "input": - new_W = torch.matmul(W_, Q_).to(device=dev, dtype=dtype) - else: - new_W = torch.matmul(Q_.T, W_).to(device=dev, dtype=dtype) - # Release fp64 copy before assigning back so peak memory ≈ 1× weight + 1× rotated. - del W_ - module.weight.data = new_W - if side == "output" and module.bias is not None: - b = module.bias.data.to(device=cdev, dtype=torch.float64) - new_b = torch.matmul(Q_.T, b).to(device=dev, dtype=dtype) - del b - module.bias.data = new_b - del Q_ - - -def _untie_word_embeddings(model, mapping: RotationMapping) -> None: - """Break tied weights between lm_head and embedding if they share the same tensor.""" - embedding = _resolve(model, mapping.embedding) - lm_head = _resolve(model, mapping.lm_head) - - if lm_head.weight.data_ptr() != embedding.weight.data_ptr(): - return - - lm_head.weight = torch.nn.Parameter(lm_head.weight.data.clone()) - if hasattr(model.config, "tie_word_embeddings"): - model.config.tie_word_embeddings = False - - -def _uses_layernorm_with_mean(model, mapping: RotationMapping) -> bool: - """Check whether the model uses standard LayerNorm (which subtracts mean).""" - layers = _resolve(model, mapping.layers_attr) - first_ln = _resolve(layers[0], mapping.attn_input_ln) - return isinstance(first_ln, torch.nn.LayerNorm) - - -def _bake_mean_into_linear(linear: torch.nn.Linear) -> None: - """Subtract column-wise mean from a Linear layer's weight (and mean from bias).""" - linear_dtype = linear.weight.dtype - W_ = linear.weight.data.double() - linear.weight.data = (W_ - W_.mean(dim=-2, keepdim=True)).to(linear_dtype) - if linear.bias is not None: - b_ = linear.bias.data.double() - linear.bias.data = (b_ - b_.mean()).to(linear_dtype) - - -def _subtract_embedding_mean(model, mapping: RotationMapping) -> None: - """Subtract per-row mean from the embedding weight matrix.""" - W = _resolve(model, mapping.embedding) - dtype = W.weight.data.dtype - W_ = W.weight.data.to(dtype=torch.float64) - W.weight.data = (W_ - W_.mean(dim=-1, keepdim=True)).to(dtype=dtype) - - if mapping.positional_embedding is not None: - P = _resolve(model, mapping.positional_embedding) - p_dtype = P.weight.data.dtype - P_ = P.weight.data.to(dtype=torch.float64) - P.weight.data = (P_ - P_.mean(dim=-1, keepdim=True)).to(dtype=p_dtype) - - -class _RMSNorm(torch.nn.Module): - """RMS Normalization (no mean subtraction).""" - - def __init__(self, dim: int, eps: float = 1e-5): - super().__init__() - self.eps = eps - self.register_buffer("weight", torch.ones(dim)) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - rms = torch.sqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps) - return x / rms * self.weight - - -def _replace_layernorms_with_rmsnorm(model) -> None: - """Replace all ``nn.LayerNorm`` modules with ``_RMSNorm``.""" - replacements = [] - for name, module in model.named_modules(): - if isinstance(module, torch.nn.LayerNorm): - replacements.append((name, module)) - - for name, module in replacements: - parts = name.rsplit(".", 1) - if len(parts) == 2: - parent = _resolve(model, parts[0]) - attr = parts[1] - else: - parent = model - attr = parts[0] - rms = _RMSNorm(module.normalized_shape[0], eps=module.eps) - rms = rms.to(device=module.weight.device, dtype=module.weight.dtype) - setattr(parent, attr, rms) - - -# --------------------------------------------------------------------------- -# High-level steps driven by RotationMapping -# --------------------------------------------------------------------------- - - -def _fuse_layer_norms(model, mapping: RotationMapping) -> None: - """Fuse all LayerNorm parameters into adjacent Linear layers.""" - layers = _resolve(model, mapping.layers_attr) - - for layer in layers: - mlp_ln = _resolve(layer, mapping.mlp_input_ln) - mlp_linears = [_resolve(layer, p) for p in mapping.mlp_in] - _fuse_ln_linear(mlp_ln, mlp_linears) - _reset_ln_params(mlp_ln) - - attn_ln = _resolve(layer, mapping.attn_input_ln) - attn_linears = [ - _resolve(layer, mapping.attn_q), - _resolve(layer, mapping.attn_k), - _resolve(layer, mapping.attn_v), - ] - _fuse_ln_linear(attn_ln, attn_linears) - _reset_ln_params(attn_ln) - - pre_head_ln = _resolve(model, mapping.pre_head_ln) - lm_head = _resolve(model, mapping.lm_head) - _fuse_ln_linear(pre_head_ln, [lm_head]) - _reset_ln_params(pre_head_ln) - - -# --------------------------------------------------------------------------- -# Unified weight rotation (full or grouped) -# --------------------------------------------------------------------------- - - -@torch.inference_mode() -def _rotate_weights( - model, - mapping: RotationMapping, - use_fast_had: bool = True, - group_size: int = None, - compute_device: torch.device = None, - had_dict: dict = None, - preset: str = None, - fuse_online_to_weight: bool = True, -) -> None: - """Apply Hadamard rotation to all weights. - - Args: - group_size: ``None`` → full Hadamard rotation. - ``int`` → block-diagonal rotation with this block size. - compute_device: Device to run Hadamard computation on (e.g. ``"cuda:0"``). - Weights are moved there temporarily and moved back afterwards. - If ``None``, auto-detects GPU availability. - allow_online_rotation: If ``True`` (default), apply extra input-side - Hadamard rotations on ``down_proj`` and the OV pair (``v_proj`` - output + ``o_proj`` input) that require compensating online hooks - at inference time. If ``False``, skip those extra rotations so - that **no** online hooks are needed. - had_dict: Normalized ``dict[int, Tensor]`` of custom Hadamard matrices - (keyed by dimension). Only used in grouped mode. - preset: Rotation preset name (``"quarot_hadamard"``, ``"hadamard"``, - ``"random_hadamard"``, or ``None``). - - * ``"quarot_hadamard"``: fusable (residual-stream) rotations use - ``fast_hadamard_transform`` / random Hadamard; non-fusable - (online-paired) rotations and their weight-side counterparts use - deterministic ``get_hadK``/``matmul_hadU`` so that the online - hook at inference produces the exact same transform. - * ``"hadamard"``: all rotations use deterministic ``get_hadK`` / - ``matmul_hadU``. Full-mode Q is a deterministic Hadamard matrix. - * ``"random_hadamard"``: all rotations use random Hadamard matrices - from the global cache (``get_or_create_random_hadamard``). - Same dimension → same matrix everywhere. - * ``None``: same behaviour as ``"hadamard"`` (built-in butterfly). - """ - compute_device = _resolve_compute_device(compute_device) - config = model.config - hidden_size = getattr(config, mapping.hidden_size_attr) - intermediate_size = getattr(config, mapping.intermediate_size_attr) - num_heads = getattr(config, mapping.num_heads_attr) - head_dim = mapping.attn_head_dim or (hidden_size // num_heads) - - is_grouped = group_size is not None and group_size > 0 - desc = f"Rotating (group_size={group_size})" if is_grouped else "Rotating" - - # ----- Resolve per-operation Hadamard sources ----- - fused_fast = use_fast_had - online_fast = False - if preset == "random_hadamard": - fused_fast = False - - # -- Matrix resolution -- - had_matrix, _found = _get_custom_had(had_dict, group_size) if is_grouped else (None, False) - - online_had_matrix = had_matrix - if preset == "random_hadamard" and had_matrix is None: - had_matrix = get_or_create_random_hadamard(group_size if is_grouped else hidden_size, compute_device) - online_had_matrix = had_matrix - if preset == "quarot_hadamard" and is_grouped: - online_had_matrix = None # force deterministic for online-paired - - # -- Helper: look up cached random matrix for online-paired ops -- - def _online_had(dim): - """Return cached random matrix for *dim* under random_hadamard, else None.""" - if preset == "random_hadamard": - return get_or_create_random_hadamard(dim, compute_device) - return None - - if is_grouped: - assert hidden_size % group_size == 0, f"group_size={group_size} must divide hidden_size={hidden_size}" - assert ( - intermediate_size % group_size == 0 - ), f"group_size={group_size} must divide intermediate_size={intermediate_size}" - - # --- Full mode: build Hadamard matrix Q --- - Q = None - if not is_grouped: - if preset == "hadamard": - Q = deterministic_hadamard_matrix(hidden_size, compute_device) - else: - # "random_hadamard", "quarot_hadamard", None — same shape → same matrix - Q = get_or_create_random_hadamard(hidden_size, compute_device) - - # ---- Top-level: embedding / lm_head ---- - # When fuse_online_to_weight=False, skip embedding and lm_head rotation: - # each layer is self-contained (weight rotation + online hook cancel out). - if fuse_online_to_weight: - embedding = _resolve(model, mapping.embedding) - if is_grouped: - _rotate_embedding_grouped( - embedding, group_size, use_fast_had=fused_fast, compute_device=compute_device, had_matrix=had_matrix - ) - else: - dtype = embedding.weight.data.dtype - dev = embedding.weight.data.device - cdev = compute_device - W_ = embedding.weight.data.to(device=cdev, dtype=torch.float64) - new_W = torch.matmul(W_, Q.to(cdev)).to(device=dev, dtype=dtype) - del W_ - embedding.weight.data = new_W - - if mapping.positional_embedding is not None: - pos_emb = _resolve(model, mapping.positional_embedding) - if is_grouped: - _rotate_embedding_grouped( - pos_emb, group_size, use_fast_had=fused_fast, compute_device=compute_device, had_matrix=had_matrix - ) - else: - pos_dtype = pos_emb.weight.data.dtype - pos_dev = pos_emb.weight.data.device - cdev = compute_device - P_ = pos_emb.weight.data.to(device=cdev, dtype=torch.float64) - new_P = torch.matmul(P_, Q.to(cdev)).to(device=pos_dev, dtype=pos_dtype) - del P_ - pos_emb.weight.data = new_P - - # ---- Top-level: lm_head ---- - lm_head = _resolve(model, mapping.lm_head) - if is_grouped: - _rotate_linear_grouped( - lm_head, - group_size, - side="input", - use_fast_had=fused_fast, - compute_device=compute_device, - had_matrix=had_matrix, - ) - else: - _rotate_linear_by_Q(lm_head, Q, side="input", compute_device=compute_device) - - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - - # ---- Per-layer rotation ---- - layers = _resolve(model, mapping.layers_attr) - for layer in tqdm.tqdm(layers, unit="layer", desc=desc): - if fuse_online_to_weight: - # ---- fuse mode: QuaRot-style residual stream rotation ---- - # Q/K/V: only residual Q on input (no online Had stacking, no hook). - # When Q == online Had (e.g. preset="hadamard"), Q @ Q = I cancels - # the rotation entirely, destroying quantization benefit. - # gate/up: only residual Q on input (no online Had stacking, no hook). - # down_proj: residual Q^T on output + online Had on input (+ hook). - # v_proj/o_proj: per-head/cross-head Had below (+ hook on o_proj). - for attr in (mapping.attn_q, mapping.attn_k, mapping.attn_v): - mod = _resolve(layer, attr) - if is_grouped: - _rotate_linear_grouped( - mod, - group_size, - side="input", - use_fast_had=fused_fast, - compute_device=compute_device, - had_matrix=had_matrix, - ) - else: - _rotate_linear_by_Q(mod, Q, side="input", compute_device=compute_device) - - # o_proj: residual stream output rotation - if is_grouped: - _rotate_linear_grouped( - _resolve(layer, mapping.attn_o), - group_size, - side="output", - use_fast_had=fused_fast, - compute_device=compute_device, - had_matrix=had_matrix, - ) - else: - _rotate_linear_by_Q(_resolve(layer, mapping.attn_o), Q, side="output", compute_device=compute_device) - - # gate/up: only residual Q on input - for attr in mapping.mlp_in: - mod = _resolve(layer, attr) - if is_grouped: - _rotate_linear_grouped( - mod, - group_size, - side="input", - use_fast_had=fused_fast, - compute_device=compute_device, - had_matrix=had_matrix, - ) - else: - _rotate_linear_by_Q(mod, Q, side="input", compute_device=compute_device) - - # down_proj: residual output + online input Had - down_proj = _resolve(layer, mapping.mlp_out) - if is_grouped: - _rotate_linear_grouped( - down_proj, - group_size, - side="output", - use_fast_had=fused_fast, - compute_device=compute_device, - had_matrix=had_matrix, - ) - _rotate_linear_grouped( - down_proj, - group_size, - side="input", - use_fast_had=online_fast, - compute_device=compute_device, - had_matrix=online_had_matrix, - ) - else: - _rotate_linear_by_Q(down_proj, Q, side="output", compute_device=compute_device) - apply_exact_had_to_linear( - down_proj, - had_dim=-1, - output=False, - use_fast_had=online_fast, - compute_device=compute_device, - had_matrix=_online_had(intermediate_size), - ) - - # OV projection: v_proj per-head output + o_proj full/cross-head input - v_proj = _resolve(layer, mapping.attn_v) - o_proj = _resolve(layer, mapping.attn_o) - if is_grouped: - pass - else: - online_head_had = _online_had(head_dim) - apply_exact_had_to_linear( - v_proj, - had_dim=head_dim, - output=True, - use_fast_had=online_fast, - compute_device=compute_device, - had_matrix=online_head_had, - ) - if preset == "random_hadamard": - apply_exact_had_to_linear( - o_proj, - had_dim=head_dim, - output=False, - use_fast_had=online_fast, - compute_device=compute_device, - had_matrix=online_head_had, - ) - apply_cross_head_had_to_linear( - o_proj, - num_heads, - head_dim, - use_fast_had=online_fast, - compute_device=compute_device, - had_matrix=_online_had(num_heads), - ) - else: - apply_exact_had_to_linear( - o_proj, - had_dim=-1, - output=False, - use_fast_had=online_fast, - compute_device=compute_device, - ) - - else: - # ---- unfused mode: no residual rotation, only input-side Had ---- - # Each layer gets Had fused on input side + compensating hook → equivalent. - # No embedding/lm_head rotation. No self-cancelling pair. - # v_proj treated same as Q/K (input Had only, no per-head/cross-head). - - # Q/K/V: input-side Had on hidden_size - for attr in (mapping.attn_q, mapping.attn_k, mapping.attn_v): - mod = _resolve(layer, attr) - if is_grouped: - _rotate_linear_grouped( - mod, - group_size, - side="input", - use_fast_had=online_fast, - compute_device=compute_device, - had_matrix=online_had_matrix, - ) - else: - apply_exact_had_to_linear( - mod, - had_dim=-1, - output=False, - use_fast_had=online_fast, - compute_device=compute_device, - had_matrix=_online_had(hidden_size), - ) - - # o_proj: input-side Had on hidden_size (full Had, not cross-head) - o_proj = _resolve(layer, mapping.attn_o) - if is_grouped: - _rotate_linear_grouped( - o_proj, - group_size, - side="input", - use_fast_had=online_fast, - compute_device=compute_device, - had_matrix=online_had_matrix, - ) - else: - apply_exact_had_to_linear( - o_proj, - had_dim=-1, - output=False, - use_fast_had=online_fast, - compute_device=compute_device, - had_matrix=_online_had(hidden_size), - ) - - # gate/up: input-side Had on hidden_size - for attr in mapping.mlp_in: - mod = _resolve(layer, attr) - if is_grouped: - _rotate_linear_grouped( - mod, - group_size, - side="input", - use_fast_had=online_fast, - compute_device=compute_device, - had_matrix=online_had_matrix, - ) - else: - apply_exact_had_to_linear( - mod, - had_dim=-1, - output=False, - use_fast_had=online_fast, - compute_device=compute_device, - had_matrix=_online_had(hidden_size), - ) - - # down_proj: input-side Had on intermediate_size - down_proj = _resolve(layer, mapping.mlp_out) - if is_grouped: - _rotate_linear_grouped( - down_proj, - group_size, - side="input", - use_fast_had=online_fast, - compute_device=compute_device, - had_matrix=online_had_matrix, - ) - else: - apply_exact_had_to_linear( - down_proj, - had_dim=-1, - output=False, - use_fast_had=online_fast, - compute_device=compute_device, - had_matrix=_online_had(intermediate_size), - ) - - # Per-layer cleanup: drop fp64 temporaries and CUDA caching allocator - # blocks so peak memory stays at ~1 layer's worth instead of accumulating - # across all 32+ decoder layers (was the main cause of 33 GB RAM on 8B). - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - - -# --------------------------------------------------------------------------- -# Unified online hook registration -# --------------------------------------------------------------------------- - - -def _register_online_hooks( - model, - mapping: RotationMapping, - fp32_had: bool = False, - use_fast_had: bool = True, - group_size: int = None, - had_dict: dict = None, - preset: str = None, - fuse_online_to_weight: bool = True, -): - """Register online Hadamard pre-forward hooks on ``down_proj`` and ``o_proj``. - - Online hooks must use the **same** Hadamard matrix that was applied to the - weight-side counterpart during ``_rotate_weights``. For ``quarot_hadamard`` - this is always the deterministic ``get_hadK``/``matmul_hadU`` path - (``use_fast_had=False``). For ``"random_hadamard"`` it is the random matrix that - was generated once and stored in ``had_dict``. - - Args: - group_size: ``None`` → full Hadamard hooks (original QuaRot). - ``int`` → per-group Hadamard hooks. - had_dict: Normalized ``dict[int, Tensor]`` of custom Hadamard matrices. - preset: Rotation preset name. - Returns: - list of hook handles. - """ - config = model.config - num_heads = getattr(config, mapping.num_heads_attr) - hidden_size = getattr(config, mapping.hidden_size_attr) - intermediate_size = getattr(config, mapping.intermediate_size_attr) - head_dim = mapping.attn_head_dim or (hidden_size // num_heads) - - is_grouped = group_size is not None and group_size > 0 - - # Online hooks always use deterministic (fixed) Hadamard — never fast_had - # for quarot_hadamard; for "random_hadamard" they use the same random matrix - # that was cached in had_dict by _rotate_weights. - online_fast = False - - # -- Matrix resolution (must match the *online-paired* matrix used by - # _rotate_weights for down_proj input / OV pair). Variable name kept in - # sync with _rotate_weights to make any future drift obvious. - online_had_matrix, _ = _get_custom_had(had_dict, group_size) if is_grouped else (None, False) - if preset == "random_hadamard" and online_had_matrix is None: - online_had_matrix = get_or_create_random_hadamard(group_size if is_grouped else hidden_size) - if preset == "quarot_hadamard" and is_grouped: - online_had_matrix = None - - # -- Helper: look up cached random matrix for online-paired hooks -- - def _online_had(dim): - if preset == "random_hadamard": - return get_or_create_random_hadamard(dim) - return None - - mlp_out_suffix = mapping.mlp_out.split(".")[-1] - attn_o_suffix = mapping.attn_o.split(".")[-1] - - # Suffixes for Q/K/V and gate/up (for online input Had hooks) - attn_qkv_suffixes = set(attr.split(".")[-1] for attr in (mapping.attn_q, mapping.attn_k, mapping.attn_v)) - mlp_in_suffixes = set(attr.split(".")[-1] for attr in mapping.mlp_in) - - # --- Build hook factories --- - def _make_down_proj_hook(): - if is_grouped: - return GroupOnlineHadamardHook( - group_size=group_size, fp32_had=fp32_had, use_fast_had=online_fast, had_matrix=online_had_matrix - ) - online_mat = _online_had(intermediate_size) - if online_mat is not None: - return FullOnlineHadamardHook( - had_K=None, K=None, fp32_had=fp32_had, use_fast_had=online_fast, had_matrix=online_mat - ) - had_K, K = get_hadK(intermediate_size) - return FullOnlineHadamardHook(had_K=had_K, K=K, fp32_had=fp32_had, use_fast_had=online_fast) - - def _make_hidden_had_hook(): - """Full Had hook on hidden_size (for Q/K/V and gate/up input).""" - if is_grouped: - return GroupOnlineHadamardHook( - group_size=group_size, fp32_had=fp32_had, use_fast_had=online_fast, had_matrix=online_had_matrix - ) - online_mat = _online_had(hidden_size) - if online_mat is not None: - return FullOnlineHadamardHook( - had_K=None, K=None, fp32_had=fp32_had, use_fast_had=online_fast, had_matrix=online_mat - ) - had_K, K = get_hadK(hidden_size) - return FullOnlineHadamardHook(had_K=had_K, K=K, fp32_had=fp32_had, use_fast_had=online_fast) - - def _make_o_proj_hook(): - online_mat = _online_had(num_heads) - if online_mat is not None: - return CrossHeadOnlineHadamardHook( - had_K=None, - K=None, - head_dim=head_dim, - fp32_had=fp32_had, - use_fast_had=online_fast, - had_matrix=online_mat, - ) - had_K, K = get_hadK(num_heads) - return CrossHeadOnlineHadamardHook( - had_K=had_K, - K=K, - head_dim=head_dim, - fp32_had=fp32_had, - use_fast_had=online_fast, - ) - - # --- Register --- - handles = [] - - for name, module in model.named_modules(): - if not isinstance(module, torch.nn.Linear): - continue - suffix = name.split(".")[-1] - - if name.endswith(mlp_out_suffix): - # down_proj: full Had on intermediate_size input - h = module.register_forward_pre_hook(_make_down_proj_hook()) - handles.append(h) - elif name.endswith(attn_o_suffix): - if fuse_online_to_weight and not is_grouped: - # o_proj: cross-head Had on input (fused mode, full only) - h = module.register_forward_pre_hook(_make_o_proj_hook()) - handles.append(h) - elif not fuse_online_to_weight: - # o_proj: full Had on hidden_size input (unfused mode, matches weight rotation) - h = module.register_forward_pre_hook(_make_hidden_had_hook()) - handles.append(h) - elif suffix in attn_qkv_suffixes: - if not fuse_online_to_weight: - # Q/K/V: full Had on hidden_size input (unfused mode only). - # In fused mode Q/K/V only have residual Q on weight (no online Had), - # and activations come pre-rotated from residual stream → no hook needed. - h = module.register_forward_pre_hook(_make_hidden_had_hook()) - handles.append(h) - elif suffix in mlp_in_suffixes: - if not fuse_online_to_weight: - # gate/up: full Had on hidden_size input (unfused mode only). - # Same reasoning as Q/K/V above. - h = module.register_forward_pre_hook(_make_hidden_had_hook()) - handles.append(h) - - return handles - - -# --------------------------------------------------------------------------- -# Public API -# --------------------------------------------------------------------------- - - -def apply_rotation_transform( - model, - group_size: int = None, - allow_online_rotation: bool = True, - rotation_matrix: Union[str, torch.Tensor, Dict[int, torch.Tensor], None] = None, - compute_device: torch.device | str = None, - fp32_had: bool = False, - fuse_online_to_weight: bool = None, -): - """Fuse layer norms, rotate weights, and register online Hadamard hooks. - - This is the single entry point for applying Hadamard inplace rotation. - The model architecture is auto-detected via ``model.config.model_type``. - - Args: - model: A HuggingFace CausalLM model (LLaMA-2/3, Qwen-3, etc.). - fp32_had: Whether to compute the online Hadamard transform in fp32. - group_size: If ``None`` (default), use full-dimension Hadamard rotation. - compute_device: Device to run Hadamard computation on. - allow_online_rotation: If ``True`` (default), apply online Hadamard - rotations on ``down_proj`` input and the OV pair. - rotation_matrix: Rotation matrix selection (``"hadamard"``, - ``"random_hadamard"``, ``"quarot_hadamard"``, Tensor, dict, or None). - fuse_online_to_weight: If ``True`` (default), fuse online Hadamard - rotation into weights (down_proj input, v_proj output, o_proj input) - and register compensating online hooks. If ``False``, skip - embedding/lm_head rotation; each linear layer is self-contained - with input-side Had on weight + compensating online hook on - activation. No v_proj cross-head or inner-head rotation. - - Returns: - list of hook handles.""" - if fuse_online_to_weight is None: - if model.config.model_type in MAPPING_REGISTRY or model.__class__.__name__ in MAPPING_REGISTRY: - fuse_online_to_weight = True - else: - fuse_online_to_weight = False - had_dict, use_fast_had, preset = _normalize_rotation_matrix(rotation_matrix, group_size) - compute_device = _resolve_compute_device(compute_device) - - if use_fast_had: - from auto_round.utils import logger - - try: - import fast_hadamard_transform # noqa: F401 - - if group_size is None: - logger.warning( - "fast_hadamard_transform uses a different Hadamard matrix than the " - "default implementation. Please ensure consistency between training " - "and inference. This will be refined later." - ) - except ImportError: - logger.warning("Importing fast_hadamard_transform failed, falling back to default implementation.") - use_fast_had = False - - mapping = infer_mapping_from_model(model) - - _untie_word_embeddings(model, mapping) - - if _uses_layernorm_with_mean(model, mapping): - _subtract_embedding_mean(model, mapping) - - _fuse_layer_norms(model, mapping) - - if _uses_layernorm_with_mean(model, mapping): - layers = _resolve(model, mapping.layers_attr) - for layer in layers: - _bake_mean_into_linear(_resolve(layer, mapping.attn_o)) - _bake_mean_into_linear(_resolve(layer, mapping.mlp_out)) - _replace_layernorms_with_rmsnorm(model) - - _rotate_weights( - model, - mapping, - use_fast_had=use_fast_had, - group_size=group_size, - compute_device=compute_device, - had_dict=had_dict, - preset=preset, - fuse_online_to_weight=fuse_online_to_weight, - ) - - handles = [] - if fuse_online_to_weight or allow_online_rotation: - handles = _register_online_hooks( - model, - mapping, - fp32_had=fp32_had, - use_fast_had=use_fast_had, - group_size=group_size, - had_dict=had_dict, - preset=preset, - fuse_online_to_weight=fuse_online_to_weight, - ) - - return model, handles - - -# --------------------------------------------------------------------------- -# Quick smoke test -# --------------------------------------------------------------------------- - -if __name__ == "__main__": - from transformers import AutoModelForCausalLM, AutoTokenizer - - model_name = "/models/opt-125m" - tokenizer = AutoTokenizer.from_pretrained(model_name) - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") - model.to("cuda") - text = "There is a girl who likes adventure," - inputs = tokenizer(text, return_tensors="pt").to(model.device) - print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) - - apply_rotation_transform( - model, group_size=-1, allow_online_rotation=True, rotation_matrix="random_hadamard", fuse_online_to_weight=False - ) - model.to("cuda") - text = "There is a girl who likes adventure," - inputs = tokenizer(text, return_tensors="pt").to(model.device) - print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) - - model_name = "/models/Qwen3-8B" - tokenizer = AutoTokenizer.from_pretrained(model_name) - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") - apply_rotation_transform(model, group_size=-1, allow_online_rotation=True, fuse_online_to_weight=True) - model.to("cuda") - text = "There is a girl who likes adventure," - inputs = tokenizer(text, return_tensors="pt").to(model.device) - print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) - - from transformers import AutoModelForCausalLM, AutoTokenizer - - model_name = "/models/Meta-Llama-3.1-8B-Instruct" - tokenizer = AutoTokenizer.from_pretrained(model_name) - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") - apply_rotation_transform(model, fuse_online_to_weight=True, group_size=32) - model.to("cuda") - text = "There is a girl who likes adventure," - inputs = tokenizer(text, return_tensors="pt").to(model.device) - print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) - # - # model_name = "/models/Llama-2-7b-chat-hf" - # tokenizer = AutoTokenizer.from_pretrained(model_name) - # model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") - # apply_hadamard_rotation(model) - # model.to("cuda") - # text = "There is a girl who likes adventure," - # inputs = tokenizer(text, return_tensors="pt").to(model.device) - # print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) diff --git a/auto_round/experimental/rotation_inplace/model_config.py b/auto_round/experimental/rotation_inplace/model_config.py index 3ecbf9b69..35078cd28 100644 --- a/auto_round/experimental/rotation_inplace/model_config.py +++ b/auto_round/experimental/rotation_inplace/model_config.py @@ -1,169 +1,9 @@ # # Copyright (C) 2026 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 +"""Backward-compat re-export shim. -"""Model architecture mapping for Hadamard rotation. - -Each :class:`RotationMapping` describes *where* the rotation-relevant modules -live inside a model. Currently supports LLaMA-2, LLaMA-3, and Qwen-3 (dense). - -New architectures can be supported by calling :func:`register_mapping`. +The canonical implementation now lives in +:mod:`auto_round.algorithms.transforms.rotation.inplace.model_config`. """ -from __future__ import annotations - -from dataclasses import dataclass, field -from typing import Dict, List, Optional - -from auto_round.utils import logger - -__all__ = [ - "RotationMapping", - "register_mapping", - "get_mapping", - "infer_mapping_from_model", - "MAPPING_REGISTRY", -] - - -# --------------------------------------------------------------------------- -# Mapping dataclass -# --------------------------------------------------------------------------- - - -@dataclass -class RotationMapping: - """Declarative description of a transformer architecture for Hadamard rotation. - - Attribute names follow the dot-path convention relative to the model or - each decoder layer. - - Config attribute names (read from ``model.config``): - num_heads_attr, hidden_size_attr, intermediate_size_attr - head_dim_override – explicit head dim (skip hidden_size // num_heads) - """ - - # -- top-level modules (dot-path from model root) -- - embedding: str = "model.embed_tokens" - lm_head: str = "lm_head" - positional_embedding: Optional[str] = None # e.g. "model.decoder.embed_positions" for OPT - - # -- layers container (dot-path from model root) -- - layers_attr: str = "model.layers" - - # -- per-layer: attention (dot-path from each layer) -- - attn_input_ln: str = "input_layernorm" - attn_q: str = "self_attn.q_proj" - attn_k: str = "self_attn.k_proj" - attn_v: str = "self_attn.v_proj" - attn_o: str = "self_attn.o_proj" - - # -- per-layer: MLP (dot-path from each layer) -- - mlp_input_ln: str = "post_attention_layernorm" - mlp_in: List[str] = field(default_factory=lambda: ["mlp.up_proj", "mlp.gate_proj"]) - mlp_out: str = "mlp.down_proj" - - # -- final norm (dot-path from model root) -- - pre_head_ln: str = "model.norm" - - # -- head dim override (None = hidden_size // num_heads) -- - attn_head_dim: Optional[int] = None - - # -- config attr names -- - num_heads_attr: str = "num_attention_heads" - hidden_size_attr: str = "hidden_size" - intermediate_size_attr: str = "intermediate_size" - - -# --------------------------------------------------------------------------- -# Helper: resolve a dot-path attribute on a module -# --------------------------------------------------------------------------- - - -def _resolve(root, dot_path: str): - """Resolve ``'a.b.c'`` to ``root.a.b.c``.""" - obj = root - for attr in dot_path.split("."): - obj = getattr(obj, attr) - return obj - - -# --------------------------------------------------------------------------- -# Registry -# --------------------------------------------------------------------------- - -MAPPING_REGISTRY: Dict[str, RotationMapping] = {} - - -def register_mapping(key: str, mapping: RotationMapping) -> RotationMapping: - """Register a :class:`RotationMapping` under *key* (model_type or architecture).""" - MAPPING_REGISTRY[key] = mapping - return mapping - - -def get_mapping(key: str) -> RotationMapping: - """Look up a mapping by *key*; fall back to default if not found.""" - if key in MAPPING_REGISTRY: - return MAPPING_REGISTRY[key] - logger.warning(f"No rotation mapping registered for '{key}', " "falling back to default (LLaMA-like) mapping.") - return RotationMapping() - - -def infer_mapping_from_model(model) -> RotationMapping: - """Return the best :class:`RotationMapping` for *model*. - - Tries ``model.config.model_type`` first, then ``model.__class__.__name__``. - """ - model_type = getattr(getattr(model, "config", None), "model_type", "") - if model_type in MAPPING_REGISTRY: - return MAPPING_REGISTRY[model_type] - - arch = model.__class__.__name__ - if arch in MAPPING_REGISTRY: - return MAPPING_REGISTRY[arch] - - logger.warning( - f"Unrecognised architecture '{arch}' (model_type='{model_type}'). " - "Falling back to default (LLaMA-like) mapping." - ) - return RotationMapping() - - -# =================================================================== -# Built-in mappings -# =================================================================== - -# LLaMA-2 / LLaMA-3 / Mistral / Yi — all share the same layout -_default = RotationMapping() - -register_mapping("llama", _default) -register_mapping("LlamaForCausalLM", _default) - -# Qwen-3 dense — identical layout to LLaMA -register_mapping("qwen3", _default) -register_mapping("Qwen3ForCausalLM", _default) - -# Qwen-2 / Qwen-2.5 dense — identical layout to LLaMA -register_mapping("qwen2", _default) -register_mapping("Qwen2ForCausalLM", _default) - -# ---- OPT ---- -# OPT uses standard LayerNorm (with bias, subtracts mean), -# different module names, and tied lm_head ↔ embedding weights. -_opt = RotationMapping( - embedding="model.decoder.embed_tokens", - lm_head="lm_head", - positional_embedding="model.decoder.embed_positions", - layers_attr="model.decoder.layers", - attn_input_ln="self_attn_layer_norm", - attn_q="self_attn.q_proj", - attn_k="self_attn.k_proj", - attn_v="self_attn.v_proj", - attn_o="self_attn.out_proj", - mlp_input_ln="final_layer_norm", - mlp_in=["fc1"], - mlp_out="fc2", - pre_head_ln="model.decoder.final_layer_norm", - intermediate_size_attr="ffn_dim", -) -register_mapping("opt", _opt) -register_mapping("OPTForCausalLM", _opt) +from auto_round.algorithms.transforms.rotation.inplace.model_config import * # noqa: F401, F403 diff --git a/auto_round/experimental/rotation_inplace/utils.py b/auto_round/experimental/rotation_inplace/utils.py index 04bb18981..4ddd80d48 100644 --- a/auto_round/experimental/rotation_inplace/utils.py +++ b/auto_round/experimental/rotation_inplace/utils.py @@ -1,786 +1,9 @@ # # Copyright (C) 2026 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 +"""Backward-compat re-export shim. -"""Online Hadamard transform hooks. - -After weight rotation, down_proj and o_proj require an online Hadamard -transform on their *input activations* at inference time. This module -provides the hooks and a helper to register them on the model. +The canonical implementation now lives in +:mod:`auto_round.algorithms.transforms.rotation.inplace.hooks`. """ -import math - -import torch -import torch.nn as nn - -try: - import fast_hadamard_transform -except ImportError: - fast_hadamard_transform = None - - -def _resolve_compute_device(compute_device) -> torch.device: - """Return *compute_device* if explicitly given, otherwise auto-detect GPU. - - When ``compute_device`` is ``None`` the function checks for CUDA / XPU - availability and returns the first accelerator it finds so that heavy - matrix operations are offloaded to GPU even when the model weights live - on CPU. Falls back to ``torch.device("cpu")`` when no accelerator is - present. - """ - if compute_device is not None: - return torch.device(compute_device) if not isinstance(compute_device, torch.device) else compute_device - if torch.cuda.is_available(): - return torch.device("cuda:0") - if hasattr(torch, "xpu") and torch.xpu.is_available(): - return torch.device("xpu:0") - return torch.device("cpu") - - -BUILTIN_ROTATION_PRESETS = {"quarot_hadamard", "hadamard", "random_hadamard"} - -# Global cache for random Hadamard matrices keyed by dimension. -# Ensures the same shape always returns the exact same random matrix within -# a process, across all calls to ``_rotate_weights`` / ``_register_online_hooks``. -_RANDOM_HADAMARD_CACHE: dict = {} - - -def get_or_create_random_hadamard(dim: int, device=None) -> torch.Tensor: - """Return a random Hadamard matrix for *dim*, creating and caching it if needed. - - The matrix is cached globally in ``_RANDOM_HADAMARD_CACHE`` so that every - caller that requests the same *dim* receives the identical matrix. - """ - if dim in _RANDOM_HADAMARD_CACHE: - mat = _RANDOM_HADAMARD_CACHE[dim] - if device is not None: - mat = mat.to(device) - return mat - mat = random_hadamard_matrix(dim, device or torch.device("cpu")) - _RANDOM_HADAMARD_CACHE[dim] = mat - return mat - - -def clear_random_hadamard_cache(): - """Clear the global random Hadamard matrix cache. - - Call this when you want subsequent ``random_hadamard`` preset runs to - generate fresh random matrices (e.g. between independent experiments). - """ - _RANDOM_HADAMARD_CACHE.clear() - - -def _normalize_rotation_matrix(rotation_matrix, group_size): - """Normalize ``rotation_matrix`` into a ``(had_dict, use_fast_had, preset)`` tuple. - - Accepted inputs: - * ``None`` → ``(None, False, None)`` — use built-in butterfly ``matmul_hadU``. - * ``"quarot_hadamard"`` → ``(None, True, "quarot_hadamard")`` — fusable - rotations use ``fast_hadamard_transform`` (random); non-fusable - (online-paired) rotations use deterministic ``get_hadK``/``matmul_hadU``. - * ``"hadamard"`` → ``(None, False, "hadamard")`` — all rotations use - deterministic ``get_hadK``/``matmul_hadU``. - * ``"random_hadamard"`` → ``(None, False, "random_hadamard")`` — all rotations use - ``random_hadamard_matrix``. - * A ``torch.Tensor`` of shape ``(n, n)`` → ``({n: tensor}, False, None)``. - * A ``dict[int, Tensor]`` → ``(dict, False, None)`` — returned as-is. - - Returns: - ``(had_dict, use_fast_had, preset)`` - - Raises: - ValueError: if a non-``str`` *rotation_matrix* is given but - *group_size* is not a positive integer, or an unknown preset. - """ - if rotation_matrix is None: - return None, False, None - - if isinstance(rotation_matrix, str): - if rotation_matrix not in BUILTIN_ROTATION_PRESETS: - raise ValueError( - f"Unknown rotation_matrix preset '{rotation_matrix}'. " - f"Supported presets: {BUILTIN_ROTATION_PRESETS}." - ) - if rotation_matrix == "quarot_hadamard": - return None, True, "quarot_hadamard" - elif rotation_matrix == "hadamard": - return None, False, "hadamard" - else: # "random_hadamard" - return None, False, "random_hadamard" - - is_grouped = group_size is not None and group_size > 0 - if not is_grouped and not isinstance(rotation_matrix, dict): - raise ValueError( - "rotation_matrix (Tensor/dict) can only be used with a positive group_size. " - f"Got group_size={group_size}." - ) - - if isinstance(rotation_matrix, torch.Tensor): - assert ( - rotation_matrix.ndim == 2 and rotation_matrix.shape[0] == rotation_matrix.shape[1] - ), f"rotation_matrix must be square, got shape {rotation_matrix.shape}" - return {rotation_matrix.shape[0]: rotation_matrix}, False, None - - if isinstance(rotation_matrix, dict): - for k, t in rotation_matrix.items(): - assert ( - isinstance(t, torch.Tensor) and t.ndim == 2 and t.shape[0] == t.shape[1] - ), f"rotation_matrix[{k}] must be a square tensor, got shape {t.shape}" - return rotation_matrix, False, None - - raise TypeError( - f"rotation_matrix must be a Tensor, dict[int, Tensor], str, or None. " f"Got {type(rotation_matrix)}." - ) - - -def _get_custom_had(had_dict, size): - """Look up a custom Hadamard matrix for *size* from the normalized dict. - - Returns ``(had_tensor, True)`` if found, ``(None, False)`` otherwise. - """ - if had_dict is None: - return None, False - if size in had_dict: - return had_dict[size], True - return None, False - - -# --------------------------------------------------------------------------- -# Hook implementations -# --------------------------------------------------------------------------- - - -class FullOnlineHadamardHook(nn.Module): - """Pre-forward hook: full Hadamard on the entire last dimension (for ``down_proj``).""" - - def __init__(self, had_K, K, fp32_had=False, use_fast_had=True, had_matrix=None): - super().__init__() - self.custom_had = had_matrix is not None - if had_matrix is not None: - self.register_buffer("had_matrix", had_matrix) - self.had_K = None - self.K = None - else: - if had_K is not None: - self.register_buffer("had_K", had_K) - else: - self.had_K = None - self.K = K - self.fp32_had = fp32_had - self.use_fast_had = use_fast_had - - def __call__(self, module: nn.Module, args): - x = args[0] if isinstance(args, tuple) else args - x_dtype = x.dtype - - if self.custom_had: - H = self.had_matrix.to(device=x.device, dtype=x.dtype) - if self.fp32_had: - H = self.had_matrix.to(device=x.device).float() - x = (x.float() @ H.T).to(x_dtype) - else: - x = x @ H.T - elif self.fp32_had: - x = matmul_hadU_cuda(x.float(), self.had_K, self.K, use_fast_had=self.use_fast_had).to(x_dtype) - else: - x = matmul_hadU_cuda(x, self.had_K, self.K, use_fast_had=self.use_fast_had) - - if isinstance(args, tuple): - return (x,) + args[1:] - return x - - -class CrossHeadOnlineHadamardHook(nn.Module): - """Pre-forward hook: **cross-head** Hadamard on the ``num_heads`` dimension - (for ``o_proj``). - - After offline rotation: - - ``v_proj`` absorbed a per-head (within-head) Hadamard on ``head_dim``. - - ``o_proj`` absorbed a full Hadamard on ``hidden_size``. - - Since ``H_full = H_cross ⊗ H_within`` (Kronecker decomposition) and the - within-head part is already cancelled by ``v_proj`` through the attention - path (``H_within² = I``), the online hook only needs to apply the residual - **cross-head** Hadamard (``H_cross ⊗ I``): - - * reshape ``(*, hidden_size)`` → ``(*, num_heads, head_dim)`` - * transpose → ``(*, head_dim, num_heads)`` - * Hadamard on the **num_heads** axis (last dim) - * transpose back and reshape - """ - - def __init__(self, had_K, K, head_dim, fp32_had=False, use_fast_had=True, had_matrix=None): - """ - Args: - had_K: Hadamard sub-matrix from ``get_hadK(num_heads)``. - K: Block size from ``get_hadK(num_heads)``. - head_dim: ``hidden_size // num_attention_heads``. - fp32_had: Compute in fp32. - use_fast_had: If True use fast_hadamard_transform; if False use matmul_hadU. - had_matrix: Optional custom rotation matrix of shape ``(num_heads, num_heads)``. - """ - super().__init__() - self.custom_had = had_matrix is not None - if had_matrix is not None: - self.register_buffer("had_matrix", had_matrix) - self.had_K = None - self.K = None - else: - if had_K is not None: - self.register_buffer("had_K", had_K) - else: - self.had_K = None - self.K = K - self.had_dim = head_dim - self.fp32_had = fp32_had - self.use_fast_had = use_fast_had - - def __call__(self, module: nn.Module, args): - x = args[0] if isinstance(args, tuple) else args - x_dtype = x.dtype - - if self.fp32_had: - x = x.float() - - init_shape = x.shape - num_heads = init_shape[-1] // self.had_dim - - if self.custom_had: - H = self.had_matrix.to(device=x.device, dtype=x.dtype) - # reshape (*, hidden) → (*, num_heads, head_dim), transpose → (*, head_dim, num_heads) - x = x.reshape(-1, num_heads, self.had_dim).transpose(1, 2) - # apply H on last dim (num_heads): x @ H.T - x = (x @ H.T).transpose(1, 2) - elif self.use_fast_had and fast_hadamard_transform is not None and self.K == 1: - x = fast_hadamard_transform.hadamard_transform( - x.reshape(-1, num_heads, self.had_dim).transpose(1, 2), - scale=1 / math.sqrt(num_heads), - ).transpose(1, 2) - else: - # Fallback: use matmul_hadU (pure butterfly + had_K, no fast_hadamard_transform) - x = x.reshape(-1, num_heads, self.had_dim).transpose(1, 2) - x = matmul_hadU(x.contiguous()) - x = x.transpose(1, 2) - - if self.fp32_had: - x = x.to(x_dtype) - x = x.reshape(init_shape) - - if isinstance(args, tuple): - return (x,) + args[1:] - return x - - -# --------------------------------------------------------------------------- -# Registration helper -# --------------------------------------------------------------------------- - - -def register_online_had_hooks(model, mapping=None, fp32_had=False, use_fast_had=True): - """Register online Hadamard pre-forward hooks on ``down_proj`` and ``o_proj``. - - * **down_proj** (``online_full_had``): full Hadamard on ``intermediate_size``. - Compensates ``apply_exact_had_to_linear(down_proj, had_dim=-1, output=False)``. - - * **o_proj** (``online cross-head had``): cross-head Hadamard on ``num_heads``. - Compensates the residual after v_proj's within-head Hadamard cancels. - - Args: - model: A HuggingFace model whose weights have already been rotated. - mapping: A :class:`RotationMapping` (auto-inferred if ``None``). - fp32_had: Whether to compute the Hadamard transform in fp32. - use_fast_had: If True use fast_hadamard_transform; if False use matmul_hadU. - - Returns: - list of hook handles (call ``handle.remove()`` to detach). - """ - if mapping is None: - from auto_round.experimental.rotation_inplace.model_config import infer_mapping_from_model - - mapping = infer_mapping_from_model(model) - - config = model.config - num_heads = getattr(config, mapping.num_heads_attr) - hidden_size = getattr(config, mapping.hidden_size_attr) - intermediate_size = getattr(config, mapping.intermediate_size_attr) - head_dim = mapping.attn_head_dim or (hidden_size // num_heads) - - # down_proj: full Hadamard on intermediate_size - had_K_full, K_full = get_hadK(intermediate_size) - - # o_proj: cross-head Hadamard on num_heads - had_K_head, K_head = get_hadK(num_heads) - - # Identify target module suffixes from mapping - mlp_out_suffix = mapping.mlp_out.split(".")[-1] # e.g. "down_proj" - attn_o_suffix = mapping.attn_o.split(".")[-1] # e.g. "o_proj" - - handles = [] - for name, module in model.named_modules(): - if name.endswith(mlp_out_suffix) and isinstance(module, nn.Linear): - hook = FullOnlineHadamardHook( - had_K=had_K_full, - K=K_full, - fp32_had=fp32_had, - use_fast_had=use_fast_had, - ) - h = module.register_forward_pre_hook(hook) - handles.append(h) - elif name.endswith(attn_o_suffix) and isinstance(module, nn.Linear): - hook = CrossHeadOnlineHadamardHook( - had_K=had_K_head, - K=K_head, - head_dim=head_dim, - fp32_had=fp32_had, - use_fast_had=use_fast_had, - ) - h = module.register_forward_pre_hook(hook) - handles.append(h) - - return handles - - -def is_pow2(n): - return (n & (n - 1) == 0) and (n > 0) - - -# Adapted from https://github.com/Cornell-RelaxML/quip-sharp/blob/main/lib/utils/matmul_had.py -def get_hadK(n: int, transpose=False) -> (torch.Tensor, int): - hadK, K = None, None - - if is_pow2(n): - K = 1 - return hadK, K - else: - from auto_round.experimental.transform.utils.hadamard import _fetch_hadamard_divisor - - hadK = _fetch_hadamard_divisor(n, torch.float, torch.device("cpu")) - if transpose: - hadK = hadK.T - if hadK is not None: - return hadK, 1 if is_pow2(hadK.shape[0]) else hadK.shape[0] - assert is_pow2(n) - - -def matmul_hadU(X, transpose=False): - n = X.shape[-1] - hadK, K = get_hadK(n, transpose) - input = X.clone().view(-1, n, 1) - output = input.clone() - while input.shape[1] > K: - input = input.view(input.shape[0], input.shape[1] // 2, 2, input.shape[2]) - output = output.view(input.shape) - output[:, :, 0, :] = input[:, :, 0, :] + input[:, :, 1, :] - output[:, :, 1, :] = input[:, :, 0, :] - input[:, :, 1, :] - output = output.view(input.shape[0], input.shape[1], -1) - input, output = (output, input) - del output - - if K > 1: - # Do not explicitly repeat - OOM - # input = torch.bmm( - # hadK.repeat(len(input), 1, 1).to(input.device).to(input.dtype), input) - # Use bcast instead - input = hadK.view(1, K, K).to(input) @ input - - return input.view(X.shape) / torch.tensor(n).sqrt() - - -def matmul_hadUt(X): - return matmul_hadU(X, transpose=True) - - -def random_hadamard_matrix(size, device): - # See https://cornell-relaxml.github.io/quip-sharp/ , Section "Randomized Hadamard Transformation" - Q = torch.randint(low=0, high=2, size=(size,)).to(torch.float64) - Q = Q * 2 - 1 - Q = torch.diag(Q) - return matmul_hadU(Q).to(device) - - -def deterministic_hadamard_matrix(size, device): - """Build a deterministic Hadamard matrix of the given *size*. - - Applies the butterfly ``matmul_hadU`` to an identity matrix so that the - result is purely determined by ``get_hadK`` (no random sign flips). - """ - Q = torch.eye(size, dtype=torch.float64) - return matmul_hadU(Q).to(device) - - -def matmul_hadU_cuda(X, hadK, K, use_fast_had=True): - n = X.shape[-1] - if not use_fast_had or fast_hadamard_transform is None: - return matmul_hadU(X) - if K == 1: - return fast_hadamard_transform.hadamard_transform(X.contiguous(), 1.0 / torch.tensor(n).sqrt()) - # if transpose: - # hadK = hadK.T.contiguous() - input = X.view(*X.shape[:-1], K, n // K) - input = fast_hadamard_transform.hadamard_transform(input.contiguous(), 1.0 / torch.tensor(n).sqrt()) - input = hadK.to(input.device).to(input.dtype) @ input - return input.reshape(X.shape) - - -def matmul_hadUt_cuda(X, hadK, K, use_fast_had=True): - return matmul_hadU_cuda(X, hadK, K, use_fast_had=use_fast_had) - - -def apply_exact_had_to_linear( - module, had_dim=-1, output=False, use_fast_had=True, compute_device=None, had_matrix=None -): - """Apply Hadamard rotation to a Linear layer's weight in-place. - - Args: - module: ``nn.Linear`` layer. - had_dim: Dimension of each Hadamard block (``-1`` for full dimension). - output: If ``True`` rotate the output (row) side; otherwise input (col). - use_fast_had: Use ``fast_hadamard_transform`` when available. - compute_device: Device to run computation on. - had_matrix: Optional custom rotation matrix. When ``had_dim == -1`` - this should be a square tensor whose size equals - ``out_features`` (output) or ``in_features`` (input). When - ``had_dim > 0`` the size should equal ``had_dim``. - """ - assert isinstance(module, torch.nn.Linear) - in_features, out_features = module.in_features, module.out_features - - if had_dim != -1 and had_matrix is None: - assert is_pow2(had_dim), "Hadamard dimension must be a power of 2!" - - W_ = module.weight.data - dtype = W_.dtype - dev = W_.device - init_shape = W_.shape - compute_dev = _resolve_compute_device(compute_device) - W_ = W_.double().to(compute_dev) - - if had_matrix is not None: - H = had_matrix.to(device=compute_dev, dtype=torch.float64) - if had_dim == -1: - # Full-dimension custom matrix - if output: - # W.T = H @ W.T → W = (H @ W.T).T = W @ H.T - W_ = W_ @ H.T - else: - # W = H @ W (rotate input columns: W_new[i,:] = sum H[i,j]*W[j,:]) - # Actually for input side: W_new = W @ H (each row is rotated) - W_ = W_ @ H.T - else: - # Per-block custom matrix - if output: - W_ = W_.t() - transposed_shape = W_.shape - flat = W_.reshape(-1, had_dim) - W_ = (flat @ H.T).reshape(transposed_shape).t() - else: - flat = W_.reshape(-1, had_dim) - W_ = (flat @ H.T).reshape(init_shape) - elif had_dim == -1: - if output: - had_K, K = get_hadK(out_features) - W_ = matmul_hadU_cuda(W_.t(), had_K, K, use_fast_had=use_fast_had).t() - if not output: - had_K, K = get_hadK(in_features) - W_ = matmul_hadU_cuda(W_, had_K, K, use_fast_had=use_fast_had) - else: - # Apply Hadamard to the last had_dim chunks of the weights - if output: - W_ = W_.t() - transposed_shape = W_.shape - if use_fast_had and fast_hadamard_transform is not None: - W_ = ( - fast_hadamard_transform.hadamard_transform( - W_.reshape(-1, transposed_shape[-1] // had_dim, had_dim), scale=1 / math.sqrt(had_dim) - ) - .reshape(transposed_shape) - .t() - ) - else: - W_ = matmul_hadU(W_.reshape(-1, had_dim)).reshape(transposed_shape).t() - else: - if use_fast_had and fast_hadamard_transform is not None: - n = W_.shape[1] - W_ = fast_hadamard_transform.hadamard_transform( - W_.reshape(-1, n // had_dim, had_dim), scale=1 / math.sqrt(had_dim) - ).reshape(init_shape) - else: - W_ = matmul_hadU(W_.reshape(-1, had_dim)).reshape(init_shape) - module.weight.data = W_.to(device=dev, dtype=dtype) - - -def apply_cross_head_had_to_linear( - module, num_heads, head_dim, use_fast_had=True, compute_device=None, had_matrix=None -): - """Apply a cross-head Hadamard rotation to a Linear layer's input side. - - The operation is equivalent to ``(H_cross ⊗ I_head_dim)`` applied to the - input columns: - - * Reshape columns ``(hidden_size,)`` → ``(num_heads, head_dim)`` - * Transpose → ``(head_dim, num_heads)`` - * Hadamard on the ``num_heads`` axis - * Transpose back and reshape - - This mirrors what :class:`CrossHeadOnlineHadamardHook` does at runtime. - - Args: - module: ``nn.Linear`` layer whose ``in_features == num_heads * head_dim``. - num_heads: Number of attention heads. - head_dim: Per-head dimension. - use_fast_had: Use ``fast_hadamard_transform`` when available. - compute_device: Device to run computation on. - had_matrix: Optional custom rotation matrix of shape ``(num_heads, num_heads)``. - """ - assert isinstance(module, torch.nn.Linear) - W_ = module.weight.data - dtype = W_.dtype - dev = W_.device - compute_dev = _resolve_compute_device(compute_device) - W_ = W_.double().to(compute_dev) - - out_f = W_.shape[0] - # W shape: (out_features, hidden_size) where hidden_size = num_heads * head_dim - # Reshape columns: (out_f, num_heads, head_dim) - W_ = W_.reshape(out_f, num_heads, head_dim) - # Transpose last two dims: (out_f, head_dim, num_heads) - W_ = W_.transpose(1, 2).contiguous() - - if had_matrix is not None: - H = had_matrix.to(device=compute_dev, dtype=torch.float64) - # Apply H on last dim (num_heads): flat @ H.T - flat = W_.reshape(-1, num_heads) - W_ = (flat @ H.T).reshape(out_f, head_dim, num_heads) - elif use_fast_had and fast_hadamard_transform is not None and is_pow2(num_heads): - W_ = fast_hadamard_transform.hadamard_transform(W_, scale=1.0 / math.sqrt(num_heads)) - else: - W_ = matmul_hadU(W_.reshape(-1, num_heads)).reshape(out_f, head_dim, num_heads) - - # Transpose back: (out_f, num_heads, head_dim) → (out_f, hidden_size) - W_ = W_.transpose(1, 2).contiguous().reshape(out_f, num_heads * head_dim) - module.weight.data = W_.to(device=dev, dtype=dtype) - - -# --------------------------------------------------------------------------- -# Grouped (block-diagonal) Hadamard utilities -# --------------------------------------------------------------------------- - - -class OnlineHadamardPostHook(nn.Module): - """Forward hook (post-hook) adapter: wraps a pre-hook-style Hadamard - transform to apply it on the layer's **output** instead of input. - - Used for v_proj per-head Hadamard on the output side when online - rotation is not fused into weights. - """ - - def __init__(self, pre_hook): - super().__init__() - self.pre_hook = pre_hook - - def __call__(self, module, input, output): - result = self.pre_hook(module, (output,)) - if isinstance(result, tuple): - return result[0] - return result - - -class GroupOnlineHadamardHook(nn.Module): - """Pre-forward hook: block-diagonal Hadamard with fixed ``group_size`` on last dim. - - Reshapes ``(*, D)`` → ``(*, D // group_size, group_size)``, applies Hadamard - per group, then reshapes back. Much cheaper than a full-dimension Hadamard. - """ - - def __init__(self, group_size, fp32_had=False, use_fast_had=True, had_matrix=None): - super().__init__() - self.group_size = group_size - self.fp32_had = fp32_had - self.use_fast_had = use_fast_had - self.custom_had = had_matrix is not None - - if had_matrix is not None: - self.register_buffer("had_matrix", had_matrix) - self.had_K = None - self.K = None - elif not is_pow2(group_size): - had_K, K = get_hadK(group_size) - if had_K is not None: - self.register_buffer("had_K", had_K) - else: - self.had_K = None - self.K = K - else: - self.had_K = None - self.K = 1 - - def __call__(self, module: nn.Module, args): - x = args[0] if isinstance(args, tuple) else args - x_dtype = x.dtype - init_shape = x.shape - gs = self.group_size - - if self.fp32_had: - x = x.float() - - # Reshape: (*, D) → (*, D//gs, gs) - x = x.reshape(*init_shape[:-1], init_shape[-1] // gs, gs) - - if self.custom_had: - H = self.had_matrix.to(device=x.device, dtype=x.dtype) - flat = x.reshape(-1, gs) - x = (flat @ H.T).reshape(*init_shape[:-1], init_shape[-1] // gs, gs) - elif self.use_fast_had and fast_hadamard_transform is not None and self.K == 1: - x = fast_hadamard_transform.hadamard_transform(x, scale=1.0 / math.sqrt(gs)) - else: - x = x.reshape(-1, gs) - x = matmul_hadU(x) - x = x.reshape(*init_shape[:-1], init_shape[-1] // gs, gs) - - x = x.reshape(init_shape) - - if self.fp32_had: - x = x.to(x_dtype) - - if isinstance(args, tuple): - return (x,) + args[1:] - return x - - -def _apply_grouped_had_to_weight(W, group_size, side="input", use_fast_had=True, had_matrix=None): - """Apply block-diagonal Hadamard to a weight matrix. - - Args: - W: Weight tensor, shape (out_features, in_features). - group_size: Block size for the Hadamard rotation. - side: ``'input'`` rotates columns (in_features dim), - ``'output'`` rotates rows (out_features dim). - use_fast_had: Use fast_hadamard_transform if available. - had_matrix: Optional custom Hadamard matrix of shape ``(gs, gs)`` - to use instead of the built-in Hadamard. - - Returns: - Rotated weight tensor. - """ - gs = group_size - dtype = W.dtype - W = W.double() - - def _had_on_last_dim(X): - """Apply Hadamard on the last dimension (size gs) of X shaped (..., gs).""" - if had_matrix is not None: - H = had_matrix.to(device=X.device, dtype=X.dtype) - # X: (..., gs) → batch matmul with H^T → X @ H^T - flat = X.reshape(-1, gs) - return (flat @ H.T).reshape(X.shape) - if use_fast_had and fast_hadamard_transform is not None and is_pow2(gs): - return fast_hadamard_transform.hadamard_transform(X, scale=1.0 / math.sqrt(gs)) - orig_shape = X.shape - return matmul_hadU(X.reshape(-1, gs)).reshape(orig_shape) - - if side == "input": - out_f, in_f = W.shape - W = W.reshape(out_f, in_f // gs, gs) - W = _had_on_last_dim(W) - W = W.reshape(out_f, in_f) - else: - out_f, in_f = W.shape - Wt = W.t().contiguous() - Wt = Wt.reshape(in_f, out_f // gs, gs) - Wt = _had_on_last_dim(Wt) - W = Wt.reshape(in_f, out_f).t().contiguous() - - return W.to(dtype) - - -def _rotate_linear_grouped(module, group_size, side="input", use_fast_had=True, compute_device=None, had_matrix=None): - """Apply block-diagonal Hadamard rotation to a Linear layer's weight. - - Args: - module: ``nn.Linear`` layer. - group_size: Block size. - side: ``'input'`` or ``'output'``. - use_fast_had: Use fast_hadamard_transform. - compute_device: Device to run computation on. If None, auto-detects GPU. - had_matrix: Optional custom Hadamard matrix of shape ``(gs, gs)``. - """ - dtype = module.weight.data.dtype - dev = module.weight.data.device - compute_dev = _resolve_compute_device(compute_device) - W = module.weight.data.to(device=compute_dev, dtype=torch.float64) - W = _apply_grouped_had_to_weight(W, group_size, side=side, use_fast_had=use_fast_had, had_matrix=had_matrix) - module.weight.data = W.to(device=dev, dtype=dtype) - - if side == "output" and module.bias is not None: - bias = module.bias.data.to(device=compute_dev, dtype=torch.float64) - gs = group_size - bias = bias.reshape(-1, gs) - if had_matrix is not None: - H = had_matrix.to(device=compute_dev, dtype=torch.float64) - bias = (bias @ H.T).reshape(-1) - elif use_fast_had and fast_hadamard_transform is not None and is_pow2(gs): - bias = ( - fast_hadamard_transform.hadamard_transform(bias.unsqueeze(0), scale=1.0 / math.sqrt(gs)) - .squeeze(0) - .reshape(-1) - ) - else: - bias = matmul_hadU(bias).reshape(-1) - module.bias.data = bias.to(device=dev, dtype=dtype) - - -def _rotate_embedding_grouped(embedding, group_size, use_fast_had=True, compute_device=None, had_matrix=None): - """Apply block-diagonal Hadamard rotation to an Embedding layer. - - Embedding weight: (vocab, hidden_size) → rotate on hidden_size (columns). - """ - dtype = embedding.weight.data.dtype - dev = embedding.weight.data.device - compute_dev = _resolve_compute_device(compute_device) - W = embedding.weight.data.to(device=compute_dev, dtype=torch.float64) - W = _apply_grouped_had_to_weight(W, group_size, side="input", use_fast_had=use_fast_had, had_matrix=had_matrix) - new_W = W.to(device=dev, dtype=dtype) - del W - embedding.weight.data = new_W - - -def register_online_had_hooks_grouped(model, mapping, group_size, fp32_had=False, use_fast_had=True): - """Register per-group online Hadamard hooks on ``down_proj`` and ``o_proj``. - - In grouped mode: - - **down_proj**: block-diagonal Hadamard on ``intermediate_size`` with ``group_size``. - - **o_proj**: block-diagonal Hadamard on ``hidden_size`` with ``group_size``. - - Args: - model: HuggingFace model with rotated weights. - mapping: RotationMapping. - group_size: Block size for block-diagonal Hadamard. - fp32_had: Compute in fp32. - use_fast_had: Use fast_hadamard_transform. - - Returns: - list of hook handles. - """ - mlp_out_suffix = mapping.mlp_out.split(".")[-1] - attn_o_suffix = mapping.attn_o.split(".")[-1] - - handles = [] - for name, module in model.named_modules(): - if name.endswith(mlp_out_suffix) and isinstance(module, nn.Linear): - hook = GroupOnlineHadamardHook( - group_size=group_size, - fp32_had=fp32_had, - use_fast_had=use_fast_had, - ) - h = module.register_forward_pre_hook(hook) - handles.append(h) - elif name.endswith(attn_o_suffix) and isinstance(module, nn.Linear): - hook = GroupOnlineHadamardHook( - group_size=group_size, - fp32_had=fp32_had, - use_fast_had=use_fast_had, - ) - h = module.register_forward_pre_hook(hook) - handles.append(h) - - return handles +from auto_round.algorithms.transforms.rotation.inplace.hooks import * # noqa: F401, F403 diff --git a/auto_round/experimental/transform/rotation_config.py b/auto_round/experimental/transform/rotation_config.py index dfcfb5d45..fad2ede62 100644 --- a/auto_round/experimental/transform/rotation_config.py +++ b/auto_round/experimental/transform/rotation_config.py @@ -1,61 +1,11 @@ # # Copyright (C) 2026 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 +"""Backward-compat re-export shim. -from typing import Optional +The canonical ``RotationConfig`` schema now lives in +:mod:`auto_round.algorithms.transforms.rotation.config`. +""" -from pydantic import BaseModel, Field, field_validator +from auto_round.algorithms.transforms.rotation.config import RotationConfig # noqa: F401 __all__ = ["RotationConfig"] - - -class RotationConfig(BaseModel): - """ - Unified configuration for Hadamard rotation/transform applied to a model. - - Two implementation paths are supported: - - * ``backend="inplace"`` -> ``auto_round.experimental.rotation_inplace`` - QuaRot-style residual-stream / per-layer rotation. Supports any - weight/activation dtype (incl. INT4/INT8/FPx). Can optionally fuse - the online Hadamard into weights (``fuse_online_to_weight=True``). - * ``backend="transform"`` -> ``auto_round.experimental.transform`` - Per-Linear weight + activation Hadamard with a fused triton kernel. - **Only supports MXFP4 / NVFP4** and **cannot fuse online to weight.** - * ``backend="auto"`` (default) - - If ``fuse_online_to_weight=True`` -> inplace (fused). - - Else if ``data_type`` is MX-FP / NV-FP -> transform. - - Otherwise -> inplace (unfused). - - Notes: - * ``block_size`` is the group/block size for grouped Hadamard. - For ``backend="inplace"`` it is forwarded as ``group_size`` (``None`` - / ``-1`` means full-dimension Hadamard). - """ - - # ---- shared ---- - backend: str = Field(default="auto") - block_size: Optional[int] = Field(default=None) - hadamard_type: str = Field(default="hadamard") - - # ---- inplace-only ---- - fuse_online_to_weight: Optional[bool] = Field(default=None) - allow_online_rotation: bool = Field(default=True) - - # for random hadamard transform (transform path) - random_seed: bool = Field(default=False, exclude=True) - - @field_validator("backend") - @classmethod - def validate_backend(cls, v: str) -> str: - allowed = {"auto", "inplace", "transform"} - if v not in allowed: - raise ValueError(f"Unsupported backend: {v}. Supported values: {sorted(allowed)}") - return v - - @field_validator("hadamard_type") - @classmethod - def validate_hadamard_type(cls, v: str) -> str: - allowed = {"hadamard", "random_hadamard", "quarot_hadamard"} - if v not in allowed: - raise ValueError(f"Unsupported hadamard_type: {v}. Supported values: {sorted(allowed)}") - return v diff --git a/auto_round/experimental/utils.py b/auto_round/experimental/utils.py index ea1dade78..3965ba71e 100644 --- a/auto_round/experimental/utils.py +++ b/auto_round/experimental/utils.py @@ -138,84 +138,23 @@ def is_triton_kernel_available(data_type: str) -> bool: def dump_group_size_to_rotation_config(rotation_config: str | dict | RotationConfig, group_size: int): - rotation_dict = to_dict_rotation_config(rotation_config) - if rotation_dict.get("block_size", None) is None: - rotation_dict["block_size"] = group_size - return rotation_dict + from auto_round.algorithms.transforms.rotation.config import ( + dump_group_size_to_rotation_config as _impl, + ) + + return _impl(rotation_config, group_size) def to_dict_rotation_config(rotation_config: str | dict | RotationConfig): - if isinstance(rotation_config, str): - key = rotation_config.strip() - if not key: - return {} + from auto_round.algorithms.transforms.rotation.config import to_dict_rotation_config as _impl - if key == "default": - cfg_dict = {"hadamard_type": "hadamard"} - else: - cfg_dict = {"hadamard_type": key} - elif isinstance(rotation_config, RotationConfig): - cfg_dict = rotation_config.model_dump() - else: - cfg_dict = dict(rotation_config) - return cfg_dict + return _impl(rotation_config) def normalize_rotation_config(rotation_config: str | dict | RotationConfig | None, data_type: str) -> dict[str, Any]: - """ - Normalize and validate `rotation_config`. - - Supported input types: - - None -> {} - - dict -> validated via RotationConfig - - RotationConfig -> validated & converted to dict - - str -> shorthand for `hadamard_type` in HADAMARDS keys - - Additional behavior: - - If block_size is not set by user: - - mx_fp -> default block_size to 32 - - nv_fp -> default block_size to 16 - - other data types -> emit a warning - - If block_size is set but does not match the recommended value: - - mx_fp expects 32 - - nv_fp expects 16 - - emit a warning - """ + from auto_round.algorithms.transforms.rotation.config import normalize_rotation_config as _impl - def _apply_data_type_block_size(cfg_dict: dict[str, Any], block_size_explicitly_set: bool) -> dict[str, Any]: - block_size = cfg_dict.get("block_size") - - if not block_size_explicitly_set or block_size is None: - if is_mx_fp(data_type): - cfg_dict["block_size"] = 32 - logger.warning("block_size is not set for data_type 'mx_fp'; defaulting to 32.") - elif is_nv_fp(data_type): - cfg_dict["block_size"] = 16 - logger.warning("block_size is not set for data_type 'nv_fp'; defaulting to 16.") - else: - logger.warning( - f"block_size is not set and cannot be inferred for data_type {data_type!r}; " - "please set block_size explicitly in rotation_config if needed." - ) - else: - if is_mx_fp(data_type) and block_size != 32: - logger.warning(f"data_type is 'mx_fp' but block_size={block_size}; recommended value is 32.") - elif is_nv_fp(data_type) and block_size != 16: - logger.warning(f"data_type is 'nv_fp' but block_size={block_size}; recommended value is 16.") - - return cfg_dict - - # 1) None -> {} - if rotation_config is None: - return {} - - rotation_dict = to_dict_rotation_config(rotation_config) - block_size_explicitly_set = "block_size" in rotation_dict - cfg_dict = _apply_data_type_block_size(rotation_dict, block_size_explicitly_set) - try: - return RotationConfig.model_validate(cfg_dict).model_dump() - except Exception as e: - raise ValueError(f"Invalid RotationConfig: {e}") from e + return _impl(rotation_config, data_type) def check_supported_schemes(scheme: str): From 01f02cf2dbfe0b3bde541b503d1d5f2d8cf66443 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 23 Apr 2026 13:17:58 +0800 Subject: [PATCH 78/90] refactor(rotation): convert experimental/transform/apply.py to shim The transform-backend rotation logic is already present (as a strict superset including the HadamardRotation BaseRotation subclass) in auto_round/algorithms/transforms/rotation/apply.py. With this change, every file under auto_round/experimental/transform/ is now a pure re-export shim pointing at its canonical new-arch home: - rotation_config.py -> algorithms/transforms/rotation/config - apply.py -> algorithms/transforms/rotation/apply - hadamards.py -> algorithms/transforms/rotation/transforms - patch_modules.py -> algorithms/transforms/rotation/patch - triton/mxfp4.py -> algorithms/transforms/rotation/utils/triton/mxfp4 - utils/matrix.py -> algorithms/transforms/rotation/utils/matrix - utils/hadamard.py -> algorithms/transforms/rotation/utils/math --- auto_round/experimental/transform/apply.py | 197 +-------------------- 1 file changed, 7 insertions(+), 190 deletions(-) diff --git a/auto_round/experimental/transform/apply.py b/auto_round/experimental/transform/apply.py index aafd4c1b0..e0aa77e13 100644 --- a/auto_round/experimental/transform/apply.py +++ b/auto_round/experimental/transform/apply.py @@ -1,196 +1,13 @@ # # Copyright (C) 2026 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 +"""Backward-compat re-export shim. -import torch -import tqdm +The canonical implementation now lives in +:mod:`auto_round.algorithms.transforms.rotation.apply`. +""" -from auto_round.experimental.qmodules.base import QModuleBase -from auto_round.experimental.transform.hadamards import build_hadamard_transform -from auto_round.experimental.transform.rotation_config import RotationConfig -from auto_round.experimental.utils import is_triton_kernel_available, normalize_rotation_config +from auto_round.algorithms.transforms.rotation.apply import ( # noqa: F401 + apply_rotation_transform, +) __all__ = ["apply_rotation_transform"] - - -def apply_rotation_transform( - model: torch.nn.Module, - config: str | dict | RotationConfig | None, - location: str = "weight", - use_tqdm=True, - desc=None, - data_type="mx_fp", -): - """ - Apply a transform configuration to a model. - - Weight and activation transforms are attached as submodules and are - triggered via PyTorch hooks. - - :param model: Model to which the transform configuration will be applied. - :param config: Transform configuration to apply. Supported values are: - * ``str``: A named/preset transform configuration. In this case, - resolved to a concrete quantization/transform configuration. - * ``dict``: A raw configuration mapping that will be normalized - (via :func:`normalize_rotation_config`) and then passed to - :class:`TransformConfig`. - * :class:`TransformConfig`: An existing configuration instance. - This will be used to construct the final configuration after - normalization. - * ``None``: Uses the default behavior of - :func:`_normalize_rotation_config` (for example, inferring a - configuration from ``data_type`` or other project defaults), if - supported. - :param data_type: quantization data type. - :param use_tqdm: If ``True``, wrap the per-module application in a - tqdm progress bar. - :param desc: Optional description string to show in the tqdm progress - bar. If ``None``, a description will be derived from - ``config.transform_type``. - """ - - config = normalize_rotation_config(config, data_type) - if not isinstance(config, RotationConfig): - config = RotationConfig(**config) - - modules_config = [ - (name, module, config) - for name, module in model.named_modules() - if isinstance(module, torch.nn.Linear) or isinstance(module, QModuleBase) - ] - - desc = f"Applying {config.hadamard_type} transforms" if desc is None else desc - for name, module, config in tqdm.tqdm(modules_config, desc=desc, disable=(not use_tqdm)): - if "lm_head" in name: - continue - _apply_to_module(model, module, config, location, data_type) - - # attach config to model for compression/serialization. Use a plain dict so - # that downstream HF `save_pretrained` -> JSON works (RotationConfig is a - # pydantic model and is not directly JSON serializable). - setattr(model, "rotation_config", config.model_dump() if hasattr(config, "model_dump") else config) - hooks = None - - return model, hooks - - -def _apply_to_module( - model: torch.nn.Module, - module: torch.nn.Module, - config: RotationConfig, - location: str = "weight", - data_type: str = "mx_fp", -): - """ - Create transforms and apply them to the module - - :param model: model which module belongs to - :param module: target module to apply transforms to - """ - - # create transform as submodule - hadamard_name = config.hadamard_type - - if location == "input": - - # activation needs transpose - input_hadamard_transform = build_hadamard_transform( - **config.model_dump(), - location="input", - inverse=True, - device="cpu", - precision=module.dtype, # for online activation, the transform dtype maybe bfloat16/float16. - ) - - if config.hadamard_type != "random_hadamard": - hadamard_weight = input_hadamard_transform.weight - else: - hadamard_weight = None - - if is_triton_kernel_available(data_type): - from auto_round.experimental.transform.triton.mxfp4 import mxfp4_forward_kernel_wrapper - - def input_hook(self, args): - input = args[0] - # transform(input) - orig_shape = input.shape - orig_dtype = input.dtype - x_flat = input.contiguous().flatten(end_dim=-2) - qdq_input, _ = mxfp4_forward_kernel_wrapper( - x_flat, - ( - hadamard_weight.to(orig_dtype) - if hadamard_weight is not None - else self.hadamard_matrix.T.to(orig_dtype) - ), # this matrix from w_transform, needs transpose - ) - return qdq_input.reshape(orig_shape).to(orig_dtype) - - # for fused transform + quantization kernel - module.pre_dequantized_input = True - module.register_forward_pre_hook(input_hook, prepend=True) - else: - - from auto_round.experimental.transform.utils.matrix import _multihead_matmul - - def input_hook(self, args): - input = args[0] - - ori_shape = input.shape - orig_dtype = input.dtype - - if hadamard_weight is not None: - input = input.view(-1, hadamard_weight.shape[0]) - return ( - (_multihead_matmul(input, hadamard_weight.to(input.device).to(orig_dtype))) - .view(ori_shape) - .to(orig_dtype) - ) - else: - input = input.view(-1, self.hadamard_matrix.shape[0]) - return ( - (_multihead_matmul(input, self.hadamard_matrix.T.to(orig_dtype))).view(ori_shape).to(orig_dtype) - ) - - # for fused transform + quantization kernel - module.pre_dequantized_input = False - module.register_forward_pre_hook(input_hook, prepend=True) - - elif location == "weight": - # eagerly apply transformation to weight - # fuse transform into weight - assert hasattr(module, "weight") - - weight_hadamard_transform = build_hadamard_transform( - **config.model_dump(), - location="weight", - device=module.weight.device, - ) - - # need save random hadamard matrix needed when inference - if config.hadamard_type == "random_hadamard": - # for saving transform weight - from auto_round.experimental.transform.patch_modules import patch_quantlinear - - patch_quantlinear(weight_hadamard_transform) - - # for autoround tuning: weight not tuning - # for rtn: weight transformed before saving - from auto_round.experimental.transform.patch_modules import ( - patch_wrapperlinear_to_apply_transform, - patch_wrapperwalayer_forward_to_apply_transform, - ) - - input_hadamard_transform = build_hadamard_transform( - **config.model_dump(), - location="input", - inverse=True, - device=module.weight.device, - precision=module.weight.dtype, # for online activation, the transform dtype maybe bfloat16/float16. - ) - - patch_wrapperlinear_to_apply_transform(weight_hadamard_transform, input_hadamard_transform) - patch_wrapperwalayer_forward_to_apply_transform(input_hadamard_transform) - - else: - # TODO: apply transform to output/q/k - raise NotImplementedError() From 08caa0d4dbeaea2cffccab35a84ce7ea4be1f262 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 23 Apr 2026 13:20:25 +0800 Subject: [PATCH 79/90] refactor(entry): simplify rotation_config kwarg handling in AutoRoundCompatible Since RotationConfig is now unified (experimental shim -> canonical algorithms/transforms/rotation/config.RotationConfig), drop the redundant three-step translation (old RC -> dict -> normalize -> new RC) and use the already-imported _NewArchRotationConfig directly. Behavior is preserved: inplace backend still warns+skips, other backends are appended to alg_configs list unchanged. --- auto_round/compressors_new/entry.py | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/auto_round/compressors_new/entry.py b/auto_round/compressors_new/entry.py index ef9c1bd34..2d15e75a3 100644 --- a/auto_round/compressors_new/entry.py +++ b/auto_round/compressors_new/entry.py @@ -455,24 +455,19 @@ def __new__( # Determine output format if specified format = kwargs.pop("format", None) - # Extract rotation_config (old-API kwarg) and convert to new arch RotationConfig. - # In old arch, rotation_config was a keyword arg; in new arch, rotation transforms - # are passed as part of alg_configs list. "inplace" backend is not yet supported - # in the new arch (requires CUDA/triton), so we only convert transform-compatible configs. + # Extract rotation_config (old-API kwarg) and thread it into alg_configs. + # In old arch this was a standalone keyword arg; the new arch passes rotation + # transforms as part of the alg_configs list. backend='inplace' is not yet + # wired into the new-arch alg_configs pipeline, so warn and skip. _rotation_config_raw = kwargs.pop("rotation_config", None) if _rotation_config_raw is not None: - from auto_round.algorithms.transforms.rotation.config import RotationConfig as _NARotCfg - from auto_round.algorithms.transforms.rotation.config import normalize_rotation_config as _normalize_rc - from auto_round.experimental.transform.rotation_config import RotationConfig as _RotationConfig - - # Resolve to a RotationConfig to check the backend field - if isinstance(_rotation_config_raw, _RotationConfig): + if isinstance(_rotation_config_raw, _NewArchRotationConfig): _rc = _rotation_config_raw elif isinstance(_rotation_config_raw, dict): - _rc = _RotationConfig.model_validate(_rotation_config_raw) + _rc = _NewArchRotationConfig.model_validate(_rotation_config_raw) else: - # str ("default", "random_hadamard", …) or plain dict - _rc = _RotationConfig() + # str alias ("default", "random_hadamard", …) -> default config + _rc = _NewArchRotationConfig() if _rc.backend == "inplace": logger.warning( @@ -481,13 +476,7 @@ def __new__( "with an MXFP4/NVFP4 scheme, or pass RotationConfig() explicitly via alg_configs." ) else: - # Convert to new arch RotationConfig. - # normalize_rotation_config accepts None/str/dict/RotationConfig, so - # convert old-arch RotationConfig instances to dict first (dropping backend field). - _raw_for_norm = _rc.model_dump(exclude={"backend", "fuse_online_to_weight", "allow_online_rotation"}) - hadamard_dict = _normalize_rc(_raw_for_norm) - hadamard_cfg = _NARotCfg.model_validate(hadamard_dict) - config = [config, hadamard_cfg] + config = [config, _rc] # Extract MLLM-specific parameters processor = kwargs.pop("processor", None) From 70631e999161a5ac9ba8eed4b10b618c983719d4 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 23 Apr 2026 13:27:33 +0800 Subject: [PATCH 80/90] feat(rotation): support backend='inplace' in new-arch alg_configs pipeline HadamardRotation.apply_to_model now dispatches on cfg.backend via resolve_hadamard_backend: * backend resolves to 'inplace' -> delegate to auto_round.algorithms.transforms.rotation.inplace.apply_rotation_transform (QuaRot residual-stream rotation, works for any dtype incl. fp8/int). * backend resolves to 'transform' -> unchanged triton-fused per-Linear path for MXFP4 / NVFP4. Fuse flag resolution (explicit > AR_FUSE_ONLINE_ROTATION env > default) mirrors the standalone dispatcher so the BaseRotation path and the top-level apply_hadamard_rotation dispatcher behave identically. entry.py: drop the inplace warn+skip branch in AutoRoundCompatible -- the new arch now handles all backends uniformly via the alg_configs list, so the rotation_config kwarg is threaded through unchanged. --- .../algorithms/transforms/rotation/apply.py | 33 +++++++++++++++++++ auto_round/compressors_new/entry.py | 14 ++------ 2 files changed, 36 insertions(+), 11 deletions(-) diff --git a/auto_round/algorithms/transforms/rotation/apply.py b/auto_round/algorithms/transforms/rotation/apply.py index bf2144e76..62ceb02b5 100644 --- a/auto_round/algorithms/transforms/rotation/apply.py +++ b/auto_round/algorithms/transforms/rotation/apply.py @@ -107,6 +107,39 @@ def apply_to_model( """ cfg = self.config + # Dispatch by backend. The transform backend (triton-fused per-Linear) + # is implemented below; the inplace (QuaRot) backend is delegated to + # :mod:`auto_round.algorithms.transforms.rotation.inplace`. + from auto_round.algorithms.transforms.rotation.dispatcher import resolve_hadamard_backend + + backend = resolve_hadamard_backend(cfg, data_type) + if backend == "inplace": + import auto_round.envs as envs + from auto_round.algorithms.transforms.rotation.inplace import apply_rotation_transform as _inplace_apply + + # Resolve fuse flag: explicit > env var > default(False). + fuse_online_to_weight = cfg.fuse_online_to_weight + if cfg.fuse_online_to_weight is not None: + fuse_online_to_weight = bool(cfg.fuse_online_to_weight) + elif envs.AR_FUSE_ONLINE_ROTATION: + fuse_online_to_weight = bool(envs.AR_FUSE_ONLINE_ROTATION) + + bs = cfg.block_size + group_size = bs if (bs is not None and bs > 0) else None + + compute_device = kwargs.get("compute_device") + model, _hooks = _inplace_apply( + model, + group_size=group_size, + allow_online_rotation=cfg.allow_online_rotation, + rotation_matrix=cfg.hadamard_type, + fuse_online_to_weight=fuse_online_to_weight, + compute_device=compute_device, + ) + setattr(model, "rotation_config", cfg.model_dump()) + return model + + # backend == "transform": original per-Linear triton-fused path. # Collect target modules. target_types = (torch.nn.Linear, QModuleBase) diff --git a/auto_round/compressors_new/entry.py b/auto_round/compressors_new/entry.py index 2d15e75a3..7d519e007 100644 --- a/auto_round/compressors_new/entry.py +++ b/auto_round/compressors_new/entry.py @@ -457,8 +457,8 @@ def __new__( # Extract rotation_config (old-API kwarg) and thread it into alg_configs. # In old arch this was a standalone keyword arg; the new arch passes rotation - # transforms as part of the alg_configs list. backend='inplace' is not yet - # wired into the new-arch alg_configs pipeline, so warn and skip. + # transforms as part of the alg_configs list. All backends (auto / inplace / + # transform) are dispatched inside ``HadamardRotation.apply_to_model``. _rotation_config_raw = kwargs.pop("rotation_config", None) if _rotation_config_raw is not None: if isinstance(_rotation_config_raw, _NewArchRotationConfig): @@ -468,15 +468,7 @@ def __new__( else: # str alias ("default", "random_hadamard", …) -> default config _rc = _NewArchRotationConfig() - - if _rc.backend == "inplace": - logger.warning( - "rotation_config with backend='inplace' is not yet supported in the new architecture. " - "The rotation will be skipped. Use backend='transform' or backend='auto' " - "with an MXFP4/NVFP4 scheme, or pass RotationConfig() explicitly via alg_configs." - ) - else: - config = [config, _rc] + config = [config, _rc] # Extract MLLM-specific parameters processor = kwargs.pop("processor", None) From d5c8e9ba225b1741e1412c7f5ef190191dcd0b91 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 23 Apr 2026 05:29:48 +0000 Subject: [PATCH 81/90] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../algorithms/transforms/rotation/config.py | 4 +- .../transforms/rotation/dispatcher.py | 3 +- .../transforms/rotation/inplace/apply.py | 12 ++-- .../experimental/transform/hadamards.py | 6 +- .../experimental/transform/utils/hadamard.py | 4 +- benchmark_both.py | 17 ++++-- docs/fp8_new_arch_debug_handoff.md | 2 +- profile_rss_per_block.py | 61 +++++++++++-------- 8 files changed, 61 insertions(+), 48 deletions(-) diff --git a/auto_round/algorithms/transforms/rotation/config.py b/auto_round/algorithms/transforms/rotation/config.py index fe579142c..2a8c50697 100644 --- a/auto_round/algorithms/transforms/rotation/config.py +++ b/auto_round/algorithms/transforms/rotation/config.py @@ -128,9 +128,7 @@ def to_dict_rotation_config(rotation_config: str | dict | RotationConfig | None) return dict(rotation_config) -def dump_group_size_to_rotation_config( - rotation_config: str | dict | RotationConfig, group_size: int -) -> dict[str, Any]: +def dump_group_size_to_rotation_config(rotation_config: str | dict | RotationConfig, group_size: int) -> dict[str, Any]: """Return *rotation_config* as a dict with ``block_size`` populated from *group_size* (if unset).""" rotation_dict = to_dict_rotation_config(rotation_config) if rotation_dict.get("block_size", None) is None: diff --git a/auto_round/algorithms/transforms/rotation/dispatcher.py b/auto_round/algorithms/transforms/rotation/dispatcher.py index af3dee646..ecae6652f 100644 --- a/auto_round/algorithms/transforms/rotation/dispatcher.py +++ b/auto_round/algorithms/transforms/rotation/dispatcher.py @@ -34,9 +34,8 @@ import torch import auto_round.envs as envs +from auto_round.algorithms.transforms.rotation.config import RotationConfig, normalize_rotation_config from auto_round.compressors.utils import is_mx_fp, is_nv_fp -from auto_round.algorithms.transforms.rotation.config import RotationConfig -from auto_round.algorithms.transforms.rotation.config import normalize_rotation_config from auto_round.utils import logger __all__ = ["apply_hadamard_rotation", "resolve_hadamard_backend"] diff --git a/auto_round/algorithms/transforms/rotation/inplace/apply.py b/auto_round/algorithms/transforms/rotation/inplace/apply.py index 680c1950d..3177d6fa9 100644 --- a/auto_round/algorithms/transforms/rotation/inplace/apply.py +++ b/auto_round/algorithms/transforms/rotation/inplace/apply.py @@ -14,12 +14,6 @@ import torch import tqdm -from auto_round.algorithms.transforms.rotation.inplace.model_config import ( - MAPPING_REGISTRY, - RotationMapping, - _resolve, - infer_mapping_from_model, -) from auto_round.algorithms.transforms.rotation.inplace.hooks import ( CrossHeadOnlineHadamardHook, FullOnlineHadamardHook, @@ -35,6 +29,12 @@ get_hadK, get_or_create_random_hadamard, ) +from auto_round.algorithms.transforms.rotation.inplace.model_config import ( + MAPPING_REGISTRY, + RotationMapping, + _resolve, + infer_mapping_from_model, +) # --------------------------------------------------------------------------- # Low-level primitives (model-agnostic via RotationMapping) diff --git a/auto_round/experimental/transform/hadamards.py b/auto_round/experimental/transform/hadamards.py index 8f52b80a5..83efa91b6 100644 --- a/auto_round/experimental/transform/hadamards.py +++ b/auto_round/experimental/transform/hadamards.py @@ -6,11 +6,13 @@ :mod:`auto_round.algorithms.transforms.rotation.transforms`. """ -from auto_round.algorithms.transforms.rotation.transforms import ( # noqa: F401 +from auto_round.algorithms.transforms.rotation.transforms import ( HADAMARDS, HadamardTransform, RandomHadamardTransform, - _filter_kwargs as filter_kwarg_dict, +) +from auto_round.algorithms.transforms.rotation.transforms import _filter_kwargs as filter_kwarg_dict # noqa: F401 +from auto_round.algorithms.transforms.rotation.transforms import ( build_hadamard_transform, ) diff --git a/auto_round/experimental/transform/utils/hadamard.py b/auto_round/experimental/transform/utils/hadamard.py index aa9e59d9a..8bed8a7c1 100644 --- a/auto_round/experimental/transform/utils/hadamard.py +++ b/auto_round/experimental/transform/utils/hadamard.py @@ -6,9 +6,9 @@ :mod:`auto_round.algorithms.transforms.rotation.utils.math`. """ -from auto_round.algorithms.transforms.rotation.utils.math import ( # noqa: F401 +from auto_round.algorithms.transforms.rotation.utils.math import _HADAMARD_MATRICES_PATH as REPO_PATH # noqa: F401 +from auto_round.algorithms.transforms.rotation.utils.math import ( _fetch_hadamard_divisor, - _HADAMARD_MATRICES_PATH as REPO_PATH, _matmul_hadU, deterministic_hadamard_matrix, is_pow2, diff --git a/benchmark_both.py b/benchmark_both.py index 3edbc04ef..320f18b11 100644 --- a/benchmark_both.py +++ b/benchmark_both.py @@ -10,18 +10,23 @@ import sys import time - MODEL = "Qwen/Qwen3-0.6B" ITERS = "200" SCHEME = "W4A16" DEVICE = "cuda:0" CMD_TEMPLATE = [ - sys.executable, "-m", "auto_round", - "--model_name", MODEL, - "--scheme", SCHEME, - "--iters", ITERS, - "--device", DEVICE, + sys.executable, + "-m", + "auto_round", + "--model_name", + MODEL, + "--scheme", + SCHEME, + "--iters", + ITERS, + "--device", + DEVICE, ] diff --git a/docs/fp8_new_arch_debug_handoff.md b/docs/fp8_new_arch_debug_handoff.md index a269b66f9..c6e4244c3 100644 --- a/docs/fp8_new_arch_debug_handoff.md +++ b/docs/fp8_new_arch_debug_handoff.md @@ -37,7 +37,7 @@ On HPU, new-arch `FP8_STATIC` (static W8A8-FP8) tuning leaked ~GBs of host RAM p 1. **Immediate packing trigger flag** `auto_round/compressors_new/calib.py` ~L1031: ```python - if self.compress_context.is_immediate_packing: # was: is_immediate_saving + if self.compress_context.is_immediate_packing: # was: is_immediate_saving ... ``` Without this, packed weights were held in CPU RAM indefinitely. diff --git a/profile_rss_per_block.py b/profile_rss_per_block.py index 06dd014f6..66b13ce59 100644 --- a/profile_rss_per_block.py +++ b/profile_rss_per_block.py @@ -9,6 +9,7 @@ # Old arch: AR_DISABLE_NEW_ARCH=1 python profile_rss_per_block.py """ + import gc import os import resource @@ -35,18 +36,19 @@ def rss_mb_clean(): def live_rss_mb(): """Current RSS in MB (not peak).""" - return _proc.memory_info().rss / (1024*1024) + return _proc.memory_info().rss / (1024 * 1024) def live_rss_mb_clean(): gc.collect() try: import ctypes + libc = ctypes.CDLL("libc.so.6") libc.malloc_trim(0) except Exception: pass - return _proc.memory_info().rss / (1024*1024) + return _proc.memory_info().rss / (1024 * 1024) arch = os.environ.get("AR_DISABLE_NEW_ARCH", "0") @@ -60,6 +62,7 @@ def live_rss_mb_clean(): if arch != "1": # NEW ARCH: patch CalibCompressor._quantize_single_block from auto_round.compressors_new import calib as calib_mod + _orig_quantize_single_block = calib_mod.CalibCompressor._quantize_single_block _orig_quantize_blocks = calib_mod.CalibCompressor._quantize_blocks @@ -76,6 +79,7 @@ def _patched_quantize_single_block(self, model, m, input_ids, input_others, q_in rss_after_gc = live_rss_mb() try: import ctypes + libc = ctypes.CDLL("libc.so.6") libc.malloc_trim(0) except Exception: @@ -83,21 +87,22 @@ def _patched_quantize_single_block(self, model, m, input_ids, input_others, q_in rss_after_trim = live_rss_mb() entry = { - 'block': block_idx, - 'before': rss_before, - 'after_return': rss_after_return, - 'after_gc': rss_after_gc, - 'after_trim': rss_after_trim, - 'delta_return': rss_after_return - rss_before, - 'delta_gc': rss_after_gc - rss_before, - 'delta_trim': rss_after_trim - rss_before, + "block": block_idx, + "before": rss_before, + "after_return": rss_after_return, + "after_gc": rss_after_gc, + "after_trim": rss_after_trim, + "delta_return": rss_after_return - rss_before, + "delta_gc": rss_after_gc - rss_before, + "delta_trim": rss_after_trim - rss_before, } _block_rss_log.append(entry) print( f" Block {block_idx:2d}: before={rss_before:.1f} after_ret={rss_after_return:.1f} " f"after_gc={rss_after_gc:.1f} after_trim={rss_after_trim:.1f} " f"delta_ret={entry['delta_return']:+.1f} delta_trim={entry['delta_trim']:+.1f} MB", - flush=True) + flush=True, + ) return result calib_mod.CalibCompressor._quantize_single_block = _patched_quantize_single_block @@ -105,6 +110,7 @@ def _patched_quantize_single_block(self, model, m, input_ids, input_others, q_in else: # OLD ARCH: patch LLMCompressor._quantize_block from auto_round.compressors import base as base_mod + _orig_quantize_block = base_mod.LLMCompressor._quantize_block _block_rss_log = [] @@ -120,6 +126,7 @@ def _patched_quantize_block(self, block, input_ids, input_others, q_input=None, rss_after_gc = live_rss_mb() try: import ctypes + libc = ctypes.CDLL("libc.so.6") libc.malloc_trim(0) except Exception: @@ -127,21 +134,22 @@ def _patched_quantize_block(self, block, input_ids, input_others, q_input=None, rss_after_trim = live_rss_mb() entry = { - 'block': block_idx, - 'before': rss_before, - 'after_return': rss_after_return, - 'after_gc': rss_after_gc, - 'after_trim': rss_after_trim, - 'delta_return': rss_after_return - rss_before, - 'delta_gc': rss_after_gc - rss_before, - 'delta_trim': rss_after_trim - rss_before, + "block": block_idx, + "before": rss_before, + "after_return": rss_after_return, + "after_gc": rss_after_gc, + "after_trim": rss_after_trim, + "delta_return": rss_after_return - rss_before, + "delta_gc": rss_after_gc - rss_before, + "delta_trim": rss_after_trim - rss_before, } _block_rss_log.append(entry) print( f" Block {block_idx:2d}: before={rss_before:.1f} after_ret={rss_after_return:.1f} " f"after_gc={rss_after_gc:.1f} after_trim={rss_after_trim:.1f} " f"delta_ret={entry['delta_return']:+.1f} delta_trim={entry['delta_trim']:+.1f} MB", - flush=True) + flush=True, + ) return result base_mod.LLMCompressor._quantize_block = _patched_quantize_block @@ -157,7 +165,7 @@ def _patched_quantize_block(self, block, input_ids, input_others, q_input=None, save_dir = "/tmp/profile_rss_output" shutil.rmtree(save_dir, ignore_errors=True) -print(f"\nCreating AutoRound instance...") +print("\nCreating AutoRound instance...") ar = AutoRound( model="Qwen/Qwen3-0.6B", scheme="FP8_STATIC", @@ -168,7 +176,7 @@ def _patched_quantize_block(self, block, input_ids, input_others, q_input=None, print(f"After init RSS: {live_rss_mb():.1f} MB") print(f"After init RSS (clean): {live_rss_mb_clean():.1f} MB") -print(f"\nStarting quantize_and_save...\n") +print("\nStarting quantize_and_save...\n") model, folder = ar.quantize_and_save(output_dir=save_dir, format="llm_compressor") print(f"\n{'='*70}") @@ -176,16 +184,17 @@ def _patched_quantize_block(self, block, input_ids, input_others, q_input=None, print(f"{'='*70}") print(f"Final RSS: {live_rss_mb():.1f} MB") print(f"Final RSS (clean): {live_rss_mb_clean():.1f} MB") -print(f"\nPer-block deltas (after return, after gc+trim):") +print("\nPer-block deltas (after return, after gc+trim):") for e in _block_rss_log: print( f" Block {e['block']:2d}: delta_ret={e['delta_return']:+.1f} delta_trim={e['delta_trim']:+.1f} MB " - f"(abs: {e['after_trim']:.1f} MB)") + f"(abs: {e['after_trim']:.1f} MB)" + ) # Compute growth rate if len(_block_rss_log) >= 2: - first = _block_rss_log[0]['after_trim'] - last = _block_rss_log[-1]['after_trim'] + first = _block_rss_log[0]["after_trim"] + last = _block_rss_log[-1]["after_trim"] n = len(_block_rss_log) - 1 print(f"\nGrowth: {first:.1f} -> {last:.1f} MB over {n} blocks = {(last-first)/n:.1f} MB/block avg") From aa4c54073c3f8212fb7cefdad0c883e98f4030ed Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 23 Apr 2026 13:29:56 +0800 Subject: [PATCH 82/90] merge and sync main Signed-off-by: n1ck-guo --- auto_round/experimental/utils.py | 4 +--- setup.py | 2 -- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/auto_round/experimental/utils.py b/auto_round/experimental/utils.py index 3965ba71e..80e830bed 100644 --- a/auto_round/experimental/utils.py +++ b/auto_round/experimental/utils.py @@ -138,9 +138,7 @@ def is_triton_kernel_available(data_type: str) -> bool: def dump_group_size_to_rotation_config(rotation_config: str | dict | RotationConfig, group_size: int): - from auto_round.algorithms.transforms.rotation.config import ( - dump_group_size_to_rotation_config as _impl, - ) + from auto_round.algorithms.transforms.rotation.config import dump_group_size_to_rotation_config as _impl return _impl(rotation_config, group_size) diff --git a/setup.py b/setup.py index 16f7aaa17..6bb184bcb 100644 --- a/setup.py +++ b/setup.py @@ -130,7 +130,6 @@ def fetch_requirements(path): # python setup.py hpu install ############################################################################### - HPU_REQUIREMENTS_FILE = "requirements-hpu.txt" HPU_INSTALL_CFG = { "include_packages": find_packages( @@ -144,7 +143,6 @@ def fetch_requirements(path): "install_requires": fetch_requirements(HPU_REQUIREMENTS_FILE), } - # Support legacy `python setup.py hpu install` invocation for backward compatibility. # For python -m build / uv build, use the BUILD_HPU_ONLY=1 environment variable instead. if __name__ == "__main__": From 4c78670dd54b243dc85b0467ab4c93d580aa4145 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 23 Apr 2026 13:33:42 +0800 Subject: [PATCH 83/90] clean Signed-off-by: n1ck-guo --- docs/fp8_new_arch_debug_handoff.md | 222 ----------------------------- 1 file changed, 222 deletions(-) delete mode 100644 docs/fp8_new_arch_debug_handoff.md diff --git a/docs/fp8_new_arch_debug_handoff.md b/docs/fp8_new_arch_debug_handoff.md deleted file mode 100644 index a269b66f9..000000000 --- a/docs/fp8_new_arch_debug_handoff.md +++ /dev/null @@ -1,222 +0,0 @@ -# FP8 Scheme Debug Handoff — `hengguo/new_ar_arch` (PR #1542) - -> **Purpose**: Hand-off for the next AI/engineer continuing FP8 regression -> debugging on the new AutoRound architecture (`auto_round/compressors_new/`). -> Skip the discovery phase — this doc captures what's broken, what's already -> fixed, how to reach the HPU test environment, and the exact commands that -> reproduce results. Read this end-to-end **before touching code**. - ---- - -## 1. Context - -- Repo: `intel/auto-round` -- Branch: `hengguo/new_ar_arch` (PR #1542 "Step1 new architecture for auto_round") -- Origin: `18ba254d merge main` (last pushed commit; several fixes are still **uncommitted** locally — see §5) -- Two parallel code paths exist: - - **Old arch**: `auto_round/compressors/` (baseline, `main` branch) - - **New arch**: `auto_round/compressors_new/` + `auto_round/algorithms/` + `auto_round/context/` (this PR) -- Routing entry: `auto_round/compressors_new/entry.py::AutoRound.__new__` picks old vs new compressor class by config type. - -### Key architectural differences vs old arch - -- `BaseCompressor.__getattr__` delegates attribute access to **three contexts** in order: `quantize_config`, `model_context`, `compress_context`. Missing attribute on all three → `AttributeError`. Many latent bugs were caused by attributes existing on the wrong context. -- Per-sample batching logic moved from `BaseCompressor._get_batch_data` (old) to `BaseQuantizers._sampling_inputs` in `auto_round/algorithms/quantization/base.py` (new). -- `share_cache_keys = ('position_ids', 'cache_position', 'position_embeddings')` — values cached **once** (not per-sample), typically wrapped by hook as `[val]`. New arch needs to unwrap and pass through regardless of `len(indices)`. -- Immediate packing flag is `self.compress_context.is_immediate_packing` (not `is_immediate_saving`). Conflating the two caused the original FP8_STATIC RAM regression. - ---- - -## 2. The FP8_STATIC Host-RAM Regression (primary motivating bug) - -### Symptom -On HPU, new-arch `FP8_STATIC` (static W8A8-FP8) tuning leaked ~GBs of host RAM per block vs old arch — traced to HPU eager-pipeline host-side growth when the static-activation calibration path runs. - -### Root causes & fixes (already in tree) - -1. **Immediate packing trigger flag** - `auto_round/compressors_new/calib.py` ~L1031: - ```python - if self.compress_context.is_immediate_packing: # was: is_immediate_saving - ... - ``` - Without this, packed weights were held in CPU RAM indefinitely. - -2. **`tmp_dtype` missing** - `auto_round/compressors_new/calib.py` ~L1424: added - ```python - tmp_dtype = self.model_context.amp_dtype if self.model_context.amp else torch.float32 - ``` - Matches old arch. - -### Status -**Fix #1 verified on HPU** (see §6). Do not revert these two pieces. - -> **Note (2026-04-22):** An earlier version of this branch added `_needs_hpu_fp8_static_eager_guard` / -> `_maybe_disable_hpu_eager_pipeline` in `entry.py` that set `PT_HPU_EAGER_PIPELINE_ENABLE=0` -> for FP8_STATIC on HPU. This was **speculative** (never confirmed to reduce RAM) and has been -> **deleted**. The `is_immediate_packing` fix in calib.py is the real fix. - ---- - -## 3. Latest performance report from user (the trigger for this handoff) - -Reported CI (from `performance_ut.sh`, model `Qwen/Qwen3-0.6B`, scheme likely W4A16 per default): - -``` -Tuning Time (s) : Current = 1192.5 | Baseline = 445.7 (+167.58%) FAIL -Peak RAM (GB) : Current = 3.68 | Baseline = 4.05 (−9.14%) FAIL (tolerance) -Peak VRAM (GB) : Current = 1.29 | Baseline = 26.73 (−95.17%) FAIL -Output Size (GB) : Current = 0.7114 | Baseline = 0.7114 (+0.00%) PASS -``` - -### CRITICAL finding from direct HPU test (iters=20, same scheme/model) - -Running the exact same binary on the HPU box (see §4) produced: - -``` -Quantizing model.layers.0 ... peak_ram=2.83GB peak_vram=1.25GB (block 0 only, expected) -Quantizing model.layers.1 ... peak_ram=2.86GB peak_vram=26.43GB ← HPU *is* being used -Quantizing model.layers.27 ... peak_ram=3.62GB peak_vram=26.61GB -quantization tuning time 68.34s for 20 iters → linear scale 200 iters ≈ 683s -real 1m24s user 11m16s -``` - -**So on the actual HPU**, `Peak VRAM = 26.61 GB` (matches baseline 26.73 GB). -The CI-reported `1.29 GB` equals the **first block only** (`model.layers.0`), before HPU allocation expands. This strongly suggests: - -- `check_performance.py` in the CI pipeline is picking up the first `peak_vram` log line (block 0 = 1.25–1.29 GB) and missing later ones, **OR** -- The CI run crashed/exited after block 0 (VRAM never grew) but reported partial data as "success". - -The tuning time gap (683 s estimate locally, 1192 s in CI) is real but smaller than the 2.7× the CI report suggests. Likely contributors: -- `torch.compile` recompile on every block (new-arch cache invalidation logic). -- Caching / `_sampling_inputs` overhead per step. -- CI docker env differences (no model warm cache, different HPU driver). - -### Action for next agent -1. **Do NOT assume VRAM=1.29 GB is real**. First re-read `.azure-pipelines/scripts/performance/check_performance.py` — it likely parses log incorrectly. Fix the parser before believing the VRAM number. -2. Investigate the tuning-time gap separately from VRAM: - - Profile `_resolve_block_forward`: does `torch.compile` actually hit the compiled path, or does `self` delegate through `__getattr__` to a context that returns `False` for `enable_torch_compile`? - - Profile `_sampling_inputs`: per-sample tensor copies, shared-key unwrap path. - - Compare `block_forward` call count / time per block between old and new arch under identical config. - ---- - -## 4. HPU Test Environment (**use this to reproduce — do not reinvent**) - -### 4.1 SSH chain (3 hops) - -``` -local → ssh tensorflow@clx5673.ra.intel.com - → ssh -i ~/.ssh/id_rsa_qun -J guest@146.152.224.86 sdp@100.81.152.55 - → sshpass -p 1 ssh sdp@192.168.122.81 (host: kvm-01) - → docker exec AutoRoundDebug bash -``` - -- Final host `kvm-01` has **4× HL-225 (Gaudi2, 96 GB HBM each)**, driver 1.24.0, hl-1.23.0. -- Container `AutoRoundDebug`: Ubuntu 24.04, `torch 2.9.0+hpu.1.23.0.695`, `habana-torch-plugin 1.23.0.695`. -- Code lives at `/ar_work_space/auto-round-patched/` **inside** the container (NOT bind-mounted; must be copied via SSH). -- HF cache inside container: `~/.cache/huggingface/hub/` contains `models--Qwen--Qwen3-0.6B` and `Qwen3-1.7B` (already downloaded, no HF token needed). -- `auto_round` is **not** pip-installed in the container. Run via `PYTHONPATH=/ar_work_space/auto-round-patched` + `python3 -m auto_round ...`. - -### 4.2 Helper script that works across all 3 hops - -Saved at `/tmp/hpu_run.sh` (local). Base64-encodes the command to avoid shell escape hell: - -```bash -#!/bin/bash -# Usage: /tmp/hpu_run.sh '' -set -e -CMD="$1" -B64=$(printf '%s' "$CMD" | base64 -w0) -ssh -o StrictHostKeyChecking=no -T tensorflow@clx5673.ra.intel.com \ - "ssh -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa_qun -J guest@146.152.224.86 sdp@100.81.152.55 \ - \"sshpass -p 1 ssh -o StrictHostKeyChecking=no sdp@192.168.122.81 \\\"docker exec AutoRoundDebug bash -c 'echo $B64 | base64 -d | bash'\\\"\"" -``` - -### 4.3 Syncing local code → container (no bind mount) - -```bash -cd /home/hengguo/code/bug_fix/auto-round && \ -tar -czf - auto_round/ | ssh tensorflow@clx5673.ra.intel.com \ - "ssh -i ~/.ssh/id_rsa_qun -J guest@146.152.224.86 sdp@100.81.152.55 \ - 'sshpass -p 1 ssh -o StrictHostKeyChecking=no sdp@192.168.122.81 \ - \"docker exec -i AutoRoundDebug bash -c \\\"rm -rf /ar_work_space/auto-round-patched/auto_round && tar -C /ar_work_space/auto-round-patched -xzf - && echo OK\\\"\"'" -``` - -Repeat for `auto_round_extension/`, `setup.py`, `setup.cfg`, `pyproject.toml` if those change. - -### 4.4 Canonical perf commands - -**Short sanity (20 iters, ~90 s wall):** -```bash -/tmp/hpu_run.sh 'cd /ar_work_space/auto-round-patched && \ - export PYTHONPATH=/ar_work_space/auto-round-patched:$PYTHONPATH && \ - export HF_HUB_DISABLE_PROGRESS_BARS=1 TQDM_MININTERVAL=60 && \ - rm -rf /tmp/ar_test && \ - time python3 -m auto_round --model_name Qwen/Qwen3-0.6B --scheme W4A16 \ - --iters 20 --enable_torch_compile --device hpu --output_dir /tmp/ar_test 2>&1 | tail -60' -``` - -**Full perf run (mirrors `performance_ut.sh`, 200 iters, ~12 min):** -Replace `--iters 20` with `--iters 200` above. Baseline for W4A16/Qwen3-0.6B is ~445 s tuning on this box. - -**FP8_STATIC reproduction:** replace `--scheme W4A16` with `--scheme FP8_STATIC`. - ---- - -## 5. Uncommitted local changes (as of this handoff) - -``` -auto_round/algorithms/quantization/base.py — _sampling_inputs share_cache_keys unwrap -auto_round/utils/device.py — get_device_and_parallelism dict handling -auto_round/special_model_handler.py — L207 use pre-extracted model_type (gemma4 FrozenDict) -test/test_cpu/export/test_export.py — added autoround_old.post_init() in INT8_W8A8 test -``` - -Plus earlier, already-committed fixes: -- Deleted `auto_round/sign_sgd.py` (duplicate of `auto_round/algorithms/quantization/sign_round/sign_sgd.py`). -- Removed duplicate `from auto_round.sign_sgd import SignSGD` in `auto_round/compressors/base.py`. -- `calib.py`: `is_immediate_packing` flag + `tmp_dtype` definition. -- `entry.py`: Removed speculative `_maybe_disable_hpu_eager_pipeline` / `_needs_hpu_fp8_static_eager_guard` (never validated on HPU). -- `utils/device.py`: `get_device_and_parallelism` now handles `device=None` (fixes llmcompressor integration crash). - -Push these before next CI run or CI logs will still show pre-fix behaviour. - ---- - -## 6. What the next agent should do (ordered) - -1. **Commit & push the uncommitted fixes in §5** so CI reflects current state. -2. **Fix `check_performance.py`** (in `.azure-pipelines/scripts/performance/`) — it is almost certainly reporting `peak_vram` from block 0 instead of the run max. Local HPU proof in §3 shows VRAM=26.6 GB is correct. -3. **Profile the real 2.7× tuning-time gap** with iters=200: - - Add timing around `_resolve_block_forward` branches (compiled vs plain). - - Log `self.compress_context.enable_torch_compile` once per block. - - Compare `_sampling_inputs` CPU time between archs (new arch has extra conditional branches for share_cache_keys). - - Check whether `torch.compile` cache is invalidated every block (`_invalidate_block_forward_cache` in `calib.py` at block boundary). Old arch reused the compiled function across blocks; new arch resets on `_dynamo.reset()` — confirm this is intentional and not the regression source. -4. Only then consider the algorithm-level code as suspect. - ---- - -## 7. Known-good signals (sanity checks) - -- `PT_HPU_LAZY_MODE=0` (eager mode) is active in this container. -- `torch.hpu.is_available() → True`, `device_count() → 4`. -- `from auto_round import __version__ → 0.13.0`. - ---- - -## 8. Files to study first (highest-signal) - -| Path | Why | -|---|---| -| `auto_round/compressors_new/entry.py` | routing, scheme pre-resolution | -| `auto_round/compressors_new/calib.py` | caching, block loop, immediate_pack, tmp_dtype | -| `auto_round/algorithms/quantization/base.py` | `_sampling_inputs`, `_get_block_outputs`, `_resolve_block_forward` | -| `auto_round/context/compress.py` / `model.py` | the three contexts `__getattr__` delegates to | -| `auto_round/compressors/base.py` (old arch) | ground-truth reference for every behaviour | -| `.azure-pipelines/scripts/performance/check_performance.py` | likely source of bogus VRAM=1.29 GB | - ---- - -*Written 2026-04-22 during PR #1542 post-merge bug-fix session.* From e3087956fe44b9571b5847232213093baf5eb107 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 23 Apr 2026 15:08:49 +0800 Subject: [PATCH 84/90] fix(transforms): handle block_size=None in HadamardTransform RotationConfig.block_size defaults to None (meaning 'unset / auto'). When build_hadamard_transform passes block_size=None explicitly, it overrides the __init__ default of 32, causing math.sqrt(None) to raise TypeError. Fix: fall back to 32 when block_size is None. --- auto_round/algorithms/transforms/rotation/transforms.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/auto_round/algorithms/transforms/rotation/transforms.py b/auto_round/algorithms/transforms/rotation/transforms.py index 9725b2d9a..78dcd0d6b 100644 --- a/auto_round/algorithms/transforms/rotation/transforms.py +++ b/auto_round/algorithms/transforms/rotation/transforms.py @@ -68,7 +68,7 @@ class HadamardTransform(nn.Module): def __init__( self, - block_size: int = 32, + block_size: int | None = 32, device: torch.device | None = None, precision: torch.dtype | None = None, location: str = "weight", @@ -76,7 +76,7 @@ def __init__( inverse: bool = False, ) -> None: super().__init__() - self.size = block_size + self.size = block_size if block_size is not None else 32 self.scale = 1.0 / math.sqrt(self.size) self.location = location self.module_type = module_type From 110e9eadbc538ee02c1b335f2609611f3fded15e Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 23 Apr 2026 15:34:28 +0800 Subject: [PATCH 85/90] fix cuda ut Signed-off-by: n1ck-guo --- test/test_cuda/export/test_gguf_format.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/test_cuda/export/test_gguf_format.py b/test/test_cuda/export/test_gguf_format.py index 8d4c6a363..f67e003e4 100644 --- a/test/test_cuda/export/test_gguf_format.py +++ b/test/test_cuda/export/test_gguf_format.py @@ -213,8 +213,7 @@ def test_q2_k_s_ffn_down_q4k(self): from gguf.gguf_reader import GGUFReader model_path = get_model_path("Qwen/Qwen3-1.7B") - tiny_model_path = "./tmp/tiny_qwen3_1b" - save_tiny_model(model_path, tiny_model_path, num_layers=8) + tiny_model_path = save_tiny_model(model_path, "./tmp/tiny_qwen3_1b", num_layers=8) autoround = AutoRound( tiny_model_path, iters=0, From 441d060fba41d57a2e674ca3feadef249e729559 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Mon, 27 Apr 2026 09:05:01 +0800 Subject: [PATCH 86/90] sync: add xpu sdpa patch and AutoScheme VLM support to new arch --- auto_round/compressors_new/base.py | 86 +++++++++++++++++++++++++++++- 1 file changed, 84 insertions(+), 2 deletions(-) diff --git a/auto_round/compressors_new/base.py b/auto_round/compressors_new/base.py index 8d9842503..37f5726e8 100644 --- a/auto_round/compressors_new/base.py +++ b/auto_round/compressors_new/base.py @@ -57,7 +57,12 @@ is_quantized_input_module, memory_monitor, ) -from auto_round.utils.device import _force_trim_malloc, get_major_device, set_non_auto_device_map +from auto_round.utils.device import ( + _force_trim_malloc, + get_major_device, + patch_xpu_sdpa_drop_causal_mask, + set_non_auto_device_map, +) from auto_round.utils.offload import OffloadManager from auto_round.wrapper import wrapper_block @@ -203,6 +208,11 @@ def __init__( else: torch.use_deterministic_algorithms(True, warn_only=True) + # XPU SDPA workaround: drop pure causal masks so FLASH backend is used, + # and set torch.use_deterministic_algorithms(False) + # instead of MATH (avoids ~10x peak-VRAM blow-up during block tuning). + patch_xpu_sdpa_drop_causal_mask() + device = kwargs.pop("device", None) if device is not None: logger.warning("`device` is deprecated, please use `device_map` instead") @@ -349,7 +359,15 @@ def _scheme_post_init(self) -> None: def _gen_auto_scheme(self) -> dict[str, dict]: """Generate per-layer config via AutoScheme delta-loss selection.""" if self.model_context.is_mllm: - raise NotImplementedError("AutoScheme is not yet supported for multimodal LLMs.") + # AutoScheme on a VLM only scores the language tower (the block + # walker in delta_loss already skips vision/audio sub-trees) and + # uses a pure-text calibration dataset by default, falling back to + # the multimodal dataloader if the VLM rejects text-only forward. + logger.info( + "AutoScheme on multimodal LLM: scoring the language tower only " + "with text-only calibration (multimodal dataloader will be used " + "as a fallback if needed)." + ) if is_quantized_input_module(self.model_context.model): raise NotImplementedError("AutoScheme does not currently support quantized input models (e.g., FP8).") @@ -394,6 +412,58 @@ def _gen_auto_scheme(self) -> dict[str, dict]: is_mllm=self.model_context.is_mllm, ) quant_layer_names = layer_config.keys() + + # ---- VLM: peel non-text sub-trees AutoScheme should not score ---- # + nontext_skipped_layers: dict[str, dict] = {} + if self.model_context.is_mllm: + from auto_round.utils import get_block_names + + quant_nontext = getattr(self, "quant_nontext_module", False) + scoreable_blocks = get_block_names(self.model_context.model, quant_vision=quant_nontext) + scoreable_block_prefixes = tuple(blk for group in scoreable_blocks for blk in group) + if quant_nontext: + peel_markers = ("audio", "speech", "wav", "waveform") + tower_label = "language+vision" + peel_label = "audio/speech" + else: + peel_markers = ( + "vision", + "visual", + "image", + "img", + "audio", + "speech", + "wav", + "waveform", + ) + tower_label = "language" + peel_label = "vision/audio" + + def _is_scoreable_layer(name: str) -> bool: + if any(name == p or name.startswith(p + ".") for p in scoreable_block_prefixes): + return True + lname = name.lower() + return not any(marker in lname for marker in peel_markers) + + scoreable_layer_config = {} + for name, cfg in layer_config.items(): + if _is_scoreable_layer(name): + scoreable_layer_config[name] = cfg + else: + nontext_skipped_layers[name] = cfg + + if nontext_skipped_layers: + logger.info( + "AutoScheme (VLM): scoring %d %s-tower layers; " + "%d %s-tower layers kept at their original 16-bit configuration.", + len(scoreable_layer_config), + tower_label, + len(nontext_skipped_layers), + peel_label, + ) + layer_config = scoreable_layer_config + quant_layer_names = layer_config.keys() + scheme_keys = {f.name for f in fields(QuantizationScheme)} fixed_layer_scheme_new = { k: {key: v[key] for key in scheme_keys & v.keys()} @@ -418,8 +488,20 @@ def _gen_auto_scheme(self) -> dict[str, dict]: device_map=self.compress_context.device_map, tokenizer=self.model_context.tokenizer, enable_torch_compile=self.compress_context.enable_torch_compile, + processor=self.model_context.processor, ) layer_config = self.scheme_generator.get_layer_config() + # Re-attach vision/audio-tower layers we peeled off earlier so the + # downstream quantization pipeline sees the complete layer map. + if nontext_skipped_layers: + allowed_keys = {f.name for f in fields(QuantizationScheme)} | { + "fixed_by_user", + "scale_dtype", + "scheme", + } + for name, cfg in nontext_skipped_layers.items(): + clean_cfg = {k: v for k, v in cfg.items() if k in allowed_keys} if isinstance(cfg, dict) else cfg + layer_config.setdefault(name, clean_cfg) return layer_config def configure_layer_config(self, enable_gguf_official_mixed: bool | None = True) -> None: From efe2c74bd1e93c2b579eeb19ebeff10176e44e87 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Mon, 27 Apr 2026 14:11:27 +0800 Subject: [PATCH 87/90] merge main Signed-off-by: n1ck-guo --- auto_round/algorithms/transforms/rotation/patch.py | 4 ++-- auto_round/compressors_new/diffusion_mixin.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/auto_round/algorithms/transforms/rotation/patch.py b/auto_round/algorithms/transforms/rotation/patch.py index 49c838ac6..632ec9682 100644 --- a/auto_round/algorithms/transforms/rotation/patch.py +++ b/auto_round/algorithms/transforms/rotation/patch.py @@ -81,10 +81,10 @@ def _qdq_weight_patched(self, value, min_scale, max_scale): _orig_qdq_act = WrapperLinear._qdq_act - def _qdq_act_patched(self, x, act_max_scale, act_max=None): + def _qdq_act_patched(self, x, act_min_scale=torch.tensor(1.0), act_max_scale=torch.tensor(1.0), act_max=None): x = inp_transform(x) - return _orig_qdq_act(self, x, act_max_scale, act_max) + return _orig_qdq_act(self, x, act_min_scale=act_min_scale, act_max_scale=act_max_scale, act_max=act_max) WrapperLinear._qdq_weight = _qdq_weight_patched WrapperLinear._qdq_act = _qdq_act_patched diff --git a/auto_round/compressors_new/diffusion_mixin.py b/auto_round/compressors_new/diffusion_mixin.py index 5225e1e33..1139e384b 100644 --- a/auto_round/compressors_new/diffusion_mixin.py +++ b/auto_round/compressors_new/diffusion_mixin.py @@ -112,7 +112,7 @@ def calib(self, nsamples, bs): if ( hasattr(self.model, "hf_device_map") - and len(self.model.hf_device_map) > 0 + and len(self.model.hf_device_map) > 1 and pipe.device != self.model.device and torch.device(self.model.device).type in ["cuda", "xpu"] ): From 26aa1066c5bdb0471ed25a209be9f41f1e767102 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Mon, 27 Apr 2026 16:25:36 +0800 Subject: [PATCH 88/90] fix diffusion ut Signed-off-by: n1ck-guo --- auto_round/algorithms/quantization/base.py | 4 ++-- auto_round/compressors_new/diffusion_mixin.py | 22 +++++++++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/auto_round/algorithms/quantization/base.py b/auto_round/algorithms/quantization/base.py index 054ab9acd..8c400aec7 100644 --- a/auto_round/algorithms/quantization/base.py +++ b/auto_round/algorithms/quantization/base.py @@ -413,7 +413,7 @@ def _get_current_q_output( _bf = self._resolve_block_forward() if getattr(self.model_context, "is_diffusion", False): - output_config = self.DIFFUSION_OUTPUT_CONFIGS.get(block.__class__.__name__, []) + output_config = self.DIFFUSION_OUTPUT_CONFIGS.get(block.__class__.__name__, ["hidden_states"]) idx = None if "hidden_states" not in output_config else output_config.index("hidden_states") if isinstance(current_input_ids, dict): hidden_states = current_input_ids.pop("hidden_states") @@ -510,7 +510,7 @@ def _get_diffusion_block_outputs( support for new diffusion architectures. """ output = defaultdict(list) - output_config = self.DIFFUSION_OUTPUT_CONFIGS.get(block.__class__.__name__, []) + output_config = self.DIFFUSION_OUTPUT_CONFIGS.get(block.__class__.__name__, ["hidden_states"]) if isinstance(input_ids, dict): nsamples = len(input_ids["hidden_states"]) else: diff --git a/auto_round/compressors_new/diffusion_mixin.py b/auto_round/compressors_new/diffusion_mixin.py index 1139e384b..20b6c1baf 100644 --- a/auto_round/compressors_new/diffusion_mixin.py +++ b/auto_round/compressors_new/diffusion_mixin.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import inspect import os from typing import Union @@ -51,10 +52,29 @@ def __init__(self, *args, guidance_scale=7.5, num_inference_steps=50, generator_ self.guidance_scale = guidance_scale self.num_inference_steps = num_inference_steps self.generator_seed = generator_seed + self.height = kwargs.pop("height", None) + self.width = kwargs.pop("width", None) # Call parent class __init__ (will be CalibCompressor, ImatrixCompressor, etc) super().__init__(*args, **kwargs) + def _get_pipeline_call_kwargs(self, pipe) -> dict: + """Build optional pipeline kwargs for calibration. + + Prefer latent outputs during calibration when the pipeline supports them, + since transformer-block caching does not require VAE decode and this + avoids dtype mismatches in tiny/random diffusion fixtures. + """ + pipe_sig = inspect.signature(pipe.__call__) + extra = {} + if "height" in pipe_sig.parameters and self.height is not None: + extra["height"] = self.height + if "width" in pipe_sig.parameters and self.width is not None: + extra["width"] = self.width + if "output_type" in pipe_sig.parameters: + extra["output_type"] = "latent" + return extra + def _get_block_forward_func(self, name: str): """Diffusion models pass positional args; wrap the base forward func accordingly. @@ -107,6 +127,7 @@ def calib(self, nsamples, bs): total_cnt = 0 total = nsamples if not hasattr(self.dataloader, "len") else min(nsamples, len(self.dataloader)) + extra = self._get_pipeline_call_kwargs(pipe) if pipe.dtype != self.model.dtype: pipe.to(self.model.dtype) @@ -139,6 +160,7 @@ def calib(self, nsamples, bs): if self.generator_seed is None else torch.Generator(device=pipe.device).manual_seed(self.generator_seed) ), + **extra, ) except NotImplementedError: pass From 9d98b0dab4d98db7cf8b1c1fce4b548f19c7b214 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Mon, 27 Apr 2026 17:02:42 +0800 Subject: [PATCH 89/90] diffusion: align new arch with old DiffusionCompressor - DiffusionMixin.__init__: when iters>0, force batch_size=1 and fold the batch into gradient_accumulate_steps, patching both kwargs and the AlgConfig (same pattern as MLLMMixin), matching old DiffusionCompressor.__init__. - DiffusionMixin.__init__ (post-super): unconditionally pipe.to(model.dtype) to align VAE/text-encoder with transformer dtype, mirroring DiffusionCompressor._align_device_and_dtype. Equality check on pipe.dtype is unreliable because it only reflects the primary component. - BaseQuantizers.DIFFUSION_OUTPUT_CONFIGS: populate Flux/OvisImage block output mappings, matching old-arch compressors/diffusion/compressor.py. - Revert new-arch-only logic (_get_pipeline_call_kwargs, height/width, output_type=latent injection) to keep parity with old arch. --- auto_round/algorithms/quantization/base.py | 7 +- auto_round/compressors_new/diffusion_mixin.py | 85 +++++++++++++------ 2 files changed, 67 insertions(+), 25 deletions(-) diff --git a/auto_round/algorithms/quantization/base.py b/auto_round/algorithms/quantization/base.py index 8c400aec7..a92f75e7b 100644 --- a/auto_round/algorithms/quantization/base.py +++ b/auto_round/algorithms/quantization/base.py @@ -48,7 +48,12 @@ class BaseQuantizers: # Subclasses that support diffusion models should override this with the # appropriate output key mapping, e.g.: # DIFFUSION_OUTPUT_CONFIGS = {"FluxTransformerBlock": ["encoder_hidden_states", "hidden_states"]} - DIFFUSION_OUTPUT_CONFIGS: dict = {} + DIFFUSION_OUTPUT_CONFIGS: dict = { + "FluxTransformerBlock": ["encoder_hidden_states", "hidden_states"], + "FluxSingleTransformerBlock": ["encoder_hidden_states", "hidden_states"], + "OvisImageTransformerBlock": ["encoder_hidden_states", "hidden_states"], + "OvisImageSingleTransformerBlock": ["encoder_hidden_states", "hidden_states"], + } def __init__(self, config: QuantizationConfig): self.config = config diff --git a/auto_round/compressors_new/diffusion_mixin.py b/auto_round/compressors_new/diffusion_mixin.py index 20b6c1baf..515839c1f 100644 --- a/auto_round/compressors_new/diffusion_mixin.py +++ b/auto_round/compressors_new/diffusion_mixin.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import inspect import os from typing import Union @@ -52,28 +51,70 @@ def __init__(self, *args, guidance_scale=7.5, num_inference_steps=50, generator_ self.guidance_scale = guidance_scale self.num_inference_steps = num_inference_steps self.generator_seed = generator_seed - self.height = kwargs.pop("height", None) - self.width = kwargs.pop("width", None) + + # Mirror old-arch DiffusionCompressor.__init__: when iters > 0, diffusion calibration + # cannot use batch_size > 1 for non-text modules; fold the extra batch into + # gradient_accumulate_steps so the effective sample count is unchanged. + # The authoritative batch_size lives on the AlgConfig (args[0]); kwargs may also + # carry it from AutoRoundCompatible. Patch BOTH (same pattern as MLLMMixin). + iters = kwargs.get("iters", None) + _alg_cfg = args[0] if args else None + if iters is None and _alg_cfg is not None: + cfgs = _alg_cfg if isinstance(_alg_cfg, list) else [_alg_cfg] + for cfg in cfgs: + if hasattr(cfg, "iters") and cfg.iters is not None: + iters = cfg.iters + break + if iters is None: + iters = 200 + + if iters > 0: + batch_size = kwargs.get("batch_size", None) + if batch_size is None and _alg_cfg is not None: + cfgs = _alg_cfg if isinstance(_alg_cfg, list) else [_alg_cfg] + for cfg in cfgs: + if hasattr(cfg, "batch_size") and cfg.batch_size is not None: + batch_size = cfg.batch_size + break + if batch_size is not None and batch_size != 1: + grad_acc = kwargs.get("gradient_accumulate_steps", 1) + if _alg_cfg is not None: + cfgs = _alg_cfg if isinstance(_alg_cfg, list) else [_alg_cfg] + for cfg in cfgs: + if hasattr(cfg, "gradient_accumulate_steps") and cfg.gradient_accumulate_steps is not None: + grad_acc = cfg.gradient_accumulate_steps + break + new_grad_acc = batch_size * grad_acc + kwargs["gradient_accumulate_steps"] = new_grad_acc + kwargs["batch_size"] = 1 + if _alg_cfg is not None: + cfgs = _alg_cfg if isinstance(_alg_cfg, list) else [_alg_cfg] + for cfg in cfgs: + if hasattr(cfg, "batch_size"): + cfg.batch_size = 1 + if hasattr(cfg, "gradient_accumulate_steps"): + cfg.gradient_accumulate_steps = new_grad_acc + logger.warning( + f"reset batch_size({batch_size}) to 1 and " + f"gradient_accumulate_steps to {new_grad_acc} " + f"because batch_size={batch_size} cannot be used for calibrating non-text modules." + ) # Call parent class __init__ (will be CalibCompressor, ImatrixCompressor, etc) super().__init__(*args, **kwargs) - def _get_pipeline_call_kwargs(self, pipe) -> dict: - """Build optional pipeline kwargs for calibration. - - Prefer latent outputs during calibration when the pipeline supports them, - since transformer-block caching does not require VAE decode and this - avoids dtype mismatches in tiny/random diffusion fixtures. - """ - pipe_sig = inspect.signature(pipe.__call__) - extra = {} - if "height" in pipe_sig.parameters and self.height is not None: - extra["height"] = self.height - if "width" in pipe_sig.parameters and self.width is not None: - extra["width"] = self.width - if "output_type" in pipe_sig.parameters: - extra["output_type"] = "latent" - return extra + # Mirror old-arch DiffusionCompressor._align_device_and_dtype: unconditionally + # cast the full diffusion pipeline (VAE, text encoder, etc.) to the transformer's + # dtype so that calibration's pipe(...) call doesn't crash with dtype mismatches + # when the transformer is force-cast to bf16 for activation quantization. + # Note: pipe.dtype only reflects the primary component, so an equality check would + # miss mixed-dtype pipelines where e.g. the VAE is still float32. + pipe = getattr(self.model_context, "pipe", None) + model = getattr(self.model_context, "model", None) + if pipe is not None and model is not None: + is_nextstep = hasattr(model, "config") and getattr(model.config, "model_type", None) == "nextstep" + if not is_nextstep: + pipe.to(model.dtype) def _get_block_forward_func(self, name: str): """Diffusion models pass positional args; wrap the base forward func accordingly. @@ -127,9 +168,6 @@ def calib(self, nsamples, bs): total_cnt = 0 total = nsamples if not hasattr(self.dataloader, "len") else min(nsamples, len(self.dataloader)) - extra = self._get_pipeline_call_kwargs(pipe) - if pipe.dtype != self.model.dtype: - pipe.to(self.model.dtype) if ( hasattr(self.model, "hf_device_map") @@ -152,7 +190,7 @@ def calib(self, nsamples, bs): prompts = list(prompts) try: pipe( - prompt=prompts, + prompts, guidance_scale=self.guidance_scale, num_inference_steps=self.num_inference_steps, generator=( @@ -160,7 +198,6 @@ def calib(self, nsamples, bs): if self.generator_seed is None else torch.Generator(device=pipe.device).manual_seed(self.generator_seed) ), - **extra, ) except NotImplementedError: pass From 2fe5f032fdd940dc4cae822becc92856c20987ff Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Mon, 27 Apr 2026 22:49:22 +0800 Subject: [PATCH 90/90] fix Signed-off-by: n1ck-guo --- auto_round/algorithms/quantization/base.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/auto_round/algorithms/quantization/base.py b/auto_round/algorithms/quantization/base.py index a92f75e7b..12d972554 100644 --- a/auto_round/algorithms/quantization/base.py +++ b/auto_round/algorithms/quantization/base.py @@ -546,6 +546,8 @@ def _get_diffusion_block_outputs( device, None, ) + if isinstance(tmp_output, torch.Tensor): + tmp_output = [tmp_output] assert len(output_config) == len(tmp_output) tmp_output = dict(zip(output_config, tmp_output))