From 2401210fe7267721c3abfcb88d211ff1778b365f Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Wed, 4 Mar 2026 11:28:09 -0800 Subject: [PATCH 01/28] Bump transformers to 5.0 Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- examples/gpt-oss/requirements.txt | 1 - examples/llm_distill/requirements.txt | 1 - examples/speculative_decoding/requirements.txt | 3 +-- examples/vlm_ptq/requirements-vila.txt | 3 --- modelopt/torch/__init__.py | 2 +- modelopt/torch/speculative/plugins/transformers.py | 9 ++------- pyproject.toml | 2 +- 7 files changed, 5 insertions(+), 16 deletions(-) delete mode 100644 examples/vlm_ptq/requirements-vila.txt diff --git a/examples/gpt-oss/requirements.txt b/examples/gpt-oss/requirements.txt index 368097d337..76c3b0a2e8 100644 --- a/examples/gpt-oss/requirements.txt +++ b/examples/gpt-oss/requirements.txt @@ -1,5 +1,4 @@ kernels>=0.9.0 torch>2.7.1 trackio -transformers>=4.55.0 trl>=0.21.0 diff --git a/examples/llm_distill/requirements.txt b/examples/llm_distill/requirements.txt index 91dda9dafd..4bcd190839 100644 --- a/examples/llm_distill/requirements.txt +++ b/examples/llm_distill/requirements.txt @@ -1,4 +1,3 @@ pyarrow torchao>=0.14.1 -transformers<5.0 trl>=0.23.0 diff --git a/examples/speculative_decoding/requirements.txt b/examples/speculative_decoding/requirements.txt index 6324bac62b..8e50f9c3f4 100644 --- a/examples/speculative_decoding/requirements.txt +++ b/examples/speculative_decoding/requirements.txt @@ -1,2 +1 @@ -accelerate==1.12.0 -transformers==5.0.0rc1 +transformers>=5.0 diff --git a/examples/vlm_ptq/requirements-vila.txt b/examples/vlm_ptq/requirements-vila.txt deleted file mode 100644 index 7391a5f268..0000000000 --- a/examples/vlm_ptq/requirements-vila.txt +++ /dev/null @@ -1,3 +0,0 @@ -deepspeed>=0.16.0 -git+https://github.com/bfshi/scaling_on_scales.git -transformers<=4.50.0 diff --git a/modelopt/torch/__init__.py b/modelopt/torch/__init__.py index ec62b86ffc..d2d4d80582 100644 --- a/modelopt/torch/__init__.py +++ b/modelopt/torch/__init__.py @@ -32,7 +32,7 @@ try: from transformers import __version__ as _transformers_version - if not (_Version("4.56") <= _Version(_transformers_version) < _Version("5.0")): + if not (_Version("4.56") <= _Version(_transformers_version)): _warnings.warn( f"transformers version {_transformers_version} is not tested with nvidia-modelopt and may cause issues. " "Please install recommended version with `pip install nvidia-modelopt[hf]` if working with HF models.", diff --git a/modelopt/torch/speculative/plugins/transformers.py b/modelopt/torch/speculative/plugins/transformers.py index 8561a390fc..286f256b87 100644 --- a/modelopt/torch/speculative/plugins/transformers.py +++ b/modelopt/torch/speculative/plugins/transformers.py @@ -75,11 +75,6 @@ CACHED_SHARD_TTT_MASKS = {} -def _get_empty_cache(config): - """Return an empty cache. Handle different versions of transformers for unit tests.""" - return DynamicCache(config=config) - - @MedusaDMRegistry.register({PreTrainedModel: "hf.PreTrainedModel"}) class HFMedusaModel(MedusaModel): """Medusa Model Class for huggingface models.""" @@ -910,9 +905,9 @@ def forward( ) if not isinstance(past_key_values, Cache): - past_key_values = _get_empty_cache(self._base_llm_config) + past_key_values = DynamicCache(config=self._base_llm_config) if not isinstance(eagle_cache, Cache): - eagle_cache = _get_empty_cache(self.eagle_module.config) + eagle_cache = DynamicCache(config=self.eagle_module.config) past_key_values.eagle_cache = eagle_cache # ====Prepare inputs for the first eagle forward pass==== diff --git a/pyproject.toml b/pyproject.toml index 96490dff0a..cb4185e16b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,7 +81,7 @@ hf = [ "nltk", "peft>=0.17.0", "sentencepiece>=0.2.1", # Also implicitly used in test_unified_export_megatron, test_vllm_fakequant_megatron_export - "transformers>=4.56,<5.0", # Should match modelopt/torch/__init__.py and tox.ini + "transformers>=4.56", # Should match modelopt/torch/__init__.py and tox.ini "wonderwords", ] dev-lint = [ From b269824776b8b5240b78f3212d7e3cb37756aab3 Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Fri, 6 Mar 2026 08:15:33 -0800 Subject: [PATCH 02/28] Fix Bert Gradnas tracing for transformers 5.0 Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- modelopt/torch/trace/plugins/transformers.py | 58 ++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/modelopt/torch/trace/plugins/transformers.py b/modelopt/torch/trace/plugins/transformers.py index f07a37601b..02e70741c5 100644 --- a/modelopt/torch/trace/plugins/transformers.py +++ b/modelopt/torch/trace/plugins/transformers.py @@ -15,7 +15,10 @@ """Utilities to describe symbols in the dynamic attention module.""" +import torch +from packaging.version import Version as _Version from torch import nn +from transformers import __version__ as _transformers_version from transformers.models.bert.modeling_bert import BertAttention from transformers.models.gptj.modeling_gptj import GPTJAttention @@ -56,3 +59,58 @@ def get_hf_attn_sym_info_sortable(mod: nn.Module) -> SymInfo: @SymMap.register([GPTJAttention]) def get_hf_attn_sym_info_unsortable(mod: nn.Module) -> SymInfo: return get_hf_attn_sym_info(sortable_attn=True) + + +# In transformers>=5.0, BertLayer.forward uses tuple unpacking on the BertAttention output +# (e.g. `self_attn_out, _ = self.attention(...)`), which FX symbolic tracing cannot handle when +# BertAttention is a registered leaf (the proxy is not iterable). Patch BertLayer.forward to use +# indexing instead, and call feed_forward_chunk directly (equivalent to apply_chunking_to_forward +# with chunk_size=0, which is the default for BERT). +if _Version(_transformers_version) >= _Version("5.0"): + from transformers.models.bert.modeling_bert import BertLayer as _BertLayer + + def _fx_friendly_bert_layer_forward( + self, + hidden_states: torch.Tensor, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + cache_position=None, + **kwargs, + ): + # Use indexing instead of tuple-unpacking so FX can trace through BertLayer + # when BertAttention is a registered leaf (returns an opaque Proxy). + # Accept **kwargs so that a parent trace (e.g. BertEncoder) passing extra kwargs + # like position_ids does not mark BertLayer as failed. However, do NOT forward + # **kwargs into self.attention: FX represents **kwargs as a Proxy(_kwargs), so + # unpacking it with ** would trigger "Proxy cannot be iterated". Additionally, + # BertSelfAttention ignores these kwargs (e.g. position_ids) in practice. + _attn_outputs = self.attention( + hidden_states, + attention_mask, + past_key_values=past_key_values, + cache_position=cache_position, + ) + attention_output = _attn_outputs[0] + + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with" + " cross-attention layers by setting `config.add_cross_attention=True`" + ) + _cross_outputs = self.crossattention( + attention_output, + None, + encoder_hidden_states, + encoder_attention_mask, + past_key_values=past_key_values, + ) + attention_output = _cross_outputs[0] + + # Call feed_forward_chunk directly (equivalent to apply_chunking_to_forward when + # chunk_size_feed_forward=0, which is the BERT default). + return self.feed_forward_chunk(attention_output) + + _BertLayer.forward = _fx_friendly_bert_layer_forward From 66ec5533ef6e5bb9b718cb025b69b3315826f41f Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Tue, 24 Mar 2026 13:32:39 -0700 Subject: [PATCH 03/28] Add more fixes Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- modelopt/torch/opt/plugins/transformers.py | 4 +-- .../torch/quantization/plugins/huggingface.py | 33 +++++++++++-------- .../torch/quantization/utils/core_utils.py | 7 ++++ 3 files changed, 29 insertions(+), 15 deletions(-) diff --git a/modelopt/torch/opt/plugins/transformers.py b/modelopt/torch/opt/plugins/transformers.py index 7cfdc8ca0c..220f4cae9e 100644 --- a/modelopt/torch/opt/plugins/transformers.py +++ b/modelopt/torch/opt/plugins/transformers.py @@ -93,12 +93,12 @@ def _save_pretrained_with_checks(self, save_directory, *args, **kwargs): # [Fix for huggingface bug] deepspeed zero3 training backend only loads params into the model from # state_dict, but not buffers. So lets explicitly load the buffers into the model from state_dict. -def _load_params_and_buffers_into_zero3_model(model_to_load, state_dict): +def _load_params_and_buffers_into_zero3_model(model_to_load, state_dict, load_config=None): buffer_names = [name for name, _ in model_to_load.named_buffers()] buffer_state_dict = {k: v for k, v in state_dict.items() if k in buffer_names} model_to_load.load_state_dict(buffer_state_dict, strict=False) return tf_modeling_utils._modelopt_cache["_load_state_dict_into_zero3_model"]( - model_to_load, state_dict + model_to_load, state_dict, load_config ) diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py index 0d02716a6e..1484a404f9 100644 --- a/modelopt/torch/quantization/plugins/huggingface.py +++ b/modelopt/torch/quantization/plugins/huggingface.py @@ -333,10 +333,14 @@ class HFParallelLinear(torch.nn.Linear, DynamicModule): shard = None def _setup(self): - assert self.weight.placements == self.shard, ( - f"Received unexpected shard {self.weight.placements} for {self}" - ) - tp_group = self.weight.device_mesh.get_group() + if isinstance(self.weight, torch.distributed.tensor.DTensor): # transformers<5.0 + assert self.weight.placements == self.shard, ( + f"Received unexpected shard {self.weight.placements} for {self}" + ) + device_mesh = self.weight.device_mesh + else: # transformers>=5.0: weights are plain Parameters, mesh is on the module + device_mesh = self._hf_device_mesh + tp_group = device_mesh.get_group() self._parallel_state = ParallelState(data_parallel_group=-1, tensor_parallel_group=tp_group) @classmethod @@ -371,14 +375,17 @@ def fold_weight(self, keep_attrs: bool = False): @contextmanager def enable_weight_access_and_writeback(self): - assert self.weight.placements == self.shard, ( - f"Received unexpected shard {self.weight.placements} for {self}" - ) - weight = self.weight - # TODO: To support TP + FSDP, we need to redistribute the tensor with replicate instead of shard - self.weight = nn.Parameter(weight.to_local()) - yield - self.weight = weight + if isinstance(self.weight, torch.distributed.tensor.DTensor): # transformers<5.0 + assert self.weight.placements == self.shard, ( + f"Received unexpected shard {self.weight.placements} for {self}" + ) + weight = self.weight + # TODO: To support TP + FSDP, we need to redistribute the tensor with replicate instead of shard + self.weight = nn.Parameter(weight.to_local()) + yield + self.weight = weight + else: # transformers>=5.0: weights are already plain Parameters + yield @QuantModuleRegistry.register({HFColumnParallelLinear: "HFColumnParallelLinear"}) @@ -523,7 +530,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: super().forward(hidden_states) self.gate.top_k = original_top_k else: - # Path for transformers < 5.0 + # Path for transformers<5.0 if hasattr(self, "gate") and hasattr(self.gate, "top_k"): top_k_owner = self.gate else: diff --git a/modelopt/torch/quantization/utils/core_utils.py b/modelopt/torch/quantization/utils/core_utils.py index 4340b8dc1f..22f2079649 100644 --- a/modelopt/torch/quantization/utils/core_utils.py +++ b/modelopt/torch/quantization/utils/core_utils.py @@ -524,7 +524,14 @@ def sync_moe_expert_amax(experts): 2. For any ``weight_quantizer`` that is enabled but has ``amax is None`` (expert received no tokens during calibration), runs a weight-only ``max_calibrate`` to populate the missing amax. + + No-op for batched expert modules (e.g. transformers>=5.0 ``Qwen3MoeExperts``) + that store all expert weights in a single 3D tensor without per-expert sub-modules. """ + if not hasattr(experts, "__iter__"): + # transformers>=5.0: batched experts, no per-expert quantizers + return + from ..nn import TensorQuantizer amax_dict: dict[str, torch.Tensor] = {} From bdaa515ef64b5f112f7f94c47df118290396cfe1 Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Tue, 24 Mar 2026 13:42:04 -0700 Subject: [PATCH 04/28] Fix Bert and DBRX unit tests Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- .../quantization/nn/modules/quant_linear.py | 49 +++++++++----- .../torch/quantization/plugins/huggingface.py | 66 ++++++++++++------- .../quantization/plugins/test_huggingface.py | 10 ++- 3 files changed, 79 insertions(+), 46 deletions(-) diff --git a/modelopt/torch/quantization/nn/modules/quant_linear.py b/modelopt/torch/quantization/nn/modules/quant_linear.py index bcb71e4c93..bb65d59077 100644 --- a/modelopt/torch/quantization/nn/modules/quant_linear.py +++ b/modelopt/torch/quantization/nn/modules/quant_linear.py @@ -246,26 +246,39 @@ def __init__(self, weight_quantizer: TensorQuantizer, *args, **kwargs): self.weight_quantizer = weight_quantizer def __setitem__(self, key, value): - if ( - key == "weight" - and self.weight_quantizer - and self.weight_quantizer.is_enabled - and not self.weight_quantizer._fake_quant - and value.element_size() > 1 - ): - # reset the amax for later calibration + if key == "weight" and not isinstance(value, QTensorWrapper): + existing = self.get("weight") if ( - self.weight_quantizer.amax is not None - and self.weight_quantizer.amax.is_meta + isinstance(existing, QTensorWrapper) + and not existing.is_meta + and existing.shape == value.shape ): - delattr(self.weight_quantizer, "_amax") - self.weight_quantizer.amax = self.weight_quantizer._get_amax(value) - self.weight_quantizer._calibrator.reset() - # compress the weight - real_quant_tensor = self.weight_quantizer(value) - real_quant_value = QTensorWrapper(real_quant_tensor) - del value # delete the original weight to save memory - value = real_quant_value + # Loading a compressed weight (e.g. from safetensors in transformers>=5.0 + # which replaces parameters via setattr rather than copy_). Preserve the + # QTensorWrapper type and metadata. + super().__setitem__( + key, QTensorWrapper(qtensor=value.data, metadata=existing.metadata) + ) + return + if ( + self.weight_quantizer + and self.weight_quantizer.is_enabled + and not self.weight_quantizer._fake_quant + and value.element_size() > 1 + ): + # reset the amax for later calibration + if ( + self.weight_quantizer.amax is not None + and self.weight_quantizer.amax.is_meta + ): + delattr(self.weight_quantizer, "_amax") + self.weight_quantizer.amax = self.weight_quantizer._get_amax(value) + self.weight_quantizer._calibrator.reset() + # compress the weight + real_quant_tensor = self.weight_quantizer(value) + real_quant_value = QTensorWrapper(real_quant_tensor) + del value # delete the original weight to save memory + value = real_quant_value super().__setitem__(key, value) # Monkey patch the _parameters.__setitem__ to real quant the weight when loading diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py index 1484a404f9..45bd50631d 100644 --- a/modelopt/torch/quantization/plugins/huggingface.py +++ b/modelopt/torch/quantization/plugins/huggingface.py @@ -172,14 +172,20 @@ def forward(self, *args, **kwargs): The forward method is used to patch the attention interface with _quantized_attention. Once output tensors are generated, it restores the original attention interface. """ + # In transformers>=5.0 some attention classes (e.g. BertAttention) no longer store + # `self.config` directly; fall back to searching child modules for a config attribute. + _config = getattr(self, "config", None) + if _config is None: + _config = next( + (getattr(m, "config", None) for m in self.children() if hasattr(m, "config")), + None, + ) + _attn_impl = getattr(_config, "_attn_implementation", None) if _config is not None else None def _is_eager_attention(): - if self.config._attn_implementation == "eager": + if _attn_impl is None or _attn_impl == "eager": return True - return bool( - self.config._attn_implementation == "sdpa" - and kwargs.get("output_attentions", False) - ) + return bool(_attn_impl == "sdpa" and kwargs.get("output_attentions", False)) # Get the original transformers module before wrapped in any ModelOpt DynamicModule module: ModuleType = inspect.getmodule(self.get_attn_type(self)) @@ -188,7 +194,7 @@ def _is_eager_attention(): original_attention_interface = ( module.eager_attention_forward if _is_eager_attention() - else module.ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + else module.ALL_ATTENTION_FUNCTIONS[_attn_impl] ) patch_fn = partial(self._quantized_attention, original_attention_interface) @@ -201,7 +207,7 @@ def _is_eager_attention(): ) module.eager_attention_forward = patch_fn # type: ignore[attr-defined] else: - module.ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] = patch_fn + module.ALL_ATTENTION_FUNCTIONS[_attn_impl] = patch_fn try: outputs = super().forward(*args, **kwargs) @@ -210,9 +216,7 @@ def _is_eager_attention(): if _is_eager_attention(): module.eager_attention_forward = original_attention_interface # type: ignore[attr-defined] else: - module.ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] = ( - original_attention_interface - ) + module.ALL_ATTENTION_FUNCTIONS[_attn_impl] = original_attention_interface return outputs @@ -603,22 +607,20 @@ def _setup(self): """Modify the DbrxExpert.""" # No setup is needed for DbrxExpert, we only need to update DbrxExpertGLU - # forward method copied from the original dbrx repo - https://github.com/databricks/dbrx/blob/a3200393/model/modeling_dbrx.py#L795 def forward( self, x: torch.Tensor, - weights: torch.Tensor, - top_weights: torch.Tensor, top_experts: torch.LongTensor, + top_weights: torch.Tensor, ) -> torch.Tensor: bsz, q_len, hidden_size = x.shape x = x.view(-1, hidden_size) out = torch.zeros_like(x) - expert_mask = nn.functional.one_hot(top_experts, num_classes=self.moe_num_experts).permute( + expert_mask = nn.functional.one_hot(top_experts, num_classes=self.num_experts).permute( 2, 1, 0 ) - for expert_idx in range(self.moe_num_experts): + for expert_idx in range(self.num_experts): topk_idx, token_idx = torch.where(expert_mask[expert_idx]) if token_idx.shape[0] == 0: continue @@ -648,41 +650,48 @@ def _copy_weights(modules, weights): with torch.no_grad(): module.weight.copy_(weights[expert_idx].detach()) + # In transformers 5.0, DbrxExpertGLU.forward uses raw matmul: x @ w1[i] where + # w1[i] has shape (ffn_hidden_size, hidden_size). To match via F.linear (which + # computes x @ W.T), we store weights transposed: W = w1[i].T. self.w1_linear = nn.ModuleList( [ - nn.Linear(self.hidden_size, self.ffn_hidden_size, bias=False) + nn.Linear(self.ffn_hidden_size, self.hidden_size, bias=False) for _ in range(self.moe_num_experts) ] ) _copy_weights( self.w1_linear, - self.w1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size), + self.w1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size).transpose( + 1, 2 + ), ) delattr(self, "w1") self.v1_linear = nn.ModuleList( [ - nn.Linear(self.hidden_size, self.ffn_hidden_size, bias=False) + nn.Linear(self.ffn_hidden_size, self.hidden_size, bias=False) for _ in range(self.moe_num_experts) ] ) _copy_weights( self.v1_linear, - self.v1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size), + self.v1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size).transpose( + 1, 2 + ), ) delattr(self, "v1") + # w2: down_proj uses intermediate.matmul(w2[i].t()) = F.linear(intermediate, w2[i]) + # so W = w2[i] directly (no extra transpose needed). self.w2_linear = nn.ModuleList( [ - nn.Linear(self.ffn_hidden_size, self.hidden_size, bias=False) + nn.Linear(self.hidden_size, self.ffn_hidden_size, bias=False) for _ in range(self.moe_num_experts) ] ) _copy_weights( self.w2_linear, - self.w2.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size).transpose( - 1, 2 - ), + self.w2.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size), ) delattr(self, "w2") @@ -882,11 +891,18 @@ def num_experts(self): @property def top_k(self): - return self.router.moe_top_k + # In older transformers, top_k was stored on DbrxRouter as moe_top_k. + # In transformers 5.0, DbrxFFN stores it as a plain attribute (top_k). + if hasattr(self.router, "moe_top_k"): + return self.router.moe_top_k + return self.__dict__.get("top_k", 1) @top_k.setter def top_k(self, value): - self.router.moe_top_k = value + if hasattr(self.router, "moe_top_k"): + self.router.moe_top_k = value + else: + self.__dict__["top_k"] = value @contextmanager diff --git a/tests/unit/torch/quantization/plugins/test_huggingface.py b/tests/unit/torch/quantization/plugins/test_huggingface.py index 33730409a6..253aa665c6 100644 --- a/tests/unit/torch/quantization/plugins/test_huggingface.py +++ b/tests/unit/torch/quantization/plugins/test_huggingface.py @@ -110,7 +110,8 @@ def test_dbrx(): assert DbrxExpertGLU in QuantModuleRegistry config = DbrxConfig( - ffn_config=DbrxFFNConfig(ffn_hidden_size=8, moe_num_experts=2), hidden_size=32 + ffn_config=DbrxFFNConfig(ffn_hidden_size=8, moe_num_experts=2, hidden_size=32), + hidden_size=32, ) model_ref = DbrxFFN(config) @@ -131,14 +132,17 @@ def test_dbrx(): assert hasattr(expertglu_test, "v1_linear") and not hasattr(expertglu_test, "v1") assert hasattr(expertglu_test, "w2_linear") and not hasattr(expertglu_test, "w2") + # Weights are stored transposed (W = w1[i].T) to match F.linear semantics with + # transformers 5.0's raw matmul: x @ w1[i] = F.linear(x, w1[i].T) assert torch.allclose( - torch.concat(list(expertglu_test.w1_linear.parameters()), dim=0), + torch.concat([m.weight.T for m in expertglu_test.w1_linear], dim=0), expertglu_ref.w1, ) mtq.set_quantizer_attribute(model_test, "*", {"enable": False}) - x = torch.randn(1, 4, 32) + # In transformers 5.0, the FFN input dimension is ffn_hidden_size (not hidden_size) + x = torch.randn(1, 4, 8) out_1 = model_ref(x) out_2 = model_test(x) assert torch.allclose(out_1[0], out_2[0]) From c72454cded23966009e114130bbed42b9f2e7500 Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Tue, 24 Mar 2026 13:49:39 -0700 Subject: [PATCH 05/28] Fix transformers load and test_llm_qat Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- modelopt/torch/opt/plugins/transformers.py | 35 ++++++++++++++++++++++ tests/examples/llm_qat/test_llm_qat.py | 3 +- 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/modelopt/torch/opt/plugins/transformers.py b/modelopt/torch/opt/plugins/transformers.py index 220f4cae9e..a60ea1af3b 100644 --- a/modelopt/torch/opt/plugins/transformers.py +++ b/modelopt/torch/opt/plugins/transformers.py @@ -15,6 +15,7 @@ """ModelOpt plugin for enabling automatic save/restore of ModelOpt state for HuggingFace models.""" +import os import types from contextlib import contextmanager @@ -26,6 +27,7 @@ from ..conversion import ModeloptStateManager from .huggingface import ( + _get_modelopt_state_path, _new_save_pretrained, _patch_model_init_for_modelopt, enable_huggingface_checkpointing, @@ -60,6 +62,37 @@ def _undo_torch_init_override_by_transformers(): setattr(torch.nn.init, name, init_func) +def _restore_qtensor_wrappers(model, model_path): + """Re-wrap QTensorWrapper weights that were replaced during HF weight loading. + + Transformers>=5.0 uses ``setattr`` to load weights, which replaces ``QTensorWrapper`` + objects with plain ``Parameter`` tensors. The compressed data is loaded correctly but + the wrapper metadata (original shape, dtype, qtensor class) is lost. This function + reads the saved ``q_tensor_state`` from ``modelopt_state.pth`` and re-wraps the affected + weights. + """ + modelopt_state_path = _get_modelopt_state_path(model_path) + if not os.path.isfile(modelopt_state_path): + return + + from modelopt.torch.quantization.nn.modules.quant_linear import RealQuantLinear + from modelopt.torch.quantization.qtensor import QTensorWrapper + + state = torch.load(modelopt_state_path, map_location="cpu", weights_only=False) + for _mode_name, mode_config in state.get("modelopt_state_dict", []): + q_tensor_state = mode_config.get("metadata", {}).get("q_tensor_state", {}) + for name, module in model.named_modules(): + if ( + isinstance(module, RealQuantLinear) + and name in q_tensor_state + and not isinstance(module.weight, QTensorWrapper) + ): + module._parameters["weight"] = QTensorWrapper( + qtensor=module.weight.data, + metadata=q_tensor_state[name]["metadata"], + ) + + def _new_from_pretrained(cls, /, pretrained_model_name_or_path, *args, **kwargs): """Patch for `cls.from_pretrained` method to restore ModelOpt state.""" with _patch_model_init_for_modelopt( @@ -69,6 +102,8 @@ def _new_from_pretrained(cls, /, pretrained_model_name_or_path, *args, **kwargs) pretrained_model_name_or_path, *args, **kwargs ) + _restore_qtensor_wrappers(model, pretrained_model_name_or_path) + return model diff --git a/tests/examples/llm_qat/test_llm_qat.py b/tests/examples/llm_qat/test_llm_qat.py index ebdb670247..5a0e7ad442 100644 --- a/tests/examples/llm_qat/test_llm_qat.py +++ b/tests/examples/llm_qat/test_llm_qat.py @@ -17,6 +17,7 @@ import pytest import torch from _test_utils.examples.run_command import run_example_command +from _test_utils.torch.misc import minimum_sm # fmt: off @@ -98,7 +99,7 @@ def test_llama_lora_qat_nvfp4(tiny_llama_path, tmp_path): ] ) - +@minimum_sm(90) def test_llama_qlora_nvfp4(tiny_llama_path, tmp_path): _run_command( [ From 46348a0e4510e4088baf230a8af4b417f751fc02 Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Tue, 24 Mar 2026 13:54:39 -0700 Subject: [PATCH 06/28] Remove tokenizer.batch_encode_plus Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- examples/llm_sparsity/weight_sparsity/eval.py | 2 +- examples/llm_sparsity/weight_sparsity/hf_pts.py | 2 +- examples/windows/onnx_ptq/genai_llm/quantize.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/llm_sparsity/weight_sparsity/eval.py b/examples/llm_sparsity/weight_sparsity/eval.py index 6b1d4ef17b..91199933e5 100644 --- a/examples/llm_sparsity/weight_sparsity/eval.py +++ b/examples/llm_sparsity/weight_sparsity/eval.py @@ -129,7 +129,7 @@ def __call__(self, instances: Sequence[dict]) -> dict[str, torch.Tensor]: [instance[key] for instance in instances] for key in ("src_idx", "label_idx") ) - batch_encoded = self.tokenizer.batch_encode_plus( + batch_encoded = self.tokenizer( sources, return_tensors="pt", padding=True, diff --git a/examples/llm_sparsity/weight_sparsity/hf_pts.py b/examples/llm_sparsity/weight_sparsity/hf_pts.py index ad8061211d..cae361e387 100644 --- a/examples/llm_sparsity/weight_sparsity/hf_pts.py +++ b/examples/llm_sparsity/weight_sparsity/hf_pts.py @@ -40,7 +40,7 @@ def get_calib_dataloader( else: raise NotImplementedError - batch_encoded = tokenizer.batch_encode_plus( + batch_encoded = tokenizer( dataset, return_tensors="pt", padding=True, truncation=True, max_length=block_size ) if device: diff --git a/examples/windows/onnx_ptq/genai_llm/quantize.py b/examples/windows/onnx_ptq/genai_llm/quantize.py index d21d1d796b..13f6ac8045 100644 --- a/examples/windows/onnx_ptq/genai_llm/quantize.py +++ b/examples/windows/onnx_ptq/genai_llm/quantize.py @@ -180,7 +180,7 @@ def get_initial_inputs( """ # tokenizer.pad_token = "[PAD]" tokenizer.pad_token = tokenizer.eos_token - encodings_dict = tokenizer.batch_encode_plus(prompt, padding=True) + encodings_dict = tokenizer(prompt, padding=True) # max_length = model.config.max_position_embeddings # input_ids = tokenizer.encode(text, truncation=True, padding='max_length', max_length=max_length) @@ -242,7 +242,7 @@ def get_calib_inputs( # dataset2 = dataset2.shuffle(seed=42) dataset2 = dataset2[column][:calib_size] - batch_encoded = tokenizer.batch_encode_plus( + batch_encoded = tokenizer( dataset2, return_tensors="pt", padding=True, truncation=True, max_length=block_size ) # return_tensors="pt", batch_encoded = batch_encoded.to(device) From 1d9155bc773456c7dfb18c44385531870e407f1b Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Tue, 24 Mar 2026 14:29:40 -0700 Subject: [PATCH 07/28] Remove deprecated transformers arguments Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- examples/gpt-oss/configs/sft_full.yaml | 2 +- examples/gpt-oss/configs/sft_lora.yaml | 2 +- .../gpt-oss/qat-finetune-transformers.ipynb | 2 +- examples/llm_qat/launch.sh | 2 +- .../notebooks/QAT_QAD_Walkthrough.ipynb | 2 +- .../llm_sparsity/attention_sparsity/hf_sa.py | 2 +- .../llm_sparsity/weight_sparsity/finetune.py | 19 ++++++------------- .../weight_sparsity/launch_finetune.sh | 4 ++-- examples/speculative_decoding/README.md | 2 +- 9 files changed, 15 insertions(+), 22 deletions(-) diff --git a/examples/gpt-oss/configs/sft_full.yaml b/examples/gpt-oss/configs/sft_full.yaml index 33273c1e92..7d980b9d03 100644 --- a/examples/gpt-oss/configs/sft_full.yaml +++ b/examples/gpt-oss/configs/sft_full.yaml @@ -16,7 +16,7 @@ per_device_train_batch_size: 2 per_device_eval_batch_size: 2 gradient_accumulation_steps: 2 max_length: 4096 -warmup_ratio: 0.03 +warmup_steps: 0.03 lr_scheduler_type: cosine_with_min_lr lr_scheduler_kwargs: min_lr_rate: 0.1 diff --git a/examples/gpt-oss/configs/sft_lora.yaml b/examples/gpt-oss/configs/sft_lora.yaml index 34f76a6e71..4b44ca4af9 100644 --- a/examples/gpt-oss/configs/sft_lora.yaml +++ b/examples/gpt-oss/configs/sft_lora.yaml @@ -21,7 +21,7 @@ lora_alpha: 16 lora_dropout: 0.0 lora_target_modules: all-linear max_length: 4096 -warmup_ratio: 0.03 +warmup_steps: 0.03 lr_scheduler_type: cosine_with_min_lr lr_scheduler_kwargs: min_lr_rate: 0.1 diff --git a/examples/gpt-oss/qat-finetune-transformers.ipynb b/examples/gpt-oss/qat-finetune-transformers.ipynb index 695ed39f67..58dba84cb6 100644 --- a/examples/gpt-oss/qat-finetune-transformers.ipynb +++ b/examples/gpt-oss/qat-finetune-transformers.ipynb @@ -207,7 +207,7 @@ " per_device_eval_batch_size=1,\n", " gradient_accumulation_steps=2,\n", " max_length=4096,\n", - " warmup_ratio=0.03,\n", + " warmup_steps=0.03,\n", " eval_strategy=\"steps\",\n", " eval_on_start=True,\n", " logging_steps=10,\n", diff --git a/examples/llm_qat/launch.sh b/examples/llm_qat/launch.sh index 6120476f17..cc3adc74fe 100755 --- a/examples/llm_qat/launch.sh +++ b/examples/llm_qat/launch.sh @@ -165,7 +165,7 @@ CMD="accelerate launch --config-file accelerate_config/$CONFIG_FILE $FSDP_ARGS \ --save_total_limit 2 \ --learning_rate $LR \ --weight_decay 0.0 \ - --warmup_ratio 0.1 \ + --warmup_steps 0.1 \ --lr_scheduler_type linear \ --logging_steps 1 \ --report_to tensorboard \ diff --git a/examples/llm_qat/notebooks/QAT_QAD_Walkthrough.ipynb b/examples/llm_qat/notebooks/QAT_QAD_Walkthrough.ipynb index a9bb6589be..9c10c55c25 100644 --- a/examples/llm_qat/notebooks/QAT_QAD_Walkthrough.ipynb +++ b/examples/llm_qat/notebooks/QAT_QAD_Walkthrough.ipynb @@ -290,7 +290,7 @@ " per_device_eval_batch_size=1,\n", " gradient_accumulation_steps=2,\n", " max_length=4096,\n", - " warmup_ratio=0.03,\n", + " warmup_steps=0.03,\n", " eval_strategy=\"steps\",\n", " eval_on_start=True,\n", " logging_steps=50,\n", diff --git a/examples/llm_sparsity/attention_sparsity/hf_sa.py b/examples/llm_sparsity/attention_sparsity/hf_sa.py index ca92e5ebc9..c48c836756 100644 --- a/examples/llm_sparsity/attention_sparsity/hf_sa.py +++ b/examples/llm_sparsity/attention_sparsity/hf_sa.py @@ -111,7 +111,7 @@ def generate_sample_output(model, tokenizer, args): padding=False, ) if torch.cuda.is_available(): - inputs = {k: v.cuda() for k, v in inputs.items()} + inputs = {k: v.to(model.device) for k, v in inputs.items()} # Generate with torch.no_grad(): diff --git a/examples/llm_sparsity/weight_sparsity/finetune.py b/examples/llm_sparsity/weight_sparsity/finetune.py index 7110846683..6eb199adc5 100644 --- a/examples/llm_sparsity/weight_sparsity/finetune.py +++ b/examples/llm_sparsity/weight_sparsity/finetune.py @@ -297,13 +297,12 @@ def train(): ) last_checkpoint = None - if os.path.isdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: + if os.path.isdir(args.output_dir) and args.do_train and args.resume_from_checkpoint is None: last_checkpoint = get_last_checkpoint(args.output_dir) - if last_checkpoint is not None and args.resume_from_checkpoint is None: + if last_checkpoint is not None: print_rank_0( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this" - " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train" - " from scratch." + " behavior, change the `--output_dir` or pass `--resume_from_checkpoint`." ) model = transformers.AutoModelForCausalLM.from_pretrained( @@ -335,18 +334,12 @@ def train(): # Detecting last checkpoint. last_checkpoint = None - if os.path.isdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: + if os.path.isdir(args.output_dir) and args.do_train and args.resume_from_checkpoint is None: last_checkpoint = get_last_checkpoint(args.output_dir) - if last_checkpoint is None and len(os.listdir(args.output_dir)) > 0: - raise ValueError( - f"Output directory ({args.output_dir}) already exists and is not empty. " - "Use --overwrite_output_dir to overcome." - ) - elif last_checkpoint is not None and args.resume_from_checkpoint is None: + if last_checkpoint is not None: print_rank_0( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this" - " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train" - " from scratch." + " behavior, change the `--output_dir` or pass `--resume_from_checkpoint`." ) # Training diff --git a/examples/llm_sparsity/weight_sparsity/launch_finetune.sh b/examples/llm_sparsity/weight_sparsity/launch_finetune.sh index a65e1e6003..7f8e71f255 100755 --- a/examples/llm_sparsity/weight_sparsity/launch_finetune.sh +++ b/examples/llm_sparsity/weight_sparsity/launch_finetune.sh @@ -88,11 +88,11 @@ CMD="accelerate launch --multi_gpu --mixed_precision bf16 finetune.py \ --save_total_limit 10 \ --learning_rate 2e-5 \ --weight_decay 0.1 \ - --warmup_ratio 0.0 \ + --warmup_steps 0.0 \ --lr_scheduler_type cosine \ --logging_steps 1 \ --fsdp 'full_shard auto_wrap' \ - --fsdp_transformer_layer_cls_to_wrap LlamaDecoderLayer \ + --fsdp_config '{\"transformer_layer_cls_to_wrap\": \"LlamaDecoderLayer\"}' \ --tf32 True \ --modelopt_restore_path $MODELOPT_RESTORE_PATH \ --report_to tensorboard \ diff --git a/examples/speculative_decoding/README.md b/examples/speculative_decoding/README.md index 2a29f644e6..8d75eb06f8 100644 --- a/examples/speculative_decoding/README.md +++ b/examples/speculative_decoding/README.md @@ -308,7 +308,7 @@ This will modify the model in-place with eagle training forward, making it compa ```python # Create a trainer -trainer = transformers.Trainer(model=model, tokenizer=tokenizer, args=training_args, **data_module) +trainer = transformers.Trainer(model=model, processing_class=tokenizer, args=training_args, **data_module) trainer._move_model_to_device(model, trainer.args.device) # Enable HF checkpointing so that the saved model will contain the speculative decoding module From ee51fd71246923ef61311109121c0620762901cc Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Tue, 24 Mar 2026 14:44:01 -0700 Subject: [PATCH 08/28] Rename torch_dtype to dtype Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- CHANGELOG.rst | 9 +++++++-- examples/gpt-oss/sft.py | 2 +- examples/llm_autodeploy/run_auto_quantize.py | 2 +- examples/llm_eval/modeling.py | 6 +++--- examples/llm_ptq/example_utils.py | 6 +++--- examples/llm_ptq/multinode_ptq.py | 4 +--- examples/llm_qat/main.py | 6 ++---- examples/llm_sparsity/attention_sparsity/hf_sa.py | 5 +---- examples/llm_sparsity/weight_sparsity/eval.py | 2 +- .../llm_sparsity/weight_sparsity/export_trtllm_ckpt.py | 2 +- examples/llm_sparsity/weight_sparsity/hf_pts.py | 2 +- .../specdec_bench/models/specbench_medusa.py | 2 +- .../collect_hidden_states/compute_hidden_states_hf.py | 2 +- examples/speculative_decoding/main.py | 4 ++-- modelopt/onnx/llm_export_utils/export_utils.py | 2 +- modelopt/torch/quantization/plugins/accelerate.py | 6 ++++-- modelopt/torch/quantization/plugins/huggingface.py | 5 +---- .../torch/sparsity/attention_sparsity/model_sparsify.py | 2 +- 18 files changed, 33 insertions(+), 36 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index d52ad0c2ad..1ecd3976b0 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,5 +1,6 @@ -NVIDIA Model Optimizer Changelog -================================ +Changelog +========= + 0.44 (2026-05-xx) ^^^^^^^^^^^^^^^^^ @@ -15,6 +16,10 @@ NVIDIA Model Optimizer Changelog - Fix Minitron pruning (``mcore_minitron``) for MoE models. Importance estimation hooks were incorrectly registered for MoE modules and NAS step was hanging before this. +**Misc** + +- Add ``transformers>=5.0`` support. + 0.43 (2026-04-09) ^^^^^^^^^^^^^^^^^ diff --git a/examples/gpt-oss/sft.py b/examples/gpt-oss/sft.py index cc896021fa..4d30fc0fd7 100644 --- a/examples/gpt-oss/sft.py +++ b/examples/gpt-oss/sft.py @@ -72,7 +72,7 @@ def main(script_args, training_args, model_args, quant_args): "revision": model_args.model_revision, "trust_remote_code": model_args.trust_remote_code, "attn_implementation": model_args.attn_implementation, - "torch_dtype": getattr(model_args, "dtype", "bfloat16"), + "dtype": getattr(model_args, "dtype", "float32"), "use_cache": not training_args.gradient_checkpointing, } diff --git a/examples/llm_autodeploy/run_auto_quantize.py b/examples/llm_autodeploy/run_auto_quantize.py index e9ecb0731f..e9e6107af2 100644 --- a/examples/llm_autodeploy/run_auto_quantize.py +++ b/examples/llm_autodeploy/run_auto_quantize.py @@ -121,7 +121,7 @@ def modelopt_ptq( ) -> torch.nn.Module: """Quantize the model with modelopt.""" model = AutoModelForCausalLM.from_pretrained( - model_path, trust_remote_code=True, torch_dtype="auto", device_map="auto" + model_path, trust_remote_code=True, dtype="auto", device_map="auto" ) model.eval() diff --git a/examples/llm_eval/modeling.py b/examples/llm_eval/modeling.py index d06d055603..4cd9cc8bad 100644 --- a/examples/llm_eval/modeling.py +++ b/examples/llm_eval/modeling.py @@ -188,7 +188,7 @@ def load(self): args.update(device_map="auto") if self.load_8bit: args.update(device_map="auto", load_in_8bit=True) - args.update(torch_dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto") + args.update(dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto") if self.attn_implementation: args["attn_implementation"] = self.attn_implementation self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_path, **args) @@ -243,7 +243,7 @@ def load(self): args.update(device_map="auto") if self.load_8bit: args.update(device_map="auto", load_in_8bit=True) - args.update(torch_dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto") + args.update(dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto") if self.attn_implementation: args["attn_implementation"] = self.attn_implementation self.model = AutoModelForCausalLM.from_pretrained( @@ -322,7 +322,7 @@ def load(self): args.update(device_map="auto") if self.load_8bit: args.update(device_map="auto", load_in_8bit=True) - args.update(torch_dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto") + args.update(dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto") self.model = LlamaForCausalLM.from_pretrained(self.model_path, **args) print_gpu_utilization() if self.lora_path: diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py index 58eb676111..2a02283799 100755 --- a/examples/llm_ptq/example_utils.py +++ b/examples/llm_ptq/example_utils.py @@ -567,7 +567,7 @@ def get_model( model_kwargs = config_kwargs.copy() # Don't set torch_dtype for VILA models as they handle it explicitly in their builder if "vila" not in ckpt_path.lower(): - model_kwargs.setdefault("torch_dtype", "auto") + model_kwargs.setdefault("dtype", "auto") if "vila" in ckpt_path.lower(): hf_vila = AutoModel.from_pretrained( @@ -618,7 +618,7 @@ def has_pack_quantized_config(config): ckpt_path, device_map="auto", trust_remote_code=trust_remote_code, - torch_dtype="auto", + dtype="auto", ) else: architecture = hf_config.architectures[0] @@ -650,7 +650,7 @@ def has_pack_quantized_config(config): model_kwargs2 = model_kwargs.copy() if auto_model_module not in [AutoModelForCausalLM, AutoModel]: model_kwargs2.pop("trust_remote_code", None) - model_kwargs2["torch_dtype"] = torch_dtype + model_kwargs2["dtype"] = torch_dtype model_kwargs2.pop("max_memory", None) model = from_config(hf_config, **model_kwargs2) diff --git a/examples/llm_ptq/multinode_ptq.py b/examples/llm_ptq/multinode_ptq.py index 624307cda2..93ef21ea4d 100644 --- a/examples/llm_ptq/multinode_ptq.py +++ b/examples/llm_ptq/multinode_ptq.py @@ -149,9 +149,7 @@ def load_and_prepare_model( Tuple of (prepared_model, model_type, original_architectures, calibration_dataloader) """ model = AutoModelForCausalLM.from_pretrained( - model_path, - torch_dtype="auto", - trust_remote_code=trust_remote_code, + model_path, dtype="auto", trust_remote_code=trust_remote_code ) model.eval() model_type = get_model_type(model) diff --git a/examples/llm_qat/main.py b/examples/llm_qat/main.py index 9435157259..2edbf3ccbb 100644 --- a/examples/llm_qat/main.py +++ b/examples/llm_qat/main.py @@ -166,9 +166,7 @@ def train(): print_rank_0(f"Last checkpoint detected: {last_checkpoint}") model = transformers.AutoModelForCausalLM.from_pretrained( - model_args.model_name_or_path, - cache_dir=training_args.cache_dir, - torch_dtype=torch.bfloat16, + model_args.model_name_or_path, cache_dir=training_args.cache_dir, dtype=torch.bfloat16 ) model.generation_config.do_sample = True tokenizer = transformers.AutoTokenizer.from_pretrained( @@ -223,7 +221,7 @@ def train(): teacher_model = transformers.AutoModelForCausalLM.from_pretrained( model_args.teacher_model, cache_dir=training_args.cache_dir, - torch_dtype=torch.bfloat16, + dtype=torch.bfloat16, ) distill_config = { "teacher_model": teacher_model, diff --git a/examples/llm_sparsity/attention_sparsity/hf_sa.py b/examples/llm_sparsity/attention_sparsity/hf_sa.py index c48c836756..d6c5bd025a 100644 --- a/examples/llm_sparsity/attention_sparsity/hf_sa.py +++ b/examples/llm_sparsity/attention_sparsity/hf_sa.py @@ -143,10 +143,7 @@ def main(args): # No need to specify attn_implementation here — mtsa.sparsify() sets it # automatically ("eager" for pytorch backend, "modelopt_triton" for triton). model = AutoModelForCausalLM.from_pretrained( - args.pyt_ckpt_path, - attn_implementation="eager", - torch_dtype="auto", - device_map="auto", + args.pyt_ckpt_path, attn_implementation="eager", dtype="auto", device_map="auto" ) tokenizer = AutoTokenizer.from_pretrained(args.pyt_ckpt_path) diff --git a/examples/llm_sparsity/weight_sparsity/eval.py b/examples/llm_sparsity/weight_sparsity/eval.py index 91199933e5..a5f2fb91b2 100644 --- a/examples/llm_sparsity/weight_sparsity/eval.py +++ b/examples/llm_sparsity/weight_sparsity/eval.py @@ -254,7 +254,7 @@ def main(): dataloader = get_dataloader( accelerator, dataset, tokenizer, args.model_max_length, args.batch_size, shuffle=False ) - model = AutoModelForCausalLM.from_pretrained(args.model_dir, torch_dtype=torch.float16).to( + model = AutoModelForCausalLM.from_pretrained(args.model_dir, dtype=torch.float16).to( accelerator.device ) diff --git a/examples/llm_sparsity/weight_sparsity/export_trtllm_ckpt.py b/examples/llm_sparsity/weight_sparsity/export_trtllm_ckpt.py index 0fb64f9589..2cf7ca3a7a 100644 --- a/examples/llm_sparsity/weight_sparsity/export_trtllm_ckpt.py +++ b/examples/llm_sparsity/weight_sparsity/export_trtllm_ckpt.py @@ -74,7 +74,7 @@ def get_model(ckpt_path, dtype="fp16", trust_remote_code=False): dtype = torch.float32 else: raise NotImplementedError(f"Unknown dtype {dtype}") - model_kwargs = {"torch_dtype": dtype} + model_kwargs = {"dtype": dtype} model = AutoModelForCausalLM.from_pretrained( ckpt_path, device_map="auto", **model_kwargs, trust_remote_code=trust_remote_code diff --git a/examples/llm_sparsity/weight_sparsity/hf_pts.py b/examples/llm_sparsity/weight_sparsity/hf_pts.py index cae361e387..77574c1c2c 100644 --- a/examples/llm_sparsity/weight_sparsity/hf_pts.py +++ b/examples/llm_sparsity/weight_sparsity/hf_pts.py @@ -98,7 +98,7 @@ def get_model(ckpt_path, dtype="fp16", trust_remote_code=False): dtype = torch.float32 else: raise NotImplementedError(f"Unknown dtype {dtype}") - model_kwargs = {"torch_dtype": dtype} + model_kwargs = {"dtype": dtype} model = AutoModelForCausalLM.from_pretrained( ckpt_path, device_map="auto", **model_kwargs, trust_remote_code=trust_remote_code diff --git a/examples/specdec_bench/specdec_bench/models/specbench_medusa.py b/examples/specdec_bench/specdec_bench/models/specbench_medusa.py index e483f379c3..0165505d2f 100644 --- a/examples/specdec_bench/specdec_bench/models/specbench_medusa.py +++ b/examples/specdec_bench/specdec_bench/models/specbench_medusa.py @@ -100,7 +100,7 @@ def __init__( self.draft_model_path, model_dir, medusa_num_heads=self.medusa_num_heads, - torch_dtype=torch_dtype, + dtype=torch_dtype, low_cpu_mem_usage=True, ) self.model = self.model.to(self.device) diff --git a/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_hf.py b/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_hf.py index a3d1681c4c..5cc0a1065a 100644 --- a/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_hf.py +++ b/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_hf.py @@ -130,7 +130,7 @@ def keep_conversation(entry): dataset = dataset.select(range(args.debug_max_num_conversations)) model = AutoModel.from_pretrained( - args.model, torch_dtype="auto", device_map="auto", trust_remote_code=True + args.model, dtype="auto", device_map="auto", trust_remote_code=True ) num_hidden_layers = getattr(model.config, "num_hidden_layers", None) diff --git a/examples/speculative_decoding/main.py b/examples/speculative_decoding/main.py index 3369d399c2..880c6b5672 100644 --- a/examples/speculative_decoding/main.py +++ b/examples/speculative_decoding/main.py @@ -185,7 +185,7 @@ def train(): if checkpoint: with patch_transformers5_params_loading(): model = load_vlm_or_llm( - checkpoint, torch_dtype="auto", trust_remote_code=model_args.trust_remote_code + checkpoint, dtype="auto", trust_remote_code=model_args.trust_remote_code ) tokenizer = transformers.AutoTokenizer.from_pretrained( checkpoint, trust_remote_code=model_args.trust_remote_code @@ -197,7 +197,7 @@ def train(): model_args.model_name_or_path, use_fake_base=model_args.use_fake_base_for_offline, use_offline_training=use_offline_training, - torch_dtype="auto", + dtype="auto", device_map="cpu", trust_remote_code=model_args.trust_remote_code, ) diff --git a/modelopt/onnx/llm_export_utils/export_utils.py b/modelopt/onnx/llm_export_utils/export_utils.py index 4009b119e7..f45b473059 100644 --- a/modelopt/onnx/llm_export_utils/export_utils.py +++ b/modelopt/onnx/llm_export_utils/export_utils.py @@ -53,7 +53,7 @@ def load_model(self, trust_remote_code: bool = False) -> AutoModelForCausalLM: """Load HuggingFace model based on model type.""" print(f"Loading HF model from {self.hf_model_path} with model type {self.model_type}") self.hf_model = AutoModelForCausalLM.from_pretrained( - self.hf_model_path, torch_dtype=torch.float16, trust_remote_code=trust_remote_code + self.hf_model_path, dtype=torch.float16, trust_remote_code=trust_remote_code ) return self.hf_model.eval().cuda() # type: ignore[attr-defined] diff --git a/modelopt/torch/quantization/plugins/accelerate.py b/modelopt/torch/quantization/plugins/accelerate.py index 59731cc8ad..13999df0f0 100644 --- a/modelopt/torch/quantization/plugins/accelerate.py +++ b/modelopt/torch/quantization/plugins/accelerate.py @@ -190,8 +190,10 @@ def patched_from_pretrained(cls, /, pretrained_model_name_or_path, *args, **kwar with init_empty_weights(): # Fix torch_dtype to match original model - torch_dtype = kwargs.get("torch_dtype", getattr(config, "torch_dtype", torch.float16)) - model = cls.from_config(config, torch_dtype=torch_dtype) + torch_dtype = kwargs.get( + "dtype", kwargs.get("torch_dtype", getattr(config, "torch_dtype", torch.float16)) + ) + model = cls.from_config(config, dtype=torch_dtype) mtq.quantize(model, quant_cfg) mtq.compress(model, config=mtq.CompressConfig(quant_gemm=quant_gemm)) diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py index 45bd50631d..b40623aa20 100644 --- a/modelopt/torch/quantization/plugins/huggingface.py +++ b/modelopt/torch/quantization/plugins/huggingface.py @@ -925,10 +925,7 @@ def patch_compressed_linear_loading(): with patch_compressed_linear_loading(): model = AutoModelForCausalLM.from_pretrained( - ckpt_path, - device_map="auto", - trust_remote_code=True, - torch_dtype="auto", + ckpt_path, device_map="auto", trust_remote_code=True, dtype="auto" ) """ try: diff --git a/modelopt/torch/sparsity/attention_sparsity/model_sparsify.py b/modelopt/torch/sparsity/attention_sparsity/model_sparsify.py index 28c18943a2..a33938b057 100644 --- a/modelopt/torch/sparsity/attention_sparsity/model_sparsify.py +++ b/modelopt/torch/sparsity/attention_sparsity/model_sparsify.py @@ -139,7 +139,7 @@ def forward_loop(model) -> float: model = AutoModelForCausalLM.from_pretrained( model_path, attn_implementation="eager", # Required for sparse attention - torch_dtype=torch.bfloat16, + dtype=torch.bfloat16, ) This is because sparse attention works by patching torch.nn.functional.softmax, From aa6c3ce4af9cb13812a67da4c99cc4048886a231 Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Tue, 24 Mar 2026 15:12:09 -0700 Subject: [PATCH 09/28] Remove hard-coded trust_remote_code=True Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- .../gpt-oss/convert_oai_mxfp4_weight_only.py | 14 ++++++----- examples/llm_autodeploy/run_auto_quantize.py | 12 ++++++++-- examples/llm_eval/lm_eval_hf.py | 3 +-- examples/llm_eval/modeling.py | 23 ++++++++++++------- examples/llm_ptq/example_utils.py | 14 +++++++---- examples/llm_ptq/hf_ptq.py | 2 ++ examples/llm_ptq/vlm_utils.py | 22 ++++++++++++------ .../llm_qad/data_utils/download_dataset.py | 12 +++++++--- .../compute_hidden_states_hf.py | 10 ++++++-- .../scripts/ar_validate.py | 9 ++++++-- .../scripts/export_hf_checkpoint.py | 8 +++---- .../scripts/send_conversation_vllm.py | 10 +++++++- examples/vllm_serve/fakequant_worker.py | 3 +-- .../windows/accuracy_benchmark/modeling.py | 21 ++++++++++++----- examples/windows/onnx_ptq/whisper/README.md | 2 +- .../whisper/whisper_onnx_quantization.py | 2 +- .../whisper/whisper_optimum_ort_inference.py | 4 +--- modelopt/torch/speculative/utils.py | 6 ++--- modelopt/torch/utils/speech_dataset_utils.py | 4 +--- tests/gpu/torch/quantization/test_gptq.py | 4 +--- 20 files changed, 121 insertions(+), 64 deletions(-) diff --git a/examples/gpt-oss/convert_oai_mxfp4_weight_only.py b/examples/gpt-oss/convert_oai_mxfp4_weight_only.py index bebb914869..4f471ef484 100644 --- a/examples/gpt-oss/convert_oai_mxfp4_weight_only.py +++ b/examples/gpt-oss/convert_oai_mxfp4_weight_only.py @@ -95,21 +95,23 @@ def convert_and_save(model, tokenizer, output_path: str): def create_parser(): parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--model_path", type=str, help="path to the fake-quantized model from QAT.") - + parser.add_argument( + "--trust_remote_code", + help="Set trust_remote_code for Huggingface models and tokenizers", + default=False, + action="store_true", + ) parser.add_argument( "--lora_path", type=str, help="path to the LoRA-QAT adapter weights. You can only specify lora_path or model_path, not both.", ) - parser.add_argument( "--base_path", type=str, help="path to the base model used for LoRA-QAT. Only used if lora_path is specified.", ) - parser.add_argument( "--output_path", type=str, required=True, help="location to save converted model." ) @@ -121,7 +123,7 @@ def create_parser(): parser = create_parser() args = parser.parse_args() - kwargs = {"device_map": "auto", "torch_dtype": "auto", "trust_remote_code": True} + kwargs = {"device_map": "auto", "dtype": "auto", "trust_remote_code": args.trust_remote_code} if args.lora_path: assert args.model_path is None, "You can only specify lora_path or model_path, not both." model_path = args.base_path @@ -140,7 +142,7 @@ def create_parser(): gc.collect() # Load tokenizer - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=args.trust_remote_code) # Quantize and save model convert_and_save(model, tokenizer, args.output_path) diff --git a/examples/llm_autodeploy/run_auto_quantize.py b/examples/llm_autodeploy/run_auto_quantize.py index e9e6107af2..931c1153d3 100644 --- a/examples/llm_autodeploy/run_auto_quantize.py +++ b/examples/llm_autodeploy/run_auto_quantize.py @@ -118,10 +118,11 @@ def modelopt_ptq( auto_quantize_bits: float | None = None, calib_dataset: str = "cnn_dailymail", calib_batch_size: int = 8, + trust_remote_code: bool = False, ) -> torch.nn.Module: """Quantize the model with modelopt.""" model = AutoModelForCausalLM.from_pretrained( - model_path, trust_remote_code=True, dtype="auto", device_map="auto" + model_path, trust_remote_code=trust_remote_code, dtype="auto", device_map="auto" ) model.eval() @@ -129,7 +130,7 @@ def modelopt_ptq( model_path, model_max_length=2048, padding_side="left", - trust_remote_code=True, + trust_remote_code=trust_remote_code, ) # sanitize tokenizer if tokenizer.pad_token != "": @@ -203,6 +204,12 @@ def modelopt_ptq( "regular quantization without auto_quantize search will be applied." ), ) + parser.add_argument( + "--trust_remote_code", + help="Set trust_remotecode for Huggingface models and tokenizers", + default=False, + action="store_true", + ) args = parser.parse_args() @@ -213,4 +220,5 @@ def modelopt_ptq( args.num_samples, auto_quantize_bits=args.effective_bits, calib_batch_size=args.calib_batch_size, + trust_remote_code=args.trust_remote_code, ) diff --git a/examples/llm_eval/lm_eval_hf.py b/examples/llm_eval/lm_eval_hf.py index 405e8590a5..11d736a429 100755 --- a/examples/llm_eval/lm_eval_hf.py +++ b/examples/llm_eval/lm_eval_hf.py @@ -38,6 +38,7 @@ # limitations under the License. import warnings +import datasets from lm_eval import utils from lm_eval.__main__ import cli_evaluate, parse_eval_args, setup_parser from lm_eval.api.model import T @@ -180,8 +181,6 @@ def setup_parser_with_modelopt_args(): model_args = utils.simple_parse_args_string(args.model_args) if args.trust_remote_code: - import datasets - datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True model_args["trust_remote_code"] = True args.trust_remote_code = None diff --git a/examples/llm_eval/modeling.py b/examples/llm_eval/modeling.py index 4cd9cc8bad..71e048e1a3 100644 --- a/examples/llm_eval/modeling.py +++ b/examples/llm_eval/modeling.py @@ -74,6 +74,7 @@ class EvalModel(BaseModel, arbitrary_types_allowed=True): model_path: str + trust_remote_code: bool = False max_input_length: int = 512 max_output_length: int = 512 dtype: str = "auto" @@ -92,7 +93,6 @@ def load(self): class OpenAIModel(EvalModel): - model_path: str engine: str = "" use_azure: bool = False tokenizer: tiktoken.Encoding | None @@ -173,7 +173,6 @@ def handler(signum, frame): class SeqToSeqModel(EvalModel): - model_path: str model: PreTrainedModel | None = None tokenizer: PreTrainedTokenizer | None = None lora_path: str = "" @@ -191,7 +190,9 @@ def load(self): args.update(dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto") if self.attn_implementation: args["attn_implementation"] = self.attn_implementation - self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_path, **args) + self.model = AutoModelForSeq2SeqLM.from_pretrained( + self.model_path, trust_remote_code=self.trust_remote_code, **args + ) print_gpu_utilization() if self.lora_path: self.model = PeftModel.from_pretrained(self.model, self.lora_path) @@ -199,7 +200,9 @@ def load(self): if "device_map" not in args: self.model.to(self.device) if self.tokenizer is None: - self.tokenizer = AutoTokenizer.from_pretrained(self.model_path) + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_path, trust_remote_code=self.trust_remote_code + ) def run(self, prompt: str, **kwargs) -> str: self.load() @@ -247,7 +250,7 @@ def load(self): if self.attn_implementation: args["attn_implementation"] = self.attn_implementation self.model = AutoModelForCausalLM.from_pretrained( - self.model_path, trust_remote_code=True, **args + self.model_path, trust_remote_code=self.trust_remote_code, **args ) self.model.eval() if "device_map" not in args: @@ -256,7 +259,9 @@ def load(self): # Sampling with temperature will cause MMLU to drop self.model.generation_config.do_sample = False if self.tokenizer is None: - self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True) + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_path, trust_remote_code=self.trust_remote_code + ) def run(self, prompt: str, **kwargs) -> str: self.load() @@ -487,10 +492,12 @@ def test_max_length(self): class ChatGLMModel(SeqToSeqModel): def load(self): if self.tokenizer is None: - self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True) + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_path, trust_remote_code=self.trust_remote_code + ) if self.model is None: self.model = AutoModel.from_pretrained( - self.model_path, trust_remote_code=True + self.model_path, trust_remote_code=self.trust_remote_code ).half() # FP16 is required for ChatGLM self.model.eval() self.model.to(self.device) diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py index 2a02283799..a4515baacb 100755 --- a/examples/llm_ptq/example_utils.py +++ b/examples/llm_ptq/example_utils.py @@ -53,7 +53,13 @@ def run_nemotron_vl_preview( - full_model, tokenizer, input_ids, pyt_ckpt_path, stage_name, allow_fallback=False + full_model, + tokenizer, + input_ids, + pyt_ckpt_path, + stage_name, + allow_fallback=False, + trust_remote_code=False, ): """Run text-only and VL preview generation for Nemotron VL models. @@ -64,7 +70,7 @@ def run_nemotron_vl_preview( pyt_ckpt_path: Path to the model checkpoint stage_name: Description of the stage (e.g., "before quantization", "after quantization") allow_fallback: Whether to allow fallback to standard generate on failure - + trust_remote_code: Whether to trust remote code for Huggingface models and tokenizers Returns: Generated text response or None if generation failed """ @@ -80,7 +86,7 @@ def run_nemotron_vl_preview( # Try text-only generation (may fail for encoder-decoder models like Nemotron-Parse) text_response = run_text_only_generation( - full_model, tokenizer, question, generation_config, pyt_ckpt_path + full_model, tokenizer, question, generation_config, pyt_ckpt_path, trust_remote_code ) generated_ids = None @@ -93,7 +99,7 @@ def run_nemotron_vl_preview( # Run additional VL test with images print(f"Running additional VL test with images ({stage_name})...") - run_vl_preview_generation(full_model, tokenizer, pyt_ckpt_path, stage_name) + run_vl_preview_generation(full_model, tokenizer, pyt_ckpt_path, stage_name, trust_remote_code) return generated_ids diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index b81dc60c01..54e0984c71 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -758,6 +758,7 @@ def pre_quantize( args.pyt_ckpt_path, "before quantization", allow_fallback=False, + trust_remote_code=args.trust_remote_code, ) else: generated_ids_before_ptq = full_model.generate(preview_input_ids, max_new_tokens=100) @@ -808,6 +809,7 @@ def post_quantize( args.pyt_ckpt_path, "after quantization", allow_fallback=False, + trust_remote_code=args.trust_remote_code, ) else: warnings.warn( diff --git a/examples/llm_ptq/vlm_utils.py b/examples/llm_ptq/vlm_utils.py index 9919e405ba..abfebbd4f0 100644 --- a/examples/llm_ptq/vlm_utils.py +++ b/examples/llm_ptq/vlm_utils.py @@ -21,7 +21,7 @@ from transformers import AutoImageProcessor, AutoProcessor -def run_vl_preview_generation(model, tokenizer, model_path, stage_name): +def run_vl_preview_generation(model, tokenizer, model_path, stage_name, trust_remote_code=False): """Run preview generation for VL models using sample images. Args: @@ -29,7 +29,7 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name): tokenizer: The tokenizer model_path: Path to the model (for loading image processor) stage_name: Description of the stage (e.g., "before quantization") - + trust_remote_code: Whether to trust remote code for Huggingface models and tokenizers Returns: Generated response text for logging/comparison """ @@ -85,7 +85,9 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name): # Try to detect the VL model has chat method or generate method if hasattr(model, "chat"): - image_processor = AutoImageProcessor.from_pretrained(model_path, trust_remote_code=True) + image_processor = AutoImageProcessor.from_pretrained( + model_path, trust_remote_code=trust_remote_code + ) image_features = image_processor([image]) # Pass as list with single image @@ -103,7 +105,9 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name): **image_features, ) else: - processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) + processor = AutoProcessor.from_pretrained( + model_path, trust_remote_code=trust_remote_code + ) # Use chat template if available, otherwise fall back to default task prompt if hasattr(tokenizer, "chat_template") and tokenizer.chat_template is not None: @@ -190,7 +194,9 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name): return None -def run_text_only_generation(model, tokenizer, question, generation_config, model_path): +def run_text_only_generation( + model, tokenizer, question, generation_config, model_path, trust_remote_code=False +): """Run text-only generation for VL models, supporting both chat and generate methods. Args: @@ -199,7 +205,7 @@ def run_text_only_generation(model, tokenizer, question, generation_config, mode question: The text question to ask generation_config: Generation configuration model_path: Path to the model (for loading processor if needed) - + trust_remote_code: Whether to trust remote code for Huggingface models and tokenizers Returns: Generated response text or None if failed """ @@ -209,7 +215,9 @@ def run_text_only_generation(model, tokenizer, question, generation_config, mode response = model.chat(tokenizer, None, question, generation_config, history=None) return response else: - processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) + processor = AutoProcessor.from_pretrained( + model_path, trust_remote_code=trust_remote_code + ) # Create text-only messages messages = [ diff --git a/examples/llm_qad/data_utils/download_dataset.py b/examples/llm_qad/data_utils/download_dataset.py index e3e3d0646e..31992d9c1f 100644 --- a/examples/llm_qad/data_utils/download_dataset.py +++ b/examples/llm_qad/data_utils/download_dataset.py @@ -30,14 +30,14 @@ _TOKENIZER = None -def init_tokenizer(name: str) -> None: +def init_tokenizer(name: str, trust_remote_code: bool = False) -> None: """Load HuggingFace tokenizer for chat template.""" global _TOKENIZER if name: from transformers import AutoTokenizer print(f"Loading tokenizer: {name}") - _TOKENIZER = AutoTokenizer.from_pretrained(name, trust_remote_code=True) + _TOKENIZER = AutoTokenizer.from_pretrained(name, trust_remote_code=trust_remote_code) def format_text(messages: list[dict], reasoning: str = "") -> str: @@ -159,10 +159,16 @@ def main(): p.add_argument( "--include-reasoning", action="store_true", help="Include COT for Thinking models" ) + p.add_argument( + "--trust_remote_code", + help="Set trust_remotecode for Huggingface models and tokenizers", + default=False, + action="store_true", + ) args = p.parse_args() if args.tokenizer: - init_tokenizer(args.tokenizer) + init_tokenizer(args.tokenizer, args.trust_remote_code) # Build suffix suffix = f"{int(args.sample_percent)}pct" diff --git a/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_hf.py b/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_hf.py index 5cc0a1065a..fbe3b27683 100644 --- a/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_hf.py +++ b/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_hf.py @@ -85,6 +85,12 @@ def parse_args() -> argparse.Namespace: default=1, help="""Data parallel world size. Number of tasks on SLURM.""", ) + parser.add_argument( + "--trust_remote_code", + help="Set trust_remotecode for Huggingface models and tokenizers", + default=False, + action="store_true", + ) return parser.parse_args() @@ -130,11 +136,11 @@ def keep_conversation(entry): dataset = dataset.select(range(args.debug_max_num_conversations)) model = AutoModel.from_pretrained( - args.model, dtype="auto", device_map="auto", trust_remote_code=True + args.model, dtype="auto", device_map="auto", trust_remote_code=args.trust_remote_code ) num_hidden_layers = getattr(model.config, "num_hidden_layers", None) - tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token tokenizer.chat_template = tokenizer.chat_template.replace(REMOVE_THINK_CHAT_TEMPLATE, "") diff --git a/examples/speculative_decoding/scripts/ar_validate.py b/examples/speculative_decoding/scripts/ar_validate.py index d1bf31a1ab..2890b2a5d5 100644 --- a/examples/speculative_decoding/scripts/ar_validate.py +++ b/examples/speculative_decoding/scripts/ar_validate.py @@ -55,6 +55,7 @@ def validate_ar(model, tokenizer, ds, steps=3, osl=20, num_samples=80, device=No def main(): parser = argparse.ArgumentParser() parser.add_argument("--model_path", type=str, required=True, help="Path to model directory") + parser.add_argument("--trust_remote_code", type=bool, default=False, help="Trust remote code") parser.add_argument("--steps", type=int, default=3, help="Steps for AR validation") parser.add_argument( "--osl", type=int, default=32, help="Output sequence length for AR validation" @@ -72,8 +73,12 @@ def main(): accelerator = Accelerator() # Load model and tokenizer - model = load_vlm_or_llm(args.model_path, device_map="auto") - tokenizer = AutoTokenizer.from_pretrained(args.model_path) + model = load_vlm_or_llm( + args.model_path, device_map="auto", trust_remote_code=args.trust_remote_code + ) + tokenizer = AutoTokenizer.from_pretrained( + args.model_path, trust_remote_code=args.trust_remote_code + ) model.eval() model = accelerator.prepare(model) diff --git a/examples/speculative_decoding/scripts/export_hf_checkpoint.py b/examples/speculative_decoding/scripts/export_hf_checkpoint.py index 925f4b73d0..7a8823099a 100644 --- a/examples/speculative_decoding/scripts/export_hf_checkpoint.py +++ b/examples/speculative_decoding/scripts/export_hf_checkpoint.py @@ -29,6 +29,7 @@ def parse_args(): description="Export a HF checkpoint (with ModelOpt state) for deployment." ) parser.add_argument("--model_path", type=str, default="Path of the trained checkpoint.") + parser.add_argument("--trust_remote_code", type=bool, default=False, help="Trust remote code") parser.add_argument( "--export_path", type=str, default="Destination directory for exported files." ) @@ -38,11 +39,8 @@ def parse_args(): mto.enable_huggingface_checkpointing() args = parse_args() -model = load_vlm_or_llm(args.model_path, torch_dtype="auto") +model = load_vlm_or_llm(args.model_path, dtype="auto", trust_remote_code=args.trust_remote_code) model.eval() with torch.inference_mode(): - export_speculative_decoding( - model, - export_dir=args.export_path, - ) + export_speculative_decoding(model, export_dir=args.export_path) print(f"Exported checkpoint to {args.export_path}") diff --git a/examples/speculative_decoding/scripts/send_conversation_vllm.py b/examples/speculative_decoding/scripts/send_conversation_vllm.py index 5101b4e6f9..ab55cd6863 100644 --- a/examples/speculative_decoding/scripts/send_conversation_vllm.py +++ b/examples/speculative_decoding/scripts/send_conversation_vllm.py @@ -55,6 +55,12 @@ def parse_args() -> argparse.Namespace: "the local serving engine. This should match the value used by the server." ), ) + parser.add_argument( + "--trust_remote_code", + help="Set trust_remotecode for Huggingface models and tokenizers", + default=False, + action="store_true", + ) ## Client Parameters ## parser.add_argument( "--base-url", @@ -133,7 +139,9 @@ async def main(args: argparse.Namespace) -> None: base_url=args.base_url, ) - tokenizer = AutoTokenizer.from_pretrained(args.model_card, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained( + args.model_card, trust_remote_code=args.trust_remote_code + ) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token bos_token_id = tokenizer.bos_token_id diff --git a/examples/vllm_serve/fakequant_worker.py b/examples/vllm_serve/fakequant_worker.py index ec2b1f4033..262f2a5360 100644 --- a/examples/vllm_serve/fakequant_worker.py +++ b/examples/vllm_serve/fakequant_worker.py @@ -49,8 +49,7 @@ def _fakequant_run_prolog_worker(self) -> None: trust_remote_code = os.environ.get("TRUST_REMOTE_CODE", "false").lower() == "true" tokenizer = AutoTokenizer.from_pretrained( - self.model_runner.model_config.tokenizer, - trust_remote_code=trust_remote_code, + self.model_runner.model_config.tokenizer, trust_remote_code=trust_remote_code ) if tokenizer.pad_token != "" or tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token diff --git a/examples/windows/accuracy_benchmark/modeling.py b/examples/windows/accuracy_benchmark/modeling.py index 273a944c57..f17300be94 100644 --- a/examples/windows/accuracy_benchmark/modeling.py +++ b/examples/windows/accuracy_benchmark/modeling.py @@ -49,6 +49,7 @@ class EvalModel(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) model_path: str + trust_remote_code: bool = False max_input_length: int = 512 max_output_length: int = 512 dtype: str = "auto" @@ -84,7 +85,9 @@ def load(self): args.update(torch_dtype=getattr(torch, self.dtype)) else: args.update(torch_dtype="auto") - self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_path, **args) + self.model = AutoModelForSeq2SeqLM.from_pretrained( + self.model_path, trust_remote_code=self.trust_remote_code, **args + ) print_gpu_utilization() if self.lora_path: self.model = PeftModel.from_pretrained(self.model, self.lora_path) @@ -92,7 +95,9 @@ def load(self): if "device_map" not in args: self.model.to(self.device) if self.tokenizer is None: - self.tokenizer = AutoTokenizer.from_pretrained(self.model_path) + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_path, trust_remote_code=self.trust_remote_code + ) def run(self, prompt: str, **kwargs) -> str: self.load() @@ -143,7 +148,7 @@ def load(self): args.update(device_map="auto", load_in_8bit=True) args.update(torch_dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto") self.model = AutoModelForCausalLM.from_pretrained( - self.model_path, trust_remote_code=True, **args + self.model_path, trust_remote_code=self.trust_remote_code, **args ) self.model.eval() if "device_map" not in args: @@ -152,7 +157,9 @@ def load(self): # Sampling with temperature will cause MMLU to drop self.model.generation_config.do_sample = False if self.tokenizer is None: - self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True) + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_path, trust_remote_code=self.trust_remote_code + ) def run(self, prompt: str, **kwargs) -> str: self.load() @@ -200,7 +207,7 @@ def load(self): args.update(device_map="auto", load_in_8bit=True) args.update(torch_dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto") self.model = AutoAWQForCausalLM.from_quantized( - self.model_path, trust_remote_code=True, **args + self.model_path, trust_remote_code=self.trust_remote_code, **args ) self.model.eval() if "device_map" not in args: @@ -209,7 +216,9 @@ def load(self): # Sampling with temperature will cause MMLU to drop self.model.config.do_sample = False if self.tokenizer is None: - self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True) + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_path, trust_remote_code=self.trust_remote_code + ) def run(self, prompt: str, **kwargs) -> str: self.load() diff --git a/examples/windows/onnx_ptq/whisper/README.md b/examples/windows/onnx_ptq/whisper/README.md index 8757aaeb53..82ae782200 100644 --- a/examples/windows/onnx_ptq/whisper/README.md +++ b/examples/windows/onnx_ptq/whisper/README.md @@ -174,7 +174,7 @@ These scripts are currently validated with following settings: - Calibration size - 32 - Calibration EPs - \[`cuda`, `cpu`\] - Audio dataset - `librispeech_asr` dataset (32 samples used for calibration, 100+ samples used for WER test) - - `load_dataset("librispeech_asr", "clean", split="test", trust_remote_code=True)` + - `load_dataset("librispeech_asr", "clean", split="test")` - Quantization support for various ONNX files - `encoder_model.onnx`, `decoder_model.onnx`, `decoder_with_past_model.onnx` - The `use_merged` argument in optimum-ORT's Whisper model API is kept False. diff --git a/examples/windows/onnx_ptq/whisper/whisper_onnx_quantization.py b/examples/windows/onnx_ptq/whisper/whisper_onnx_quantization.py index 7b3e3d3197..03d2c49801 100644 --- a/examples/windows/onnx_ptq/whisper/whisper_onnx_quantization.py +++ b/examples/windows/onnx_ptq/whisper/whisper_onnx_quantization.py @@ -275,7 +275,7 @@ def main(args): processor = WhisperProcessor.from_pretrained(args.model_name, cache_dir=args.cache_dir) - asr_dataset = load_dataset("librispeech_asr", "clean", split="test", trust_remote_code=True) + asr_dataset = load_dataset("librispeech_asr", "clean", split="test") # asr_dataset = load_dataset("librispeech_asr", "all", split="test.clean") calib_data = None diff --git a/examples/windows/onnx_ptq/whisper/whisper_optimum_ort_inference.py b/examples/windows/onnx_ptq/whisper/whisper_optimum_ort_inference.py index 52d56fe048..a1f39b8f04 100644 --- a/examples/windows/onnx_ptq/whisper/whisper_optimum_ort_inference.py +++ b/examples/windows/onnx_ptq/whisper/whisper_optimum_ort_inference.py @@ -85,9 +85,7 @@ def main(args): print(f"\n\n-- Content of input audio-file = {prediction}\n\n") if args.run_wer_test: - librispeech_test_clean = load_dataset( - "librispeech_asr", "clean", split="test", trust_remote_code=True - ) + librispeech_test_clean = load_dataset("librispeech_asr", "clean", split="test") references = [] predictions = [] diff --git a/modelopt/torch/speculative/utils.py b/modelopt/torch/speculative/utils.py index 9e167c8dc9..7bc6e2be0a 100644 --- a/modelopt/torch/speculative/utils.py +++ b/modelopt/torch/speculative/utils.py @@ -488,7 +488,7 @@ def load_vlm_or_llm( model_name_or_path: str, use_fake_base: bool = False, use_offline_training: bool = False, - torch_dtype: str | torch.dtype | None = None, + dtype: str | torch.dtype | None = None, device_map: str | None = None, trust_remote_code: bool = False, ): @@ -502,7 +502,7 @@ def load_vlm_or_llm( Args: model_name_or_path: Local path or HuggingFace repo ID of the model. use_offline_training: Whether to load a memory-efficient model for offline training. - torch_dtype: dtype to use when loading the model. + dtype: dtype to use when loading the model. device_map: Device map passed to ``from_pretrained``. trust_remote_code: Whether to trust remote code. """ @@ -528,7 +528,7 @@ def load_vlm_or_llm( model = model_cls.from_pretrained( model_name_or_path, trust_remote_code=trust_remote_code, - torch_dtype=torch_dtype, + dtype=dtype, device_map=device_map, **extra, ) diff --git a/modelopt/torch/utils/speech_dataset_utils.py b/modelopt/torch/utils/speech_dataset_utils.py index a71d73773e..ef0660175e 100644 --- a/modelopt/torch/utils/speech_dataset_utils.py +++ b/modelopt/torch/utils/speech_dataset_utils.py @@ -48,9 +48,7 @@ def _get_speech_dataset(dataset_name: str, num_samples: int): # Use streaming can reduce the downloading time for large datasets dataset = load_dataset( - **SUPPORTED_SPEECH_DATASET_CONFIG[dataset_name]["config"], - trust_remote_code=True, - streaming=True, + **SUPPORTED_SPEECH_DATASET_CONFIG[dataset_name]["config"], streaming=True ) else: raise NotImplementedError( diff --git a/tests/gpu/torch/quantization/test_gptq.py b/tests/gpu/torch/quantization/test_gptq.py index 0c60bcd007..d43177cae2 100644 --- a/tests/gpu/torch/quantization/test_gptq.py +++ b/tests/gpu/torch/quantization/test_gptq.py @@ -163,9 +163,7 @@ def test_gptq_e2e_flow(quant_cfg): model = AutoModelForCausalLM.from_pretrained( "TinyLlama/TinyLlama-1.1B-Chat-v1.0", device_map="auto" ) - tokenizer = AutoTokenizer.from_pretrained( - "TinyLlama/TinyLlama-1.1B-Chat-v1.0", trust_remote_code=True - ) + tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0") # can't set attribute 'pad_token' for "" # We skip this step for Nemo models From 7343c4ffbf6848af5c96ffdb3cff0db232b48b7d Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Wed, 25 Mar 2026 01:24:13 -0700 Subject: [PATCH 10/28] Fix unit tests Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- .../workflows/delete_outdated_pr_branches.yml | 47 ------------------- .../plugins/test_transformers_save_load.py | 5 +- .../quantization/plugins/test_huggingface.py | 2 +- 3 files changed, 4 insertions(+), 50 deletions(-) delete mode 100644 .github/workflows/delete_outdated_pr_branches.yml diff --git a/.github/workflows/delete_outdated_pr_branches.yml b/.github/workflows/delete_outdated_pr_branches.yml deleted file mode 100644 index 532b5c5b7d..0000000000 --- a/.github/workflows/delete_outdated_pr_branches.yml +++ /dev/null @@ -1,47 +0,0 @@ -name: Delete Outdated PR Branches - -on: - schedule: - - cron: "0 9 * * 1" # Every Monday at 9:00 UTC - workflow_dispatch: # On-demand - -permissions: - contents: write - pull-requests: read - -jobs: - delete-outdated-pr-branches: - runs-on: ubuntu-latest - timeout-minutes: 15 - steps: - - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - name: Delete branches for closed/merged PRs - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - REPO="${{ github.repository }}" - DELETED=0 - SKIPPED=0 - - # List all remote branches matching pull-request/ - git fetch --prune origin - for branch in $(git branch -r | grep -oP 'origin/pull-request/\K[0-9]+' | sort -un); do - FULL_BRANCH="pull-request/${branch}" - STATE=$(gh pr view "$branch" --repo "$REPO" --json state --jq '.state' 2>/dev/null || echo "") - - if [ "$STATE" = "CLOSED" ] || [ "$STATE" = "MERGED" ]; then - echo "Deleting branch '${FULL_BRANCH}' (PR #${branch} is ${STATE})" - git push origin --delete "$FULL_BRANCH" && DELETED=$((DELETED + 1)) || true - elif [ "$STATE" = "OPEN" ]; then - echo "Skipping branch '${FULL_BRANCH}' (PR #${branch} is still OPEN)" - SKIPPED=$((SKIPPED + 1)) - else - echo "Skipping branch '${FULL_BRANCH}' (could not determine PR #${branch} state)" - SKIPPED=$((SKIPPED + 1)) - fi - done - - echo "" - echo "Done. Deleted: ${DELETED}, Skipped: ${SKIPPED}" diff --git a/tests/unit/torch/opt/plugins/test_transformers_save_load.py b/tests/unit/torch/opt/plugins/test_transformers_save_load.py index 25b182b9bd..fced5734e4 100644 --- a/tests/unit/torch/opt/plugins/test_transformers_save_load.py +++ b/tests/unit/torch/opt/plugins/test_transformers_save_load.py @@ -17,6 +17,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed import pytest +import torch from _test_utils.torch.opt.utils import apply_mode_with_sampling from _test_utils.torch.transformers_models import ( create_tiny_llama_dir, @@ -27,7 +28,7 @@ @pytest.mark.parametrize("model_cls", [LlamaForCausalLM, AutoModelForCausalLM]) def test_causal_lm_save_restore(tmp_path, model_cls): - tiny_llama_dir = create_tiny_llama_dir(tmp_path, hidden_size=128) + tiny_llama_dir = create_tiny_llama_dir(tmp_path, hidden_size=128, dtype=torch.float32) model_ref = model_cls.from_pretrained(tiny_llama_dir) # TODO: Add calibrate, compress mode to the test model_ref = apply_mode_with_sampling( @@ -41,7 +42,7 @@ def test_causal_lm_save_restore(tmp_path, model_cls): def test_causal_lm_from_config(tmp_path): """Test loading a model using from_config after applying optimizations""" - tiny_llama_dir = create_tiny_llama_dir(tmp_path, hidden_size=128) + tiny_llama_dir = create_tiny_llama_dir(tmp_path, hidden_size=128, dtype=torch.float32) model_ref = AutoModelForCausalLM.from_pretrained(tiny_llama_dir) model_ref = apply_mode_with_sampling( diff --git a/tests/unit/torch/quantization/plugins/test_huggingface.py b/tests/unit/torch/quantization/plugins/test_huggingface.py index 253aa665c6..8f9ad8bd9c 100644 --- a/tests/unit/torch/quantization/plugins/test_huggingface.py +++ b/tests/unit/torch/quantization/plugins/test_huggingface.py @@ -194,7 +194,7 @@ def forward_step(model, batch): ], ) def test_quantized_transformers_save_restore(tmp_path, model_cls, quant_config): - tiny_llama_dir = create_tiny_llama_dir(tmp_path) + tiny_llama_dir = create_tiny_llama_dir(tmp_path, dtype=torch.float32) # update config to fit test cases if quant_config == mtq.INT4_AWQ_CFG: quant_config["quant_cfg"]["*weight_quantizer"]["block_sizes"] = {-1: 16} From 31efc3653e07ac40c7ba1d126a07e6fb98c1a80e Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Wed, 25 Mar 2026 01:39:57 -0700 Subject: [PATCH 11/28] Enable some quantizer manual tests Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- .../torch/quantization/tensor_quantizer_common.py | 3 +-- tests/unit/torch/quantization/test_calibrator.py | 6 +----- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/tests/_test_utils/torch/quantization/tensor_quantizer_common.py b/tests/_test_utils/torch/quantization/tensor_quantizer_common.py index ad2722dca6..8559192718 100644 --- a/tests/_test_utils/torch/quantization/tensor_quantizer_common.py +++ b/tests/_test_utils/torch/quantization/tensor_quantizer_common.py @@ -144,10 +144,9 @@ def test_max_calib(self): rtol=0, ) - @pytest.mark.manual(reason="slow test, run with --run-manual") def test_entropy_and_percentile_calib(self): """Don't really have a good way to test it.""" - quant_attr_cfg1 = QuantizerAttributeConfig(calib_method="histogram") + quant_attr_cfg1 = QuantizerAttributeConfig(calibrator="histogram") quantizer1 = TensorQuantizer(quant_attr_cfg1, if_calib=True, if_quant=False).to(self.device) x_1 = torch.rand(3, 6, 7, 7).to(self.device) diff --git a/tests/unit/torch/quantization/test_calibrator.py b/tests/unit/torch/quantization/test_calibrator.py index 4cb7458912..19c86b0b9f 100644 --- a/tests/unit/torch/quantization/test_calibrator.py +++ b/tests/unit/torch/quantization/test_calibrator.py @@ -88,8 +88,8 @@ def test_track_amax_raises(self): max_calibrator.collect(x_3) -@pytest.mark.manual(reason="slow test, run with --run-manual") class TestHistogramCalibrator: + @pytest.mark.skip(reason="TODO: Fix assertions in test_grow") def test_grow(self, verbose): x_1 = torch.tensor([0, 255, 255, 255, 255, 255]) x_2 = torch.tensor([0, 255, 255, 255, 255, 256]) @@ -181,7 +181,6 @@ def test_torch_hist(self): ) -@pytest.mark.manual(reason="slow test, run with --run-manual") class TestEntropyCalibrator: def test_one_tensor(self, verbose): hist_calibrator = calib.HistogramCalibrator( @@ -244,7 +243,6 @@ def test_repr(self): repr(hist_calibrator) -@pytest.mark.manual(reason="slow test, run with --run-manual") class TestMSECalibrator: def test_one_tensor(self, verbose): calibrator = calib.HistogramCalibrator(8, None, False, num_bins=32) @@ -299,7 +297,6 @@ def test_repr(self): repr(calibrator) -@pytest.mark.manual(reason="slow test, run with --run-manual") class TestPercentileCalibrator: def test_one_tensor(self, verbose): calibrator = calib.HistogramCalibrator(8, None, False) @@ -359,7 +356,6 @@ def test_range(self): calibrator.compute_amax("percentile", percentile=200) -@pytest.mark.manual(reason="slow test, run with --run-manual") class TestCalibrateWeights: def test_max(self): ref_lenet = QuantConvLinear() From f69d9fab08cbdae59365597a221c5ab2e883b91c Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Wed, 25 Mar 2026 02:40:11 -0700 Subject: [PATCH 12/28] fix test Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- examples/llm_ptq/README.md | 5 ++- examples/llm_ptq/requirements-t5.txt | 1 - examples/llm_ptq/requirements-whisper.txt | 2 - modelopt/torch/trace/plugins/transformers.py | 11 +++-- tests/examples/llm_ptq/test_llm_ptq.py | 44 +++++++------------ .../quantization/plugins/test_huggingface.py | 6 ++- 6 files changed, 28 insertions(+), 41 deletions(-) delete mode 100644 examples/llm_ptq/requirements-t5.txt delete mode 100644 examples/llm_ptq/requirements-whisper.txt diff --git a/examples/llm_ptq/README.md b/examples/llm_ptq/README.md index 4d22390763..fc06bc95d3 100755 --- a/examples/llm_ptq/README.md +++ b/examples/llm_ptq/README.md @@ -115,7 +115,7 @@ Please reference our [framework scripts](#framework-scripts) and our [docs](http | Kimi K2 | - | - | - | - | ✅ | | MiniMax M2.1 | - | - | - | - | ✅ | | T5 | ✅ | ✅ | ✅ | ✅ | - | -| Whisper | ✅ | ❌ | ❌ | ❌ | - | +| Whisper9 | ✅ | ❌ | ❌ | ❌ | - | | Nemotron-3 | ✅ | ❌ | ❌ | ❌ | ✅ | > *This is a subset of the models supported. For the full list please check the [TensorRT-LLM support matrix](https://nvidia.github.io/TensorRT-LLM/reference/precision.html#support-matrix)* @@ -127,7 +127,8 @@ Please reference our [framework scripts](#framework-scripts) and our [docs](http > *5.A selective set of the popular models are internally tested. The actual model support list may be longer. NVFP4 inference requires Blackwell GPUs and TensorRT-LLM v0.17 or later* \ > *6.Some models currently support export to HF format only.* \ > *7.[PTQ for DeepSeek](../deepseek/README.md)* \ -> *8.GLM-4.7 has MTP (Multi-Token Prediction) layers that are automatically loaded and excluded from quantization.* +> *8.GLM-4.7 has MTP (Multi-Token Prediction) layers that are automatically loaded and excluded from quantization.* \ +> *9.Running Whisper model requires [torchcodec](https://github.com/meta-pytorch/torchcodec?tab=readme-ov-file#installing-cuda-enabled-torchcodec) and other system packages (e.g. ffmpeg).* > *The accuracy loss after PTQ may vary depending on the actual model and the quantization method. Different models may have different accuracy loss and usually the accuracy loss is more significant when the base model is small. If the accuracy after PTQ is not meeting the requirement, please try either modifying [hf_ptq.py](./hf_ptq.py) and disabling the KV cache quantization or using the [QAT](./../llm_qat/README.md) instead. For NVFP4 quantization specifically, we recommend `nvfp4_mlp_only`, `nvfp4_experts_only`, or `nvfp4_omlp_only` to achieve higher accuracy by restricting quantization to the MLP/expert layers (and optionally the `o_proj` layer) while keeping the attention QKV projections unquantized.* diff --git a/examples/llm_ptq/requirements-t5.txt b/examples/llm_ptq/requirements-t5.txt deleted file mode 100644 index 0347135464..0000000000 --- a/examples/llm_ptq/requirements-t5.txt +++ /dev/null @@ -1 +0,0 @@ -transformers==4.48.0 diff --git a/examples/llm_ptq/requirements-whisper.txt b/examples/llm_ptq/requirements-whisper.txt deleted file mode 100644 index a79b19aeee..0000000000 --- a/examples/llm_ptq/requirements-whisper.txt +++ /dev/null @@ -1,2 +0,0 @@ -librosa -soundfile diff --git a/modelopt/torch/trace/plugins/transformers.py b/modelopt/torch/trace/plugins/transformers.py index 02e70741c5..ad7a8cf019 100644 --- a/modelopt/torch/trace/plugins/transformers.py +++ b/modelopt/torch/trace/plugins/transformers.py @@ -16,10 +16,10 @@ """Utilities to describe symbols in the dynamic attention module.""" import torch -from packaging.version import Version as _Version +import transformers +from packaging.version import Version from torch import nn -from transformers import __version__ as _transformers_version -from transformers.models.bert.modeling_bert import BertAttention +from transformers.models.bert.modeling_bert import BertAttention, BertLayer from transformers.models.gptj.modeling_gptj import GPTJAttention from ..symbols import Symbol, SymInfo, SymMap @@ -66,8 +66,7 @@ def get_hf_attn_sym_info_unsortable(mod: nn.Module) -> SymInfo: # BertAttention is a registered leaf (the proxy is not iterable). Patch BertLayer.forward to use # indexing instead, and call feed_forward_chunk directly (equivalent to apply_chunking_to_forward # with chunk_size=0, which is the default for BERT). -if _Version(_transformers_version) >= _Version("5.0"): - from transformers.models.bert.modeling_bert import BertLayer as _BertLayer +if Version(transformers.__version__) >= Version("5.0"): def _fx_friendly_bert_layer_forward( self, @@ -113,4 +112,4 @@ def _fx_friendly_bert_layer_forward( # chunk_size_feed_forward=0, which is the BERT default). return self.feed_forward_chunk(attention_output) - _BertLayer.forward = _fx_friendly_bert_layer_forward + BertLayer.forward = _fx_friendly_bert_layer_forward diff --git a/tests/examples/llm_ptq/test_llm_ptq.py b/tests/examples/llm_ptq/test_llm_ptq.py index f5d0b39c1d..e0575654ef 100644 --- a/tests/examples/llm_ptq/test_llm_ptq.py +++ b/tests/examples/llm_ptq/test_llm_ptq.py @@ -15,7 +15,7 @@ import pytest -from _test_utils.examples.llm_ptq_utils import PTQCommand, WithRequirements +from _test_utils.examples.llm_ptq_utils import PTQCommand from _test_utils.examples.models import ( BART_PATH, MIXTRAL_PATH, @@ -36,18 +36,9 @@ def test_ptq_bart(command): command.run(BART_PATH) -class TestT5(WithRequirements): - requirements = [("transformers", "4.48.0")] - - @pytest.mark.parametrize( - "command", - [ - PTQCommand(quant="fp8", min_sm=89), - ], - ids=PTQCommand.param_str, - ) - def test_ptq_t5(self, command): - command.run(T5_PATH) +@pytest.mark.parametrize("command", [PTQCommand(quant="fp8", min_sm=89)], ids=PTQCommand.param_str) +def test_ptq_t5(command): + command.run(T5_PATH) @pytest.mark.parametrize( @@ -61,22 +52,17 @@ def test_ptq_mixtral(command): command.run(MIXTRAL_PATH) -class TestWhisper(WithRequirements): - requirements = [ - ("librosa", None), - ("soundfile", None), - ] - - @pytest.mark.parametrize( - "command", - [ - # Auto-batch-size computation seems to take >10mins for Whisper hence using a fixed batch size - PTQCommand(quant="fp8", calib_batch_size=16, min_sm=89), - ], - ids=PTQCommand.param_str, - ) - def test_ptq_whisper(self, command): - command.run(WHISPER_PATH) +@pytest.mark.skip(reason="Whisper requires torchcodec and other system packages") +@pytest.mark.parametrize( + "command", + [ + # Auto-batch-size computation seems to take >10mins for Whisper hence using a fixed batch size + PTQCommand(quant="fp8", calib_batch_size=16, min_sm=89), + ], + ids=PTQCommand.param_str, +) +def test_ptq_whisper(command): + command.run(WHISPER_PATH) @pytest.mark.parametrize( diff --git a/tests/unit/torch/quantization/plugins/test_huggingface.py b/tests/unit/torch/quantization/plugins/test_huggingface.py index 8f9ad8bd9c..d9b1f9e438 100644 --- a/tests/unit/torch/quantization/plugins/test_huggingface.py +++ b/tests/unit/torch/quantization/plugins/test_huggingface.py @@ -12,7 +12,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import os import warnings from contextlib import nullcontext @@ -28,6 +27,7 @@ get_tiny_qwen3_moe, tf_modelopt_state_and_output_tester, ) +from packaging.version import Version import modelopt.torch.quantization as mtq from modelopt.torch.quantization.nn import QuantLinear, QuantModuleRegistry @@ -105,6 +105,10 @@ def test_convert_conv1d(): assert torch.allclose(out_1, out_2) +@pytest.mark.skipif( + Version(transformers.__version__) < Version("5.0"), + reason="test_dbrx is not supported for transformers<5.0", +) def test_dbrx(): assert DbrxExperts in QuantModuleRegistry assert DbrxExpertGLU in QuantModuleRegistry From 2dc3140ba4ec21863d045ebe8ca4c98e0b70af4f Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Wed, 25 Mar 2026 03:06:13 -0700 Subject: [PATCH 13/28] Set min transformers 5.0 Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- CHANGELOG.rst | 2 +- examples/llm_autodeploy/run_auto_quantize.py | 2 +- examples/llm_qad/data_utils/download_dataset.py | 2 +- .../collect_hidden_states/compute_hidden_states_hf.py | 2 +- examples/speculative_decoding/requirements.txt | 1 - .../speculative_decoding/scripts/send_conversation_vllm.py | 2 +- modelopt/torch/__init__.py | 6 +++--- pyproject.toml | 2 +- tox.ini | 2 +- 9 files changed, 10 insertions(+), 11 deletions(-) delete mode 100644 examples/speculative_decoding/requirements.txt diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 1ecd3976b0..b533904474 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -18,7 +18,7 @@ Changelog **Misc** -- Add ``transformers>=5.0`` support. +- Bump minimum recommended transformers version to 5.0. 0.43 (2026-04-09) ^^^^^^^^^^^^^^^^^ diff --git a/examples/llm_autodeploy/run_auto_quantize.py b/examples/llm_autodeploy/run_auto_quantize.py index 931c1153d3..389d8207b0 100644 --- a/examples/llm_autodeploy/run_auto_quantize.py +++ b/examples/llm_autodeploy/run_auto_quantize.py @@ -206,7 +206,7 @@ def modelopt_ptq( ) parser.add_argument( "--trust_remote_code", - help="Set trust_remotecode for Huggingface models and tokenizers", + help="Set trust_remote_code for Huggingface models and tokenizers", default=False, action="store_true", ) diff --git a/examples/llm_qad/data_utils/download_dataset.py b/examples/llm_qad/data_utils/download_dataset.py index 31992d9c1f..42ef6280e1 100644 --- a/examples/llm_qad/data_utils/download_dataset.py +++ b/examples/llm_qad/data_utils/download_dataset.py @@ -161,7 +161,7 @@ def main(): ) p.add_argument( "--trust_remote_code", - help="Set trust_remotecode for Huggingface models and tokenizers", + help="Set trust_remote_code for Huggingface models and tokenizers", default=False, action="store_true", ) diff --git a/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_hf.py b/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_hf.py index fbe3b27683..449b261c56 100644 --- a/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_hf.py +++ b/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_hf.py @@ -87,7 +87,7 @@ def parse_args() -> argparse.Namespace: ) parser.add_argument( "--trust_remote_code", - help="Set trust_remotecode for Huggingface models and tokenizers", + help="Set trust_remote_code for Huggingface models and tokenizers", default=False, action="store_true", ) diff --git a/examples/speculative_decoding/requirements.txt b/examples/speculative_decoding/requirements.txt deleted file mode 100644 index 8e50f9c3f4..0000000000 --- a/examples/speculative_decoding/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -transformers>=5.0 diff --git a/examples/speculative_decoding/scripts/send_conversation_vllm.py b/examples/speculative_decoding/scripts/send_conversation_vllm.py index ab55cd6863..d1a5ac5c11 100644 --- a/examples/speculative_decoding/scripts/send_conversation_vllm.py +++ b/examples/speculative_decoding/scripts/send_conversation_vllm.py @@ -57,7 +57,7 @@ def parse_args() -> argparse.Namespace: ) parser.add_argument( "--trust_remote_code", - help="Set trust_remotecode for Huggingface models and tokenizers", + help="Set trust_remote_code for Huggingface models and tokenizers", default=False, action="store_true", ) diff --git a/modelopt/torch/__init__.py b/modelopt/torch/__init__.py index d2d4d80582..1d5dde403f 100644 --- a/modelopt/torch/__init__.py +++ b/modelopt/torch/__init__.py @@ -32,10 +32,10 @@ try: from transformers import __version__ as _transformers_version - if not (_Version("4.56") <= _Version(_transformers_version)): + if not (_Version(_transformers_version) >= _Version("5.0")): _warnings.warn( - f"transformers version {_transformers_version} is not tested with nvidia-modelopt and may cause issues. " - "Please install recommended version with `pip install nvidia-modelopt[hf]` if working with HF models.", + f"transformers {_transformers_version} is not tested with current version of modelopt and may cause issues." + " Please install recommended version with `pip install -U nvidia-modelopt[hf]` if working with HF models.", ) except ImportError: pass diff --git a/pyproject.toml b/pyproject.toml index cb4185e16b..23b49c70ec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,7 +81,7 @@ hf = [ "nltk", "peft>=0.17.0", "sentencepiece>=0.2.1", # Also implicitly used in test_unified_export_megatron, test_vllm_fakequant_megatron_export - "transformers>=4.56", # Should match modelopt/torch/__init__.py and tox.ini + "transformers>=5.0", # Should match modelopt/torch/__init__.py and tox.ini "wonderwords", ] dev-lint = [ diff --git a/tox.ini b/tox.ini index 80299d814d..f70e38ea9d 100644 --- a/tox.ini +++ b/tox.ini @@ -24,7 +24,7 @@ deps = -e .[all,dev-test] # Should match pyproject.toml - tf_min: transformers~=4.56.0 + tf_min: transformers~=5.0 commands = python -m pytest tests/unit {env:COV_ARGS:} From b37545b7cb86925a5c9b731ba72d0601f5243a7d Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Wed, 25 Mar 2026 03:41:39 -0700 Subject: [PATCH 14/28] Fix more tests Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- examples/speculative_decoding/scripts/ar_validate.py | 2 +- .../scripts/export_hf_checkpoint.py | 2 +- modelopt/onnx/llm_export_utils/export_utils.py | 3 ++- modelopt/torch/opt/plugins/transformers.py | 8 +++++--- .../torch/quantization/plugins/transformers_trainer.py | 10 +++++++++- tox.ini | 2 +- 6 files changed, 19 insertions(+), 8 deletions(-) diff --git a/examples/speculative_decoding/scripts/ar_validate.py b/examples/speculative_decoding/scripts/ar_validate.py index 2890b2a5d5..1ad7bec409 100644 --- a/examples/speculative_decoding/scripts/ar_validate.py +++ b/examples/speculative_decoding/scripts/ar_validate.py @@ -55,7 +55,7 @@ def validate_ar(model, tokenizer, ds, steps=3, osl=20, num_samples=80, device=No def main(): parser = argparse.ArgumentParser() parser.add_argument("--model_path", type=str, required=True, help="Path to model directory") - parser.add_argument("--trust_remote_code", type=bool, default=False, help="Trust remote code") + parser.add_argument("--trust_remote_code", action="store_true", help="Trust remote code") parser.add_argument("--steps", type=int, default=3, help="Steps for AR validation") parser.add_argument( "--osl", type=int, default=32, help="Output sequence length for AR validation" diff --git a/examples/speculative_decoding/scripts/export_hf_checkpoint.py b/examples/speculative_decoding/scripts/export_hf_checkpoint.py index 7a8823099a..98ea438f1b 100644 --- a/examples/speculative_decoding/scripts/export_hf_checkpoint.py +++ b/examples/speculative_decoding/scripts/export_hf_checkpoint.py @@ -29,7 +29,7 @@ def parse_args(): description="Export a HF checkpoint (with ModelOpt state) for deployment." ) parser.add_argument("--model_path", type=str, default="Path of the trained checkpoint.") - parser.add_argument("--trust_remote_code", type=bool, default=False, help="Trust remote code") + parser.add_argument("--trust_remote_code", action="store_true", help="Trust remote code") parser.add_argument( "--export_path", type=str, default="Destination directory for exported files." ) diff --git a/modelopt/onnx/llm_export_utils/export_utils.py b/modelopt/onnx/llm_export_utils/export_utils.py index f45b473059..4f50628dca 100644 --- a/modelopt/onnx/llm_export_utils/export_utils.py +++ b/modelopt/onnx/llm_export_utils/export_utils.py @@ -86,7 +86,8 @@ def forward(self, input_ids: torch.Tensor | None, past_key_values: tuple): outputs = self.model(input_ids=input_ids, past_key_values=past_key_values, use_cache=True) hidden_states = outputs[0] - past_key_values = outputs.past_key_values.to_legacy_cache() + cache = outputs.past_key_values + past_key_values = tuple(zip(cache.key_cache, cache.value_cache)) logits = self.lm_head(hidden_states) return logits, past_key_values diff --git a/modelopt/torch/opt/plugins/transformers.py b/modelopt/torch/opt/plugins/transformers.py index a60ea1af3b..9cc729723e 100644 --- a/modelopt/torch/opt/plugins/transformers.py +++ b/modelopt/torch/opt/plugins/transformers.py @@ -25,7 +25,7 @@ from modelopt.torch.utils import report_memory -from ..conversion import ModeloptStateManager +from ..conversion import ModeloptStateManager, load_modelopt_state from .huggingface import ( _get_modelopt_state_path, _new_save_pretrained, @@ -78,9 +78,11 @@ def _restore_qtensor_wrappers(model, model_path): from modelopt.torch.quantization.nn.modules.quant_linear import RealQuantLinear from modelopt.torch.quantization.qtensor import QTensorWrapper - state = torch.load(modelopt_state_path, map_location="cpu", weights_only=False) - for _mode_name, mode_config in state.get("modelopt_state_dict", []): + state = load_modelopt_state(modelopt_state_path) + for _, mode_config in state["modelopt_state_dict"]: q_tensor_state = mode_config.get("metadata", {}).get("q_tensor_state", {}) + if not q_tensor_state: + continue for name, module in model.named_modules(): if ( isinstance(module, RealQuantLinear) diff --git a/modelopt/torch/quantization/plugins/transformers_trainer.py b/modelopt/torch/quantization/plugins/transformers_trainer.py index b0d2786509..f54b9a9512 100644 --- a/modelopt/torch/quantization/plugins/transformers_trainer.py +++ b/modelopt/torch/quantization/plugins/transformers_trainer.py @@ -29,7 +29,7 @@ import modelopt.torch.quantization as mtq from modelopt.torch.distill.plugins.huggingface import KDTrainer from modelopt.torch.opt.plugins import ModelOptHFTrainer -from modelopt.torch.utils import print_rank_0 +from modelopt.torch.utils import get_module_device, print_rank_0 from ..config import QuantizeConfig from ..nn import TensorQuantizer @@ -191,8 +191,16 @@ def __init__( if getattr(self.args, "lora_config", None) is not None and not hasattr( self.model, "peft_config" ): + # NOTE: Adapter weights are created on CPU; move only the new parameters to the + # model's device. A full self.model.to(device) is unsafe for FSDP-wrapped + # models because it would try to consolidate sharded parameters. + existing_params = {id(p) for p in self.model.parameters()} # TODO: use get_peft_model here instead of add_adapter self.model.add_adapter(self.args.lora_config) + device = get_module_device(self.model) + for p in self.model.parameters(): + if id(p) not in existing_params and p.device != device: + p.data = p.data.to(device) print_rank_0("Lora adapter added.") if hasattr(self.model, "peft_config") and self.quant_cfg is not None: diff --git a/tox.ini b/tox.ini index f70e38ea9d..011490d121 100644 --- a/tox.ini +++ b/tox.ini @@ -24,7 +24,7 @@ deps = -e .[all,dev-test] # Should match pyproject.toml - tf_min: transformers~=5.0 + tf_min: transformers~=5.0.0 commands = python -m pytest tests/unit {env:COV_ARGS:} From 1024528e77d4e5685a64a9b8ce20735ce7182963 Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Wed, 25 Mar 2026 05:20:21 -0700 Subject: [PATCH 15/28] Fix for TRT-LLM Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- modelopt/torch/__init__.py | 97 +++++++++++++++++++ modelopt/torch/export/model_config_export.py | 3 +- modelopt/torch/export/tensorrt_llm_utils.py | 3 +- .../torch/quantization/backends/nvfp4_gemm.py | 2 +- .../plugins/transformers_trainer.py | 32 +++--- tests/_test_utils/examples/models.py | 4 +- tests/examples/vlm_ptq/test_qwen_vl.py | 2 + 7 files changed, 121 insertions(+), 22 deletions(-) diff --git a/modelopt/torch/__init__.py b/modelopt/torch/__init__.py index 1d5dde403f..783cc37081 100644 --- a/modelopt/torch/__init__.py +++ b/modelopt/torch/__init__.py @@ -15,10 +15,13 @@ """Model optimization and deployment subpackage for torch.""" +import sys as _sys import warnings as _warnings from packaging.version import Version as _Version from torch import __version__ as _torch_version +from torch import device as _device +from torch import dtype as _dtype from . import distill, nas, opt, peft, prune, quantization, sparsity, speculative, utils @@ -27,8 +30,87 @@ "nvidia-modelopt will drop torch<2.7 support in a future release.", DeprecationWarning ) + # Since `hf` dependencies are optional and users have pre-installed transformers, we need to ensure # correct version is installed to avoid incompatibility issues. +def _patch_transformers_compat(mod) -> None: + """Compatibility shims for names removed in transformers 5.0.""" + import torch.nn as _nn + + # AutoModelForVision2Seq -> AutoModelForImageTextToText + if not hasattr(mod, "AutoModelForVision2Seq") and hasattr(mod, "AutoModelForImageTextToText"): + mod.AutoModelForVision2Seq = mod.AutoModelForImageTextToText + + # get_parameter_device and get_parameter_dtype were removed in transformers 5.0 + modeling_utils = _sys.modules.get("transformers.modeling_utils") + if modeling_utils is not None: + if not hasattr(modeling_utils, "get_parameter_device"): + + def get_parameter_device(parameter: _nn.Module) -> _device: + return next(parameter.parameters()).device + + modeling_utils.get_parameter_device = get_parameter_device # type: ignore[attr-defined] + + if not hasattr(modeling_utils, "get_parameter_dtype"): + + def get_parameter_dtype(parameter: _nn.Module) -> _dtype: + return next(parameter.parameters()).dtype + + modeling_utils.get_parameter_dtype = get_parameter_dtype # type: ignore[attr-defined] + + if not hasattr(modeling_utils, "load_sharded_checkpoint"): + try: + from transformers.trainer_utils import ( + load_sharded_checkpoint as _load_sharded_checkpoint, + ) + + modeling_utils.load_sharded_checkpoint = _load_sharded_checkpoint # type: ignore[attr-defined] + except ImportError: + pass + + # PreTrainedTokenizerBase.vocab_size raises NotImplementedError in transformers 5.0 + # for tokenizers that don't implement it. Python 3's hasattr() only catches + # AttributeError, so callers using hasattr(tok, 'vocab_size') get a crash instead + # of False. Re-raise as AttributeError so hasattr works correctly. + try: + from transformers.tokenization_utils_base import ( + PreTrainedTokenizerBase as _PreTrainedTokenizerBase, + ) + + _orig_vocab_size_fget = _PreTrainedTokenizerBase.vocab_size.fget + + def _safe_vocab_size(self): + try: + return _orig_vocab_size_fget(self) + except NotImplementedError: + # Processors (e.g. Qwen2VLProcessor) inherit vocab_size from + # PreTrainedTokenizerBase but don't implement it — delegate to + # the inner tokenizer if present. + inner = getattr(self, "tokenizer", None) + if inner is not None and inner is not self: + return inner.vocab_size + raise AttributeError("vocab_size not implemented for this tokenizer") + + _PreTrainedTokenizerBase.vocab_size = property(_safe_vocab_size) + except Exception: + pass + + # AutoConfig.register raises ValueError when a model type is already built into + # transformers (e.g. exaone_moe added in 5.0). Older packages like TRT-LLM call + # register without exist_ok=True. Patch CONFIG_MAPPING.register to silently skip. + try: + from transformers.models.auto.configuration_auto import CONFIG_MAPPING as _CONFIG_MAPPING + + _orig_cfg_register = _CONFIG_MAPPING.register + + def _patched_cfg_register(key, value, exist_ok=False): + _orig_cfg_register(key, value, exist_ok=True) + + _CONFIG_MAPPING.register = _patched_cfg_register + except Exception: + pass + + try: from transformers import __version__ as _transformers_version @@ -37,6 +119,21 @@ f"transformers {_transformers_version} is not tested with current version of modelopt and may cause issues." " Please install recommended version with `pip install -U nvidia-modelopt[hf]` if working with HF models.", ) + + # Temporary workaround until https://github.com/NVIDIA/TensorRT-LLM/pull/12541 makes it into a TRT-LLM container + if "transformers" in _sys.modules: + _patch_transformers_compat(_sys.modules["transformers"]) + else: + + class _TransformersCompatFinder: + def find_module(self, fullname, path=None): + if fullname == "transformers": + _sys.meta_path.remove(self) # type: ignore[arg-type] + import importlib as _importlib + + _patch_transformers_compat(_importlib.import_module(fullname)) + + _sys.meta_path.insert(0, _TransformersCompatFinder()) # type: ignore[arg-type] except ImportError: pass diff --git a/modelopt/torch/export/model_config_export.py b/modelopt/torch/export/model_config_export.py index b9acb80c8b..ae92e2776f 100644 --- a/modelopt/torch/export/model_config_export.py +++ b/modelopt/torch/export/model_config_export.py @@ -151,7 +151,8 @@ def torch_to_tensorrt_llm_checkpoint( model_metadata_config = model.config.__dict__ vocab_size = model.config.vocab_size hf_config = model.config - architecture = model.config.architectures[0] + architectures = getattr(model.config, "architectures", None) + architecture = architectures[0] if architectures else "" # For Baichuan 13B, we check if alibi is used with the alibi_mask property. if hasattr(model, "model") and hasattr(model.model, "alibi_mask"): diff --git a/modelopt/torch/export/tensorrt_llm_utils.py b/modelopt/torch/export/tensorrt_llm_utils.py index 75708dbcde..f49fcd4899 100755 --- a/modelopt/torch/export/tensorrt_llm_utils.py +++ b/modelopt/torch/export/tensorrt_llm_utils.py @@ -48,6 +48,7 @@ "gemma": "GemmaForCausalLM", "gemma3": "Gemma3ForCausalLM", "gpt": "GPTForCausalLM", + "qwen": "QWenForCausalLM", "enc": "EncoderModel", "dec": "DecoderModel", "mllama": "MLLaMAModel", @@ -240,7 +241,7 @@ def convert_to_tensorrt_llm_config( layernorm_type_map = {i.name: i.value for i in LayerNormType} layernorm_position_map = {i.name: i.value for i in LayerNormPositionType} - if decoder_type in ["gpt", "gemma", "llama"]: + if decoder_type in ["gpt", "gemma", "llama", "qwen"]: pass elif decoder_type == "mpt": config.update( diff --git a/modelopt/torch/quantization/backends/nvfp4_gemm.py b/modelopt/torch/quantization/backends/nvfp4_gemm.py index ffc18fea33..e7d2b90ff7 100644 --- a/modelopt/torch/quantization/backends/nvfp4_gemm.py +++ b/modelopt/torch/quantization/backends/nvfp4_gemm.py @@ -176,7 +176,7 @@ def backward(ctx, grad_outputs): grad_weight = grad_outputs.reshape(-1, grad_outputs.shape[-1]).T @ input_tensor.reshape( -1, input_tensor.shape[-1] ) - if ctx.compute_bias_grad is not None: + if ctx.compute_bias_grad: # Sum all dimensions except the last one grad_bias = grad_outputs.sum(dim=list(range(grad_outputs.dim() - 1))) diff --git a/modelopt/torch/quantization/plugins/transformers_trainer.py b/modelopt/torch/quantization/plugins/transformers_trainer.py index f54b9a9512..1b85e2ef09 100644 --- a/modelopt/torch/quantization/plugins/transformers_trainer.py +++ b/modelopt/torch/quantization/plugins/transformers_trainer.py @@ -29,7 +29,7 @@ import modelopt.torch.quantization as mtq from modelopt.torch.distill.plugins.huggingface import KDTrainer from modelopt.torch.opt.plugins import ModelOptHFTrainer -from modelopt.torch.utils import get_module_device, print_rank_0 +from modelopt.torch.utils import print_rank_0 from ..config import QuantizeConfig from ..nn import TensorQuantizer @@ -175,6 +175,20 @@ def __init__( **kwargs, ): """Initialize the trainer with modelopt states.""" + # Add LoRA adapter BEFORE super().__init__() which wraps the model with + # FSDP/accelerate. Adding adapters after FSDP wrapping leaves the new + # LoRA weights on CPU while the rest of the model is on GPU. + training_args = kwargs.get("args") or (args[1] if len(args) > 1 else None) + model = kwargs.get("model") or (args[0] if args else None) + if ( + model is not None + and training_args is not None + and getattr(training_args, "lora_config", None) is not None + and not hasattr(model, "peft_config") + ): + model.add_adapter(training_args.lora_config) + print_rank_0("Lora adapter added.") + super().__init__(*args, **kwargs) self.quant_args = quant_args @@ -187,22 +201,6 @@ def __init__( ) self.quant_cfg = quant_cfg - # Add lora adapter before quantizing the model - if getattr(self.args, "lora_config", None) is not None and not hasattr( - self.model, "peft_config" - ): - # NOTE: Adapter weights are created on CPU; move only the new parameters to the - # model's device. A full self.model.to(device) is unsafe for FSDP-wrapped - # models because it would try to consolidate sharded parameters. - existing_params = {id(p) for p in self.model.parameters()} - # TODO: use get_peft_model here instead of add_adapter - self.model.add_adapter(self.args.lora_config) - device = get_module_device(self.model) - for p in self.model.parameters(): - if id(p) not in existing_params and p.device != device: - p.data = p.data.to(device) - print_rank_0("Lora adapter added.") - if hasattr(self.model, "peft_config") and self.quant_cfg is not None: target_modules = ( self.args.lora_config.target_modules if hasattr(self.args, "lora_config") else [] diff --git a/tests/_test_utils/examples/models.py b/tests/_test_utils/examples/models.py index abedd7b2a4..8bf2b95a60 100644 --- a/tests/_test_utils/examples/models.py +++ b/tests/_test_utils/examples/models.py @@ -64,8 +64,8 @@ def _select_path(remote_id: str, local_id: str) -> str: ) QWEN_VL_PATH = _select_path( - remote_id="Qwen/Qwen2-VL-2B-Instruct", - local_id="Qwen2-VL-2B-Instruct", + remote_id="Qwen/Qwen3-VL-2B-Instruct", + local_id="Qwen3-VL-2B-Instruct", ) # Diffusers diff --git a/tests/examples/vlm_ptq/test_qwen_vl.py b/tests/examples/vlm_ptq/test_qwen_vl.py index 458d7563db..11c5c833be 100644 --- a/tests/examples/vlm_ptq/test_qwen_vl.py +++ b/tests/examples/vlm_ptq/test_qwen_vl.py @@ -21,4 +21,6 @@ @pytest.mark.parametrize("quant", ["fp8", "int8_sq", "nvfp4"]) def test_qwen_vl(quant): + if quant == "fp8": + pytest.skip(reason="FP8 is not supported for Qwen2-VL") run_vlm_ptq_command(model=QWEN_VL_PATH, quant=quant) From 1e45639ff2e615a41d1ec8677e489d4cb269c0b1 Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Wed, 25 Mar 2026 08:53:21 -0700 Subject: [PATCH 16/28] Let PTQ example tests run with transformers<5.0 Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- .github/workflows/example_tests.yml | 8 ++--- .github/workflows/gpu_tests.yml | 6 ++-- CHANGELOG.rst | 2 +- examples/llm_ptq/README.md | 2 +- modelopt/torch/__init__.py | 36 ++++--------------- .../plugins/transformers_trainer.py | 29 +++++++-------- pyproject.toml | 2 +- tests/examples/llm_ptq/test_llm_ptq.py | 9 +++-- tests/examples/vlm_ptq/test_qwen_vl.py | 2 -- tox.ini | 2 +- 10 files changed, 37 insertions(+), 61 deletions(-) diff --git a/.github/workflows/example_tests.yml b/.github/workflows/example_tests.yml index f3f3908043..1c2a8fbb47 100644 --- a/.github/workflows/example_tests.yml +++ b/.github/workflows/example_tests.yml @@ -70,7 +70,7 @@ jobs: uses: ./.github/workflows/_example_tests_runner.yml secrets: inherit with: - docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.01' }}-py3" + docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.03' }}-py3" example: ${{ matrix.example }} timeout_minutes: 30 pip_install_extras: "[hf,dev-test]" @@ -82,7 +82,7 @@ jobs: uses: ./.github/workflows/_example_tests_runner.yml secrets: inherit with: - docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.01' }}-py3" + docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.03' }}-py3" example: ${{ matrix.example }} timeout_minutes: 30 pip_install_extras: "[hf,dev-test]" @@ -99,7 +99,7 @@ jobs: uses: ./.github/workflows/_example_tests_runner.yml secrets: inherit with: - docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc5" + docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc9" example: ${{ matrix.example }} pip_install_extras: "[hf,dev-test]" runner: linux-amd64-gpu-rtxpro6000-latest-1 @@ -113,7 +113,7 @@ jobs: uses: ./.github/workflows/_example_tests_runner.yml secrets: inherit with: - docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc5" + docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc9" example: ${{ matrix.example }} pip_install_extras: "[hf,dev-test]" runner: linux-amd64-gpu-rtxpro6000-latest-2 diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml index 538e05e75f..214afa756d 100644 --- a/.github/workflows/gpu_tests.yml +++ b/.github/workflows/gpu_tests.yml @@ -64,13 +64,13 @@ jobs: include: - example: gpu timeout: 45 - container_image: pytorch:26.01-py3 + container_image: pytorch:26.03-py3 - example: gpu-megatron timeout: 45 - container_image: pytorch:26.01-py3 + container_image: pytorch:26.03-py3 - example: gpu-trtllm timeout: 30 - container_image: tensorrt-llm/release:1.3.0rc5 + container_image: tensorrt-llm/release:1.3.0rc9 runs-on: linux-amd64-gpu-rtxpro6000-latest-1 timeout-minutes: ${{ matrix.timeout }} container: &gpu_container diff --git a/CHANGELOG.rst b/CHANGELOG.rst index b533904474..7f9917e10d 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -18,7 +18,7 @@ Changelog **Misc** -- Bump minimum recommended transformers version to 5.0. +- Add experimental support for transformers>=5.0. Unified Hugging Face checkpoint export for quantized checkpoints may not work for some models with transformers>=5.0 yet. 0.43 (2026-04-09) ^^^^^^^^^^^^^^^^^ diff --git a/examples/llm_ptq/README.md b/examples/llm_ptq/README.md index fc06bc95d3..0bba1d71e1 100755 --- a/examples/llm_ptq/README.md +++ b/examples/llm_ptq/README.md @@ -128,7 +128,7 @@ Please reference our [framework scripts](#framework-scripts) and our [docs](http > *6.Some models currently support export to HF format only.* \ > *7.[PTQ for DeepSeek](../deepseek/README.md)* \ > *8.GLM-4.7 has MTP (Multi-Token Prediction) layers that are automatically loaded and excluded from quantization.* \ -> *9.Running Whisper model requires [torchcodec](https://github.com/meta-pytorch/torchcodec?tab=readme-ov-file#installing-cuda-enabled-torchcodec) and other system packages (e.g. ffmpeg).* +> *9.Running Whisper model with transformers>=5.0 requires [torchcodec](https://github.com/meta-pytorch/torchcodec?tab=readme-ov-file#installing-cuda-enabled-torchcodec) and other system packages (e.g. ffmpeg).* > *The accuracy loss after PTQ may vary depending on the actual model and the quantization method. Different models may have different accuracy loss and usually the accuracy loss is more significant when the base model is small. If the accuracy after PTQ is not meeting the requirement, please try either modifying [hf_ptq.py](./hf_ptq.py) and disabling the KV cache quantization or using the [QAT](./../llm_qat/README.md) instead. For NVFP4 quantization specifically, we recommend `nvfp4_mlp_only`, `nvfp4_experts_only`, or `nvfp4_omlp_only` to achieve higher accuracy by restricting quantization to the MLP/expert layers (and optionally the `o_proj` layer) while keeping the attention QKV projections unquantized.* diff --git a/modelopt/torch/__init__.py b/modelopt/torch/__init__.py index 783cc37081..456bf7b4a3 100644 --- a/modelopt/torch/__init__.py +++ b/modelopt/torch/__init__.py @@ -68,33 +68,6 @@ def get_parameter_dtype(parameter: _nn.Module) -> _dtype: except ImportError: pass - # PreTrainedTokenizerBase.vocab_size raises NotImplementedError in transformers 5.0 - # for tokenizers that don't implement it. Python 3's hasattr() only catches - # AttributeError, so callers using hasattr(tok, 'vocab_size') get a crash instead - # of False. Re-raise as AttributeError so hasattr works correctly. - try: - from transformers.tokenization_utils_base import ( - PreTrainedTokenizerBase as _PreTrainedTokenizerBase, - ) - - _orig_vocab_size_fget = _PreTrainedTokenizerBase.vocab_size.fget - - def _safe_vocab_size(self): - try: - return _orig_vocab_size_fget(self) - except NotImplementedError: - # Processors (e.g. Qwen2VLProcessor) inherit vocab_size from - # PreTrainedTokenizerBase but don't implement it — delegate to - # the inner tokenizer if present. - inner = getattr(self, "tokenizer", None) - if inner is not None and inner is not self: - return inner.vocab_size - raise AttributeError("vocab_size not implemented for this tokenizer") - - _PreTrainedTokenizerBase.vocab_size = property(_safe_vocab_size) - except Exception: - pass - # AutoConfig.register raises ValueError when a model type is already built into # transformers (e.g. exaone_moe added in 5.0). Older packages like TRT-LLM call # register without exist_ok=True. Patch CONFIG_MAPPING.register to silently skip. @@ -114,13 +87,18 @@ def _patched_cfg_register(key, value, exist_ok=False): try: from transformers import __version__ as _transformers_version - if not (_Version(_transformers_version) >= _Version("5.0")): + if _Version(_transformers_version) < _Version("4.56"): _warnings.warn( f"transformers {_transformers_version} is not tested with current version of modelopt and may cause issues." " Please install recommended version with `pip install -U nvidia-modelopt[hf]` if working with HF models.", ) + elif _Version(_transformers_version) >= _Version("5.0"): + _warnings.warn( + "transformers>=5.0 support is experimental. Unified Hugging Face checkpoint export for quantized " + "checkpoints may not work for some models yet.", + ) - # Temporary workaround until https://github.com/NVIDIA/TensorRT-LLM/pull/12541 makes it into a TRT-LLM container + # Temporary workaround until TRT-LLM container supports transformers 5.0 if "transformers" in _sys.modules: _patch_transformers_compat(_sys.modules["transformers"]) else: diff --git a/modelopt/torch/quantization/plugins/transformers_trainer.py b/modelopt/torch/quantization/plugins/transformers_trainer.py index 1b85e2ef09..3ffd4274cd 100644 --- a/modelopt/torch/quantization/plugins/transformers_trainer.py +++ b/modelopt/torch/quantization/plugins/transformers_trainer.py @@ -29,7 +29,7 @@ import modelopt.torch.quantization as mtq from modelopt.torch.distill.plugins.huggingface import KDTrainer from modelopt.torch.opt.plugins import ModelOptHFTrainer -from modelopt.torch.utils import print_rank_0 +from modelopt.torch.utils import get_module_device, print_rank_0 from ..config import QuantizeConfig from ..nn import TensorQuantizer @@ -175,20 +175,6 @@ def __init__( **kwargs, ): """Initialize the trainer with modelopt states.""" - # Add LoRA adapter BEFORE super().__init__() which wraps the model with - # FSDP/accelerate. Adding adapters after FSDP wrapping leaves the new - # LoRA weights on CPU while the rest of the model is on GPU. - training_args = kwargs.get("args") or (args[1] if len(args) > 1 else None) - model = kwargs.get("model") or (args[0] if args else None) - if ( - model is not None - and training_args is not None - and getattr(training_args, "lora_config", None) is not None - and not hasattr(model, "peft_config") - ): - model.add_adapter(training_args.lora_config) - print_rank_0("Lora adapter added.") - super().__init__(*args, **kwargs) self.quant_args = quant_args @@ -201,6 +187,14 @@ def __init__( ) self.quant_cfg = quant_cfg + # Add lora adapter before quantizing the model + if getattr(self.args, "lora_config", None) is not None and not hasattr( + self.model, "peft_config" + ): + # TODO: use get_peft_model here instead of add_adapter + self.model.add_adapter(self.args.lora_config) + print_rank_0("Lora adapter added.") + if hasattr(self.model, "peft_config") and self.quant_cfg is not None: target_modules = ( self.args.lora_config.target_modules if hasattr(self.args, "lora_config") else [] @@ -350,8 +344,11 @@ def _load_best_model(self, *args, **kwargs): ), "Some base_layer parameters are not frozen" adapter_name = self.model.active_adapters()[0] + device = get_module_device(self.model) self.model.delete_adapter(adapter_name) - self.model.load_adapter(self.state.best_model_checkpoint, adapter_name) + self.model.load_adapter( + self.state.best_model_checkpoint, adapter_name, torch_device=device + ) else: super()._load_best_model(*args, **kwargs) diff --git a/pyproject.toml b/pyproject.toml index 23b49c70ec..52ee43ccb2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,7 +81,7 @@ hf = [ "nltk", "peft>=0.17.0", "sentencepiece>=0.2.1", # Also implicitly used in test_unified_export_megatron, test_vllm_fakequant_megatron_export - "transformers>=5.0", # Should match modelopt/torch/__init__.py and tox.ini + "transformers>=4.56.0", # Should match modelopt/torch/__init__.py and tox.ini "wonderwords", ] dev-lint = [ diff --git a/tests/examples/llm_ptq/test_llm_ptq.py b/tests/examples/llm_ptq/test_llm_ptq.py index e0575654ef..358516e993 100644 --- a/tests/examples/llm_ptq/test_llm_ptq.py +++ b/tests/examples/llm_ptq/test_llm_ptq.py @@ -12,9 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - - import pytest +import transformers from _test_utils.examples.llm_ptq_utils import PTQCommand from _test_utils.examples.models import ( BART_PATH, @@ -23,6 +22,7 @@ TINY_LLAMA_PATH, WHISPER_PATH, ) +from packaging.version import Version @pytest.mark.parametrize( @@ -52,7 +52,10 @@ def test_ptq_mixtral(command): command.run(MIXTRAL_PATH) -@pytest.mark.skip(reason="Whisper requires torchcodec and other system packages") +@pytest.mark.skipif( + Version(transformers.__version__) >= Version("5.0"), + reason="Whisper requires torchcodec and other system packages for transformers>=5.0", +) @pytest.mark.parametrize( "command", [ diff --git a/tests/examples/vlm_ptq/test_qwen_vl.py b/tests/examples/vlm_ptq/test_qwen_vl.py index 11c5c833be..458d7563db 100644 --- a/tests/examples/vlm_ptq/test_qwen_vl.py +++ b/tests/examples/vlm_ptq/test_qwen_vl.py @@ -21,6 +21,4 @@ @pytest.mark.parametrize("quant", ["fp8", "int8_sq", "nvfp4"]) def test_qwen_vl(quant): - if quant == "fp8": - pytest.skip(reason="FP8 is not supported for Qwen2-VL") run_vlm_ptq_command(model=QWEN_VL_PATH, quant=quant) diff --git a/tox.ini b/tox.ini index 011490d121..80299d814d 100644 --- a/tox.ini +++ b/tox.ini @@ -24,7 +24,7 @@ deps = -e .[all,dev-test] # Should match pyproject.toml - tf_min: transformers~=5.0.0 + tf_min: transformers~=4.56.0 commands = python -m pytest tests/unit {env:COV_ARGS:} From 38e26e3e478b6a2ef94c65ed4bb4c1a43a5de0b8 Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Thu, 26 Mar 2026 02:43:19 -0700 Subject: [PATCH 17/28] fix tests Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- examples/gpt-oss/configs/sft_full.yaml | 6 +++--- examples/gpt-oss/configs/sft_lora.yaml | 6 +++--- examples/gpt-oss/qat-finetune-transformers.ipynb | 2 +- examples/llm_ptq/requirements.txt | 1 + .../llm_qat/notebooks/QAT_QAD_Walkthrough.ipynb | 4 ++-- examples/vlm_ptq/requirements.txt | 1 + .../quantization/plugins/transformers_trainer.py | 5 ++--- .../torch/speculative/eagle/default_config.py | 1 + .../torch/speculative/plugins/transformers.py | 16 ++++++++++++++-- 9 files changed, 28 insertions(+), 14 deletions(-) create mode 100644 examples/vlm_ptq/requirements.txt diff --git a/examples/gpt-oss/configs/sft_full.yaml b/examples/gpt-oss/configs/sft_full.yaml index 7d980b9d03..c3ba873be2 100644 --- a/examples/gpt-oss/configs/sft_full.yaml +++ b/examples/gpt-oss/configs/sft_full.yaml @@ -16,7 +16,7 @@ per_device_train_batch_size: 2 per_device_eval_batch_size: 2 gradient_accumulation_steps: 2 max_length: 4096 -warmup_steps: 0.03 +warmup_steps: 0.03 # use warmup_ratio if using transformers<5.0 lr_scheduler_type: cosine_with_min_lr lr_scheduler_kwargs: min_lr_rate: 0.1 @@ -30,6 +30,6 @@ eval_steps: 8 dataset_test_split: test # ModelOpt Quantization Parameters -quant_cfg: # Examples: MXFP4_MLP_WEIGHT_ONLY_CFG, NVFP4_MLP_WEIGHT_ONLY_CFG, NVFP4_MLP_ONLY_CFG - # For the full list of supported configs, do: mtq.config.choices +quant_cfg: # Examples: MXFP4_MLP_WEIGHT_ONLY_CFG, NVFP4_MLP_WEIGHT_ONLY_CFG, NVFP4_MLP_ONLY_CFG + # For the full list of supported configs, do: mtq.config.choices calib_size: 128 diff --git a/examples/gpt-oss/configs/sft_lora.yaml b/examples/gpt-oss/configs/sft_lora.yaml index 4b44ca4af9..4f35c36182 100644 --- a/examples/gpt-oss/configs/sft_lora.yaml +++ b/examples/gpt-oss/configs/sft_lora.yaml @@ -21,7 +21,7 @@ lora_alpha: 16 lora_dropout: 0.0 lora_target_modules: all-linear max_length: 4096 -warmup_steps: 0.03 +warmup_steps: 0.03 # use warmup_ratio if using transformers<5.0 lr_scheduler_type: cosine_with_min_lr lr_scheduler_kwargs: min_lr_rate: 0.1 @@ -35,6 +35,6 @@ eval_steps: 8 dataset_test_split: test # ModelOpt Quantization Parameters -quant_cfg: # Examples: MXFP4_MLP_WEIGHT_ONLY_CFG, NVFP4_MLP_WEIGHT_ONLY_CFG, NVFP4_MLP_ONLY_CFG - # For the full list of supported configs, do: mtq.config.choices +quant_cfg: # Examples: MXFP4_MLP_WEIGHT_ONLY_CFG, NVFP4_MLP_WEIGHT_ONLY_CFG, NVFP4_MLP_ONLY_CFG + # For the full list of supported configs, do: mtq.config.choices calib_size: 128 diff --git a/examples/gpt-oss/qat-finetune-transformers.ipynb b/examples/gpt-oss/qat-finetune-transformers.ipynb index 58dba84cb6..42226b2982 100644 --- a/examples/gpt-oss/qat-finetune-transformers.ipynb +++ b/examples/gpt-oss/qat-finetune-transformers.ipynb @@ -207,7 +207,7 @@ " per_device_eval_batch_size=1,\n", " gradient_accumulation_steps=2,\n", " max_length=4096,\n", - " warmup_steps=0.03,\n", + " warmup_steps=0.03, # use warmup_ratio if using transformers<5.0\n", " eval_strategy=\"steps\",\n", " eval_on_start=True,\n", " logging_steps=10,\n", diff --git a/examples/llm_ptq/requirements.txt b/examples/llm_ptq/requirements.txt index 1469d5552b..460be2fe61 100644 --- a/examples/llm_ptq/requirements.txt +++ b/examples/llm_ptq/requirements.txt @@ -3,5 +3,6 @@ fire flash-attn>=2.6.0 rouge_score>=0.1.2 tiktoken +transformers<5.0 transformers_stream_generator zstandard diff --git a/examples/llm_qat/notebooks/QAT_QAD_Walkthrough.ipynb b/examples/llm_qat/notebooks/QAT_QAD_Walkthrough.ipynb index 9c10c55c25..f52d596f7c 100644 --- a/examples/llm_qat/notebooks/QAT_QAD_Walkthrough.ipynb +++ b/examples/llm_qat/notebooks/QAT_QAD_Walkthrough.ipynb @@ -275,7 +275,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "0bf60614-99a0-48b0-85a8-1d88cd7c72ba", "metadata": {}, "outputs": [], @@ -290,7 +290,7 @@ " per_device_eval_batch_size=1,\n", " gradient_accumulation_steps=2,\n", " max_length=4096,\n", - " warmup_steps=0.03,\n", + " warmup_steps=0.03, # use warmup_ratio if using transformers<5.0\n", " eval_strategy=\"steps\",\n", " eval_on_start=True,\n", " logging_steps=50,\n", diff --git a/examples/vlm_ptq/requirements.txt b/examples/vlm_ptq/requirements.txt new file mode 100644 index 0000000000..180f534118 --- /dev/null +++ b/examples/vlm_ptq/requirements.txt @@ -0,0 +1 @@ +transformers<5.0 diff --git a/modelopt/torch/quantization/plugins/transformers_trainer.py b/modelopt/torch/quantization/plugins/transformers_trainer.py index 3ffd4274cd..2536327843 100644 --- a/modelopt/torch/quantization/plugins/transformers_trainer.py +++ b/modelopt/torch/quantization/plugins/transformers_trainer.py @@ -346,9 +346,8 @@ def _load_best_model(self, *args, **kwargs): adapter_name = self.model.active_adapters()[0] device = get_module_device(self.model) self.model.delete_adapter(adapter_name) - self.model.load_adapter( - self.state.best_model_checkpoint, adapter_name, torch_device=device - ) + self.model.load_adapter(self.state.best_model_checkpoint, adapter_name) + self.model.to(device) else: super()._load_best_model(*args, **kwargs) diff --git a/modelopt/torch/speculative/eagle/default_config.py b/modelopt/torch/speculative/eagle/default_config.py index f8c4924c19..224823ad17 100644 --- a/modelopt/torch/speculative/eagle/default_config.py +++ b/modelopt/torch/speculative/eagle/default_config.py @@ -25,6 +25,7 @@ "high_freq_factor": 4.0, "original_max_position_embeddings": 8192, "rope_type": "llama3", + "rope_theta": 500000.0, }, "rope_theta": 500000.0, "num_hidden_layers": 1, diff --git a/modelopt/torch/speculative/plugins/transformers.py b/modelopt/torch/speculative/plugins/transformers.py index 286f256b87..7ff9654123 100644 --- a/modelopt/torch/speculative/plugins/transformers.py +++ b/modelopt/torch/speculative/plugins/transformers.py @@ -560,9 +560,21 @@ def modify( elif self.eagle_decoder_type == "kimik2": decoder_cls = _setup_kimi_k2_decoder() - self.eagle_config = PretrainedConfig.from_dict(config.eagle_architecture_config) + arch_config = config.eagle_architecture_config + + # Populate base-model-dependent fields before constructing PretrainedConfig, + # since transformers >=5.4 validates rope_scaling during __init__. + arch_config.setdefault("hidden_size", self._base_llm_config.hidden_size) + arch_config.setdefault("vocab_size", self._base_llm_config.vocab_size) + arch_config.setdefault( + "max_position_embeddings", self._base_llm_config.max_position_embeddings + ) + rope_scaling = arch_config.get("rope_scaling") + if rope_scaling and "rope_theta" not in rope_scaling and "rope_theta" in arch_config: + rope_scaling["rope_theta"] = arch_config["rope_theta"] + + self.eagle_config = PretrainedConfig.from_dict(arch_config) self.eagle_config.eagle_decoder_type = self.eagle_decoder_type - # Hidden size and vocab size must match base model self.eagle_config.hidden_size = self._base_llm_config.hidden_size self.eagle_config.vocab_size = self._base_llm_config.vocab_size self.eagle_config.max_position_embeddings = self._base_llm_config.max_position_embeddings From 26cf04a5c97214ab7e43cc48a1699045bdeee423 Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Mon, 30 Mar 2026 13:09:15 -0700 Subject: [PATCH 18/28] minor fixes Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- .github/workflows/gpu_tests.yml | 5 +++- .github/workflows/unit_tests.yml | 12 ++++++---- CHANGELOG.rst | 1 + .../_installation_for_Linux.rst | 2 +- examples/gpt-oss/sft.py | 2 +- .../onnx/llm_export_utils/export_utils.py | 24 +++++++++++++++++-- modelopt/torch/__init__.py | 4 ++-- modelopt/torch/opt/plugins/huggingface.py | 13 +++++++++- pyproject.toml | 2 +- tests/_test_utils/torch/vision_models.py | 8 +++---- .../onnx/quantization/test_quantize_api.py | 6 ----- tox.ini | 7 +++--- 12 files changed, 59 insertions(+), 27 deletions(-) diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml index 214afa756d..e013c1e17c 100644 --- a/.github/workflows/gpu_tests.yml +++ b/.github/workflows/gpu_tests.yml @@ -76,7 +76,7 @@ jobs: container: &gpu_container image: nvcr.io/nvidia/${{ matrix.container_image }} env: - GIT_DEPTH: 1000 # For correct version for tests/gpu/torch/quantization/plugins/test_megatron.py + GIT_DEPTH: 1000 # For correct version PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages HF_TOKEN: ${{ secrets.HF_TOKEN }} steps: &gpu_steps @@ -85,6 +85,9 @@ jobs: - name: Setup environment variables run: | echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu" >> $GITHUB_ENV + - name: Upgrade transformers for gpu-trtllm test since TRT-LLM container is using transformers 4.57 + if: matrix.example == 'gpu-trtllm' + run: pip install -U transformers - name: Run gpu tests run: pip install tox-current-env && tox -e cuda13-${{ matrix.example }} --current-env gpu-tests-non-pr: diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index f2e862df32..605f930f2b 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -38,7 +38,7 @@ jobs: - uses: actions/checkout@v6 - uses: ./.github/actions/ubuntu-setup - name: Run unit tests - run: pip install tox && COV_ARGS="--cov" tox -e py312-torch210-tf_latest-unit + run: pip install tox && COV_ARGS="--cov" tox -e py312-torch211-tf_latest-unit - name: Upload coverage reports to Codecov uses: codecov/codecov-action@v5 with: @@ -64,6 +64,7 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 30 strategy: + fail-fast: false matrix: py: [10, 11, 13] steps: @@ -72,15 +73,16 @@ jobs: with: python-version: "3.${{ matrix.py }}" - name: Run unit tests - run: pip install tox && tox -e py3${{ matrix.py }}-torch210-tf_latest-unit + run: pip install tox && tox -e py3${{ matrix.py }}-torch211-tf_latest-unit multi-torch: if: github.event_name == 'pull_request' needs: [linux] runs-on: ubuntu-latest timeout-minutes: 30 strategy: + fail-fast: false matrix: - torch: [26, 27, 28, 29] + torch: [28, 29, 210] steps: - uses: actions/checkout@v6 - uses: ./.github/actions/ubuntu-setup @@ -92,13 +94,14 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 30 strategy: + fail-fast: false matrix: tf: [min] steps: - uses: actions/checkout@v6 - uses: ./.github/actions/ubuntu-setup - name: Run unit tests - run: pip install tox && tox -e py312-torch210-tf_${{ matrix.tf }}-unit + run: pip install tox && tox -e py312-torch211-tf_${{ matrix.tf }}-unit launcher: if: github.event_name == 'pull_request' needs: [linux] @@ -122,6 +125,7 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 30 strategy: + fail-fast: false matrix: test-env: [onnx, torch] steps: diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 7f9917e10d..3a0a4aab8d 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -18,6 +18,7 @@ Changelog **Misc** +- Bump minimum required PyTorch version to 2.8. - Add experimental support for transformers>=5.0. Unified Hugging Face checkpoint export for quantized checkpoints may not work for some models with transformers>=5.0 yet. 0.43 (2026-04-09) diff --git a/docs/source/getting_started/_installation_for_Linux.rst b/docs/source/getting_started/_installation_for_Linux.rst index 8071c34af3..2b2d4d8219 100644 --- a/docs/source/getting_started/_installation_for_Linux.rst +++ b/docs/source/getting_started/_installation_for_Linux.rst @@ -16,7 +16,7 @@ Latest Model Optimizer (``nvidia-modelopt``) currently has the following system +-------------------------+-----------------------------+ | CUDA | 12.x, 13.x | +-------------------------+-----------------------------+ -| PyTorch | >=2.6 | +| PyTorch | >=2.8 | +-------------------------+-----------------------------+ | TensorRT-LLM (Optional) | >=1.0 | +-------------------------+-----------------------------+ diff --git a/examples/gpt-oss/sft.py b/examples/gpt-oss/sft.py index 4d30fc0fd7..6cdad5187c 100644 --- a/examples/gpt-oss/sft.py +++ b/examples/gpt-oss/sft.py @@ -72,7 +72,7 @@ def main(script_args, training_args, model_args, quant_args): "revision": model_args.model_revision, "trust_remote_code": model_args.trust_remote_code, "attn_implementation": model_args.attn_implementation, - "dtype": getattr(model_args, "dtype", "float32"), + "dtype": getattr(model_args, "dtype", "bfloat16"), "use_cache": not training_args.gradient_checkpointing, } diff --git a/modelopt/onnx/llm_export_utils/export_utils.py b/modelopt/onnx/llm_export_utils/export_utils.py index 4f50628dca..a6d2b607ac 100644 --- a/modelopt/onnx/llm_export_utils/export_utils.py +++ b/modelopt/onnx/llm_export_utils/export_utils.py @@ -76,7 +76,7 @@ def __init__(self, model): self.lm_head = model.lm_head self.config = model.config - def forward(self, input_ids: torch.Tensor | None, past_key_values: tuple): + def forward(self, input_ids: torch.Tensor, past_key_values: tuple): """Forward pass.""" # Convert tuple cache to DynamicCache for models that require it (e.g., Qwen3) cache = DynamicCache(config=self.config) @@ -84,7 +84,27 @@ def forward(self, input_ids: torch.Tensor | None, past_key_values: tuple): cache.value_cache = [kv[1] for kv in past_key_values] past_key_values = cache - outputs = self.model(input_ids=input_ids, past_key_values=past_key_values, use_cache=True) + # Pre-compute a 4D causal mask so that transformers' internal mask creation + # (which relies on Python-int shapes) is bypassed entirely. During ONNX/JIT tracing, + # tensor.shape[N] can return a 0-dim scalar tensor instead of a Python int, which breaks + # the masking code in transformers>=5.4 + seq_len = input_ids.shape[1] + past_len = past_key_values.get_seq_length() # type: ignore[attr-defined] + causal_mask = ( + torch.tril( + torch.ones(seq_len, past_len + seq_len, dtype=torch.bool, device=input_ids.device), + diagonal=past_len, + ) + .unsqueeze(0) + .unsqueeze(0) + ) + + outputs = self.model( + input_ids=input_ids, + attention_mask=causal_mask, + past_key_values=past_key_values, + use_cache=True, + ) hidden_states = outputs[0] cache = outputs.past_key_values past_key_values = tuple(zip(cache.key_cache, cache.value_cache)) diff --git a/modelopt/torch/__init__.py b/modelopt/torch/__init__.py index 456bf7b4a3..d8f02fa580 100644 --- a/modelopt/torch/__init__.py +++ b/modelopt/torch/__init__.py @@ -25,9 +25,9 @@ from . import distill, nas, opt, peft, prune, quantization, sparsity, speculative, utils -if _Version(_torch_version) < _Version("2.7"): +if _Version(_torch_version) < _Version("2.9"): _warnings.warn( - "nvidia-modelopt will drop torch<2.7 support in a future release.", DeprecationWarning + "nvidia-modelopt will drop torch<2.9 support in a future release.", DeprecationWarning ) diff --git a/modelopt/torch/opt/plugins/huggingface.py b/modelopt/torch/opt/plugins/huggingface.py index 8b6396f3e7..db077487c0 100644 --- a/modelopt/torch/opt/plugins/huggingface.py +++ b/modelopt/torch/opt/plugins/huggingface.py @@ -23,6 +23,8 @@ from typing import Any import torch +from huggingface_hub import try_to_load_from_cache +from huggingface_hub.errors import HFValidationError from modelopt.torch.utils import print_rank_0 @@ -57,7 +59,16 @@ def register_for_patching(name: str, cls: type, patch_methods: list[tuple[str, A def _get_modelopt_state_path(model_name_or_path: str) -> str: - return os.path.join(model_name_or_path, _MODELOPT_STATE_SAVE_NAME) + """Get the path to the ModelOpt state file or empty string if not found. + + Also handles HF model card as input path. However for hf hub models, we dont have modelopt_state at the moment. + """ + if os.path.isdir(model_name_or_path): + return os.path.join(model_name_or_path, _MODELOPT_STATE_SAVE_NAME) + try: + return try_to_load_from_cache(model_name_or_path, _MODELOPT_STATE_SAVE_NAME) or "" + except HFValidationError: + return "" @contextmanager diff --git a/pyproject.toml b/pyproject.toml index 52ee43ccb2..aa6194df18 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,7 @@ dependencies = [ "nvidia-ml-py>=12", "packaging", "setuptools>=80", # torch.utils.cpp_extension imports setuptools at load time - "torch>=2.6", + "torch>=2.8", "tqdm", # modelopt.torch "PyYAML>=6.0", diff --git a/tests/_test_utils/torch/vision_models.py b/tests/_test_utils/torch/vision_models.py index 40e99c8d01..639dc16695 100644 --- a/tests/_test_utils/torch/vision_models.py +++ b/tests/_test_utils/torch/vision_models.py @@ -132,10 +132,10 @@ def get_model_and_input(on_gpu: bool = False): ], _create_torchvision_segmentation_fn, ), - "unet": ( - ["unet_carvana"], - _create_unet_fn, - ), + # "unet": ( + # ["unet_carvana"], + # _create_unet_fn, + # ), } diff --git a/tests/unit/onnx/quantization/test_quantize_api.py b/tests/unit/onnx/quantization/test_quantize_api.py index 3ce8f2f7fe..464fb1a88b 100644 --- a/tests/unit/onnx/quantization/test_quantize_api.py +++ b/tests/unit/onnx/quantization/test_quantize_api.py @@ -36,7 +36,6 @@ # onnxruntime version that supports opset 22+ ORT_VERSION_FOR_OPSET_22 = version.parse("1.23.0") -TORCH_VERSION_FOR_OPSET_22 = version.parse("2.8.0") # Test scenarios: (scenario_name, export_opset_offset, request_opset_offset, expected_opset_offset) @@ -87,11 +86,6 @@ def test_quantize_opset_handling( pytest.skip( f"Opset {max_opset} requires onnxruntime >= {ORT_VERSION_FOR_OPSET_22}, have {ort_version}" ) - torch_version = version.parse(torch.__version__) - if torch_version < TORCH_VERSION_FOR_OPSET_22: - pytest.skip( - f"Opset {max_opset} requires torch >= {TORCH_VERSION_FOR_OPSET_22}, have {torch_version}" - ) # Setup: create and export model model_torch = SimpleMLP() diff --git a/tox.ini b/tox.ini index 80299d814d..8b7022d074 100644 --- a/tox.ini +++ b/tox.ini @@ -12,14 +12,13 @@ passenv = ############################ # CPU Unit test environments ############################ -[testenv:{py310,py311,py312,py313}-torch{26,27,28,29,210}-tf_{min,latest}-unit] +[testenv:{py310,py311,py312,py313}-torch{28,29,210,211}-tf_{min,latest}-unit] deps = # torch version auto-selected based on torchvision version - torch26: torchvision~=0.21.0 - torch27: torchvision~=0.22.0 torch28: torchvision~=0.23.0 torch29: torchvision~=0.24.0 torch210: torchvision~=0.25.0 + torch211: torchvision~=0.26.0 -e .[all,dev-test] @@ -37,7 +36,7 @@ allowlist_externals = bash, rm deps = # Make sure torch 2.10 is used - torchvision~=0.25.0 + torchvision~=0.26.0 # ONNX unit tests heavily rely on torch / torchvision onnx: .[onnx,dev-test] From 6d3af7cd071452ac45f839ccff94bfc532a471e3 Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Tue, 31 Mar 2026 03:57:14 -0700 Subject: [PATCH 19/28] Remove transformers 5.0 compatibility patch for trtllm; disable MOE cpu tests for torch 2.8 Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- .github/workflows/example_tests.yml | 4 +- .github/workflows/gpu_tests.yml | 9 +-- examples/gpt-oss/requirements.txt | 1 - .../kl_divergence_metrics/requirements.txt | 5 +- .../perplexity_metrics/requirements.txt | 3 +- .../accuracy_benchmark/requirements.txt | 2 - .../diffusers/qad_example/requirements.txt | 2 - modelopt/onnx/quantization/extensions.py | 7 +- modelopt/torch/__init__.py | 71 ------------------- .../quantization/plugins/test_huggingface.py | 3 + .../torch/quantization/plugins/test_peft.py | 4 ++ .../quantization/plugins/test_sparse_moe.py | 4 ++ 12 files changed, 21 insertions(+), 94 deletions(-) diff --git a/.github/workflows/example_tests.yml b/.github/workflows/example_tests.yml index 1c2a8fbb47..fb9718a734 100644 --- a/.github/workflows/example_tests.yml +++ b/.github/workflows/example_tests.yml @@ -99,7 +99,7 @@ jobs: uses: ./.github/workflows/_example_tests_runner.yml secrets: inherit with: - docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc9" + docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10" example: ${{ matrix.example }} pip_install_extras: "[hf,dev-test]" runner: linux-amd64-gpu-rtxpro6000-latest-1 @@ -113,7 +113,7 @@ jobs: uses: ./.github/workflows/_example_tests_runner.yml secrets: inherit with: - docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc9" + docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10" example: ${{ matrix.example }} pip_install_extras: "[hf,dev-test]" runner: linux-amd64-gpu-rtxpro6000-latest-2 diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml index e013c1e17c..d0f8891179 100644 --- a/.github/workflows/gpu_tests.yml +++ b/.github/workflows/gpu_tests.yml @@ -64,13 +64,13 @@ jobs: include: - example: gpu timeout: 45 - container_image: pytorch:26.03-py3 + container_image: pytorch:26.02-py3 - example: gpu-megatron timeout: 45 - container_image: pytorch:26.03-py3 + container_image: pytorch:26.02-py3 - example: gpu-trtllm timeout: 30 - container_image: tensorrt-llm/release:1.3.0rc9 + container_image: tensorrt-llm/release:1.3.0rc10 runs-on: linux-amd64-gpu-rtxpro6000-latest-1 timeout-minutes: ${{ matrix.timeout }} container: &gpu_container @@ -85,9 +85,6 @@ jobs: - name: Setup environment variables run: | echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu" >> $GITHUB_ENV - - name: Upgrade transformers for gpu-trtllm test since TRT-LLM container is using transformers 4.57 - if: matrix.example == 'gpu-trtllm' - run: pip install -U transformers - name: Run gpu tests run: pip install tox-current-env && tox -e cuda13-${{ matrix.example }} --current-env gpu-tests-non-pr: diff --git a/examples/gpt-oss/requirements.txt b/examples/gpt-oss/requirements.txt index 76c3b0a2e8..d18f9eb539 100644 --- a/examples/gpt-oss/requirements.txt +++ b/examples/gpt-oss/requirements.txt @@ -1,4 +1,3 @@ kernels>=0.9.0 -torch>2.7.1 trackio trl>=0.21.0 diff --git a/examples/windows/accuracy_benchmark/kl_divergence_metrics/requirements.txt b/examples/windows/accuracy_benchmark/kl_divergence_metrics/requirements.txt index 8409b2f8ea..e5ac0ab52b 100644 --- a/examples/windows/accuracy_benchmark/kl_divergence_metrics/requirements.txt +++ b/examples/windows/accuracy_benchmark/kl_divergence_metrics/requirements.txt @@ -1,8 +1,5 @@ --extra-index-url https://download.pytorch.org/whl/cu129 accelerate datasets -numpy safetensors>=0.4.0 - -torch>=2.0.0 -transformers>=4.30.0 +transformers<5.0 diff --git a/examples/windows/accuracy_benchmark/perplexity_metrics/requirements.txt b/examples/windows/accuracy_benchmark/perplexity_metrics/requirements.txt index 4bdac071cf..46f24a4537 100644 --- a/examples/windows/accuracy_benchmark/perplexity_metrics/requirements.txt +++ b/examples/windows/accuracy_benchmark/perplexity_metrics/requirements.txt @@ -2,10 +2,9 @@ --extra-index-url https://download.pytorch.org/whl/cu129 accelerate datasets -numpy onnxruntime-genai pandas sentencepiece>=0.2.1 tokenizers>=0.14.1 torch>=2.6.0 -transformers>=4.53 +transformers<5.0 diff --git a/examples/windows/accuracy_benchmark/requirements.txt b/examples/windows/accuracy_benchmark/requirements.txt index ad4c91cacd..dd02bcaf68 100644 --- a/examples/windows/accuracy_benchmark/requirements.txt +++ b/examples/windows/accuracy_benchmark/requirements.txt @@ -1,11 +1,9 @@ datasets>=2.14.5 fire==0.6.0 -fire>=0.5.0 numpy==1.26.4 openai>=0.28.1 pandas==2.2.2 peft>=0.5.0 rwkv>=0.7.3 tiktoken==0.7.0 -tqdm==4.66.5 transformers==4.57.3 diff --git a/examples/windows/torch_onnx/diffusers/qad_example/requirements.txt b/examples/windows/torch_onnx/diffusers/qad_example/requirements.txt index f6aa9bfda7..0aafd11840 100644 --- a/examples/windows/torch_onnx/diffusers/qad_example/requirements.txt +++ b/examples/windows/torch_onnx/diffusers/qad_example/requirements.txt @@ -6,7 +6,5 @@ ltx-trainer @ git+https://github.com/Lightricks/LTX-2.git#subdirectory=packages/ # NVIDIA ModelOpt (quantization & distillation) nvidia-modelopt -pyyaml safetensors -torch>=2.0 diff --git a/modelopt/onnx/quantization/extensions.py b/modelopt/onnx/quantization/extensions.py index 13956eeac3..4603992ee6 100644 --- a/modelopt/onnx/quantization/extensions.py +++ b/modelopt/onnx/quantization/extensions.py @@ -17,19 +17,18 @@ import os import sys +from warnings import warn import cppimport -from modelopt.onnx.logging_config import logger - try: - logger.info("Loading extension modelopt_round_and_pack_ext...") + print("Loading extension modelopt_round_and_pack_ext...") path = os.path.join(os.path.dirname(__file__), "src") sys.path.append(path) round_and_pack_ext = cppimport.imp("modelopt_round_and_pack_ext") sys.path.remove(path) except Exception as e: - logger.warning( + warn( f"{e}\nUnable to load `modelopt_round_and_pack_ext', falling back to python based optimized version" ) round_and_pack_ext = None diff --git a/modelopt/torch/__init__.py b/modelopt/torch/__init__.py index d8f02fa580..190e94529c 100644 --- a/modelopt/torch/__init__.py +++ b/modelopt/torch/__init__.py @@ -15,13 +15,10 @@ """Model optimization and deployment subpackage for torch.""" -import sys as _sys import warnings as _warnings from packaging.version import Version as _Version from torch import __version__ as _torch_version -from torch import device as _device -from torch import dtype as _dtype from . import distill, nas, opt, peft, prune, quantization, sparsity, speculative, utils @@ -31,59 +28,6 @@ ) -# Since `hf` dependencies are optional and users have pre-installed transformers, we need to ensure -# correct version is installed to avoid incompatibility issues. -def _patch_transformers_compat(mod) -> None: - """Compatibility shims for names removed in transformers 5.0.""" - import torch.nn as _nn - - # AutoModelForVision2Seq -> AutoModelForImageTextToText - if not hasattr(mod, "AutoModelForVision2Seq") and hasattr(mod, "AutoModelForImageTextToText"): - mod.AutoModelForVision2Seq = mod.AutoModelForImageTextToText - - # get_parameter_device and get_parameter_dtype were removed in transformers 5.0 - modeling_utils = _sys.modules.get("transformers.modeling_utils") - if modeling_utils is not None: - if not hasattr(modeling_utils, "get_parameter_device"): - - def get_parameter_device(parameter: _nn.Module) -> _device: - return next(parameter.parameters()).device - - modeling_utils.get_parameter_device = get_parameter_device # type: ignore[attr-defined] - - if not hasattr(modeling_utils, "get_parameter_dtype"): - - def get_parameter_dtype(parameter: _nn.Module) -> _dtype: - return next(parameter.parameters()).dtype - - modeling_utils.get_parameter_dtype = get_parameter_dtype # type: ignore[attr-defined] - - if not hasattr(modeling_utils, "load_sharded_checkpoint"): - try: - from transformers.trainer_utils import ( - load_sharded_checkpoint as _load_sharded_checkpoint, - ) - - modeling_utils.load_sharded_checkpoint = _load_sharded_checkpoint # type: ignore[attr-defined] - except ImportError: - pass - - # AutoConfig.register raises ValueError when a model type is already built into - # transformers (e.g. exaone_moe added in 5.0). Older packages like TRT-LLM call - # register without exist_ok=True. Patch CONFIG_MAPPING.register to silently skip. - try: - from transformers.models.auto.configuration_auto import CONFIG_MAPPING as _CONFIG_MAPPING - - _orig_cfg_register = _CONFIG_MAPPING.register - - def _patched_cfg_register(key, value, exist_ok=False): - _orig_cfg_register(key, value, exist_ok=True) - - _CONFIG_MAPPING.register = _patched_cfg_register - except Exception: - pass - - try: from transformers import __version__ as _transformers_version @@ -97,21 +41,6 @@ def _patched_cfg_register(key, value, exist_ok=False): "transformers>=5.0 support is experimental. Unified Hugging Face checkpoint export for quantized " "checkpoints may not work for some models yet.", ) - - # Temporary workaround until TRT-LLM container supports transformers 5.0 - if "transformers" in _sys.modules: - _patch_transformers_compat(_sys.modules["transformers"]) - else: - - class _TransformersCompatFinder: - def find_module(self, fullname, path=None): - if fullname == "transformers": - _sys.meta_path.remove(self) # type: ignore[arg-type] - import importlib as _importlib - - _patch_transformers_compat(_importlib.import_module(fullname)) - - _sys.meta_path.insert(0, _TransformersCompatFinder()) # type: ignore[arg-type] except ImportError: pass diff --git a/tests/unit/torch/quantization/plugins/test_huggingface.py b/tests/unit/torch/quantization/plugins/test_huggingface.py index d9b1f9e438..8c66e6651c 100644 --- a/tests/unit/torch/quantization/plugins/test_huggingface.py +++ b/tests/unit/torch/quantization/plugins/test_huggingface.py @@ -155,6 +155,9 @@ def test_dbrx(): @pytest.mark.parametrize("method", ["gradient", "kl_div"]) @pytest.mark.parametrize("model_provider", [get_tiny_llama, get_tiny_qwen3_moe]) def test_autoquantize_huggingface(model_provider, method): + if model_provider == get_tiny_qwen3_moe and Version(torch.__version__) < Version("2.9"): + pytest.skip("torch 2.8 grouped_mm is CUDA-only") + model = model_provider() input_ids = model.dummy_inputs["input_ids"] diff --git a/tests/unit/torch/quantization/plugins/test_peft.py b/tests/unit/torch/quantization/plugins/test_peft.py index c794c67bc2..310a744c70 100644 --- a/tests/unit/torch/quantization/plugins/test_peft.py +++ b/tests/unit/torch/quantization/plugins/test_peft.py @@ -16,6 +16,7 @@ import pytest import torch from _test_utils.torch.transformers_models import get_tiny_gpt_oss, get_tiny_llama, tf_output_tester +from packaging.version import Version pytest.importorskip("peft") transformers = pytest.importorskip("transformers") @@ -53,6 +54,9 @@ def test_convert_loralinear(): tf_output_tester(model_ref, model_test) +@pytest.mark.skipif( + Version(torch.__version__) < Version("2.9"), reason="torch 2.8 grouped_mm is CUDA-only" +) def test_peft_flow(tmp_path): model_original = get_tiny_gpt_oss(num_hidden_layers=1) diff --git a/tests/unit/torch/quantization/plugins/test_sparse_moe.py b/tests/unit/torch/quantization/plugins/test_sparse_moe.py index 4ef428e9bb..3e8baab798 100644 --- a/tests/unit/torch/quantization/plugins/test_sparse_moe.py +++ b/tests/unit/torch/quantization/plugins/test_sparse_moe.py @@ -20,9 +20,13 @@ import pytest import torch import torch.nn as nn +from packaging.version import Version pytest.importorskip("transformers") +if Version(torch.__version__) < Version("2.9"): + pytest.skip("torch 2.8 grouped_mm is CUDA-only", allow_module_level=True) + from _test_utils.torch.transformers_models import get_tiny_qwen3_moe import modelopt.torch.quantization as mtq From f707ce5f250d4884696ba75f866d24aa24773180 Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Tue, 31 Mar 2026 12:21:46 -0700 Subject: [PATCH 20/28] fix for cppimport container test Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- .github/workflows/gpu_tests.yml | 5 +++-- modelopt/onnx/quantization/extensions.py | 5 +++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml index d0f8891179..d24e04e317 100644 --- a/.github/workflows/gpu_tests.yml +++ b/.github/workflows/gpu_tests.yml @@ -64,10 +64,11 @@ jobs: include: - example: gpu timeout: 45 - container_image: pytorch:26.02-py3 + container_image: pytorch:26.01-py3 + # tests/gpu/_extensions/test_onnx_extensions.py fails for newer containers until https://github.com/tbenthompson/cppimport/pull/98 - example: gpu-megatron timeout: 45 - container_image: pytorch:26.02-py3 + container_image: pytorch:26.01-py3 - example: gpu-trtllm timeout: 30 container_image: tensorrt-llm/release:1.3.0rc10 diff --git a/modelopt/onnx/quantization/extensions.py b/modelopt/onnx/quantization/extensions.py index 4603992ee6..9c9f367269 100644 --- a/modelopt/onnx/quantization/extensions.py +++ b/modelopt/onnx/quantization/extensions.py @@ -19,6 +19,7 @@ import sys from warnings import warn +# TODO: cppimport is no longer maintained, switch to a different library import cppimport try: @@ -31,4 +32,8 @@ warn( f"{e}\nUnable to load `modelopt_round_and_pack_ext', falling back to python based optimized version" ) + print( + "If you see `copy_file() got an unexpected keyword argument 'dry_run'`, you will need " + "https://github.com/tbenthompson/cppimport/pull/98 or downgrade setuptools until we have a workaround" + ) round_and_pack_ext = None From d5b61cbe20fbcc236312b28c9c6d51dda12fcdfb Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Tue, 31 Mar 2026 13:03:56 -0700 Subject: [PATCH 21/28] Fix spec dec example tests Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- .github/workflows/_example_tests_runner.yml | 4 -- .github/workflows/example_tests.yml | 4 +- examples/llm_eval/requirements.txt | 1 - examples/llm_ptq/requirements.txt | 1 - .../torch/speculative/plugins/transformers.py | 15 ++++--- pyproject.toml | 1 + tests/_test_utils/examples/llm_ptq_utils.py | 25 +++-------- .../examples/speculative_decoding/conftest.py | 43 ++++++++++--------- .../speculative_decoding/test_eagle.py | 21 ++++----- ...unified_hf_export_and_check_safetensors.py | 1 + 10 files changed, 48 insertions(+), 68 deletions(-) diff --git a/.github/workflows/_example_tests_runner.yml b/.github/workflows/_example_tests_runner.yml index 5aa0614c71..992b4127db 100644 --- a/.github/workflows/_example_tests_runner.yml +++ b/.github/workflows/_example_tests_runner.yml @@ -47,10 +47,6 @@ jobs: echo "PATH=${PATH}:/usr/local/tensorrt/targets/x86_64-linux-gnu/bin" >> $GITHUB_ENV - name: Install dependencies run: | - # Install git-lfs for Daring-Anteater dataset - apt-get update && apt-get install -y git-lfs - git lfs install --system - # use `python -m pip` instead of `pip` to avoid conflicts with system pip for nemo containers python -m pip install ".${{ inputs.pip_install_extras }}" diff --git a/.github/workflows/example_tests.yml b/.github/workflows/example_tests.yml index fb9718a734..f8ef06c7db 100644 --- a/.github/workflows/example_tests.yml +++ b/.github/workflows/example_tests.yml @@ -72,7 +72,7 @@ jobs: with: docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.03' }}-py3" example: ${{ matrix.example }} - timeout_minutes: 30 + timeout_minutes: 45 pip_install_extras: "[hf,dev-test]" runner: linux-amd64-gpu-h100-latest-1 @@ -84,7 +84,7 @@ jobs: with: docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.03' }}-py3" example: ${{ matrix.example }} - timeout_minutes: 30 + timeout_minutes: 45 pip_install_extras: "[hf,dev-test]" runner: linux-amd64-gpu-rtxpro6000-latest-2 diff --git a/examples/llm_eval/requirements.txt b/examples/llm_eval/requirements.txt index 88faeac5ca..df47ac76c6 100644 --- a/examples/llm_eval/requirements.txt +++ b/examples/llm_eval/requirements.txt @@ -2,5 +2,4 @@ fire>=0.5.0 lm_eval[api,ifeval]==0.4.8 peft>=0.5.0 rwkv>=0.7.3 -tiktoken torchvision diff --git a/examples/llm_ptq/requirements.txt b/examples/llm_ptq/requirements.txt index 460be2fe61..51f4b48625 100644 --- a/examples/llm_ptq/requirements.txt +++ b/examples/llm_ptq/requirements.txt @@ -2,7 +2,6 @@ compressed-tensors==0.12.0 fire flash-attn>=2.6.0 rouge_score>=0.1.2 -tiktoken transformers<5.0 transformers_stream_generator zstandard diff --git a/modelopt/torch/speculative/plugins/transformers.py b/modelopt/torch/speculative/plugins/transformers.py index 7ff9654123..a7edb44c0f 100644 --- a/modelopt/torch/speculative/plugins/transformers.py +++ b/modelopt/torch/speculative/plugins/transformers.py @@ -282,9 +282,9 @@ def __init__(self, config, decoder_layer_cls, bias=False): num_layers=self.config.parallel_draft_heads_num_layers, ) - def _maybe_init_rope(self): + def _maybe_init_rope(self, device=None): if self.config.eagle_decoder_type == "llama" and not hasattr(self, "rotary_emb"): - self.rotary_emb = LlamaRotaryEmbedding(config=self.config) + self.rotary_emb = LlamaRotaryEmbedding(config=self.config, device=device) def _expand_first_attn_in_dim(self, first_layer_attn): """Modify qkv projection in first layer to accept 2h hidden size.""" @@ -758,7 +758,10 @@ def _compute_ttt_attention_mask( ) -> BlockMask | torch.Tensor: """Return TTT attention_mask tensor of type BlockMask or Tensor depends on eagle attn impl.""" msk_func = get_ttt_msk_func(seq_length, ttt_step) - dtypemin = torch.finfo(self._base_llm_config.dtype).min + dtype = ( + self._base_llm_config.dtype or self.eagle_module.layers[0].input_layernorm.weight.dtype + ) + dtypemin = torch.finfo(dtype).min q_len = seq_length kv_len = seq_length * (1 + ttt_step) if self.eagle_config._attn_implementation == "flex_attention": @@ -774,7 +777,7 @@ def _compute_ttt_attention_mask( torch.arange(kv_len).view(1, 1, 1, kv_len), ).to(self.device) tensor_mask = torch.full_like( - tensor_mask, 0, dtype=self._base_llm_config.dtype, device=self.device + tensor_mask, 0, dtype=dtype, device=self.device ).masked_fill(~tensor_mask, dtypemin) return tensor_mask @@ -944,7 +947,7 @@ def forward( base_outputs, ) - self.eagle_module._maybe_init_rope() + self.eagle_module._maybe_init_rope(device=input_ids.device) # ====Run eagle forward with extra training-time-test steps==== for ttt_step in range(self.eagle_ttt_steps): @@ -1077,7 +1080,7 @@ def pseudo_speculative_generate( else: eagle_input_hidden_states = base_model_hidden_states - self.eagle_module._maybe_init_rope() + self.eagle_module._maybe_init_rope(device=eagle_input_hidden_states.device) draft_tokens = [] for step in range(steps): b, seq_length = eagle_ids.shape diff --git a/pyproject.toml b/pyproject.toml index aa6194df18..4aa7ad86e5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,6 +81,7 @@ hf = [ "nltk", "peft>=0.17.0", "sentencepiece>=0.2.1", # Also implicitly used in test_unified_export_megatron, test_vllm_fakequant_megatron_export + "tiktoken", "transformers>=4.56.0", # Should match modelopt/torch/__init__.py and tox.ini "wonderwords", ] diff --git a/tests/_test_utils/examples/llm_ptq_utils.py b/tests/_test_utils/examples/llm_ptq_utils.py index 3bd7a39222..17a0764275 100644 --- a/tests/_test_utils/examples/llm_ptq_utils.py +++ b/tests/_test_utils/examples/llm_ptq_utils.py @@ -16,12 +16,10 @@ import importlib.metadata as metadata import subprocess from dataclasses import asdict, dataclass -from pathlib import Path import pytest import torch - -PTQ_EXAMPLE_DIR = Path(__file__).parents[3] / "examples" / "llm_ptq" +from _test_utils.examples.run_command import run_llm_ptq_command @dataclass @@ -32,6 +30,7 @@ class PTQCommand: sparsity: str | None = None kv_cache_quant: str | None = None trust_remote_code: bool = False + calib_dataset: str = "cnn_dailymail" calib_batch_size: int | None = None auto_quantize_bits: float | None = None tp: int | None = None @@ -47,37 +46,23 @@ def run(self, model_path: str): self.min_sm % 10, ): pytest.skip(reason=f"Requires sm{self.min_sm} or higher") - return if self.max_sm and torch.cuda.get_device_capability() > ( self.max_sm // 10, self.max_sm % 10, ): pytest.skip(reason=f"Requires sm{self.max_sm} or lower") - return if self.min_gpu and torch.cuda.device_count() < self.min_gpu: pytest.skip(reason=f"Requires at least {self.min_gpu} GPUs") - return param_dict = asdict(self) - param_dict.pop("min_sm", None) + param_dict.pop("max_sm", None) param_dict.pop("min_gpu", None) - trust_remote_code = param_dict.pop("trust_remote_code", False) - - args = ["--model", model_path] - for key, value in param_dict.items(): - if value is not None: - args.append(f"--{key}") - args.append(f"{value}") - - if trust_remote_code: - args.append("--trust_remote_code") - - self.command = ["scripts/huggingface_example.sh", "--no-verbose", *args] - subprocess.run(self.command, cwd=PTQ_EXAMPLE_DIR, check=True) + quant = param_dict.pop("quant") + run_llm_ptq_command(model=model_path, quant=quant, **param_dict) def param_str(self): param_dict = asdict(self) diff --git a/tests/examples/speculative_decoding/conftest.py b/tests/examples/speculative_decoding/conftest.py index 80417f4048..34ab4e4741 100644 --- a/tests/examples/speculative_decoding/conftest.py +++ b/tests/examples/speculative_decoding/conftest.py @@ -13,30 +13,31 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - import pytest -from _test_utils.examples.run_command import MODELOPT_ROOT, run_example_command +import yaml +from _test_utils.examples.run_command import run_example_command @pytest.fixture(scope="session", autouse=True) def tiny_daring_anteater_path(tmp_path_factory): - dataset_path = ( - MODELOPT_ROOT / "examples/speculative_decoding/input_conversations/daring-anteater.jsonl" + tmp_dir = tmp_path_factory.mktemp("daring_anteater") + output_file = tmp_dir / "train.jsonl" + + config = { + "outputs": [ + { + "filename": str(output_file), + "global_limit": 100, + "sources": [{"name": "daring-anteater", "splits": {"all": 100}}], + } + ] + } + config_path = tmp_dir / "data_config.yaml" + config_path.write_text(yaml.dump(config)) + + run_example_command( + ["python", "prepare_input_conversations/make_dataset.py", "-f", str(config_path), "--full"], + "speculative_decoding", ) - if not os.path.exists(dataset_path): - try: - run_example_command( - ["python", "prepare_input_conversations/add_daring_anteater.py"], - "speculative_decoding", - ) - except Exception as e: - # Ignore rate-limiting errors - pytest.skip(f"Failed to prepare dataset: {e}") - output_path = tmp_path_factory.mktemp("daring_anteater") / "train.jsonl" - with open(dataset_path) as src, open(output_path, "w") as dst: - for i, line in enumerate(src): - if i >= 128: - break - dst.write(line) - return output_path + + return output_file diff --git a/tests/examples/speculative_decoding/test_eagle.py b/tests/examples/speculative_decoding/test_eagle.py index 271241bcb0..3a813ed943 100644 --- a/tests/examples/speculative_decoding/test_eagle.py +++ b/tests/examples/speculative_decoding/test_eagle.py @@ -22,6 +22,7 @@ import torch from _test_utils.examples.run_command import run_example_command from packaging.version import Version +from transformers import AutoConfig from modelopt.torch.export.plugins.hf_spec_export import LLAMA_EAGLE_SINGLE_LAYER @@ -105,11 +106,11 @@ def test_llama_eagle3(tiny_llama_path, tiny_daring_anteater_path, tmp_path, eagle_output_dir, cp_size, - mix_hidden_states): + mix_hidden_states, + num_gpus): """Test Eagle3 training with a tiny llama model, using different cp_size values.""" - available_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0 - if cp_size == 2 and available_gpus < 2: - pytest.skip("cp_size=2 requires at least 2 GPUs, but only {} found.".format(available_gpus)) + if cp_size == 2 and num_gpus < 2: + pytest.skip("cp_size=2 requires at least 2 GPUs, but only {} found.".format(num_gpus)) if cp_size == 2 and not Version(torch.__version__) >= Version("2.10.0"): pytest.skip("cp_size=2 requires torch 2.10.0") # Create an ultra-tiny EAGLE config for testing to reduce memory usage @@ -220,16 +221,12 @@ def test_offline_eagle3_training( model_source, use_fake_base, ): """Test Eagle3 training with pre-computed hidden states (offline mode / FakeBaseModel).""" - import transformers - model_path = tiny_llama_path if model_source is None else model_source model_id = "tinyllama" if model_source is None else model_source.split("/")[-1] output_subdir = eagle_output_dir / f"eagle-{model_id}-offline" - cfg = transformers.AutoConfig.from_pretrained(model_path, trust_remote_code=True) - - if model_source=="moonshotai/Kimi-K2.5": - #vlm, get text config + cfg = AutoConfig.from_pretrained(model_path, trust_remote_code=True) + if hasattr(cfg, "text_config"): # vlm: get text_config cfg = cfg.text_config offline_data_dir = generate_offline_pt_data( @@ -277,10 +274,8 @@ def test_offline_resume_training_kimi(tiny_daring_anteater_path, tmp_path, eagle Depends on test_offline_eagle3_training["kimi-k2.5"] having run first. Exercises AutoModelForCausalLM.from_pretrained with model_type='fake_base_model'. """ - import transformers - checkpoint_dir = eagle_output_dir / "eagle-Kimi-K2.5-offline" - config = transformers.AutoConfig.from_pretrained(checkpoint_dir, trust_remote_code=True) + config = AutoConfig.from_pretrained(checkpoint_dir, trust_remote_code=True) offline_data_dir = generate_offline_pt_data( tmp_path / "offline_data_resume", diff --git a/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py b/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py index 6b971c3251..45c2f1f40e 100644 --- a/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py +++ b/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py @@ -80,6 +80,7 @@ def test_unified_hf_export_and_check_safetensors( pyt_ckpt_path=tiny_model_dir, qformat=qformat, export_path=output_dir, + calib_dataset="cnn_dailymail", ) # Run the command From 22e9d4dbe1bb0239526530ceeeed3e11559f1171 Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Wed, 1 Apr 2026 00:47:31 -0700 Subject: [PATCH 22/28] minor Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- modelopt/__init__.py | 1 - modelopt/torch/speculative/plugins/transformers.py | 3 --- tests/examples/llm_ptq/test_llm_ptq.py | 2 +- .../export/test_unified_hf_export_and_check_safetensors.py | 2 +- 4 files changed, 2 insertions(+), 6 deletions(-) diff --git a/modelopt/__init__.py b/modelopt/__init__.py index c64e30b14a..1490782795 100644 --- a/modelopt/__init__.py +++ b/modelopt/__init__.py @@ -15,7 +15,6 @@ """Nvidia Model Optimizer (modelopt).""" -import warnings as _warnings from importlib.metadata import version as _version __version__ = _version("nvidia-modelopt") diff --git a/modelopt/torch/speculative/plugins/transformers.py b/modelopt/torch/speculative/plugins/transformers.py index a7edb44c0f..b787a89063 100644 --- a/modelopt/torch/speculative/plugins/transformers.py +++ b/modelopt/torch/speculative/plugins/transformers.py @@ -575,9 +575,6 @@ def modify( self.eagle_config = PretrainedConfig.from_dict(arch_config) self.eagle_config.eagle_decoder_type = self.eagle_decoder_type - self.eagle_config.hidden_size = self._base_llm_config.hidden_size - self.eagle_config.vocab_size = self._base_llm_config.vocab_size - self.eagle_config.max_position_embeddings = self._base_llm_config.max_position_embeddings self.eagle_config.draft_vocab_size = getattr( self.eagle_config, "draft_vocab_size", self.eagle_config.vocab_size ) diff --git a/tests/examples/llm_ptq/test_llm_ptq.py b/tests/examples/llm_ptq/test_llm_ptq.py index 358516e993..a5a470eea6 100644 --- a/tests/examples/llm_ptq/test_llm_ptq.py +++ b/tests/examples/llm_ptq/test_llm_ptq.py @@ -60,7 +60,7 @@ def test_ptq_mixtral(command): "command", [ # Auto-batch-size computation seems to take >10mins for Whisper hence using a fixed batch size - PTQCommand(quant="fp8", calib_batch_size=16, min_sm=89), + PTQCommand(quant="fp8", calib_batch_size=16, calib_dataset="peoples_speech", min_sm=89), ], ids=PTQCommand.param_str, ) diff --git a/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py b/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py index 45c2f1f40e..6a27ece72f 100644 --- a/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py +++ b/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py @@ -80,7 +80,7 @@ def test_unified_hf_export_and_check_safetensors( pyt_ckpt_path=tiny_model_dir, qformat=qformat, export_path=output_dir, - calib_dataset="cnn_dailymail", + dataset="cnn_dailymail", ) # Run the command From fdeb1ab960380b8811d0a3ee198e2728842a2435 Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Wed, 1 Apr 2026 02:28:32 -0700 Subject: [PATCH 23/28] Add back windows accuracy_benchmark dependencies + trust_remote_code fix Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- .../kl_divergence_metrics/requirements.txt | 2 ++ examples/windows/accuracy_benchmark/mmlu_benchmark.py | 6 +++++- .../accuracy_benchmark/perplexity_metrics/requirements.txt | 1 + examples/windows/accuracy_benchmark/requirements.txt | 1 + 4 files changed, 9 insertions(+), 1 deletion(-) diff --git a/examples/windows/accuracy_benchmark/kl_divergence_metrics/requirements.txt b/examples/windows/accuracy_benchmark/kl_divergence_metrics/requirements.txt index e5ac0ab52b..7108970c7c 100644 --- a/examples/windows/accuracy_benchmark/kl_divergence_metrics/requirements.txt +++ b/examples/windows/accuracy_benchmark/kl_divergence_metrics/requirements.txt @@ -1,5 +1,7 @@ --extra-index-url https://download.pytorch.org/whl/cu129 accelerate datasets +numpy safetensors>=0.4.0 +torch>=2.6.0 transformers<5.0 diff --git a/examples/windows/accuracy_benchmark/mmlu_benchmark.py b/examples/windows/accuracy_benchmark/mmlu_benchmark.py index 4eb2fd6190..54573e6425 100644 --- a/examples/windows/accuracy_benchmark/mmlu_benchmark.py +++ b/examples/windows/accuracy_benchmark/mmlu_benchmark.py @@ -501,7 +501,11 @@ def evaluate_func(args, subject, dev_df, test_df): tokenizer = get_tokenizer(model_ckpt_path, trust_remote_code=trust_remote_code) model = select_model( - max_input_length=MAX_SEQ_LEN, max_output_length=2, dtype=dtype, **kwargs + max_input_length=MAX_SEQ_LEN, + max_output_length=2, + dtype=dtype, + trust_remote_code=trust_remote_code, + **kwargs, ) assert isinstance(model, EvalModel) if quant_cfg: diff --git a/examples/windows/accuracy_benchmark/perplexity_metrics/requirements.txt b/examples/windows/accuracy_benchmark/perplexity_metrics/requirements.txt index 46f24a4537..c9eadf1b09 100644 --- a/examples/windows/accuracy_benchmark/perplexity_metrics/requirements.txt +++ b/examples/windows/accuracy_benchmark/perplexity_metrics/requirements.txt @@ -2,6 +2,7 @@ --extra-index-url https://download.pytorch.org/whl/cu129 accelerate datasets +numpy onnxruntime-genai pandas sentencepiece>=0.2.1 diff --git a/examples/windows/accuracy_benchmark/requirements.txt b/examples/windows/accuracy_benchmark/requirements.txt index dd02bcaf68..cb3f95140e 100644 --- a/examples/windows/accuracy_benchmark/requirements.txt +++ b/examples/windows/accuracy_benchmark/requirements.txt @@ -6,4 +6,5 @@ pandas==2.2.2 peft>=0.5.0 rwkv>=0.7.3 tiktoken==0.7.0 +tqdm==4.66.5 transformers==4.57.3 From 6061218083db334b5dd5dba75f46182a15fac92e Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Wed, 1 Apr 2026 11:47:37 -0700 Subject: [PATCH 24/28] revert onnx extension file back to logger Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- modelopt/onnx/quantization/extensions.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/modelopt/onnx/quantization/extensions.py b/modelopt/onnx/quantization/extensions.py index 9c9f367269..68facdaac8 100644 --- a/modelopt/onnx/quantization/extensions.py +++ b/modelopt/onnx/quantization/extensions.py @@ -17,22 +17,21 @@ import os import sys -from warnings import warn # TODO: cppimport is no longer maintained, switch to a different library import cppimport +from modelopt.onnx.logging_config import logger + try: - print("Loading extension modelopt_round_and_pack_ext...") + logger.info("Loading extension modelopt_round_and_pack_ext...") path = os.path.join(os.path.dirname(__file__), "src") sys.path.append(path) round_and_pack_ext = cppimport.imp("modelopt_round_and_pack_ext") sys.path.remove(path) except Exception as e: - warn( - f"{e}\nUnable to load `modelopt_round_and_pack_ext', falling back to python based optimized version" - ) - print( + logger.warning( + f"{e}\nUnable to load `modelopt_round_and_pack_ext', falling back to python based optimized version. " "If you see `copy_file() got an unexpected keyword argument 'dry_run'`, you will need " "https://github.com/tbenthompson/cppimport/pull/98 or downgrade setuptools until we have a workaround" ) From c74d5eccfe4c2271d1daa53cae4f35df70c1f017 Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Wed, 1 Apr 2026 12:11:42 -0700 Subject: [PATCH 25/28] Pin transformers<5.4 in spec dec example Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- examples/speculative_decoding/requirements.txt | 1 + tests/examples/speculative_decoding/test_eagle.py | 10 ++++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) create mode 100644 examples/speculative_decoding/requirements.txt diff --git a/examples/speculative_decoding/requirements.txt b/examples/speculative_decoding/requirements.txt new file mode 100644 index 0000000000..409c35f0ed --- /dev/null +++ b/examples/speculative_decoding/requirements.txt @@ -0,0 +1 @@ +transformers<5.4 diff --git a/tests/examples/speculative_decoding/test_eagle.py b/tests/examples/speculative_decoding/test_eagle.py index 3a813ed943..ca094bc6e2 100644 --- a/tests/examples/speculative_decoding/test_eagle.py +++ b/tests/examples/speculative_decoding/test_eagle.py @@ -211,8 +211,14 @@ def test_convert_to_vllm_ckpt(tiny_llama_path, eagle_output_dir): [ (None, False), # tiny_llama (from fixture), no FakeBase ("moonshotai/Kimi-K2.5", True), # remote HF repo, FakeBaseModel - ("moonshotai/Kimi-K2-Thinking", True), # remote HF repo, no FakeBaseModel - ("MiniMaxAI/MiniMax-M2.5", True), + pytest.param( + "moonshotai/Kimi-K2-Thinking", True, # remote HF repo, no FakeBaseModel + marks=pytest.mark.manual(reason="skip redundand test, too slow"), + ), + pytest.param( + "MiniMaxAI/MiniMax-M2.5", True, + marks=pytest.mark.manual(reason="skip redundand test, too slow"), + ), ], ids=["tinyllama", "kimi-k2.5","kimi-k2-thinking","minimax-m2.5"], ) From c3f1e8727e171a7c9f7d3b95c15747d984c02476 Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Tue, 7 Apr 2026 03:35:01 -0700 Subject: [PATCH 26/28] Fix pyproject.toml version Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- .github/workflows/example_tests.yml | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/example_tests.yml b/.github/workflows/example_tests.yml index f8ef06c7db..fb9718a734 100644 --- a/.github/workflows/example_tests.yml +++ b/.github/workflows/example_tests.yml @@ -72,7 +72,7 @@ jobs: with: docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.03' }}-py3" example: ${{ matrix.example }} - timeout_minutes: 45 + timeout_minutes: 30 pip_install_extras: "[hf,dev-test]" runner: linux-amd64-gpu-h100-latest-1 @@ -84,7 +84,7 @@ jobs: with: docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.03' }}-py3" example: ${{ matrix.example }} - timeout_minutes: 45 + timeout_minutes: 30 pip_install_extras: "[hf,dev-test]" runner: linux-amd64-gpu-rtxpro6000-latest-2 diff --git a/pyproject.toml b/pyproject.toml index 43c079db2d..ba1eb3ea80 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,7 +82,7 @@ hf = [ "peft>=0.17.0", "sentencepiece>=0.2.1", # Also implicitly used in test_unified_export_megatron, test_vllm_fakequant_megatron_export "tiktoken", - "transformers>=4.56,<5.0", # Should match modelopt/torch/__init__.py and tox.ini + "transformers>=4.56", # Should match modelopt/torch/__init__.py and tox.ini "wonderwords", ] dev-lint = [ From 1ede79499a04697c387028ebee2e7cd0852aee01 Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Tue, 7 Apr 2026 03:56:18 -0700 Subject: [PATCH 27/28] Fix HFEagleModel for transformers 5.5 Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- modelopt/torch/speculative/plugins/transformers.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/modelopt/torch/speculative/plugins/transformers.py b/modelopt/torch/speculative/plugins/transformers.py index a2e90dd458..ce7791cea4 100644 --- a/modelopt/torch/speculative/plugins/transformers.py +++ b/modelopt/torch/speculative/plugins/transformers.py @@ -38,7 +38,7 @@ from torch import nn from torch.nn import CrossEntropyLoss from torch.nn.attention.flex_attention import BlockMask, create_block_mask -from transformers import Cache, DynamicCache, PretrainedConfig, PreTrainedModel +from transformers import Cache, DynamicCache, PreTrainedModel from transformers.models.llama.modeling_llama import ( LlamaDecoderLayer, LlamaRMSNorm, @@ -571,7 +571,10 @@ def modify( if rope_scaling and "rope_theta" not in rope_scaling and "rope_theta" in arch_config: rope_scaling["rope_theta"] = arch_config["rope_theta"] - self.eagle_config = PretrainedConfig.from_dict(arch_config) + # Use the base model's config class so fields like max_position_embeddings are declared + # before transformers>=5.5 rope standardization runs in __post_init__. + base_config_cls = type(self._base_llm_config) + self.eagle_config = base_config_cls.from_dict(arch_config) self.eagle_config.eagle_decoder_type = self.eagle_decoder_type self.eagle_config.draft_vocab_size = getattr( self.eagle_config, "draft_vocab_size", self.eagle_config.vocab_size From 54dd1769a1a780f522a14187a2be3f845ea1ffb9 Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Tue, 7 Apr 2026 03:56:32 -0700 Subject: [PATCH 28/28] Sparse Sequential MoE fixes Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- .../torch/quantization/plugins/huggingface.py | 31 ++++++++------ .../torch/quantization/utils/core_utils.py | 10 +---- ...e_moe.py => test_sparse_sequential_moe.py} | 42 ++++++++++--------- 3 files changed, 41 insertions(+), 42 deletions(-) rename tests/unit/torch/quantization/plugins/{test_sparse_moe.py => test_sparse_sequential_moe.py} (89%) diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py index b40623aa20..6fda3d7bdd 100644 --- a/modelopt/torch/quantization/plugins/huggingface.py +++ b/modelopt/torch/quantization/plugins/huggingface.py @@ -452,9 +452,12 @@ def backward(ctx, grad_output): _transposed_quantize = _TransposedQuantization.apply -class _QuantSparseMoe(QuantModule): +class _QuantSparseSequentialMoe(QuantModule): """Quantization wrapper for HuggingFace sparse MoE blocks. + This base class is for Sequential MoEs (i.e each experts are implemented as standalone modules). + Transformers>=5.0 has batched experts, no per-expert quantizers. + Supports ``layer_sync_moe_local_experts_amax`` to sync input quantizer amax across experts. Optionally supports config-driven features (disabled by default): @@ -572,10 +575,6 @@ def layer_sync_moe_local_experts_amax(self): """ if self._moe_calib_experts_ratio is not None: return - try: - iter(self.experts) - except TypeError: - return sync_moe_expert_amax(self.experts) @@ -884,7 +883,7 @@ def forward( return final_hidden_states -class _QuantDbrxFFN(_QuantSparseMoe): +class _QuantDbrxFFN(_QuantSparseSequentialMoe): @property def num_experts(self): return self.router.moe_num_experts @@ -1325,8 +1324,8 @@ def _has_num_experts(obj): return hasattr(obj, "num_experts") or hasattr(obj, "n_routed_experts") -def _is_sparse_moe_block(module): - """Check if a module is structurally a sparse MoE block compatible with _QuantSparseMoe. +def _is_sparse_sequaential_moe_block(module): + """Check if a module is structurally a sparse sequential MoE block compatible with _QuantSparseSequentialMoe. All HuggingFace MoE blocks (Mixtral, Qwen3Moe, Qwen2Moe, Qwen3Next, Llama4, MiniMax, NemotronH, etc.) share a common structural pattern: a ``gate`` (TopKRouter) sub-module with @@ -1339,6 +1338,10 @@ def _is_sparse_moe_block(module): if not hasattr(module, "experts"): return False + if not hasattr(module.experts, "__iter__"): + # transformers>=5.0 has batched experts, no per-expert quantizers + return False + # Primary: gate sub-module has topk/top_k + num_experts (standard TopKRouter pattern) if hasattr(module, "gate"): gate = module.gate @@ -1355,10 +1358,10 @@ def _is_sparse_moe_block(module): def register_sparse_moe_on_the_fly(model): - """Auto-detect and register MOE modules as _QuantSparseMoe. + """Auto-detect and register MOE modules as _QuantSparseSequentialMoe. Walks the model tree, identifies MoE blocks by their structural attributes - (``gate`` + ``experts``), and registers unregistered ones with ``_QuantSparseMoe``. + (``gate`` + ``experts``), and registers unregistered ones with ``_QuantSparseSequentialMoe``. """ visited_types = set() for name, module in model.named_modules(): @@ -1371,12 +1374,14 @@ def register_sparse_moe_on_the_fly(model): visited_types.add(mod_type) - if _is_sparse_moe_block(module): + if _is_sparse_sequaential_moe_block(module): print( f"\033[1mDetected MOE module '{name}' of type {mod_type.__name__}, " - f"registering with _QuantSparseMoe.\033[0m" + f"registering with _QuantSparseSequentialMoe.\033[0m" + ) + QuantModuleRegistry.register({mod_type: f"hf.{mod_type.__name__}"})( + _QuantSparseSequentialMoe ) - QuantModuleRegistry.register({mod_type: f"hf.{mod_type.__name__}"})(_QuantSparseMoe) def _is_supported_hf_model(model): diff --git a/modelopt/torch/quantization/utils/core_utils.py b/modelopt/torch/quantization/utils/core_utils.py index b83dae4480..538654b7de 100644 --- a/modelopt/torch/quantization/utils/core_utils.py +++ b/modelopt/torch/quantization/utils/core_utils.py @@ -529,14 +529,8 @@ def sync_moe_expert_amax(experts): 2. For any ``weight_quantizer`` that is enabled but has ``amax is None`` (expert received no tokens during calibration), runs a weight-only ``max_calibrate`` to populate the missing amax. - - No-op for batched expert modules (e.g. transformers>=5.0 ``Qwen3MoeExperts``) - that store all expert weights in a single 3D tensor without per-expert sub-modules. """ - if not hasattr(experts, "__iter__"): - # transformers>=5.0: batched experts, no per-expert quantizers - return - + from ..model_calib import max_calibrate from ..nn import TensorQuantizer amax_dict: dict[str, torch.Tensor] = {} @@ -558,8 +552,6 @@ def sync_moe_expert_amax(experts): if isinstance(module, TensorQuantizer) and name in amax_dict: module.amax = amax_dict[name].detach().clone() - from ..model_calib import max_calibrate - for expert in experts: for name, module in expert.named_modules(): if name.endswith("weight_quantizer") and module.is_enabled and module.amax is None: diff --git a/tests/unit/torch/quantization/plugins/test_sparse_moe.py b/tests/unit/torch/quantization/plugins/test_sparse_sequential_moe.py similarity index 89% rename from tests/unit/torch/quantization/plugins/test_sparse_moe.py rename to tests/unit/torch/quantization/plugins/test_sparse_sequential_moe.py index 3e8baab798..636a43bad5 100644 --- a/tests/unit/torch/quantization/plugins/test_sparse_moe.py +++ b/tests/unit/torch/quantization/plugins/test_sparse_sequential_moe.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Tests for _is_sparse_moe_block and _QuantSparseMoe.""" +"""Tests for _is_sparse_sequaential_moe_block and _QuantSparseSequentialMoe.""" import copy @@ -33,13 +33,13 @@ from modelopt.torch.quantization.nn import QuantModuleRegistry from modelopt.torch.quantization.plugins.huggingface import ( TRANSFORMERS_VERSION_GE_5_0, - _is_sparse_moe_block, + _is_sparse_sequaential_moe_block, register_sparse_moe_on_the_fly, ) # --------------------------------------------------------------------------- -# Helpers: lightweight mock modules for _is_sparse_moe_block +# Helpers: lightweight mock modules for _is_sparse_sequaential_moe_block # --------------------------------------------------------------------------- class _FakeGateWithRouter(nn.Module): """Mimics a v5.x TopKRouter gate with top_k and num_experts.""" @@ -101,25 +101,25 @@ def forward(self, hidden_states): # --------------------------------------------------------------------------- -# Tests for _is_sparse_moe_block +# Tests for _is_sparse_sequaential_moe_block # --------------------------------------------------------------------------- class TestIsSparseBlock: def test_no_experts_returns_false(self): module = nn.Linear(8, 8) - assert _is_sparse_moe_block(module) is False + assert _is_sparse_sequaential_moe_block(module) is False def test_experts_but_no_gate_or_topk_returns_false(self): module = nn.Module() module.experts = nn.ModuleList([nn.Linear(8, 8)]) - assert _is_sparse_moe_block(module) is False + assert _is_sparse_sequaential_moe_block(module) is False def test_gate_with_router_attrs_returns_true(self): block = _MoEBlockWithGateRouter(num_experts=4, top_k=2) - assert _is_sparse_moe_block(block) is True + assert _is_sparse_sequaential_moe_block(block) is True def test_fallback_block_level_attrs_returns_true(self): block = _MoEBlockFallback(num_experts=4, top_k=2) - assert _is_sparse_moe_block(block) is True + assert _is_sparse_sequaential_moe_block(block) is True def test_gate_missing_num_experts_returns_false(self): """gate.top_k present but gate.num_experts absent -> primary path fails.""" @@ -128,7 +128,7 @@ def test_gate_missing_num_experts_returns_false(self): gate = nn.Module() gate.top_k = 2 module.gate = gate - assert _is_sparse_moe_block(module) is False + assert _is_sparse_sequaential_moe_block(module) is False def test_gate_missing_top_k_returns_false(self): """gate.num_experts present but gate.top_k absent -> primary path fails.""" @@ -137,14 +137,14 @@ def test_gate_missing_top_k_returns_false(self): gate = nn.Module() gate.num_experts = 4 module.gate = gate - assert _is_sparse_moe_block(module) is False + assert _is_sparse_sequaential_moe_block(module) is False def test_block_level_top_k_infers_num_experts(self): """top_k on block + experts with __len__ -> num_experts is inferred, returns True.""" module = nn.Module() module.experts = nn.ModuleList([nn.Linear(8, 8)]) module.top_k = 2 - assert _is_sparse_moe_block(module) is True + assert _is_sparse_sequaential_moe_block(module) is True assert module.num_experts == 1 def test_block_level_top_k_no_len_returns_false(self): @@ -152,14 +152,14 @@ def test_block_level_top_k_no_len_returns_false(self): module = nn.Module() module.experts = nn.Module() module.top_k = 2 - assert _is_sparse_moe_block(module) is False + assert _is_sparse_sequaential_moe_block(module) is False def test_block_level_only_num_experts_returns_false(self): """Only num_experts on block (no top_k) -> fallback fails.""" module = nn.Module() module.experts = nn.ModuleList([nn.Linear(8, 8)]) module.num_experts = 4 - assert _is_sparse_moe_block(module) is False + assert _is_sparse_sequaential_moe_block(module) is False def test_n_routed_experts_accepted(self): """A module with n_routed_experts (NemotronH-style) should be accepted.""" @@ -169,20 +169,21 @@ def test_n_routed_experts_accepted(self): gate.top_k = 2 gate.n_routed_experts = 4 module.gate = gate - assert _is_sparse_moe_block(module) is True + assert _is_sparse_sequaential_moe_block(module) is True # --------------------------------------------------------------------------- -# Tests for _QuantSparseMoe +# Tests for _QuantSparseSequentialMoe # --------------------------------------------------------------------------- -class TestQuantSparseMoe: - """Tests for _QuantSparseMoe using a real tiny Qwen3Moe model.""" +@pytest.mark.skipif(TRANSFORMERS_VERSION_GE_5_0, reason="Transformers v5 has stacked MoE") +class TestQuantSparseSequentialMoe: + """Tests for _QuantSparseSequentialMoe using a real tiny Qwen3Moe model.""" @staticmethod def _get_moe_block(model): """Return the first MoE block from the model.""" for module in model.modules(): - if _is_sparse_moe_block(module): + if _is_sparse_sequaential_moe_block(module): return module raise RuntimeError("No MoE block found in model") @@ -302,12 +303,13 @@ def test_token_counting_lazy_init(self): assert converted.expert_token_count.sum().item() == 8 * top_k -def test_qwen3_moe_quantize_with_token_forcing_and_counting(): +@pytest.mark.skipif(TRANSFORMERS_VERSION_GE_5_0, reason="Transformers v5 has stacked MoE") +def test_qwen3_sequential_moe_quantize_with_token_forcing_and_counting(): """End-to-end: mtq.quantize a Qwen3MoE with INT8 + moe_calib_experts_ratio + token counting.""" model = get_tiny_qwen3_moe() # Verify detection - moe_found = any(_is_sparse_moe_block(m) for m in model.modules()) + moe_found = any(_is_sparse_sequaential_moe_block(m) for m in model.modules()) assert moe_found, "Qwen3MoE should be detected as a sparse MoE block" quant_cfg = copy.deepcopy(mtq.INT8_DEFAULT_CFG)