From e7a403190a142a0cd7d215eee1187fef3cf8e59e Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 8 Apr 2026 13:50:06 +0200 Subject: [PATCH 1/2] fix --- src/transformers/conversion_mapping.py | 20 +++++++++++-------- .../models/gemma3n/modeling_gemma3n.py | 1 + .../models/gemma3n/modular_gemma3n.py | 2 +- .../models/qwen3_5/modeling_qwen3_5.py | 1 + .../models/qwen3_5/modular_qwen3_5.py | 1 + .../qwen3_5_moe/modeling_qwen3_5_moe.py | 1 + .../models/qwen3_5_moe/modular_qwen3_5_moe.py | 1 + 7 files changed, 18 insertions(+), 9 deletions(-) diff --git a/src/transformers/conversion_mapping.py b/src/transformers/conversion_mapping.py index de599b0d42aa..e378302e3ebf 100755 --- a/src/transformers/conversion_mapping.py +++ b/src/transformers/conversion_mapping.py @@ -108,9 +108,6 @@ def _build_checkpoint_conversion_mapping(): WeightRenaming(source_patterns=r"vlm.model", target_patterns="vlm"), WeightRenaming(source_patterns=r"vlm(?!\.(language_model|visual))", target_patterns="vlm.language_model"), ], - "gemma3n_text": [ - WeightRenaming(source_patterns=r"^model.language_model", target_patterns="model"), - ], "timm_wrapper": [ # Simply add the prefix `timm_model`. Similar to `base_model_prefix` but also removes prefix # when saving. TODO: Would be probably much cleaner with a `add_prefix` argument in WeightRenaming @@ -152,9 +149,6 @@ def _build_checkpoint_conversion_mapping(): WeightRenaming("attention_layer_norm", "input_layernorm"), WeightRenaming("feedforward_layer_norm", "post_attention_layernorm"), ], - "qwen3_5_text": [ - WeightRenaming(source_patterns=r"^model.language_model", target_patterns="model"), - ], "sam3_tracker": [ WeightRenaming( source_patterns=r"detector_model.vision_encoder.backbone.", target_patterns="vision_encoder.backbone." @@ -518,8 +512,7 @@ def _build_checkpoint_conversion_mapping(): ), ] - mapping["qwen3_5_moe_text"] = mapping["qwen3_5_text"].copy() - mapping["qwen3_5_moe_text"] += mapping["qwen2_moe"].copy() + mapping["qwen3_5_moe_text"] = mapping["qwen2_moe"].copy() mapping["cohere_asr"] = [ WeightRenaming(r"encoder\.pre_encode\.conv\.", r"encoder.subsampling.layers."), @@ -612,6 +605,17 @@ def get_model_conversion_mapping( # Load models with explicit, user-provided key mapping if key_mapping is not None: weight_conversions = [WeightRenaming(source_patterns=k, target_patterns=v) for k, v in key_mapping.items()] + elif any( + allowed_name in class_name.__name__.lower() + for class_name in model.__class__.__mro__[:-1] + for allowed_name in ["qwen3_5", "gemma3n"] + ): + # TODO: these are used only for VLMs which sometimes are loaded as LLMs + # prob can be fixed as we did with `config_class`, all at once for VLM-LLMs + weight_conversions = [ + WeightRenaming(source_patterns=k, target_patterns=v) + for k, v in model._checkpoint_conversion_mapping.items() + ] # Model have several `PreTrainedModel` within with the same model type # For ex: XForConditionalGeneration -> XModel. We don't want to apply the same diff --git a/src/transformers/models/gemma3n/modeling_gemma3n.py b/src/transformers/models/gemma3n/modeling_gemma3n.py index edca10b4f48e..002471ac3700 100644 --- a/src/transformers/models/gemma3n/modeling_gemma3n.py +++ b/src/transformers/models/gemma3n/modeling_gemma3n.py @@ -1772,6 +1772,7 @@ class Gemma3nForCausalLM(Gemma3nPreTrainedModel, GenerationMixin): _tp_plan = {"lm_head": "colwise_gather_output"} _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} config: Gemma3nTextConfig + _checkpoint_conversion_mapping = {"model.language_model": "model"} def __init__(self, config: Gemma3nTextConfig): super().__init__(config) diff --git a/src/transformers/models/gemma3n/modular_gemma3n.py b/src/transformers/models/gemma3n/modular_gemma3n.py index d5633a689687..c3a52fee6890 100644 --- a/src/transformers/models/gemma3n/modular_gemma3n.py +++ b/src/transformers/models/gemma3n/modular_gemma3n.py @@ -1926,7 +1926,7 @@ def forward( @auto_docstring(custom_intro="The base Gemma 3n language model with a language modeling head.") class Gemma3nForCausalLM(Gemma3ForCausalLM): - pass + _checkpoint_conversion_mapping = {"model.language_model": "model"} class Gemma3nMultimodalEmbedder(nn.Module): diff --git a/src/transformers/models/qwen3_5/modeling_qwen3_5.py b/src/transformers/models/qwen3_5/modeling_qwen3_5.py index eba3eec02fdd..3efd55445d9a 100644 --- a/src/transformers/models/qwen3_5/modeling_qwen3_5.py +++ b/src/transformers/models/qwen3_5/modeling_qwen3_5.py @@ -1690,6 +1690,7 @@ class Qwen3_5ForCausalLM(Qwen3_5PreTrainedModel, GenerationMixin): _tp_plan = {"lm_head": "colwise_gather_output"} _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} config: Qwen3_5TextConfig + _checkpoint_conversion_mapping = {"model.language_model": "model"} _keys_to_ignore_on_load_unexpected = [r"^mtp.*", r"^model.visual.*"] def __init__(self, config): diff --git a/src/transformers/models/qwen3_5/modular_qwen3_5.py b/src/transformers/models/qwen3_5/modular_qwen3_5.py index 8fddbc6115c1..26175d452004 100644 --- a/src/transformers/models/qwen3_5/modular_qwen3_5.py +++ b/src/transformers/models/qwen3_5/modular_qwen3_5.py @@ -652,6 +652,7 @@ def forward( class Qwen3_5ForCausalLM(Qwen3ForCausalLM): config: Qwen3_5TextConfig + _checkpoint_conversion_mapping = {"model.language_model": "model"} _keys_to_ignore_on_load_unexpected = [r"^mtp.*", r"^model.visual.*"] def __init__(self, config): diff --git a/src/transformers/models/qwen3_5_moe/modeling_qwen3_5_moe.py b/src/transformers/models/qwen3_5_moe/modeling_qwen3_5_moe.py index a8a46ecf508b..299aaa3a3343 100644 --- a/src/transformers/models/qwen3_5_moe/modeling_qwen3_5_moe.py +++ b/src/transformers/models/qwen3_5_moe/modeling_qwen3_5_moe.py @@ -1897,6 +1897,7 @@ class Qwen3_5MoeForCausalLM(Qwen3_5MoePreTrainedModel, GenerationMixin): _tp_plan = {"lm_head": "colwise_gather_output"} _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} config: Qwen3_5MoeTextConfig + _checkpoint_conversion_mapping = {"model.language_model": "model"} _keys_to_ignore_on_load_unexpected = [r"^mtp.*", r"^model.visual.*"] def __init__(self, config): diff --git a/src/transformers/models/qwen3_5_moe/modular_qwen3_5_moe.py b/src/transformers/models/qwen3_5_moe/modular_qwen3_5_moe.py index f3b4b80aa3a6..54920c6b7f0b 100644 --- a/src/transformers/models/qwen3_5_moe/modular_qwen3_5_moe.py +++ b/src/transformers/models/qwen3_5_moe/modular_qwen3_5_moe.py @@ -240,6 +240,7 @@ class Qwen3_5MoeModel(Qwen3_5Model): class Qwen3_5MoeForCausalLM(Qwen3NextForCausalLM): config: Qwen3_5MoeTextConfig + _checkpoint_conversion_mapping = {"model.language_model": "model"} _keys_to_ignore_on_load_unexpected = [r"^mtp.*", r"^model.visual.*"] def __init__(self, config): From 421982e5e71b29aa5f29582c326396c6f1211da0 Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 8 Apr 2026 13:52:22 +0200 Subject: [PATCH 2/2] unskip tests --- tests/models/gemma3n/test_modeling_gemma3n.py | 6 ------ tests/models/qwen3_5/test_modeling_qwen3_5.py | 6 ------ tests/models/qwen3_5_moe/test_modeling_qwen3_5_moe.py | 6 ------ 3 files changed, 18 deletions(-) diff --git a/tests/models/gemma3n/test_modeling_gemma3n.py b/tests/models/gemma3n/test_modeling_gemma3n.py index e993b9e3ddf0..99e662a6b46b 100644 --- a/tests/models/gemma3n/test_modeling_gemma3n.py +++ b/tests/models/gemma3n/test_modeling_gemma3n.py @@ -866,12 +866,6 @@ def test_get_audio_features_attentions(self, return_dict: bool | None): def test_generate_with_quant_cache(self): pass - @unittest.skip( - "Conversion only for the `CausalLM` loading from saved `ConditionalLM`, doesn't apply to simple VLM" - ) - def test_reverse_loading_mapping(self, check_keys_were_modified=True): - pass - def _check_hidden_states_for_generate( self, batch_size, hidden_states, prompt_length, output_length, config, use_cache=False ): diff --git a/tests/models/qwen3_5/test_modeling_qwen3_5.py b/tests/models/qwen3_5/test_modeling_qwen3_5.py index 7725d2891a33..a230acfe7f21 100644 --- a/tests/models/qwen3_5/test_modeling_qwen3_5.py +++ b/tests/models/qwen3_5/test_modeling_qwen3_5.py @@ -304,12 +304,6 @@ def setUp(self): def test_config(self): self.config_tester.run_common_tests() - @unittest.skip( - "Conversion only for the `CausalLM` loading from saved `ConditionalLM`, doesn't apply to simple VLM" - ) - def test_reverse_loading_mapping(self, check_keys_were_modified=True): - pass - def _get_conv_state_shape(self, batch_size: int, config): num_v_heads = config.linear_num_value_heads num_k_heads = config.linear_num_key_heads diff --git a/tests/models/qwen3_5_moe/test_modeling_qwen3_5_moe.py b/tests/models/qwen3_5_moe/test_modeling_qwen3_5_moe.py index e81e4d951917..c325a8682908 100644 --- a/tests/models/qwen3_5_moe/test_modeling_qwen3_5_moe.py +++ b/tests/models/qwen3_5_moe/test_modeling_qwen3_5_moe.py @@ -300,12 +300,6 @@ def setUp(self): def test_config(self): self.config_tester.run_common_tests() - @unittest.skip( - "Conversion only for the `CausalLM` loading from saved `ConditionalLM`, doesn't apply to simple VLM" - ) - def test_reverse_loading_mapping(self, check_keys_were_modified=True): - pass - def _get_conv_state_shape(self, batch_size: int, config): num_v_heads = config.linear_num_value_heads num_k_heads = config.linear_num_key_heads