From e7a403190a142a0cd7d215eee1187fef3cf8e59e Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Wed, 8 Apr 2026 13:50:06 +0200
Subject: [PATCH 1/2] fix

---
 src/transformers/conversion_mapping.py        | 20 +++++++++++--------
 .../models/gemma3n/modeling_gemma3n.py        |  1 +
 .../models/gemma3n/modular_gemma3n.py         |  2 +-
 .../models/qwen3_5/modeling_qwen3_5.py        |  1 +
 .../models/qwen3_5/modular_qwen3_5.py         |  1 +
 .../qwen3_5_moe/modeling_qwen3_5_moe.py       |  1 +
 .../models/qwen3_5_moe/modular_qwen3_5_moe.py |  1 +
 7 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/src/transformers/conversion_mapping.py b/src/transformers/conversion_mapping.py
index de599b0d42aa..e378302e3ebf 100755
--- a/src/transformers/conversion_mapping.py
+++ b/src/transformers/conversion_mapping.py
@@ -108,9 +108,6 @@ def _build_checkpoint_conversion_mapping():
             WeightRenaming(source_patterns=r"vlm.model", target_patterns="vlm"),
             WeightRenaming(source_patterns=r"vlm(?!\.(language_model|visual))", target_patterns="vlm.language_model"),
         ],
-        "gemma3n_text": [
-            WeightRenaming(source_patterns=r"^model.language_model", target_patterns="model"),
-        ],
         "timm_wrapper": [
             # Simply add the prefix `timm_model`. Similar to `base_model_prefix` but also removes prefix
             # when saving. TODO: Would be probably much cleaner with a `add_prefix` argument in WeightRenaming
@@ -152,9 +149,6 @@ def _build_checkpoint_conversion_mapping():
             WeightRenaming("attention_layer_norm", "input_layernorm"),
             WeightRenaming("feedforward_layer_norm", "post_attention_layernorm"),
         ],
-        "qwen3_5_text": [
-            WeightRenaming(source_patterns=r"^model.language_model", target_patterns="model"),
-        ],
         "sam3_tracker": [
             WeightRenaming(
                 source_patterns=r"detector_model.vision_encoder.backbone.", target_patterns="vision_encoder.backbone."
@@ -518,8 +512,7 @@ def _build_checkpoint_conversion_mapping():
         ),
     ]
 
-    mapping["qwen3_5_moe_text"] = mapping["qwen3_5_text"].copy()
-    mapping["qwen3_5_moe_text"] += mapping["qwen2_moe"].copy()
+    mapping["qwen3_5_moe_text"] = mapping["qwen2_moe"].copy()
 
     mapping["cohere_asr"] = [
         WeightRenaming(r"encoder\.pre_encode\.conv\.", r"encoder.subsampling.layers."),
@@ -612,6 +605,17 @@ def get_model_conversion_mapping(
     # Load models with explicit, user-provided key mapping
     if key_mapping is not None:
         weight_conversions = [WeightRenaming(source_patterns=k, target_patterns=v) for k, v in key_mapping.items()]
+    elif any(
+        allowed_name in class_name.__name__.lower()
+        for class_name in model.__class__.__mro__[:-1]
+        for allowed_name in ["qwen3_5", "gemma3n"]
+    ):
+        # TODO: these are used only for VLMs which sometimes are loaded as LLMs
+        # prob can be fixed as we did with `config_class`, all at once for VLM-LLMs
+        weight_conversions = [
+            WeightRenaming(source_patterns=k, target_patterns=v)
+            for k, v in model._checkpoint_conversion_mapping.items()
+        ]
 
     # Model have several `PreTrainedModel` within with the same model type
     # For ex: XForConditionalGeneration -> XModel. We don't want to apply the same
diff --git a/src/transformers/models/gemma3n/modeling_gemma3n.py b/src/transformers/models/gemma3n/modeling_gemma3n.py
index edca10b4f48e..002471ac3700 100644
--- a/src/transformers/models/gemma3n/modeling_gemma3n.py
+++ b/src/transformers/models/gemma3n/modeling_gemma3n.py
@@ -1772,6 +1772,7 @@ class Gemma3nForCausalLM(Gemma3nPreTrainedModel, GenerationMixin):
     _tp_plan = {"lm_head": "colwise_gather_output"}
     _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
     config: Gemma3nTextConfig
+    _checkpoint_conversion_mapping = {"model.language_model": "model"}
 
     def __init__(self, config: Gemma3nTextConfig):
         super().__init__(config)
diff --git a/src/transformers/models/gemma3n/modular_gemma3n.py b/src/transformers/models/gemma3n/modular_gemma3n.py
index d5633a689687..c3a52fee6890 100644
--- a/src/transformers/models/gemma3n/modular_gemma3n.py
+++ b/src/transformers/models/gemma3n/modular_gemma3n.py
@@ -1926,7 +1926,7 @@ def forward(
 
 @auto_docstring(custom_intro="The base Gemma 3n language model with a language modeling head.")
 class Gemma3nForCausalLM(Gemma3ForCausalLM):
-    pass
+    _checkpoint_conversion_mapping = {"model.language_model": "model"}
 
 
 class Gemma3nMultimodalEmbedder(nn.Module):
diff --git a/src/transformers/models/qwen3_5/modeling_qwen3_5.py b/src/transformers/models/qwen3_5/modeling_qwen3_5.py
index eba3eec02fdd..3efd55445d9a 100644
--- a/src/transformers/models/qwen3_5/modeling_qwen3_5.py
+++ b/src/transformers/models/qwen3_5/modeling_qwen3_5.py
@@ -1690,6 +1690,7 @@ class Qwen3_5ForCausalLM(Qwen3_5PreTrainedModel, GenerationMixin):
     _tp_plan = {"lm_head": "colwise_gather_output"}
     _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
     config: Qwen3_5TextConfig
+    _checkpoint_conversion_mapping = {"model.language_model": "model"}
     _keys_to_ignore_on_load_unexpected = [r"^mtp.*", r"^model.visual.*"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/qwen3_5/modular_qwen3_5.py b/src/transformers/models/qwen3_5/modular_qwen3_5.py
index 8fddbc6115c1..26175d452004 100644
--- a/src/transformers/models/qwen3_5/modular_qwen3_5.py
+++ b/src/transformers/models/qwen3_5/modular_qwen3_5.py
@@ -652,6 +652,7 @@ def forward(
 
 class Qwen3_5ForCausalLM(Qwen3ForCausalLM):
     config: Qwen3_5TextConfig
+    _checkpoint_conversion_mapping = {"model.language_model": "model"}
     _keys_to_ignore_on_load_unexpected = [r"^mtp.*", r"^model.visual.*"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/qwen3_5_moe/modeling_qwen3_5_moe.py b/src/transformers/models/qwen3_5_moe/modeling_qwen3_5_moe.py
index a8a46ecf508b..299aaa3a3343 100644
--- a/src/transformers/models/qwen3_5_moe/modeling_qwen3_5_moe.py
+++ b/src/transformers/models/qwen3_5_moe/modeling_qwen3_5_moe.py
@@ -1897,6 +1897,7 @@ class Qwen3_5MoeForCausalLM(Qwen3_5MoePreTrainedModel, GenerationMixin):
     _tp_plan = {"lm_head": "colwise_gather_output"}
     _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
     config: Qwen3_5MoeTextConfig
+    _checkpoint_conversion_mapping = {"model.language_model": "model"}
     _keys_to_ignore_on_load_unexpected = [r"^mtp.*", r"^model.visual.*"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/qwen3_5_moe/modular_qwen3_5_moe.py b/src/transformers/models/qwen3_5_moe/modular_qwen3_5_moe.py
index f3b4b80aa3a6..54920c6b7f0b 100644
--- a/src/transformers/models/qwen3_5_moe/modular_qwen3_5_moe.py
+++ b/src/transformers/models/qwen3_5_moe/modular_qwen3_5_moe.py
@@ -240,6 +240,7 @@ class Qwen3_5MoeModel(Qwen3_5Model):
 
 class Qwen3_5MoeForCausalLM(Qwen3NextForCausalLM):
     config: Qwen3_5MoeTextConfig
+    _checkpoint_conversion_mapping = {"model.language_model": "model"}
     _keys_to_ignore_on_load_unexpected = [r"^mtp.*", r"^model.visual.*"]
 
     def __init__(self, config):

From 421982e5e71b29aa5f29582c326396c6f1211da0 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Wed, 8 Apr 2026 13:52:22 +0200
Subject: [PATCH 2/2] unskip tests

---
 tests/models/gemma3n/test_modeling_gemma3n.py         | 6 ------
 tests/models/qwen3_5/test_modeling_qwen3_5.py         | 6 ------
 tests/models/qwen3_5_moe/test_modeling_qwen3_5_moe.py | 6 ------
 3 files changed, 18 deletions(-)

diff --git a/tests/models/gemma3n/test_modeling_gemma3n.py b/tests/models/gemma3n/test_modeling_gemma3n.py
index e993b9e3ddf0..99e662a6b46b 100644
--- a/tests/models/gemma3n/test_modeling_gemma3n.py
+++ b/tests/models/gemma3n/test_modeling_gemma3n.py
@@ -866,12 +866,6 @@ def test_get_audio_features_attentions(self, return_dict: bool | None):
     def test_generate_with_quant_cache(self):
         pass
 
-    @unittest.skip(
-        "Conversion only for the `CausalLM` loading from saved `ConditionalLM`, doesn't apply to simple VLM"
-    )
-    def test_reverse_loading_mapping(self, check_keys_were_modified=True):
-        pass
-
     def _check_hidden_states_for_generate(
         self, batch_size, hidden_states, prompt_length, output_length, config, use_cache=False
     ):
diff --git a/tests/models/qwen3_5/test_modeling_qwen3_5.py b/tests/models/qwen3_5/test_modeling_qwen3_5.py
index 7725d2891a33..a230acfe7f21 100644
--- a/tests/models/qwen3_5/test_modeling_qwen3_5.py
+++ b/tests/models/qwen3_5/test_modeling_qwen3_5.py
@@ -304,12 +304,6 @@ def setUp(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    @unittest.skip(
-        "Conversion only for the `CausalLM` loading from saved `ConditionalLM`, doesn't apply to simple VLM"
-    )
-    def test_reverse_loading_mapping(self, check_keys_were_modified=True):
-        pass
-
     def _get_conv_state_shape(self, batch_size: int, config):
         num_v_heads = config.linear_num_value_heads
         num_k_heads = config.linear_num_key_heads
diff --git a/tests/models/qwen3_5_moe/test_modeling_qwen3_5_moe.py b/tests/models/qwen3_5_moe/test_modeling_qwen3_5_moe.py
index e81e4d951917..c325a8682908 100644
--- a/tests/models/qwen3_5_moe/test_modeling_qwen3_5_moe.py
+++ b/tests/models/qwen3_5_moe/test_modeling_qwen3_5_moe.py
@@ -300,12 +300,6 @@ def setUp(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    @unittest.skip(
-        "Conversion only for the `CausalLM` loading from saved `ConditionalLM`, doesn't apply to simple VLM"
-    )
-    def test_reverse_loading_mapping(self, check_keys_were_modified=True):
-        pass
-
     def _get_conv_state_shape(self, batch_size: int, config):
         num_v_heads = config.linear_num_value_heads
         num_k_heads = config.linear_num_key_heads