modelscope · Jintao-Huang · Jan 20, 2026 · Jan 20, 2026 · Jan 20, 2026 · Jan 20, 2026
diff --git a/docs/source/Instruction/Supported-models-and-datasets.md b/docs/source/Instruction/Supported-models-and-datasets.md
@@ -408,7 +408,7 @@
 |[ZhipuAI/GLM-4.6-FP8](https://modelscope.cn/models/ZhipuAI/GLM-4.6-FP8)|glm4_moe|glm4_5|transformers>=4.54|&#x2718;|-|[zai-org/GLM-4.6-FP8](https://huggingface.co/zai-org/GLM-4.6-FP8)|
 |[ZhipuAI/GLM-4.7](https://modelscope.cn/models/ZhipuAI/GLM-4.7)|glm4_moe|glm4_7|transformers>=4.54|&#x2714;|-|[zai-org/GLM-4.7](https://huggingface.co/zai-org/GLM-4.7)|
 |[ZhipuAI/GLM-4.7-FP8](https://modelscope.cn/models/ZhipuAI/GLM-4.7-FP8)|glm4_moe|glm4_7|transformers>=4.54|&#x2718;|-|[zai-org/GLM-4.7-FP8](https://huggingface.co/zai-org/GLM-4.7-FP8)|
-|[ZhipuAI/GLM-4.7-Flash](https://modelscope.cn/models/ZhipuAI/GLM-4.7-Flash)|glm4_moe_lite|glm4_7|transformers>=5.0.0.dev|&#x2718;|-|[zai-org/GLM-4.7-Flash](https://huggingface.co/zai-org/GLM-4.7-Flash)|
+|[ZhipuAI/GLM-4.7-Flash](https://modelscope.cn/models/ZhipuAI/GLM-4.7-Flash)|glm4_moe_lite|glm4_7|transformers>=5.0.0.dev|&#x2714;|-|[zai-org/GLM-4.7-Flash](https://huggingface.co/zai-org/GLM-4.7-Flash)|
 |[ZhipuAI/glm-edge-1.5b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-1.5b-chat)|glm_edge|chatglm4|transformers>=4.46|&#x2718;|-|[zai-org/glm-edge-1.5b-chat](https://huggingface.co/zai-org/glm-edge-1.5b-chat)|
 |[ZhipuAI/glm-edge-4b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-4b-chat)|glm_edge|chatglm4|transformers>=4.46|&#x2718;|-|[zai-org/glm-edge-4b-chat](https://huggingface.co/zai-org/glm-edge-4b-chat)|
 |[codefuse-ai/CodeFuse-CodeGeeX2-6B](https://modelscope.cn/models/codefuse-ai/CodeFuse-CodeGeeX2-6B)|codefuse_codegeex2|codefuse|transformers<4.34|&#x2718;|coding|[codefuse-ai/CodeFuse-CodeGeeX2-6B](https://huggingface.co/codefuse-ai/CodeFuse-CodeGeeX2-6B)|

diff --git a/docs/source/Megatron-SWIFT/Command-line-parameters.md b/docs/source/Megatron-SWIFT/Command-line-parameters.md
@@ -235,6 +235,7 @@
 - kv_lora_rank: Key 和 Value 张量低秩表示的秩（rank）值。默认为None，自动从config.json读取。
 - qk_head_dim: QK 投影中 head 的维度。 `q_head_dim = qk_head_dim + qk_pos_emb_head_dim`。默认为None，自动从config.json读取。
 - qk_pos_emb_head_dim: QK 投影中位置嵌入的维度。默认为None，自动从config.json读取。
+- v_head_dim: V 投影中的 head 维度。默认为None，自动从config.json读取。
 
 **MTP参数**
 - mtp_num_layers: 多token预测（MTP）层的数量。MTP将每个位置的预测范围扩展到多个未来token。此MTP实现使用D个顺序模块依次预测D个额外的token。默认为None。（需要"megatron-core>=0.14"）

diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md
@@ -409,7 +409,7 @@ The table below introduces the models integrated with ms-swift:
 |[ZhipuAI/GLM-4.6-FP8](https://modelscope.cn/models/ZhipuAI/GLM-4.6-FP8)|glm4_moe|glm4_5|transformers>=4.54|&#x2718;|-|[zai-org/GLM-4.6-FP8](https://huggingface.co/zai-org/GLM-4.6-FP8)|
 |[ZhipuAI/GLM-4.7](https://modelscope.cn/models/ZhipuAI/GLM-4.7)|glm4_moe|glm4_7|transformers>=4.54|&#x2714;|-|[zai-org/GLM-4.7](https://huggingface.co/zai-org/GLM-4.7)|
 |[ZhipuAI/GLM-4.7-FP8](https://modelscope.cn/models/ZhipuAI/GLM-4.7-FP8)|glm4_moe|glm4_7|transformers>=4.54|&#x2718;|-|[zai-org/GLM-4.7-FP8](https://huggingface.co/zai-org/GLM-4.7-FP8)|
-|[ZhipuAI/GLM-4.7-Flash](https://modelscope.cn/models/ZhipuAI/GLM-4.7-Flash)|glm4_moe_lite|glm4_7|transformers>=5.0.0.dev|&#x2718;|-|[zai-org/GLM-4.7-Flash](https://huggingface.co/zai-org/GLM-4.7-Flash)|
+|[ZhipuAI/GLM-4.7-Flash](https://modelscope.cn/models/ZhipuAI/GLM-4.7-Flash)|glm4_moe_lite|glm4_7|transformers>=5.0.0.dev|&#x2714;|-|[zai-org/GLM-4.7-Flash](https://huggingface.co/zai-org/GLM-4.7-Flash)|
 |[ZhipuAI/glm-edge-1.5b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-1.5b-chat)|glm_edge|chatglm4|transformers>=4.46|&#x2718;|-|[zai-org/glm-edge-1.5b-chat](https://huggingface.co/zai-org/glm-edge-1.5b-chat)|
 |[ZhipuAI/glm-edge-4b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-4b-chat)|glm_edge|chatglm4|transformers>=4.46|&#x2718;|-|[zai-org/glm-edge-4b-chat](https://huggingface.co/zai-org/glm-edge-4b-chat)|
 |[codefuse-ai/CodeFuse-CodeGeeX2-6B](https://modelscope.cn/models/codefuse-ai/CodeFuse-CodeGeeX2-6B)|codefuse_codegeex2|codefuse|transformers<4.34|&#x2718;|coding|[codefuse-ai/CodeFuse-CodeGeeX2-6B](https://huggingface.co/codefuse-ai/CodeFuse-CodeGeeX2-6B)|

diff --git a/docs/source_en/Megatron-SWIFT/Command-line-parameters.md b/docs/source_en/Megatron-SWIFT/Command-line-parameters.md
@@ -248,6 +248,7 @@ For guidance on selecting parallelization strategies, please refer to the [Train
 - kv_lora_rank: Low-rank representation rank value of the Key and Value tensors. Default is None and will be automatically read from config.json.
 - qk_head_dim: Dimension of the head in the QK projection. `q_head_dim = qk_head_dim + qk_pos_emb_head_dim`. Default is None and will be automatically read from config.json.
 - qk_pos_emb_head_dim: Dimension of the position embedding in the QK projection. Default is None and will be automatically read from config.json.
+- v_head_dim: The head dimension in the V projection. Defaults to None, automatically read from config.json.
 
 
 **MTP Parameters**

diff --git a/swift/megatron/arguments/megatron_args.py b/swift/megatron/arguments/megatron_args.py
@@ -567,6 +567,7 @@ class MegatronArguments(ExtraMegatronArguments):
     kv_lora_rank: Optional[int] = None
     qk_head_dim: Optional[int] = None
     qk_pos_emb_head_dim: Optional[int] = None
+    v_head_dim: Optional[int] = None
 
     # mtp
     mtp_num_layers: Optional[int] = None
@@ -656,6 +657,8 @@ def _set_default(self):
             self.qk_head_dim = 128
         if self.qk_pos_emb_head_dim is None:
             self.qk_pos_emb_head_dim = 64
+        if self.v_head_dim is None:
+            self.v_head_dim = 128
         if self.task_type is None:
             self.task_type = 'causal_lm'
         if self.calculate_per_token_loss is None:

diff --git a/swift/megatron/model/gpt_bridge.py b/swift/megatron/model/gpt_bridge.py
@@ -62,7 +62,11 @@ def __init__(self, disable_tqmd: bool = False):
         self.pp_group = mpu.get_pipeline_model_parallel_group()
         self.etp_group = mpu.get_expert_tensor_parallel_group()
         self.ep_group = mpu.get_expert_model_parallel_group()
-
+        self.is_transformers_5 = version.parse(transformers.__version__) >= version.parse('5.0.0.dev')
+        if self.is_transformers_5 and self.hf_model.model_info.is_moe_model and not self.args.merge_lora:
+            logger.warning('In transformers 5.0, the weight organization of MoE model experts differs from Megatron. '
+                           'It is recommended to use `--merge_lora true`, otherwise the trained model may not be '
+                           'usable for inference with transformers.')
         self.tp_rank = mpu.get_tensor_model_parallel_rank()
         self.pp_rank = mpu.get_pipeline_model_parallel_rank()
         self.etp_rank = mpu.get_expert_tensor_parallel_rank()
@@ -713,8 +717,7 @@ def _set_mlp_state(self,
             num_local_experts = args.num_experts // self.ep_size
         # TODO: Temporary modification for transformers 5.0 compatibility with GLM4.6v, to be fixed later
         is_gate_up = hasattr(hf_mlp, 'gate_up_proj')
-        if version.parse(
-                transformers.__version__) >= version.parse('5.0.0.dev') and self.args.hf_model_type == 'glm4v_moe':
+        if self.is_transformers_5 and self.args.hf_model_type in {'glm4v_moe', 'glm4_moe_lite'}:
             hf_grouped = False
             is_gate_up = False
         if to_mcore or hf_grouped:

diff --git a/swift/megatron/model/gpts/__init__.py b/swift/megatron/model/gpts/__init__.py
@@ -25,6 +25,7 @@
             ModelType.ernie4_5,
             ModelType.ernie4_5_moe,
             ModelType.glm4_moe,
+            ModelType.glm4_moe_lite,
             ModelType.gpt_oss,
         ],
     ))
diff --git a/swift/megatron/model/model_provider.py b/swift/megatron/model/model_provider.py
@@ -143,9 +143,9 @@ def oom_observer(device, alloc, device_alloc, device_free):
                 config, transformer_layer_spec_for_mtp, use_transformer_engine=use_te, **kwargs)
 
         if args.use_shared_expert_gate and args.num_experts and args.moe_shared_expert_intermediate_size:
-            # qwen2_moe
             for layer_spec in transformer_layer_spec.layer_specs:
-                layer_spec.submodules.mlp.submodules.shared_experts.params = {'gate': True}
+                if hasattr(layer_spec.submodules.mlp.submodules, 'shared_experts'):
+                    layer_spec.submodules.mlp.submodules.shared_experts.params = {'gate': True}
         model = megatron_model_meta.model_cls(
             config=config,
             transformer_layer_spec=transformer_layer_spec,

diff --git a/swift/megatron/utils/config.py b/swift/megatron/utils/config.py
@@ -20,7 +20,7 @@
     'swiglu': ['hidden_act'],
     'add_qkv_bias': ['attention_bias', 'qkv_bias', 'use_bias'],
     'disable_bias_linear': ['mlp_bias'],
-    'kv_channels': ['head_dim', 'v_head_dim'],
+    'kv_channels': ['head_dim'],
     'hf_model_type': ['model_type'],
     # moe
     'moe_ffn_hidden_size': ['moe_intermediate_size'],
@@ -37,6 +37,7 @@
     'moe_router_bias_update_rate': ['aux_loss_alpha'],
     'qk_head_dim': ['qk_nope_head_dim'],
     'qk_pos_emb_head_dim': ['qk_rope_head_dim'],
+    'v_head_dim': ['v_head_dim'],
     'moe_router_topk_scaling_factor': ['routed_scaling_factor'],
     'qk_layernorm': ['use_qk_norm'],
     # qwen3_next
@@ -104,6 +105,7 @@ def convert_hf_config(config) -> Dict[str, Any]:
     mlp_ffn_hidden_size = res.pop('mlp_ffn_hidden_size', None)
     interleave_moe_layer_step = res.pop('interleave_moe_layer_step', None)
     window_size = res.pop('window_size', None)
+    rope_scaling = res.get('rope_scaling') or {}
     if llm_model_type in {'qwen3', 'qwen3_moe', 'qwen3_next'
                           } or hf_model_type in {'qwen3_omni_moe', 'qwen3_omni', 'qwen3_vl', 'qwen3_vl_moe'}:
         res['qk_layernorm'] = True
@@ -149,8 +151,11 @@ def convert_hf_config(config) -> Dict[str, Any]:
         else:
             window_attn_skip_freq = ','.join(['1' if lt == 'sliding_attention' else '0' for lt in layer_types])
             res['window_attn_skip_freq'] = f'[{window_attn_skip_freq}]'
-    elif llm_model_type == 'glm4_moe' or hf_model_type == 'glm4v_moe':
+    elif llm_model_type in {'glm4_moe', 'glm4_moe_lite'} or hf_model_type == 'glm4v_moe':
         res['moe_router_score_function'] = 'sigmoid'
+        if llm_model_type == 'glm4_moe_lite':
+            res['qk_layernorm'] = True
+            res.pop('num_query_groups', None)
     elif llm_model_type == 'qwen3_next':
         full_attention_interval = res.pop('full_attention_interval')
         num_layers = res['num_layers']
@@ -180,9 +185,10 @@ def convert_hf_config(config) -> Dict[str, Any]:
             res['moe_layer_freq'] = f"[{','.join(moe_layer_freq)}]"
     elif hf_model_type == 'glm4v':
         res['rotary_interleaved'] = True
-    rope_scaling = res.get('rope_scaling') or {}
     if 'partial_rotary_factor' not in res and 'partial_rotary_factor' in rope_scaling:
         res['partial_rotary_factor'] = rope_scaling['partial_rotary_factor']
+    if 'rotary_base' not in res and 'rope_theta' in rope_scaling:
+        res['rotary_base'] = rope_scaling['rope_theta']
     if rope_scaling.get('mrope_section') is not None:
         res['position_embedding_type'] = 'mrope'
         res['mrope_section'] = rope_scaling['mrope_section']

diff --git a/swift/megatron/utils/convert_utils.py b/swift/megatron/utils/convert_utils.py
@@ -192,10 +192,11 @@ def test_convert_precision(hf_model, mg_model, template, torch_dtype=torch.float
     _param = next(mg_language_model.parameters())
     mg_dtype = _param.dtype
     mg_device = _param.device
-    # router to bfloat16
-    for n, m in mg_language_model.named_modules():
-        if n.endswith('router'):
-            m.to(mg_dtype)
+    if args.hf_model_type == 'minimax_m2':
+        # router to bfloat16
+        for n, m in mg_language_model.named_modules():
+            if n.endswith('router'):
+                m.to(mg_dtype)
     with torch.inference_mode(), _model_cpu_forward_context(
             mg_modules, torch_dtype, 'cuda', share_embedding=share_embedding, target_device=mg_device):
         mg_logits = forward_step_helper(mg_model, mg_inputs, dtype=torch_dtype)

diff --git a/tests/megatron/test_align/test_llm.py b/tests/megatron/test_align/test_llm.py
@@ -148,6 +148,10 @@ def test_minimax_m2():
     _test_model('MiniMax/MiniMax-M2.1')
 
 
+def test_glm4_moe_lite():
+    _test_model('ZhipuAI/GLM-4.7-Flash')
+
+
 if __name__ == '__main__':
     # test_qwen2()
     # test_llama2()
@@ -178,4 +182,5 @@ def test_minimax_m2():
     # test_ernie_thinking()
     # test_tongyi_deepresearch()
     # test_glm4()
-    test_minimax_m2()
+    # test_minimax_m2()
+    test_glm4_moe_lite()