diff --git a/docs/source/Instruction/Supported-models-and-datasets.md b/docs/source/Instruction/Supported-models-and-datasets.md index eeeeb07cdf..38dc384a6b 100644 --- a/docs/source/Instruction/Supported-models-and-datasets.md +++ b/docs/source/Instruction/Supported-models-and-datasets.md @@ -408,7 +408,7 @@ |[ZhipuAI/GLM-4.6-FP8](https://modelscope.cn/models/ZhipuAI/GLM-4.6-FP8)|glm4_moe|glm4_5|transformers>=4.54|✘|-|[zai-org/GLM-4.6-FP8](https://huggingface.co/zai-org/GLM-4.6-FP8)| |[ZhipuAI/GLM-4.7](https://modelscope.cn/models/ZhipuAI/GLM-4.7)|glm4_moe|glm4_7|transformers>=4.54|✔|-|[zai-org/GLM-4.7](https://huggingface.co/zai-org/GLM-4.7)| |[ZhipuAI/GLM-4.7-FP8](https://modelscope.cn/models/ZhipuAI/GLM-4.7-FP8)|glm4_moe|glm4_7|transformers>=4.54|✘|-|[zai-org/GLM-4.7-FP8](https://huggingface.co/zai-org/GLM-4.7-FP8)| -|[ZhipuAI/GLM-4.7-Flash](https://modelscope.cn/models/ZhipuAI/GLM-4.7-Flash)|glm4_moe_lite|glm4_7|transformers>=5.0.0.dev|✘|-|[zai-org/GLM-4.7-Flash](https://huggingface.co/zai-org/GLM-4.7-Flash)| +|[ZhipuAI/GLM-4.7-Flash](https://modelscope.cn/models/ZhipuAI/GLM-4.7-Flash)|glm4_moe_lite|glm4_7|transformers>=5.0.0.dev|✔|-|[zai-org/GLM-4.7-Flash](https://huggingface.co/zai-org/GLM-4.7-Flash)| |[ZhipuAI/glm-edge-1.5b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-1.5b-chat)|glm_edge|chatglm4|transformers>=4.46|✘|-|[zai-org/glm-edge-1.5b-chat](https://huggingface.co/zai-org/glm-edge-1.5b-chat)| |[ZhipuAI/glm-edge-4b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-4b-chat)|glm_edge|chatglm4|transformers>=4.46|✘|-|[zai-org/glm-edge-4b-chat](https://huggingface.co/zai-org/glm-edge-4b-chat)| |[codefuse-ai/CodeFuse-CodeGeeX2-6B](https://modelscope.cn/models/codefuse-ai/CodeFuse-CodeGeeX2-6B)|codefuse_codegeex2|codefuse|transformers<4.34|✘|coding|[codefuse-ai/CodeFuse-CodeGeeX2-6B](https://huggingface.co/codefuse-ai/CodeFuse-CodeGeeX2-6B)| diff --git a/docs/source/Megatron-SWIFT/Command-line-parameters.md b/docs/source/Megatron-SWIFT/Command-line-parameters.md index dcdc490b38..2f6da8a041 100644 --- a/docs/source/Megatron-SWIFT/Command-line-parameters.md +++ b/docs/source/Megatron-SWIFT/Command-line-parameters.md @@ -235,6 +235,7 @@ - kv_lora_rank: Key 和 Value 张量低秩表示的秩(rank)值。默认为None,自动从config.json读取。 - qk_head_dim: QK 投影中 head 的维度。 `q_head_dim = qk_head_dim + qk_pos_emb_head_dim`。默认为None,自动从config.json读取。 - qk_pos_emb_head_dim: QK 投影中位置嵌入的维度。默认为None,自动从config.json读取。 +- v_head_dim: V 投影中的 head 维度。默认为None,自动从config.json读取。 **MTP参数** - mtp_num_layers: 多token预测(MTP)层的数量。MTP将每个位置的预测范围扩展到多个未来token。此MTP实现使用D个顺序模块依次预测D个额外的token。默认为None。(需要"megatron-core>=0.14") diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md index cb11ad7b20..18868c1dc4 100644 --- a/docs/source_en/Instruction/Supported-models-and-datasets.md +++ b/docs/source_en/Instruction/Supported-models-and-datasets.md @@ -409,7 +409,7 @@ The table below introduces the models integrated with ms-swift: |[ZhipuAI/GLM-4.6-FP8](https://modelscope.cn/models/ZhipuAI/GLM-4.6-FP8)|glm4_moe|glm4_5|transformers>=4.54|✘|-|[zai-org/GLM-4.6-FP8](https://huggingface.co/zai-org/GLM-4.6-FP8)| |[ZhipuAI/GLM-4.7](https://modelscope.cn/models/ZhipuAI/GLM-4.7)|glm4_moe|glm4_7|transformers>=4.54|✔|-|[zai-org/GLM-4.7](https://huggingface.co/zai-org/GLM-4.7)| |[ZhipuAI/GLM-4.7-FP8](https://modelscope.cn/models/ZhipuAI/GLM-4.7-FP8)|glm4_moe|glm4_7|transformers>=4.54|✘|-|[zai-org/GLM-4.7-FP8](https://huggingface.co/zai-org/GLM-4.7-FP8)| -|[ZhipuAI/GLM-4.7-Flash](https://modelscope.cn/models/ZhipuAI/GLM-4.7-Flash)|glm4_moe_lite|glm4_7|transformers>=5.0.0.dev|✘|-|[zai-org/GLM-4.7-Flash](https://huggingface.co/zai-org/GLM-4.7-Flash)| +|[ZhipuAI/GLM-4.7-Flash](https://modelscope.cn/models/ZhipuAI/GLM-4.7-Flash)|glm4_moe_lite|glm4_7|transformers>=5.0.0.dev|✔|-|[zai-org/GLM-4.7-Flash](https://huggingface.co/zai-org/GLM-4.7-Flash)| |[ZhipuAI/glm-edge-1.5b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-1.5b-chat)|glm_edge|chatglm4|transformers>=4.46|✘|-|[zai-org/glm-edge-1.5b-chat](https://huggingface.co/zai-org/glm-edge-1.5b-chat)| |[ZhipuAI/glm-edge-4b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-4b-chat)|glm_edge|chatglm4|transformers>=4.46|✘|-|[zai-org/glm-edge-4b-chat](https://huggingface.co/zai-org/glm-edge-4b-chat)| |[codefuse-ai/CodeFuse-CodeGeeX2-6B](https://modelscope.cn/models/codefuse-ai/CodeFuse-CodeGeeX2-6B)|codefuse_codegeex2|codefuse|transformers<4.34|✘|coding|[codefuse-ai/CodeFuse-CodeGeeX2-6B](https://huggingface.co/codefuse-ai/CodeFuse-CodeGeeX2-6B)| diff --git a/docs/source_en/Megatron-SWIFT/Command-line-parameters.md b/docs/source_en/Megatron-SWIFT/Command-line-parameters.md index 04c7443660..f7b64ededb 100644 --- a/docs/source_en/Megatron-SWIFT/Command-line-parameters.md +++ b/docs/source_en/Megatron-SWIFT/Command-line-parameters.md @@ -248,6 +248,7 @@ For guidance on selecting parallelization strategies, please refer to the [Train - kv_lora_rank: Low-rank representation rank value of the Key and Value tensors. Default is None and will be automatically read from config.json. - qk_head_dim: Dimension of the head in the QK projection. `q_head_dim = qk_head_dim + qk_pos_emb_head_dim`. Default is None and will be automatically read from config.json. - qk_pos_emb_head_dim: Dimension of the position embedding in the QK projection. Default is None and will be automatically read from config.json. +- v_head_dim: The head dimension in the V projection. Defaults to None, automatically read from config.json. **MTP Parameters** diff --git a/swift/megatron/arguments/megatron_args.py b/swift/megatron/arguments/megatron_args.py index 576e37fed6..ae85ae28d5 100644 --- a/swift/megatron/arguments/megatron_args.py +++ b/swift/megatron/arguments/megatron_args.py @@ -567,6 +567,7 @@ class MegatronArguments(ExtraMegatronArguments): kv_lora_rank: Optional[int] = None qk_head_dim: Optional[int] = None qk_pos_emb_head_dim: Optional[int] = None + v_head_dim: Optional[int] = None # mtp mtp_num_layers: Optional[int] = None @@ -656,6 +657,8 @@ def _set_default(self): self.qk_head_dim = 128 if self.qk_pos_emb_head_dim is None: self.qk_pos_emb_head_dim = 64 + if self.v_head_dim is None: + self.v_head_dim = 128 if self.task_type is None: self.task_type = 'causal_lm' if self.calculate_per_token_loss is None: diff --git a/swift/megatron/model/gpt_bridge.py b/swift/megatron/model/gpt_bridge.py index cbf9fc9f70..5c65169f5a 100644 --- a/swift/megatron/model/gpt_bridge.py +++ b/swift/megatron/model/gpt_bridge.py @@ -62,7 +62,11 @@ def __init__(self, disable_tqmd: bool = False): self.pp_group = mpu.get_pipeline_model_parallel_group() self.etp_group = mpu.get_expert_tensor_parallel_group() self.ep_group = mpu.get_expert_model_parallel_group() - + self.is_transformers_5 = version.parse(transformers.__version__) >= version.parse('5.0.0.dev') + if self.is_transformers_5 and self.hf_model.model_info.is_moe_model and not self.args.merge_lora: + logger.warning('In transformers 5.0, the weight organization of MoE model experts differs from Megatron. ' + 'It is recommended to use `--merge_lora true`, otherwise the trained model may not be ' + 'usable for inference with transformers.') self.tp_rank = mpu.get_tensor_model_parallel_rank() self.pp_rank = mpu.get_pipeline_model_parallel_rank() self.etp_rank = mpu.get_expert_tensor_parallel_rank() @@ -713,8 +717,7 @@ def _set_mlp_state(self, num_local_experts = args.num_experts // self.ep_size # TODO: Temporary modification for transformers 5.0 compatibility with GLM4.6v, to be fixed later is_gate_up = hasattr(hf_mlp, 'gate_up_proj') - if version.parse( - transformers.__version__) >= version.parse('5.0.0.dev') and self.args.hf_model_type == 'glm4v_moe': + if self.is_transformers_5 and self.args.hf_model_type in {'glm4v_moe', 'glm4_moe_lite'}: hf_grouped = False is_gate_up = False if to_mcore or hf_grouped: diff --git a/swift/megatron/model/gpts/__init__.py b/swift/megatron/model/gpts/__init__.py index fae8feb389..04d9266cf7 100644 --- a/swift/megatron/model/gpts/__init__.py +++ b/swift/megatron/model/gpts/__init__.py @@ -25,6 +25,7 @@ ModelType.ernie4_5, ModelType.ernie4_5_moe, ModelType.glm4_moe, + ModelType.glm4_moe_lite, ModelType.gpt_oss, ], )) diff --git a/swift/megatron/model/model_provider.py b/swift/megatron/model/model_provider.py index 00a9ff1115..c4f664db04 100644 --- a/swift/megatron/model/model_provider.py +++ b/swift/megatron/model/model_provider.py @@ -143,9 +143,9 @@ def oom_observer(device, alloc, device_alloc, device_free): config, transformer_layer_spec_for_mtp, use_transformer_engine=use_te, **kwargs) if args.use_shared_expert_gate and args.num_experts and args.moe_shared_expert_intermediate_size: - # qwen2_moe for layer_spec in transformer_layer_spec.layer_specs: - layer_spec.submodules.mlp.submodules.shared_experts.params = {'gate': True} + if hasattr(layer_spec.submodules.mlp.submodules, 'shared_experts'): + layer_spec.submodules.mlp.submodules.shared_experts.params = {'gate': True} model = megatron_model_meta.model_cls( config=config, transformer_layer_spec=transformer_layer_spec, diff --git a/swift/megatron/utils/config.py b/swift/megatron/utils/config.py index 59498a68ba..24720ad3a4 100644 --- a/swift/megatron/utils/config.py +++ b/swift/megatron/utils/config.py @@ -20,7 +20,7 @@ 'swiglu': ['hidden_act'], 'add_qkv_bias': ['attention_bias', 'qkv_bias', 'use_bias'], 'disable_bias_linear': ['mlp_bias'], - 'kv_channels': ['head_dim', 'v_head_dim'], + 'kv_channels': ['head_dim'], 'hf_model_type': ['model_type'], # moe 'moe_ffn_hidden_size': ['moe_intermediate_size'], @@ -37,6 +37,7 @@ 'moe_router_bias_update_rate': ['aux_loss_alpha'], 'qk_head_dim': ['qk_nope_head_dim'], 'qk_pos_emb_head_dim': ['qk_rope_head_dim'], + 'v_head_dim': ['v_head_dim'], 'moe_router_topk_scaling_factor': ['routed_scaling_factor'], 'qk_layernorm': ['use_qk_norm'], # qwen3_next @@ -104,6 +105,7 @@ def convert_hf_config(config) -> Dict[str, Any]: mlp_ffn_hidden_size = res.pop('mlp_ffn_hidden_size', None) interleave_moe_layer_step = res.pop('interleave_moe_layer_step', None) window_size = res.pop('window_size', None) + rope_scaling = res.get('rope_scaling') or {} if llm_model_type in {'qwen3', 'qwen3_moe', 'qwen3_next' } or hf_model_type in {'qwen3_omni_moe', 'qwen3_omni', 'qwen3_vl', 'qwen3_vl_moe'}: res['qk_layernorm'] = True @@ -149,8 +151,11 @@ def convert_hf_config(config) -> Dict[str, Any]: else: window_attn_skip_freq = ','.join(['1' if lt == 'sliding_attention' else '0' for lt in layer_types]) res['window_attn_skip_freq'] = f'[{window_attn_skip_freq}]' - elif llm_model_type == 'glm4_moe' or hf_model_type == 'glm4v_moe': + elif llm_model_type in {'glm4_moe', 'glm4_moe_lite'} or hf_model_type == 'glm4v_moe': res['moe_router_score_function'] = 'sigmoid' + if llm_model_type == 'glm4_moe_lite': + res['qk_layernorm'] = True + res.pop('num_query_groups', None) elif llm_model_type == 'qwen3_next': full_attention_interval = res.pop('full_attention_interval') num_layers = res['num_layers'] @@ -180,9 +185,10 @@ def convert_hf_config(config) -> Dict[str, Any]: res['moe_layer_freq'] = f"[{','.join(moe_layer_freq)}]" elif hf_model_type == 'glm4v': res['rotary_interleaved'] = True - rope_scaling = res.get('rope_scaling') or {} if 'partial_rotary_factor' not in res and 'partial_rotary_factor' in rope_scaling: res['partial_rotary_factor'] = rope_scaling['partial_rotary_factor'] + if 'rotary_base' not in res and 'rope_theta' in rope_scaling: + res['rotary_base'] = rope_scaling['rope_theta'] if rope_scaling.get('mrope_section') is not None: res['position_embedding_type'] = 'mrope' res['mrope_section'] = rope_scaling['mrope_section'] diff --git a/swift/megatron/utils/convert_utils.py b/swift/megatron/utils/convert_utils.py index b576e693ce..cb0dabf5e4 100644 --- a/swift/megatron/utils/convert_utils.py +++ b/swift/megatron/utils/convert_utils.py @@ -192,10 +192,11 @@ def test_convert_precision(hf_model, mg_model, template, torch_dtype=torch.float _param = next(mg_language_model.parameters()) mg_dtype = _param.dtype mg_device = _param.device - # router to bfloat16 - for n, m in mg_language_model.named_modules(): - if n.endswith('router'): - m.to(mg_dtype) + if args.hf_model_type == 'minimax_m2': + # router to bfloat16 + for n, m in mg_language_model.named_modules(): + if n.endswith('router'): + m.to(mg_dtype) with torch.inference_mode(), _model_cpu_forward_context( mg_modules, torch_dtype, 'cuda', share_embedding=share_embedding, target_device=mg_device): mg_logits = forward_step_helper(mg_model, mg_inputs, dtype=torch_dtype) diff --git a/tests/megatron/test_align/test_llm.py b/tests/megatron/test_align/test_llm.py index 84f547edea..1c49ff84db 100644 --- a/tests/megatron/test_align/test_llm.py +++ b/tests/megatron/test_align/test_llm.py @@ -148,6 +148,10 @@ def test_minimax_m2(): _test_model('MiniMax/MiniMax-M2.1') +def test_glm4_moe_lite(): + _test_model('ZhipuAI/GLM-4.7-Flash') + + if __name__ == '__main__': # test_qwen2() # test_llama2() @@ -178,4 +182,5 @@ def test_minimax_m2(): # test_ernie_thinking() # test_tongyi_deepresearch() # test_glm4() - test_minimax_m2() + # test_minimax_m2() + test_glm4_moe_lite()