From dc6225bb4df755341ba3aa95c21844f979c1d61d Mon Sep 17 00:00:00 2001 From: khazic Date: Mon, 27 Apr 2026 16:48:32 +0800 Subject: [PATCH 01/27] feat(llm): add Hy3-preview (HYV3) SFT support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds SFT training support for tencent/Hy3-preview (295B MoE, 192 experts top-8, 256K context). Requires transformers >= 5.6.0. New files: - nemo_automodel/components/models/hy_v3/layers.py: HYV3Attention with GQA, per-head QK RMSNorm, and RoPE - nemo_automodel/components/models/hy_v3/model.py: HYV3ForCausalLM / HYV3Model / Block wrapping Automodel's MoE infrastructure - nemo_automodel/components/models/hy_v3/state_dict_adapter.py: HYV3StateDictAdapter handling HF↔native conversion (expert tensor transposition, e_score_correction_bias relocation, MTP layer skipping) - examples/llm_finetune/hy_v3/hy3_preview_deepep.yaml: example SFT config Key architecture differences vs Qwen3-MoE: - Sigmoid routing with e_score_correction_bias (vs softmax) - first_k_dense_replace=1: only layer 0 is dense - 1 shared expert alongside 192 routed experts - route_scale=2.826 applied to routing weights - HF expert tensors are pre-grouped [n,2i,h] vs per-expert; only need transposition (not stack/concat) in state_dict_adapter Registers HYV3ForCausalLM in MODEL_ARCH_MAPPING. Signed-off-by: khazic --- .../hy_v3/hy3_preview_deepep.yaml | 117 +++++++ nemo_automodel/_transformers/registry.py | 4 + .../components/models/hy_v3/__init__.py | 17 + .../components/models/hy_v3/layers.py | 138 ++++++++ .../components/models/hy_v3/model.py | 327 ++++++++++++++++++ .../models/hy_v3/state_dict_adapter.py | 205 +++++++++++ 6 files changed, 808 insertions(+) create mode 100644 examples/llm_finetune/hy_v3/hy3_preview_deepep.yaml create mode 100644 nemo_automodel/components/models/hy_v3/__init__.py create mode 100644 nemo_automodel/components/models/hy_v3/layers.py create mode 100644 nemo_automodel/components/models/hy_v3/model.py create mode 100644 nemo_automodel/components/models/hy_v3/state_dict_adapter.py diff --git a/examples/llm_finetune/hy_v3/hy3_preview_deepep.yaml b/examples/llm_finetune/hy_v3/hy3_preview_deepep.yaml new file mode 100644 index 000000000..523a72b0d --- /dev/null +++ b/examples/llm_finetune/hy_v3/hy3_preview_deepep.yaml @@ -0,0 +1,117 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# SFT recipe for tencent/Hy3-preview (295B MoE, 192 experts top-8, 256K context). +# Requires transformers >= 5.6.0 for the hy_v3 architecture. +# +# Hardware: 8 GPUs (80 GB+ each) for LoRA; 32 GPUs for full fine-tuning. +# automodel examples/llm_finetune/hy_v3/hy3_preview_deepep.yaml --nproc-per-node 8 +# +# EP size must divide num_experts (192): e.g. ep_size=8 gives 24 experts/rank. + +recipe: TrainFinetuneRecipeForNextTokenPrediction + +step_scheduler: + global_batch_size: 256 + local_batch_size: 4 + ckpt_every_steps: 500 + val_every_steps: 100 + num_epochs: 1 + +dist_env: + backend: nccl + timeout_minutes: 30 + +rng: + _target_: nemo_automodel.components.training.rng.StatefulRNG + seed: 1111 + ranked: true + +model: + _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained + pretrained_model_name_or_path: tencent/Hy3-preview + torch_dtype: bfloat16 + backend: + _target_: nemo_automodel.components.models.common.BackendConfig + attn: te + linear: torch + rms_norm: torch_fp32 + experts: torch_mm + dispatcher: deepep + fake_balanced_gate: false + enable_hf_state_dict_adapter: true + enable_fsdp_optimizations: true + +checkpoint: + enabled: true + checkpoint_dir: /tmp/checkpoints/hy3_preview/ + model_save_format: safetensors + save_consolidated: true + +distributed: + strategy: fsdp2 + tp_size: 1 + cp_size: 1 + pp_size: 1 + ep_size: 8 # 192 experts / 8 ranks = 24 experts per rank + + sequence_parallel: false + activation_checkpointing: true + + pipeline: + pp_schedule: interleaved1f1b + pp_microbatch_size: 4 + round_virtual_stages_to_pp_multiple: up + scale_grads_in_schedule: false + patch_inner_model: false + patch_causal_lm_model: false + +loss_fn: + _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy + +dataset: + _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag + path_or_dataset: rowan/hellaswag + split: train + +packed_sequence: + packed_sequence_size: 0 + +dataloader: + _target_: torchdata.stateful_dataloader.StatefulDataLoader + collate_fn: nemo_automodel.components.datasets.utils.default_collater + shuffle: true + +validation_dataset: + _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag + path_or_dataset: rowan/hellaswag + split: validation + num_samples_limit: 64 + +validation_dataloader: + _target_: torchdata.stateful_dataloader.StatefulDataLoader + collate_fn: nemo_automodel.components.datasets.utils.default_collater + batch_size: 4 + +optimizer: + _target_: torch.optim.Adam + betas: [0.9, 0.95] + eps: 1e-7 + lr: 1.0e-5 + weight_decay: 0.1 + +# Uncomment for W&B logging +# wandb: +# project: hy3-preview-sft +# name: hy3_preview_deepep diff --git a/nemo_automodel/_transformers/registry.py b/nemo_automodel/_transformers/registry.py index 593db1b07..bd025f354 100644 --- a/nemo_automodel/_transformers/registry.py +++ b/nemo_automodel/_transformers/registry.py @@ -133,6 +133,10 @@ "LLaVAOneVision1_5_ForConditionalGeneration", ), ), + ( + "HYV3ForCausalLM", + ("nemo_automodel.components.models.hy_v3.model", "HYV3ForCausalLM"), + ), ( "Qwen2ForCausalLM", ("nemo_automodel.components.models.qwen2.model", "Qwen2ForCausalLM"), diff --git a/nemo_automodel/components/models/hy_v3/__init__.py b/nemo_automodel/components/models/hy_v3/__init__.py new file mode 100644 index 000000000..0aba0c161 --- /dev/null +++ b/nemo_automodel/components/models/hy_v3/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_automodel.components.models.hy_v3.model import HYV3ForCausalLM + +__all__ = ["HYV3ForCausalLM"] diff --git a/nemo_automodel/components/models/hy_v3/layers.py b/nemo_automodel/components/models/hy_v3/layers.py new file mode 100644 index 000000000..c0865dbf0 --- /dev/null +++ b/nemo_automodel/components/models/hy_v3/layers.py @@ -0,0 +1,138 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any + +import torch +from torch import nn + +from nemo_automodel.components.attention.utils import ( + initialize_attn_module_and_func, + postprocess_output_for_attn, + preprocess_args_and_kwargs_for_attn, +) +from nemo_automodel.components.models.common import ( + BackendConfig, + initialize_linear_module, + initialize_rms_norm_module, +) +from nemo_automodel.components.models.gpt_oss.rope_utils import apply_rotary_emb_qk + + +class HYV3Attention(nn.Module): + """HYV3 attention with GQA, per-head Q/K RMSNorm, and RoPE. + + Architecture: + - q_proj: [hidden, n_heads * head_dim] + - k_proj / v_proj: [hidden, n_kv_heads * head_dim] + - q_norm / k_norm: RMSNorm applied per-head before RoPE + - RoPE applied after per-head norm + """ + + def __init__(self, config: Any, backend: BackendConfig): + super().__init__() + self.backend = backend + + self.num_heads = config.num_attention_heads + self.num_kv_heads = config.num_key_value_heads + self.head_dim = getattr(config, "head_dim", config.hidden_size // self.num_heads) + + attention_bias = getattr(config, "attention_bias", False) + + self.q_proj = initialize_linear_module( + backend.linear, config.hidden_size, self.num_heads * self.head_dim, attention_bias + ) + self.k_proj = initialize_linear_module( + backend.linear, config.hidden_size, self.num_kv_heads * self.head_dim, attention_bias + ) + self.v_proj = initialize_linear_module( + backend.linear, config.hidden_size, self.num_kv_heads * self.head_dim, attention_bias + ) + self.o_proj = initialize_linear_module( + backend.linear, self.num_heads * self.head_dim, config.hidden_size, attention_bias + ) + + # Per-head RMSNorm on Q and K (qk_norm=true in HYV3 config) + self.q_norm = initialize_rms_norm_module(backend.rms_norm, self.head_dim, eps=config.rms_norm_eps) + self.k_norm = initialize_rms_norm_module(backend.rms_norm, self.head_dim, eps=config.rms_norm_eps) + + softmax_scale = self.head_dim**-0.5 + self.attn_module, self.attn_func = initialize_attn_module_and_func( + attn_impl=backend.attn, + num_attention_heads=self.num_heads, + num_qk_channels=self.head_dim, + num_v_channels=self.head_dim, + softmax_scale=softmax_scale, + num_gqa_groups=self.num_kv_heads, + ) + + def forward( + self, + x: torch.Tensor, + *, + freqs_cis: torch.Tensor, + attention_mask: torch.Tensor | None = None, + **attn_kwargs: Any, + ) -> torch.Tensor: + if len(x.shape) == 2: + qkv_format = "thd" + num_tokens = x.shape[0] + else: + qkv_format = "bshd" + bsz, seqlen, _ = x.size() + + q = self.q_proj(x) + k = self.k_proj(x) + v = self.v_proj(x) + + if qkv_format == "thd": + q = q.view(num_tokens, self.num_heads, self.head_dim) + k = k.view(num_tokens, self.num_kv_heads, self.head_dim) + v = v.view(num_tokens, self.num_kv_heads, self.head_dim) + else: + q = q.view(bsz, seqlen, self.num_heads, self.head_dim) + k = k.view(bsz, seqlen, self.num_kv_heads, self.head_dim) + v = v.view(bsz, seqlen, self.num_kv_heads, self.head_dim) + + q = self.q_norm(q) + k = self.k_norm(k) + + q, k = apply_rotary_emb_qk( + q, + k, + freqs_cis, + format=qkv_format, + rope_fusion=self.backend.rope_fusion, + cu_seqlens=attn_kwargs.get("cu_seqlens", None), + cp_size=attn_kwargs.get("cp_size", 1), + cp_rank=attn_kwargs.get("cp_rank", 0), + ) + + q, k, v, _attn_kwargs = preprocess_args_and_kwargs_for_attn( + q, k, v, attention_mask, self.backend.attn, **attn_kwargs + ) + out = self.attn_func(q, k, v, **_attn_kwargs) + out = postprocess_output_for_attn(out, self.backend.attn) + + flatten_dim = 2 if qkv_format == "bshd" else 1 + out = self.o_proj(out.flatten(flatten_dim)) + return out + + def init_weights(self, buffer_device: torch.device, init_std: float = 0.02): + for linear in (self.q_proj, self.k_proj, self.v_proj, self.o_proj): + nn.init.trunc_normal_(linear.weight, mean=0.0, std=init_std) + if hasattr(linear, "bias") and linear.bias is not None: + nn.init.zeros_(linear.bias) + for norm in (self.q_norm, self.k_norm): + norm.reset_parameters() diff --git a/nemo_automodel/components/models/hy_v3/model.py b/nemo_automodel/components/models/hy_v3/model.py new file mode 100644 index 000000000..a8217ad5c --- /dev/null +++ b/nemo_automodel/components/models/hy_v3/model.py @@ -0,0 +1,327 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""HYV3ForCausalLM — Tencent Hy3-preview (295B MoE) SFT support. + +Architecture (from tencent/Hy3-preview config.json): + - 80 transformer layers; layer 0 is dense, layers 1-79 are MoE + - MoE: 192 routed experts + 1 shared expert, top-8 activated + - Sigmoid routing with expert-bias correction (e_score_correction_bias) + - GQA: 64 Q heads, 8 KV heads, head_dim=128 + - Per-head QK RMSNorm before RoPE + - 256K context, rope_theta=11158840 +""" + +from typing import Any + +import torch +import torch.nn as nn + +from nemo_automodel.components.models.common import ( + BackendConfig, + get_rope_config, + initialize_linear_module, + initialize_rms_norm_module, +) +from nemo_automodel.components.models.common.hf_checkpointing_mixin import HFCheckpointingMixin +from nemo_automodel.components.models.common.utils import cast_model_to_dtype +from nemo_automodel.components.models.gpt_oss.rope_utils import RotaryEmbedding, position_ids_to_freqs_cis +from nemo_automodel.components.models.hy_v3.layers import HYV3Attention +from nemo_automodel.components.models.hy_v3.state_dict_adapter import HYV3StateDictAdapter +from nemo_automodel.components.moe.config import MoEConfig +from nemo_automodel.components.moe.fsdp_mixin import MoEFSDPSyncMixin +from nemo_automodel.components.moe.layers import MLP, MoE +from nemo_automodel.components.utils.model_utils import squeeze_input_for_thd +from nemo_automodel.shared.utils import dtype_from_str as get_dtype + + +class Block(nn.Module): + def __init__(self, layer_idx: int, config: Any, moe_config: MoEConfig, backend: BackendConfig): + super().__init__() + self.self_attn = HYV3Attention(config, backend) + + # Layers 0..(first_k_dense_replace-1) are dense; the rest are MoE. + first_k_dense = getattr(config, "first_k_dense_replace", 1) + if layer_idx < first_k_dense: + self.mlp = MLP(config.hidden_size, config.intermediate_size, backend.linear) + else: + self.mlp = MoE(moe_config, backend) + + self.input_layernorm = initialize_rms_norm_module(backend.rms_norm, config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = initialize_rms_norm_module( + backend.rms_norm, config.hidden_size, eps=config.rms_norm_eps + ) + self.layer_idx = layer_idx + + def forward( + self, + x: torch.Tensor, + *, + freqs_cis: torch.Tensor, + attention_mask: torch.Tensor | None = None, + padding_mask: torch.Tensor | None = None, + **attn_kwargs: Any, + ) -> torch.Tensor: + if attention_mask is not None and padding_mask is None: + padding_mask = attention_mask.bool().logical_not() + + attn_out = self.self_attn( + x=self.input_layernorm(x), + freqs_cis=freqs_cis, + attention_mask=attention_mask, + **attn_kwargs, + ) + x = x + attn_out + + mlp_out = self._mlp(x=self.post_attention_layernorm(x), padding_mask=padding_mask) + x = x + mlp_out + return x + + def _mlp(self, x: torch.Tensor, padding_mask: torch.Tensor | None) -> torch.Tensor: + if isinstance(self.mlp, MLP): + return self.mlp(x) + assert isinstance(self.mlp, MoE) + return self.mlp(x, padding_mask) + + def init_weights(self, buffer_device: torch.device): + for norm in (self.input_layernorm, self.post_attention_layernorm): + norm.reset_parameters() + self.self_attn.init_weights(buffer_device) + self.mlp.init_weights(buffer_device) + + +class HYV3Model(nn.Module): + def __init__( + self, + config: Any, + backend: BackendConfig, + *, + moe_config: MoEConfig | None = None, + moe_overrides: dict | None = None, + ): + super().__init__() + self.backend = backend + self.config = config + if moe_config is not None and moe_overrides is not None: + raise ValueError("Cannot pass both moe_config and moe_overrides.") + + moe_defaults = dict( + dim=config.hidden_size, + inter_dim=config.intermediate_size, + moe_inter_dim=config.moe_intermediate_size, + n_routed_experts=config.num_experts, + n_shared_experts=getattr(config, "num_shared_experts", 0), + n_activated_experts=config.num_experts_per_tok, + n_expert_groups=0, + n_limited_groups=0, + train_gate=True, + # Expert-bias correction: updated every backward pass at rate 1e-3 + gate_bias_update_factor=1e-3, + score_func="sigmoid", + route_scale=getattr(config, "router_scaling_factor", 1.0), + aux_loss_coeff=0.0, + norm_topk_prob=getattr(config, "route_norm", False), + expert_bias=False, + router_bias=False, + expert_activation="swiglu", + softmax_before_topk=False, + # Ensures e_score_correction_bias buffer is created so HF checkpoints load cleanly + force_e_score_correction_bias=getattr(config, "moe_router_enable_expert_bias", False), + ) + if moe_overrides: + moe_defaults.update(moe_overrides) + self.moe_config = moe_config or MoEConfig(**moe_defaults) + + self.embed_tokens = nn.Embedding( + config.vocab_size, config.hidden_size, dtype=get_dtype(config.torch_dtype, torch.bfloat16) + ) + self.layers = torch.nn.ModuleDict() + for layer_id in range(config.num_hidden_layers): + self.layers[str(layer_id)] = Block(layer_id, config, self.moe_config, backend) + self.norm = initialize_rms_norm_module(backend.rms_norm, config.hidden_size, eps=config.rms_norm_eps) + + self.max_seq_len = config.max_position_embeddings + self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + + base, rope_scaling, _ = get_rope_config(config) + + self.rotary_emb = RotaryEmbedding( + head_dim=self.head_dim, + base=base, + dtype=torch.float32, + initial_context_length=rope_scaling.get("original_max_position_embeddings", 4096), + scaling_factor=rope_scaling.get("factor", 1.0), + ntk_alpha=rope_scaling.get("beta_slow", 1.0), + ntk_beta=rope_scaling.get("beta_fast", 32.0), + device=torch.device(f"cuda:{torch.cuda.current_device()}"), + ) + + def forward( + self, + input_ids: torch.Tensor, + *, + position_ids: torch.Tensor | None = None, + attention_mask: torch.Tensor | None = None, + padding_mask: torch.Tensor | None = None, + **attn_kwargs: Any, + ) -> torch.Tensor: + if position_ids is None: + position_ids = ( + torch.arange(0, input_ids.shape[1], device=input_ids.device).unsqueeze(0).expand(input_ids.shape[0], -1) + ) + + freqs_cis = position_ids_to_freqs_cis( + self.rotary_emb, + position_ids, + qkv_format=attn_kwargs.get("qkv_format", "bshd"), + for_fused_rope=self.backend.rope_fusion, + cp_size=attn_kwargs.get("cp_size", 1), + ) + + h = self.embed_tokens(input_ids) if self.embed_tokens is not None else input_ids + + for layer in self.layers.values(): + h = layer( + x=h, + freqs_cis=freqs_cis, + attention_mask=attention_mask, + padding_mask=padding_mask, + **attn_kwargs, + ) + + h = self.norm(h) if self.norm else h + return h + + @torch.no_grad() + def init_weights(self, buffer_device: torch.device | None = None) -> None: + buffer_device = buffer_device or torch.device(f"cuda:{torch.cuda.current_device()}") + with buffer_device: + if self.embed_tokens is not None: + nn.init.normal_(self.embed_tokens.weight) + if self.norm is not None: + self.norm.reset_parameters() + self.rotary_emb.device = buffer_device + + for layer in self.layers.values(): + if layer is not None: + layer.init_weights(buffer_device=buffer_device) + + +class HYV3ForCausalLM(HFCheckpointingMixin, nn.Module, MoEFSDPSyncMixin): + @classmethod + def from_config( + cls, + config: Any, + moe_config: MoEConfig | None = None, + backend: BackendConfig | None = None, + **kwargs, + ): + return cls(config, moe_config, backend, **kwargs) + + @classmethod + def from_pretrained( + cls, + pretrained_model_name_or_path: str, + *model_args, + **kwargs, + ): + from transformers import AutoConfig + + config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=False) + return cls.from_config(config, *model_args, **kwargs) + + def __init__( + self, + config: Any, + moe_config: MoEConfig | None = None, + backend: BackendConfig | None = None, + **kwargs, + ): + super().__init__() + self.config = config + self.backend = backend or BackendConfig() + moe_overrides = kwargs.pop("moe_overrides", None) + self.model = HYV3Model(config, backend=self.backend, moe_config=moe_config, moe_overrides=moe_overrides) + self.lm_head = initialize_linear_module(self.backend.linear, config.hidden_size, config.vocab_size, bias=False) + if self.backend.enable_hf_state_dict_adapter: + self.state_dict_adapter = HYV3StateDictAdapter( + self.config, + self.model.moe_config, + self.backend, + dtype=get_dtype(config.torch_dtype, torch.bfloat16), + ) + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def forward( + self, + input_ids: torch.Tensor, + *, + position_ids: torch.Tensor | None = None, + attention_mask: torch.Tensor | None = None, + padding_mask: torch.Tensor | None = None, + **attn_kwargs: Any, + ) -> torch.Tensor: + if "qkv_format" in attn_kwargs and attn_kwargs["qkv_format"] == "thd": + input_ids, position_ids, padding_mask, attn_kwargs = squeeze_input_for_thd( + input_ids, position_ids, padding_mask, attn_kwargs + ) + attention_mask = None + + hidden = self.model( + input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + padding_mask=padding_mask, + **attn_kwargs, + ) + logits = self.lm_head(hidden) if self.lm_head else hidden + if "qkv_format" in attn_kwargs and attn_kwargs["qkv_format"] == "thd": + logits = logits.unsqueeze(0) + return logits + + @torch.no_grad() + def initialize_weights( + self, buffer_device: torch.device | None = None, dtype: torch.dtype = torch.bfloat16 + ) -> None: + buffer_device = buffer_device or torch.device(f"cuda:{torch.cuda.current_device()}") + with buffer_device: + self.model.init_weights(buffer_device=buffer_device) + final_out_std = self.config.hidden_size**-0.5 + cutoff_factor = 3 + if self.lm_head is not None: + nn.init.trunc_normal_( + self.lm_head.weight, + mean=0.0, + std=final_out_std, + a=-cutoff_factor * final_out_std, + b=cutoff_factor * final_out_std, + ) + + cast_model_to_dtype(self, dtype) + with buffer_device: + self.model.rotary_emb.device = buffer_device + + +ModelClass = HYV3ForCausalLM diff --git a/nemo_automodel/components/models/hy_v3/state_dict_adapter.py b/nemo_automodel/components/models/hy_v3/state_dict_adapter.py new file mode 100644 index 000000000..6e7f2d191 --- /dev/null +++ b/nemo_automodel/components/models/hy_v3/state_dict_adapter.py @@ -0,0 +1,205 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""State dict conversion between HuggingFace HYV3 format and Automodel native format. + +Key differences between HF and native formats: + +HF format (tencent/Hy3-preview): + model.layers.{L}.mlp.experts.gate_up_proj # [n_experts, 2*moe_inter, hidden] + model.layers.{L}.mlp.experts.down_proj # [n_experts, hidden, moe_inter] + model.layers.{L}.mlp.e_score_correction_bias # [n_experts] (on MoE module) + model.layers.{L}.mlp.gate.weight # [n_experts, hidden] + model.layers.{L}.mlp.shared_experts.{gate,up,down}_proj.weight # separate MLPs + +Native format (Automodel GroupedExperts): + model.layers.{L}.mlp.experts.gate_and_up_projs # [n_local, hidden, 2*moe_inter] (transposed) + model.layers.{L}.mlp.experts.down_projs # [n_local, moe_inter, hidden] (transposed) + model.layers.{L}.mlp.gate.e_score_correction_bias # [n_local] (moved to Gate module) + model.layers.{L}.mlp.gate.weight # [n_experts, hidden] (unchanged) + model.layers.{L}.mlp.shared_experts.{gate,up,down}_proj.weight # unchanged + +All other keys (attention, norms, embeddings, lm_head) are identical between formats. +Expert parallelism: in from_hf each rank loads only its slice [start:end] of the +first (expert) dimension of gate_up_proj / down_proj. +""" + +import logging +import re +from typing import Any, Optional + +import torch +from torch.distributed.device_mesh import DeviceMesh + +from nemo_automodel.components.checkpoint.state_dict_adapter import StateDictAdapter +from nemo_automodel.components.models.common import BackendConfig +from nemo_automodel.components.moe.config import MoEConfig +from nemo_automodel.components.moe.state_dict_utils import get_expert_range_for_rank_from_mesh + +logger = logging.getLogger(__name__) + +# Regex patterns used for key detection +_EXPERTS_GATE_UP = re.compile(r"(.*\.mlp)\.experts\.gate_up_proj$") +_EXPERTS_DOWN = re.compile(r"(.*\.mlp)\.experts\.down_proj$") +_E_SCORE_BIAS_HF = re.compile(r"(.*\.mlp)\.e_score_correction_bias$") + +_EXPERTS_GATE_AND_UP = re.compile(r"(.*\.mlp)\.experts\.gate_and_up_projs$") +_EXPERTS_DOWN_NATIVE = re.compile(r"(.*\.mlp)\.experts\.down_projs$") +_E_SCORE_BIAS_NATIVE = re.compile(r"(.*\.mlp)\.gate\.e_score_correction_bias$") + + +class HYV3StateDictAdapter(StateDictAdapter): + """Converts between HF HYV3 checkpoints and Automodel native format. + + The HF format uses pre-grouped (batched) expert tensors but with the last + two dimensions transposed relative to Automodel's GroupedExperts convention. + The e_score_correction_bias lives on the MoE module in HF but on the Gate + sub-module in native format. + """ + + def __init__( + self, + config: Any, + moe_config: MoEConfig, + backend: BackendConfig, + dtype: torch.dtype = torch.bfloat16, + ): + self.config = config + self.moe_config = moe_config + self.backend = backend + self.dtype = dtype + self._uses_model_prefix = True + + # ------------------------------------------------------------------ + # HF → native + # ------------------------------------------------------------------ + + def from_hf( + self, + hf_state_dict: dict[str, Any], + device_mesh: Optional[DeviceMesh] = None, + **kwargs, + ) -> dict[str, Any]: + """Convert HF checkpoint to native Automodel format. + + For expert parallelism, only the slice owned by the current rank is + retained from the batched expert tensors. + """ + n_experts = self.moe_config.n_routed_experts + + if device_mesh is not None: + start_expert, end_expert = get_expert_range_for_rank_from_mesh(device_mesh, n_experts) + else: + start_expert, end_expert = 0, n_experts + + native: dict[str, Any] = {} + + for key, value in hf_state_dict.items(): + # Skip MTP (multi-token prediction) layers — not used during SFT. + # These appear as layers beyond num_hidden_layers in the checkpoint. + if self._is_mtp_key(key): + continue + + m = _EXPERTS_GATE_UP.match(key) + if m: + # HF: [n_experts, 2*moe_inter, hidden] → native: [n_local, hidden, 2*moe_inter] + sliced = value[start_expert:end_expert] + native_key = m.group(1) + ".experts.gate_and_up_projs" + native[native_key] = sliced.transpose(1, 2).contiguous() + continue + + m = _EXPERTS_DOWN.match(key) + if m: + # HF: [n_experts, hidden, moe_inter] → native: [n_local, moe_inter, hidden] + sliced = value[start_expert:end_expert] + native_key = m.group(1) + ".experts.down_projs" + native[native_key] = sliced.transpose(1, 2).contiguous() + continue + + m = _E_SCORE_BIAS_HF.match(key) + if m: + # Move from MoE module level to Gate sub-module. + native_key = m.group(1) + ".gate.e_score_correction_bias" + native[native_key] = value + continue + + # All other keys (attn, norms, dense mlp, shared experts, lm_head) pass through. + native[key] = value + + return native + + # ------------------------------------------------------------------ + # native → HF + # ------------------------------------------------------------------ + + def to_hf( + self, + state_dict: dict[str, Any], + exclude_key_regex: Optional[str] = None, + **kwargs, + ) -> dict[str, Any]: + """Convert native Automodel state dict back to HF format.""" + hf: dict[str, Any] = {} + + for key, value in state_dict.items(): + converted_pairs = self.convert_single_tensor_to_hf(key, value, exclude_key_regex=exclude_key_regex) + for k, v in converted_pairs: + hf[k] = v + + return hf + + def convert_single_tensor_to_hf(self, fqn: str, tensor: Any, **kwargs) -> list[tuple[str, Any]]: + """Convert a single native tensor to its HF equivalent.""" + exclude_key_regex = kwargs.get("exclude_key_regex", None) + + m = _EXPERTS_GATE_AND_UP.match(fqn) + if m: + # native: [n_local, hidden, 2*moe_inter] → HF: [n_local, 2*moe_inter, hidden] + hf_key = m.group(1) + ".experts.gate_up_proj" + result = [(hf_key, tensor.transpose(1, 2).contiguous())] + else: + m = _EXPERTS_DOWN_NATIVE.match(fqn) + if m: + # native: [n_local, moe_inter, hidden] → HF: [n_local, hidden, moe_inter] + hf_key = m.group(1) + ".experts.down_proj" + result = [(hf_key, tensor.transpose(1, 2).contiguous())] + else: + m = _E_SCORE_BIAS_NATIVE.match(fqn) + if m: + hf_key = m.group(1) + ".e_score_correction_bias" + result = [(hf_key, tensor)] + else: + result = [(fqn, tensor)] + + if exclude_key_regex: + result = [(k, v) for k, v in result if not re.match(exclude_key_regex, k)] + + return result + + # ------------------------------------------------------------------ + # Helpers + # ------------------------------------------------------------------ + + def _is_mtp_key(self, key: str) -> bool: + """Return True if key belongs to a multi-token prediction (MTP) layer. + + HYV3 checkpoints include num_nextn_predict_layers extra layers beyond + the main num_hidden_layers. These are stored at layer indices >= num_hidden_layers + and are not used during standard SFT. + """ + num_hidden = getattr(self.config, "num_hidden_layers", 80) + m = re.match(r"(?:model\.)?layers\.(\d+)\.", key) + if m and int(m.group(1)) >= num_hidden: + return True + return False From e1a1056753e51baa6ecfe11efba4c02d556461b4 Mon Sep 17 00:00:00 2001 From: khazic Date: Mon, 27 Apr 2026 17:05:37 +0800 Subject: [PATCH 02/27] ci(llm): add HYV3 phased test configs (P0/P1/P2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P0 (hy3_4layer_p0_smoke.yaml): 4-layer proxy, pp=2, ep=4, torch dispatcher, 100 steps — validates forward/backward/PP/EP health and e_score_correction_bias updates with no checkpoint I/O. P1 (hy3_4layer_p1_ckpt.yaml): same topology + DCP checkpoint save at step 50, exercises save/resume continuity of the full FSDP2+EP state including Gate buffers. P2 (hy3_8layer_p2_deepep.yaml): 8-layer proxy, pp=2, ep=4, DeepEP dispatcher (async_finish=True), 200 steps — validates deepep communicate-compute overlap and throughput vs P0 torch baseline. All three configs: pp=2, ep=4, 8 GPUs, interleaved1f1b schedule, real sigmoid routing (fake_balanced_gate: false). Signed-off-by: khazic --- .../hy_v3/hy3_4layer_p0_smoke.yaml | 169 +++++++++++++++++ .../hy_v3/hy3_4layer_p1_ckpt.yaml | 172 +++++++++++++++++ .../hy_v3/hy3_8layer_p2_deepep.yaml | 173 ++++++++++++++++++ 3 files changed, 514 insertions(+) create mode 100644 examples/llm_finetune/hy_v3/hy3_4layer_p0_smoke.yaml create mode 100644 examples/llm_finetune/hy_v3/hy3_4layer_p1_ckpt.yaml create mode 100644 examples/llm_finetune/hy_v3/hy3_8layer_p2_deepep.yaml diff --git a/examples/llm_finetune/hy_v3/hy3_4layer_p0_smoke.yaml b/examples/llm_finetune/hy_v3/hy3_4layer_p0_smoke.yaml new file mode 100644 index 000000000..1c23257e2 --- /dev/null +++ b/examples/llm_finetune/hy_v3/hy3_4layer_p0_smoke.yaml @@ -0,0 +1,169 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# P0 — End-to-end forward/backward/PP/EP health check for HYV3ForCausalLM. +# +# Proxy model (4 layers, tiny dims) — no checkpoint download required. +# Tests: loss decreases, no NaN/Inf, PP pipeline works, EP dispatch works, +# FSDP2 sharding correct, e_score_correction_bias buffer updates. +# +# Hardware: 8× GPU (A100 80GB or equivalent) +# automodel examples/llm_finetune/hy_v3/hy3_4layer_p0_smoke.yaml --nproc-per-node 8 +# +# Expected outcomes: +# - Training starts within 2 min +# - Loss at step 100 < loss at step 1 +# - No NaN / Inf in loss or gradients +# - Memory stable at every step (no OOM growth) +# - Validation runs at step 2 without error + +recipe: TrainFinetuneRecipeForNextTokenPrediction + +step_scheduler: + global_batch_size: 16 + local_batch_size: 4 + ckpt_every_steps: 999999 # disable checkpoint for P0 + val_every_steps: 2 + max_steps: 100 + +dist_env: + backend: nccl + timeout_minutes: 10 + +rng: + _target_: nemo_automodel.components.training.rng.StatefulRNG + seed: 1234 + ranked: true + +model: + _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_config + config: + _target_: transformers.PretrainedConfig + architectures: [HYV3ForCausalLM] + model_type: hy_v3 + # --- proxy dimensions (tiny, same structure as real HYV3) --- + vocab_size: 120832 + hidden_size: 2048 + num_hidden_layers: 4 # 1 dense (layer 0) + 3 MoE + num_attention_heads: 16 + num_key_value_heads: 4 + head_dim: 128 + intermediate_size: 4096 # dense layer 0 FFN width + moe_intermediate_size: 512 # MoE expert hidden + max_position_embeddings: 2048 + hidden_act: silu + rms_norm_eps: 1.0e-5 + rope_parameters: + rope_theta: 11158840.0 + rope_type: default + # --- MoE topology --- + num_experts: 8 # 8 / ep_size=4 = 2 experts per rank + num_experts_per_tok: 2 + num_shared_experts: 1 + first_k_dense_replace: 1 # layer 0 is dense, layers 1-3 are MoE + # --- HYV3-specific routing --- + moe_router_use_sigmoid: true + moe_router_enable_expert_bias: true + route_norm: true + router_scaling_factor: 2.826 + qk_norm: true + initializer_range: 0.02 + bos_token_id: 120000 + eos_token_id: 120025 + pad_token_id: 120002 + use_cache: false + torch_dtype: bfloat16 + trust_remote_code: false + backend: + _target_: nemo_automodel.components.models.common.BackendConfig + attn: sdpa + linear: torch + rms_norm: torch_fp32 + experts: torch_mm + dispatcher: torch # use torch dispatcher; upgrade to deepep for P2+ + fake_balanced_gate: false # exercise real sigmoid routing + enable_hf_state_dict_adapter: false + enable_fsdp_optimizations: true + +checkpoint: + enabled: false + checkpoint_dir: /tmp/checkpoints/hy3_p0/ + +distributed: + strategy: fsdp2 + tp_size: 1 + cp_size: 1 + pp_size: 2 # 2 pipeline stages + ep_size: 4 # 8 experts / 4 ranks = 2 experts per rank + # total GPUs = pp * ep = 2 * 4 = 8 + + sequence_parallel: false + activation_checkpointing: true + + pipeline: + pp_schedule: interleaved1f1b + pp_microbatch_size: 1 + # 4 layers / layers_per_stage=1 = 4 virtual stages + # 4 virtual stages / pp_size=2 = 2 stages per rank ✓ (interleaved requires >=2) + layers_per_stage: 1 + round_virtual_stages_to_pp_multiple: down + scale_grads_in_schedule: false + patch_inner_model: false + patch_causal_lm_model: false + + moe: + reshard_after_forward: false + wrap_outer_model: false + +loss_fn: + _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy + +dataset: + _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag + path_or_dataset: rowan/hellaswag + split: train + tokenizer: + pretrained_model_name_or_path: tencent/Hy3-preview + +packed_sequence: + packed_sequence_size: 0 + +dataloader: + _target_: torchdata.stateful_dataloader.StatefulDataLoader + collate_fn: nemo_automodel.components.datasets.utils.default_collater + shuffle: true + +validation_dataset: + _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag + path_or_dataset: rowan/hellaswag + split: validation + num_samples_limit: 32 + tokenizer: + pretrained_model_name_or_path: tencent/Hy3-preview + +validation_dataloader: + _target_: torchdata.stateful_dataloader.StatefulDataLoader + collate_fn: nemo_automodel.components.datasets.utils.default_collater + batch_size: 4 + +optimizer: + _target_: torch.optim.Adam + betas: [0.9, 0.95] + eps: 1.0e-7 + lr: 5.0e-4 + weight_decay: 0.0 + +ci: + recipe_owner: khazic + time: "00:30:00" diff --git a/examples/llm_finetune/hy_v3/hy3_4layer_p1_ckpt.yaml b/examples/llm_finetune/hy_v3/hy3_4layer_p1_ckpt.yaml new file mode 100644 index 000000000..349a9ab29 --- /dev/null +++ b/examples/llm_finetune/hy_v3/hy3_4layer_p1_ckpt.yaml @@ -0,0 +1,172 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# P1 — Checkpoint save / resume health check for HYV3ForCausalLM. +# +# Proxy model (4 layers, same topology as P0) — no checkpoint download required. +# Extends P0 by enabling DCP checkpoint writing and validating that resume +# from mid-training produces an identical loss trajectory. +# +# Procedure: +# 1. Run for 100 steps; a checkpoint is saved at step 50. +# 2. Kill the job, re-run with --resume pointing to the saved checkpoint. +# 3. Confirm loss at step 51 (post-resume) matches the reference log. +# +# Hardware: 8× GPU (A100 80GB or equivalent) +# automodel examples/llm_finetune/hy_v3/hy3_4layer_p1_ckpt.yaml --nproc-per-node 8 +# +# Expected outcomes: +# - Checkpoint directory contains epoch_0_step_50/ +# - Loss at step 51 (resumed) ≡ loss at step 51 (original run) ± 1e-4 +# - No NaN / Inf in loss or gradients after resume +# - e_score_correction_bias restored correctly (verify via logged norm) + +recipe: TrainFinetuneRecipeForNextTokenPrediction + +step_scheduler: + global_batch_size: 16 + local_batch_size: 4 + ckpt_every_steps: 50 + val_every_steps: 10 + max_steps: 100 + +dist_env: + backend: nccl + timeout_minutes: 15 + +rng: + _target_: nemo_automodel.components.training.rng.StatefulRNG + seed: 1234 + ranked: true + +model: + _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_config + config: + _target_: transformers.PretrainedConfig + architectures: [HYV3ForCausalLM] + model_type: hy_v3 + # --- proxy dimensions (identical to P0) --- + vocab_size: 120832 + hidden_size: 2048 + num_hidden_layers: 4 # 1 dense (layer 0) + 3 MoE + num_attention_heads: 16 + num_key_value_heads: 4 + head_dim: 128 + intermediate_size: 4096 + moe_intermediate_size: 512 + max_position_embeddings: 2048 + hidden_act: silu + rms_norm_eps: 1.0e-5 + rope_parameters: + rope_theta: 11158840.0 + rope_type: default + # --- MoE topology --- + num_experts: 8 + num_experts_per_tok: 2 + num_shared_experts: 1 + first_k_dense_replace: 1 + # --- HYV3-specific routing --- + moe_router_use_sigmoid: true + moe_router_enable_expert_bias: true + route_norm: true + router_scaling_factor: 2.826 + qk_norm: true + initializer_range: 0.02 + bos_token_id: 120000 + eos_token_id: 120025 + pad_token_id: 120002 + use_cache: false + torch_dtype: bfloat16 + trust_remote_code: false + backend: + _target_: nemo_automodel.components.models.common.BackendConfig + attn: sdpa + linear: torch + rms_norm: torch_fp32 + experts: torch_mm + dispatcher: torch + fake_balanced_gate: false + enable_hf_state_dict_adapter: false + enable_fsdp_optimizations: true + +checkpoint: + enabled: true + checkpoint_dir: /tmp/checkpoints/hy3_p1/ + model_save_format: safetensors + save_consolidated: false + +distributed: + strategy: fsdp2 + tp_size: 1 + cp_size: 1 + pp_size: 2 + ep_size: 4 + + sequence_parallel: false + activation_checkpointing: true + + pipeline: + pp_schedule: interleaved1f1b + pp_microbatch_size: 1 + layers_per_stage: 1 + round_virtual_stages_to_pp_multiple: down + scale_grads_in_schedule: false + patch_inner_model: false + patch_causal_lm_model: false + + moe: + reshard_after_forward: false + wrap_outer_model: false + +loss_fn: + _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy + +dataset: + _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag + path_or_dataset: rowan/hellaswag + split: train + tokenizer: + pretrained_model_name_or_path: tencent/Hy3-preview + +packed_sequence: + packed_sequence_size: 0 + +dataloader: + _target_: torchdata.stateful_dataloader.StatefulDataLoader + collate_fn: nemo_automodel.components.datasets.utils.default_collater + shuffle: true + +validation_dataset: + _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag + path_or_dataset: rowan/hellaswag + split: validation + num_samples_limit: 32 + tokenizer: + pretrained_model_name_or_path: tencent/Hy3-preview + +validation_dataloader: + _target_: torchdata.stateful_dataloader.StatefulDataLoader + collate_fn: nemo_automodel.components.datasets.utils.default_collater + batch_size: 4 + +optimizer: + _target_: torch.optim.Adam + betas: [0.9, 0.95] + eps: 1.0e-7 + lr: 5.0e-4 + weight_decay: 0.0 + +ci: + recipe_owner: khazic + time: "00:30:00" diff --git a/examples/llm_finetune/hy_v3/hy3_8layer_p2_deepep.yaml b/examples/llm_finetune/hy_v3/hy3_8layer_p2_deepep.yaml new file mode 100644 index 000000000..6418dad03 --- /dev/null +++ b/examples/llm_finetune/hy_v3/hy3_8layer_p2_deepep.yaml @@ -0,0 +1,173 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# P2 — DeepEP dispatcher + larger proxy (8 layers) scale-up test for HYV3ForCausalLM. +# +# Proxy model (8 layers, same dims as P0/P1) — no checkpoint download required. +# Replaces the torch dispatcher with deepep to validate the full EP communication +# path (async_finish=True, communicate-compute overlap). +# +# Layer layout: +# 8 layers / layers_per_stage=2 → 4 virtual stages +# 4 virtual stages / pp_size=2 → 2 stages per rank ✓ (interleaved requires >=2) +# total GPUs = pp(2) × ep(4) = 8 +# +# Hardware: 8× GPU (A100 80GB or equivalent) with NVLINK / NVSwitch for DeepEP +# automodel examples/llm_finetune/hy_v3/hy3_8layer_p2_deepep.yaml --nproc-per-node 8 +# +# Expected outcomes: +# - Training starts within 3 min (DeepEP kernel compilation on first step) +# - Loss at step 200 < loss at step 1 +# - No NaN / Inf in loss or gradients +# - Throughput (tokens/sec) measurably higher than P0 torch dispatcher run +# - Memory stable at every step (no OOM growth) + +recipe: TrainFinetuneRecipeForNextTokenPrediction + +step_scheduler: + global_batch_size: 16 + local_batch_size: 4 + ckpt_every_steps: 999999 # disable checkpoint for P2 + val_every_steps: 20 + max_steps: 200 + +dist_env: + backend: nccl + timeout_minutes: 20 + +rng: + _target_: nemo_automodel.components.training.rng.StatefulRNG + seed: 1234 + ranked: true + +model: + _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_config + config: + _target_: transformers.PretrainedConfig + architectures: [HYV3ForCausalLM] + model_type: hy_v3 + # --- proxy dimensions (same as P0, but 8 layers) --- + vocab_size: 120832 + hidden_size: 2048 + num_hidden_layers: 8 # 1 dense (layer 0) + 7 MoE + num_attention_heads: 16 + num_key_value_heads: 4 + head_dim: 128 + intermediate_size: 4096 + moe_intermediate_size: 512 + max_position_embeddings: 2048 + hidden_act: silu + rms_norm_eps: 1.0e-5 + rope_parameters: + rope_theta: 11158840.0 + rope_type: default + # --- MoE topology --- + num_experts: 8 # 8 / ep_size=4 = 2 experts per rank + num_experts_per_tok: 2 + num_shared_experts: 1 + first_k_dense_replace: 1 + # --- HYV3-specific routing --- + moe_router_use_sigmoid: true + moe_router_enable_expert_bias: true + route_norm: true + router_scaling_factor: 2.826 + qk_norm: true + initializer_range: 0.02 + bos_token_id: 120000 + eos_token_id: 120025 + pad_token_id: 120002 + use_cache: false + torch_dtype: bfloat16 + trust_remote_code: false + backend: + _target_: nemo_automodel.components.models.common.BackendConfig + attn: sdpa + linear: torch + rms_norm: torch_fp32 + experts: torch_mm + dispatcher: deepep # tests full DeepEP communication path + fake_balanced_gate: false # exercise real sigmoid routing + enable_hf_state_dict_adapter: false + enable_fsdp_optimizations: true + +checkpoint: + enabled: false + checkpoint_dir: /tmp/checkpoints/hy3_p2/ + +distributed: + strategy: fsdp2 + tp_size: 1 + cp_size: 1 + pp_size: 2 + ep_size: 4 + + sequence_parallel: false + activation_checkpointing: true + + pipeline: + pp_schedule: interleaved1f1b + pp_microbatch_size: 1 + # 8 layers / layers_per_stage=2 → 4 virtual stages + # 4 virtual stages / pp_size=2 → 2 stages per rank ✓ + layers_per_stage: 2 + round_virtual_stages_to_pp_multiple: down + scale_grads_in_schedule: false + patch_inner_model: false + patch_causal_lm_model: false + + moe: + reshard_after_forward: false + wrap_outer_model: false + +loss_fn: + _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy + +dataset: + _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag + path_or_dataset: rowan/hellaswag + split: train + tokenizer: + pretrained_model_name_or_path: tencent/Hy3-preview + +packed_sequence: + packed_sequence_size: 0 + +dataloader: + _target_: torchdata.stateful_dataloader.StatefulDataLoader + collate_fn: nemo_automodel.components.datasets.utils.default_collater + shuffle: true + +validation_dataset: + _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag + path_or_dataset: rowan/hellaswag + split: validation + num_samples_limit: 32 + tokenizer: + pretrained_model_name_or_path: tencent/Hy3-preview + +validation_dataloader: + _target_: torchdata.stateful_dataloader.StatefulDataLoader + collate_fn: nemo_automodel.components.datasets.utils.default_collater + batch_size: 4 + +optimizer: + _target_: torch.optim.Adam + betas: [0.9, 0.95] + eps: 1.0e-7 + lr: 5.0e-4 + weight_decay: 0.0 + +ci: + recipe_owner: khazic + time: "00:45:00" From 51520dbf8d185f36d5c87ef32d6d9634e9205c0c Mon Sep 17 00:00:00 2001 From: khazic Date: Mon, 27 Apr 2026 17:29:06 +0800 Subject: [PATCH 03/27] ci(llm): rewrite HYV3 test configs to use real checkpoint (DSV4 pattern) Switch from tiny proxy model to real tencent/Hy3-preview weights with a truncated layer count (4/8 layers), following the same approach used for DeepSeek V4 Flash validation. Checkpoint keys for layers beyond the truncated num_hidden_layers are ignored via strict=False on load. P0 (4 layers, torch, pp=2 ep=4): validates real tensor shapes and routing P1 (4 layers, torch, pp=2 ep=4): adds DCP checkpoint save/resume P2 (8 layers, deepep, pp=2 ep=4): validates DeepEP with real expert dims All configs: AutoConfig.from_pretrained + num_hidden_layers override + load_base_model=true + enable_hf_state_dict_adapter=true. 192 experts / ep=4 = 48 experts per rank (~8GB params per rank at bf16). Signed-off-by: khazic --- .../hy_v3/hy3_4layer_p0_smoke.yaml | 83 +++++++----------- .../hy_v3/hy3_4layer_p1_ckpt.yaml | 74 +++++----------- .../hy_v3/hy3_8layer_p2_deepep.yaml | 86 ++++++------------- 3 files changed, 77 insertions(+), 166 deletions(-) diff --git a/examples/llm_finetune/hy_v3/hy3_4layer_p0_smoke.yaml b/examples/llm_finetune/hy_v3/hy3_4layer_p0_smoke.yaml index 1c23257e2..bd5d406f5 100644 --- a/examples/llm_finetune/hy_v3/hy3_4layer_p0_smoke.yaml +++ b/examples/llm_finetune/hy_v3/hy3_4layer_p0_smoke.yaml @@ -14,15 +14,24 @@ # P0 — End-to-end forward/backward/PP/EP health check for HYV3ForCausalLM. # -# Proxy model (4 layers, tiny dims) — no checkpoint download required. -# Tests: loss decreases, no NaN/Inf, PP pipeline works, EP dispatch works, -# FSDP2 sharding correct, e_score_correction_bias buffer updates. +# Uses the real tencent/Hy3-preview config and weights, but only 4 layers. +# Layers 4-79 are never created; their checkpoint keys are ignored (strict=False). +# Tests real tensor shapes (hidden=4096, 192 experts) and real routing logic. +# +# Layer layout (4 layers, pp=2): +# Stage 0: embed + layer 0 (dense) + layer 1 (MoE) +# Stage 1: layer 2 (MoE) + layer 3 (MoE) + lm_head +# ep_size=4 → 192 / 4 = 48 experts per rank +# total GPUs = pp(2) × ep(4) = 8 # # Hardware: 8× GPU (A100 80GB or equivalent) # automodel examples/llm_finetune/hy_v3/hy3_4layer_p0_smoke.yaml --nproc-per-node 8 # +# Before running: set HY3_PATH to your local Hy3-preview directory, then +# replace the two pretrained_model_name_or_path fields below accordingly. +# # Expected outcomes: -# - Training starts within 2 min +# - Training starts within 3 min (HF checkpoint loading for 4 layers) # - Loss at step 100 < loss at step 1 # - No NaN / Inf in loss or gradients # - Memory stable at every step (no OOM growth) @@ -31,15 +40,15 @@ recipe: TrainFinetuneRecipeForNextTokenPrediction step_scheduler: - global_batch_size: 16 - local_batch_size: 4 + global_batch_size: 8 + local_batch_size: 2 ckpt_every_steps: 999999 # disable checkpoint for P0 val_every_steps: 2 max_steps: 100 dist_env: backend: nccl - timeout_minutes: 10 + timeout_minutes: 30 rng: _target_: nemo_automodel.components.training.rng.StatefulRNG @@ -49,41 +58,11 @@ rng: model: _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_config config: - _target_: transformers.PretrainedConfig - architectures: [HYV3ForCausalLM] - model_type: hy_v3 - # --- proxy dimensions (tiny, same structure as real HYV3) --- - vocab_size: 120832 - hidden_size: 2048 - num_hidden_layers: 4 # 1 dense (layer 0) + 3 MoE - num_attention_heads: 16 - num_key_value_heads: 4 - head_dim: 128 - intermediate_size: 4096 # dense layer 0 FFN width - moe_intermediate_size: 512 # MoE expert hidden - max_position_embeddings: 2048 - hidden_act: silu - rms_norm_eps: 1.0e-5 - rope_parameters: - rope_theta: 11158840.0 - rope_type: default - # --- MoE topology --- - num_experts: 8 # 8 / ep_size=4 = 2 experts per rank - num_experts_per_tok: 2 - num_shared_experts: 1 - first_k_dense_replace: 1 # layer 0 is dense, layers 1-3 are MoE - # --- HYV3-specific routing --- - moe_router_use_sigmoid: true - moe_router_enable_expert_bias: true - route_norm: true - router_scaling_factor: 2.826 - qk_norm: true - initializer_range: 0.02 - bos_token_id: 120000 - eos_token_id: 120025 - pad_token_id: 120002 - use_cache: false - torch_dtype: bfloat16 + _target_: transformers.AutoConfig.from_pretrained + pretrained_model_name_or_path: /path/to/Hy3-preview # <-- replace with local path + num_hidden_layers: 4 # only build layers 0-3; keys for 4-79 ignored + num_nextn_predict_layers: 0 # disable MTP layer + load_base_model: true trust_remote_code: false backend: _target_: nemo_automodel.components.models.common.BackendConfig @@ -91,9 +70,9 @@ model: linear: torch rms_norm: torch_fp32 experts: torch_mm - dispatcher: torch # use torch dispatcher; upgrade to deepep for P2+ - fake_balanced_gate: false # exercise real sigmoid routing - enable_hf_state_dict_adapter: false + dispatcher: torch # torch dispatcher for P0 + fake_balanced_gate: false + enable_hf_state_dict_adapter: true enable_fsdp_optimizations: true checkpoint: @@ -104,19 +83,15 @@ distributed: strategy: fsdp2 tp_size: 1 cp_size: 1 - pp_size: 2 # 2 pipeline stages - ep_size: 4 # 8 experts / 4 ranks = 2 experts per rank - # total GPUs = pp * ep = 2 * 4 = 8 + pp_size: 2 + ep_size: 4 # 192 / 4 = 48 experts per rank sequence_parallel: false activation_checkpointing: true pipeline: - pp_schedule: interleaved1f1b + pp_schedule: 1f1b pp_microbatch_size: 1 - # 4 layers / layers_per_stage=1 = 4 virtual stages - # 4 virtual stages / pp_size=2 = 2 stages per rank ✓ (interleaved requires >=2) - layers_per_stage: 1 round_virtual_stages_to_pp_multiple: down scale_grads_in_schedule: false patch_inner_model: false @@ -134,7 +109,7 @@ dataset: path_or_dataset: rowan/hellaswag split: train tokenizer: - pretrained_model_name_or_path: tencent/Hy3-preview + pretrained_model_name_or_path: /path/to/Hy3-preview # <-- replace with local path packed_sequence: packed_sequence_size: 0 @@ -150,7 +125,7 @@ validation_dataset: split: validation num_samples_limit: 32 tokenizer: - pretrained_model_name_or_path: tencent/Hy3-preview + pretrained_model_name_or_path: /path/to/Hy3-preview # <-- replace with local path validation_dataloader: _target_: torchdata.stateful_dataloader.StatefulDataLoader diff --git a/examples/llm_finetune/hy_v3/hy3_4layer_p1_ckpt.yaml b/examples/llm_finetune/hy_v3/hy3_4layer_p1_ckpt.yaml index 349a9ab29..1092ef5e1 100644 --- a/examples/llm_finetune/hy_v3/hy3_4layer_p1_ckpt.yaml +++ b/examples/llm_finetune/hy_v3/hy3_4layer_p1_ckpt.yaml @@ -14,36 +14,35 @@ # P1 — Checkpoint save / resume health check for HYV3ForCausalLM. # -# Proxy model (4 layers, same topology as P0) — no checkpoint download required. -# Extends P0 by enabling DCP checkpoint writing and validating that resume -# from mid-training produces an identical loss trajectory. +# Same 4-layer truncated setup as P0 (real weights, real dims) but with DCP +# checkpoint enabled. Tests that the full FSDP2+EP state — including the +# e_score_correction_bias Gate buffers — is saved and restored correctly. # # Procedure: -# 1. Run for 100 steps; a checkpoint is saved at step 50. -# 2. Kill the job, re-run with --resume pointing to the saved checkpoint. -# 3. Confirm loss at step 51 (post-resume) matches the reference log. +# 1. Run for 100 steps; a DCP checkpoint is saved at step 50. +# 2. Kill the job, then re-run with --resume pointing to the saved checkpoint. +# 3. Confirm loss at step 51 (post-resume) matches the original run ± 1e-4. # # Hardware: 8× GPU (A100 80GB or equivalent) # automodel examples/llm_finetune/hy_v3/hy3_4layer_p1_ckpt.yaml --nproc-per-node 8 # # Expected outcomes: -# - Checkpoint directory contains epoch_0_step_50/ -# - Loss at step 51 (resumed) ≡ loss at step 51 (original run) ± 1e-4 -# - No NaN / Inf in loss or gradients after resume -# - e_score_correction_bias restored correctly (verify via logged norm) +# - epoch_0_step_50/ directory present after Run 1 +# - Loss at step 51 (resumed) ≡ loss at step 51 (original) ± 1e-4 +# - e_score_correction_bias norm identical at step 51 across both runs recipe: TrainFinetuneRecipeForNextTokenPrediction step_scheduler: - global_batch_size: 16 - local_batch_size: 4 + global_batch_size: 8 + local_batch_size: 2 ckpt_every_steps: 50 val_every_steps: 10 max_steps: 100 dist_env: backend: nccl - timeout_minutes: 15 + timeout_minutes: 30 rng: _target_: nemo_automodel.components.training.rng.StatefulRNG @@ -53,41 +52,11 @@ rng: model: _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_config config: - _target_: transformers.PretrainedConfig - architectures: [HYV3ForCausalLM] - model_type: hy_v3 - # --- proxy dimensions (identical to P0) --- - vocab_size: 120832 - hidden_size: 2048 - num_hidden_layers: 4 # 1 dense (layer 0) + 3 MoE - num_attention_heads: 16 - num_key_value_heads: 4 - head_dim: 128 - intermediate_size: 4096 - moe_intermediate_size: 512 - max_position_embeddings: 2048 - hidden_act: silu - rms_norm_eps: 1.0e-5 - rope_parameters: - rope_theta: 11158840.0 - rope_type: default - # --- MoE topology --- - num_experts: 8 - num_experts_per_tok: 2 - num_shared_experts: 1 - first_k_dense_replace: 1 - # --- HYV3-specific routing --- - moe_router_use_sigmoid: true - moe_router_enable_expert_bias: true - route_norm: true - router_scaling_factor: 2.826 - qk_norm: true - initializer_range: 0.02 - bos_token_id: 120000 - eos_token_id: 120025 - pad_token_id: 120002 - use_cache: false - torch_dtype: bfloat16 + _target_: transformers.AutoConfig.from_pretrained + pretrained_model_name_or_path: /path/to/Hy3-preview # <-- replace with local path + num_hidden_layers: 4 + num_nextn_predict_layers: 0 + load_base_model: true trust_remote_code: false backend: _target_: nemo_automodel.components.models.common.BackendConfig @@ -97,7 +66,7 @@ model: experts: torch_mm dispatcher: torch fake_balanced_gate: false - enable_hf_state_dict_adapter: false + enable_hf_state_dict_adapter: true enable_fsdp_optimizations: true checkpoint: @@ -117,9 +86,8 @@ distributed: activation_checkpointing: true pipeline: - pp_schedule: interleaved1f1b + pp_schedule: 1f1b pp_microbatch_size: 1 - layers_per_stage: 1 round_virtual_stages_to_pp_multiple: down scale_grads_in_schedule: false patch_inner_model: false @@ -137,7 +105,7 @@ dataset: path_or_dataset: rowan/hellaswag split: train tokenizer: - pretrained_model_name_or_path: tencent/Hy3-preview + pretrained_model_name_or_path: /path/to/Hy3-preview # <-- replace with local path packed_sequence: packed_sequence_size: 0 @@ -153,7 +121,7 @@ validation_dataset: split: validation num_samples_limit: 32 tokenizer: - pretrained_model_name_or_path: tencent/Hy3-preview + pretrained_model_name_or_path: /path/to/Hy3-preview # <-- replace with local path validation_dataloader: _target_: torchdata.stateful_dataloader.StatefulDataLoader diff --git a/examples/llm_finetune/hy_v3/hy3_8layer_p2_deepep.yaml b/examples/llm_finetune/hy_v3/hy3_8layer_p2_deepep.yaml index 6418dad03..fc16405b6 100644 --- a/examples/llm_finetune/hy_v3/hy3_8layer_p2_deepep.yaml +++ b/examples/llm_finetune/hy_v3/hy3_8layer_p2_deepep.yaml @@ -12,39 +12,40 @@ # See the License for the specific language governing permissions and # limitations under the License. -# P2 — DeepEP dispatcher + larger proxy (8 layers) scale-up test for HYV3ForCausalLM. +# P2 — DeepEP dispatcher scale-up test for HYV3ForCausalLM. # -# Proxy model (8 layers, same dims as P0/P1) — no checkpoint download required. -# Replaces the torch dispatcher with deepep to validate the full EP communication -# path (async_finish=True, communicate-compute overlap). +# Uses the real tencent/Hy3-preview config and weights, 8 layers. +# Layers 8-79 are never created; their checkpoint keys are ignored (strict=False). +# Replaces the torch dispatcher with deepep to validate communicate-compute +# overlap (async_finish=True) with real expert tensor shapes. # -# Layer layout: -# 8 layers / layers_per_stage=2 → 4 virtual stages -# 4 virtual stages / pp_size=2 → 2 stages per rank ✓ (interleaved requires >=2) +# Layer layout (8 layers, pp=2): +# Stage 0: embed + layers 0-3 (1 dense + 3 MoE) +# Stage 1: layers 4-7 (all MoE) + lm_head +# ep_size=4 → 192 / 4 = 48 experts per rank # total GPUs = pp(2) × ep(4) = 8 # -# Hardware: 8× GPU (A100 80GB or equivalent) with NVLINK / NVSwitch for DeepEP +# Hardware: 8× GPU (A100 80GB or equivalent) with NVLink / NVSwitch for DeepEP # automodel examples/llm_finetune/hy_v3/hy3_8layer_p2_deepep.yaml --nproc-per-node 8 # # Expected outcomes: -# - Training starts within 3 min (DeepEP kernel compilation on first step) +# - First step slow (DeepEP kernel JIT compile); subsequent steps fast # - Loss at step 200 < loss at step 1 # - No NaN / Inf in loss or gradients -# - Throughput (tokens/sec) measurably higher than P0 torch dispatcher run -# - Memory stable at every step (no OOM growth) +# - Throughput higher than P0 torch run after warmup recipe: TrainFinetuneRecipeForNextTokenPrediction step_scheduler: - global_batch_size: 16 - local_batch_size: 4 + global_batch_size: 8 + local_batch_size: 2 ckpt_every_steps: 999999 # disable checkpoint for P2 val_every_steps: 20 max_steps: 200 dist_env: backend: nccl - timeout_minutes: 20 + timeout_minutes: 45 rng: _target_: nemo_automodel.components.training.rng.StatefulRNG @@ -54,41 +55,11 @@ rng: model: _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_config config: - _target_: transformers.PretrainedConfig - architectures: [HYV3ForCausalLM] - model_type: hy_v3 - # --- proxy dimensions (same as P0, but 8 layers) --- - vocab_size: 120832 - hidden_size: 2048 - num_hidden_layers: 8 # 1 dense (layer 0) + 7 MoE - num_attention_heads: 16 - num_key_value_heads: 4 - head_dim: 128 - intermediate_size: 4096 - moe_intermediate_size: 512 - max_position_embeddings: 2048 - hidden_act: silu - rms_norm_eps: 1.0e-5 - rope_parameters: - rope_theta: 11158840.0 - rope_type: default - # --- MoE topology --- - num_experts: 8 # 8 / ep_size=4 = 2 experts per rank - num_experts_per_tok: 2 - num_shared_experts: 1 - first_k_dense_replace: 1 - # --- HYV3-specific routing --- - moe_router_use_sigmoid: true - moe_router_enable_expert_bias: true - route_norm: true - router_scaling_factor: 2.826 - qk_norm: true - initializer_range: 0.02 - bos_token_id: 120000 - eos_token_id: 120025 - pad_token_id: 120002 - use_cache: false - torch_dtype: bfloat16 + _target_: transformers.AutoConfig.from_pretrained + pretrained_model_name_or_path: /path/to/Hy3-preview # <-- replace with local path + num_hidden_layers: 8 # only build layers 0-7; keys for 8-79 ignored + num_nextn_predict_layers: 0 # disable MTP layer + load_base_model: true trust_remote_code: false backend: _target_: nemo_automodel.components.models.common.BackendConfig @@ -96,9 +67,9 @@ model: linear: torch rms_norm: torch_fp32 experts: torch_mm - dispatcher: deepep # tests full DeepEP communication path - fake_balanced_gate: false # exercise real sigmoid routing - enable_hf_state_dict_adapter: false + dispatcher: deepep # DeepEP for communicate-compute overlap + fake_balanced_gate: false + enable_hf_state_dict_adapter: true enable_fsdp_optimizations: true checkpoint: @@ -110,17 +81,14 @@ distributed: tp_size: 1 cp_size: 1 pp_size: 2 - ep_size: 4 + ep_size: 4 # 192 / 4 = 48 experts per rank sequence_parallel: false activation_checkpointing: true pipeline: - pp_schedule: interleaved1f1b + pp_schedule: 1f1b pp_microbatch_size: 1 - # 8 layers / layers_per_stage=2 → 4 virtual stages - # 4 virtual stages / pp_size=2 → 2 stages per rank ✓ - layers_per_stage: 2 round_virtual_stages_to_pp_multiple: down scale_grads_in_schedule: false patch_inner_model: false @@ -138,7 +106,7 @@ dataset: path_or_dataset: rowan/hellaswag split: train tokenizer: - pretrained_model_name_or_path: tencent/Hy3-preview + pretrained_model_name_or_path: /path/to/Hy3-preview # <-- replace with local path packed_sequence: packed_sequence_size: 0 @@ -154,7 +122,7 @@ validation_dataset: split: validation num_samples_limit: 32 tokenizer: - pretrained_model_name_or_path: tencent/Hy3-preview + pretrained_model_name_or_path: /path/to/Hy3-preview # <-- replace with local path validation_dataloader: _target_: torchdata.stateful_dataloader.StatefulDataLoader From 3e37aeb139a5749a75ef85ac8b41b28ca8ff66b7 Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 28 Apr 2026 10:21:39 +0800 Subject: [PATCH 04/27] fix(llm): align HYV3 configs and model with official Hy3-preview specs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix optimizer: Adam → AdamW, lr 5e-4 → 1e-5, eps 1e-7 → 1e-8 (follows official train.py and matches DSV4 pattern) - Add gate_precision: float32 to all HYV3 backends (matches HF router FP32) - Add rope_fusion: false to P0/P1/P2 (attn: sdpa; avoids TE mismatch) - Fix collate_fn to dict form with pad_seq_len_divisible: 64 - Add _target_ to tokenizer fields in dataset/validation_dataset - Add shuffle: false and drop_last: true to validation_dataloader - Fix hy3_preview_deepep: pp_schedule 1f1b (pp=1), add moe section, fix optimizer and dataset fields - Remove num_nextn_predict_layers (DeepSeek-specific, not in HYV3 config) - Add update_moe_gate_bias() to HYV3ForCausalLM so the training recipe updates e_score_correction_bias each optimizer step (load balancing) Signed-off-by: khazic --- .../hy_v3/hy3_4layer_p0_smoke.yaml | 22 ++++++++---- .../hy_v3/hy3_4layer_p1_ckpt.yaml | 22 ++++++++---- .../hy_v3/hy3_8layer_p2_deepep.yaml | 22 ++++++++---- .../hy_v3/hy3_preview_deepep.yaml | 36 +++++++++++++------ .../components/models/hy_v3/model.py | 6 ++++ 5 files changed, 77 insertions(+), 31 deletions(-) diff --git a/examples/llm_finetune/hy_v3/hy3_4layer_p0_smoke.yaml b/examples/llm_finetune/hy_v3/hy3_4layer_p0_smoke.yaml index bd5d406f5..44971f8d2 100644 --- a/examples/llm_finetune/hy_v3/hy3_4layer_p0_smoke.yaml +++ b/examples/llm_finetune/hy_v3/hy3_4layer_p0_smoke.yaml @@ -61,7 +61,6 @@ model: _target_: transformers.AutoConfig.from_pretrained pretrained_model_name_or_path: /path/to/Hy3-preview # <-- replace with local path num_hidden_layers: 4 # only build layers 0-3; keys for 4-79 ignored - num_nextn_predict_layers: 0 # disable MTP layer load_base_model: true trust_remote_code: false backend: @@ -72,6 +71,8 @@ model: experts: torch_mm dispatcher: torch # torch dispatcher for P0 fake_balanced_gate: false + gate_precision: float32 + rope_fusion: false enable_hf_state_dict_adapter: true enable_fsdp_optimizations: true @@ -109,6 +110,7 @@ dataset: path_or_dataset: rowan/hellaswag split: train tokenizer: + _target_: transformers.AutoTokenizer.from_pretrained pretrained_model_name_or_path: /path/to/Hy3-preview # <-- replace with local path packed_sequence: @@ -116,7 +118,9 @@ packed_sequence: dataloader: _target_: torchdata.stateful_dataloader.StatefulDataLoader - collate_fn: nemo_automodel.components.datasets.utils.default_collater + collate_fn: + _target_: nemo_automodel.components.datasets.utils.default_collater + pad_seq_len_divisible: 64 shuffle: true validation_dataset: @@ -125,18 +129,22 @@ validation_dataset: split: validation num_samples_limit: 32 tokenizer: + _target_: transformers.AutoTokenizer.from_pretrained pretrained_model_name_or_path: /path/to/Hy3-preview # <-- replace with local path validation_dataloader: _target_: torchdata.stateful_dataloader.StatefulDataLoader - collate_fn: nemo_automodel.components.datasets.utils.default_collater - batch_size: 4 + collate_fn: + _target_: nemo_automodel.components.datasets.utils.default_collater + pad_seq_len_divisible: 64 + shuffle: false + drop_last: true optimizer: - _target_: torch.optim.Adam + _target_: torch.optim.AdamW betas: [0.9, 0.95] - eps: 1.0e-7 - lr: 5.0e-4 + eps: 1e-8 + lr: 1e-5 weight_decay: 0.0 ci: diff --git a/examples/llm_finetune/hy_v3/hy3_4layer_p1_ckpt.yaml b/examples/llm_finetune/hy_v3/hy3_4layer_p1_ckpt.yaml index 1092ef5e1..3752b0f30 100644 --- a/examples/llm_finetune/hy_v3/hy3_4layer_p1_ckpt.yaml +++ b/examples/llm_finetune/hy_v3/hy3_4layer_p1_ckpt.yaml @@ -55,7 +55,6 @@ model: _target_: transformers.AutoConfig.from_pretrained pretrained_model_name_or_path: /path/to/Hy3-preview # <-- replace with local path num_hidden_layers: 4 - num_nextn_predict_layers: 0 load_base_model: true trust_remote_code: false backend: @@ -66,6 +65,8 @@ model: experts: torch_mm dispatcher: torch fake_balanced_gate: false + gate_precision: float32 + rope_fusion: false enable_hf_state_dict_adapter: true enable_fsdp_optimizations: true @@ -105,6 +106,7 @@ dataset: path_or_dataset: rowan/hellaswag split: train tokenizer: + _target_: transformers.AutoTokenizer.from_pretrained pretrained_model_name_or_path: /path/to/Hy3-preview # <-- replace with local path packed_sequence: @@ -112,7 +114,9 @@ packed_sequence: dataloader: _target_: torchdata.stateful_dataloader.StatefulDataLoader - collate_fn: nemo_automodel.components.datasets.utils.default_collater + collate_fn: + _target_: nemo_automodel.components.datasets.utils.default_collater + pad_seq_len_divisible: 64 shuffle: true validation_dataset: @@ -121,18 +125,22 @@ validation_dataset: split: validation num_samples_limit: 32 tokenizer: + _target_: transformers.AutoTokenizer.from_pretrained pretrained_model_name_or_path: /path/to/Hy3-preview # <-- replace with local path validation_dataloader: _target_: torchdata.stateful_dataloader.StatefulDataLoader - collate_fn: nemo_automodel.components.datasets.utils.default_collater - batch_size: 4 + collate_fn: + _target_: nemo_automodel.components.datasets.utils.default_collater + pad_seq_len_divisible: 64 + shuffle: false + drop_last: true optimizer: - _target_: torch.optim.Adam + _target_: torch.optim.AdamW betas: [0.9, 0.95] - eps: 1.0e-7 - lr: 5.0e-4 + eps: 1e-8 + lr: 1e-5 weight_decay: 0.0 ci: diff --git a/examples/llm_finetune/hy_v3/hy3_8layer_p2_deepep.yaml b/examples/llm_finetune/hy_v3/hy3_8layer_p2_deepep.yaml index fc16405b6..8dc3bbf02 100644 --- a/examples/llm_finetune/hy_v3/hy3_8layer_p2_deepep.yaml +++ b/examples/llm_finetune/hy_v3/hy3_8layer_p2_deepep.yaml @@ -58,7 +58,6 @@ model: _target_: transformers.AutoConfig.from_pretrained pretrained_model_name_or_path: /path/to/Hy3-preview # <-- replace with local path num_hidden_layers: 8 # only build layers 0-7; keys for 8-79 ignored - num_nextn_predict_layers: 0 # disable MTP layer load_base_model: true trust_remote_code: false backend: @@ -69,6 +68,8 @@ model: experts: torch_mm dispatcher: deepep # DeepEP for communicate-compute overlap fake_balanced_gate: false + gate_precision: float32 + rope_fusion: false enable_hf_state_dict_adapter: true enable_fsdp_optimizations: true @@ -106,6 +107,7 @@ dataset: path_or_dataset: rowan/hellaswag split: train tokenizer: + _target_: transformers.AutoTokenizer.from_pretrained pretrained_model_name_or_path: /path/to/Hy3-preview # <-- replace with local path packed_sequence: @@ -113,7 +115,9 @@ packed_sequence: dataloader: _target_: torchdata.stateful_dataloader.StatefulDataLoader - collate_fn: nemo_automodel.components.datasets.utils.default_collater + collate_fn: + _target_: nemo_automodel.components.datasets.utils.default_collater + pad_seq_len_divisible: 64 shuffle: true validation_dataset: @@ -122,18 +126,22 @@ validation_dataset: split: validation num_samples_limit: 32 tokenizer: + _target_: transformers.AutoTokenizer.from_pretrained pretrained_model_name_or_path: /path/to/Hy3-preview # <-- replace with local path validation_dataloader: _target_: torchdata.stateful_dataloader.StatefulDataLoader - collate_fn: nemo_automodel.components.datasets.utils.default_collater - batch_size: 4 + collate_fn: + _target_: nemo_automodel.components.datasets.utils.default_collater + pad_seq_len_divisible: 64 + shuffle: false + drop_last: true optimizer: - _target_: torch.optim.Adam + _target_: torch.optim.AdamW betas: [0.9, 0.95] - eps: 1.0e-7 - lr: 5.0e-4 + eps: 1e-8 + lr: 1e-5 weight_decay: 0.0 ci: diff --git a/examples/llm_finetune/hy_v3/hy3_preview_deepep.yaml b/examples/llm_finetune/hy_v3/hy3_preview_deepep.yaml index 523a72b0d..66ecb18b2 100644 --- a/examples/llm_finetune/hy_v3/hy3_preview_deepep.yaml +++ b/examples/llm_finetune/hy_v3/hy3_preview_deepep.yaml @@ -50,6 +50,7 @@ model: experts: torch_mm dispatcher: deepep fake_balanced_gate: false + gate_precision: float32 enable_hf_state_dict_adapter: true enable_fsdp_optimizations: true @@ -70,13 +71,17 @@ distributed: activation_checkpointing: true pipeline: - pp_schedule: interleaved1f1b - pp_microbatch_size: 4 - round_virtual_stages_to_pp_multiple: up + pp_schedule: 1f1b + pp_microbatch_size: 1 + round_virtual_stages_to_pp_multiple: down scale_grads_in_schedule: false patch_inner_model: false patch_causal_lm_model: false + moe: + reshard_after_forward: false + wrap_outer_model: false + loss_fn: _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy @@ -84,13 +89,18 @@ dataset: _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag path_or_dataset: rowan/hellaswag split: train + tokenizer: + _target_: transformers.AutoTokenizer.from_pretrained + pretrained_model_name_or_path: tencent/Hy3-preview packed_sequence: packed_sequence_size: 0 dataloader: _target_: torchdata.stateful_dataloader.StatefulDataLoader - collate_fn: nemo_automodel.components.datasets.utils.default_collater + collate_fn: + _target_: nemo_automodel.components.datasets.utils.default_collater + pad_seq_len_divisible: 64 shuffle: true validation_dataset: @@ -98,18 +108,24 @@ validation_dataset: path_or_dataset: rowan/hellaswag split: validation num_samples_limit: 64 + tokenizer: + _target_: transformers.AutoTokenizer.from_pretrained + pretrained_model_name_or_path: tencent/Hy3-preview validation_dataloader: _target_: torchdata.stateful_dataloader.StatefulDataLoader - collate_fn: nemo_automodel.components.datasets.utils.default_collater - batch_size: 4 + collate_fn: + _target_: nemo_automodel.components.datasets.utils.default_collater + pad_seq_len_divisible: 64 + shuffle: false + drop_last: true optimizer: - _target_: torch.optim.Adam + _target_: torch.optim.AdamW betas: [0.9, 0.95] - eps: 1e-7 - lr: 1.0e-5 - weight_decay: 0.1 + eps: 1e-8 + lr: 1e-5 + weight_decay: 0.0 # Uncomment for W&B logging # wandb: diff --git a/nemo_automodel/components/models/hy_v3/model.py b/nemo_automodel/components/models/hy_v3/model.py index a8217ad5c..735cd5354 100644 --- a/nemo_automodel/components/models/hy_v3/model.py +++ b/nemo_automodel/components/models/hy_v3/model.py @@ -301,6 +301,12 @@ def forward( logits = logits.unsqueeze(0) return logits + def update_moe_gate_bias(self) -> None: + with torch.no_grad(): + for block in self.model.layers.values(): + if isinstance(block.mlp, MoE): + block.mlp.gate.update_bias() + @torch.no_grad() def initialize_weights( self, buffer_device: torch.device | None = None, dtype: torch.dtype = torch.bfloat16 From a555be506e70e630540350ca6db906f59b25e57e Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 28 Apr 2026 13:40:20 +0800 Subject: [PATCH 05/27] ci(llm): set Hy3-preview checkpoint path to /llm-align/open_models/hunyuan3 Signed-off-by: khazic --- examples/llm_finetune/hy_v3/hy3_4layer_p0_smoke.yaml | 6 +++--- examples/llm_finetune/hy_v3/hy3_4layer_p1_ckpt.yaml | 6 +++--- examples/llm_finetune/hy_v3/hy3_8layer_p2_deepep.yaml | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/llm_finetune/hy_v3/hy3_4layer_p0_smoke.yaml b/examples/llm_finetune/hy_v3/hy3_4layer_p0_smoke.yaml index 44971f8d2..02b9ed32f 100644 --- a/examples/llm_finetune/hy_v3/hy3_4layer_p0_smoke.yaml +++ b/examples/llm_finetune/hy_v3/hy3_4layer_p0_smoke.yaml @@ -59,7 +59,7 @@ model: _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_config config: _target_: transformers.AutoConfig.from_pretrained - pretrained_model_name_or_path: /path/to/Hy3-preview # <-- replace with local path + pretrained_model_name_or_path: /llm-align/open_models/hunyuan3 num_hidden_layers: 4 # only build layers 0-3; keys for 4-79 ignored load_base_model: true trust_remote_code: false @@ -111,7 +111,7 @@ dataset: split: train tokenizer: _target_: transformers.AutoTokenizer.from_pretrained - pretrained_model_name_or_path: /path/to/Hy3-preview # <-- replace with local path + pretrained_model_name_or_path: /llm-align/open_models/hunyuan3 packed_sequence: packed_sequence_size: 0 @@ -130,7 +130,7 @@ validation_dataset: num_samples_limit: 32 tokenizer: _target_: transformers.AutoTokenizer.from_pretrained - pretrained_model_name_or_path: /path/to/Hy3-preview # <-- replace with local path + pretrained_model_name_or_path: /llm-align/open_models/hunyuan3 validation_dataloader: _target_: torchdata.stateful_dataloader.StatefulDataLoader diff --git a/examples/llm_finetune/hy_v3/hy3_4layer_p1_ckpt.yaml b/examples/llm_finetune/hy_v3/hy3_4layer_p1_ckpt.yaml index 3752b0f30..8f9a774fb 100644 --- a/examples/llm_finetune/hy_v3/hy3_4layer_p1_ckpt.yaml +++ b/examples/llm_finetune/hy_v3/hy3_4layer_p1_ckpt.yaml @@ -53,7 +53,7 @@ model: _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_config config: _target_: transformers.AutoConfig.from_pretrained - pretrained_model_name_or_path: /path/to/Hy3-preview # <-- replace with local path + pretrained_model_name_or_path: /llm-align/open_models/hunyuan3 num_hidden_layers: 4 load_base_model: true trust_remote_code: false @@ -107,7 +107,7 @@ dataset: split: train tokenizer: _target_: transformers.AutoTokenizer.from_pretrained - pretrained_model_name_or_path: /path/to/Hy3-preview # <-- replace with local path + pretrained_model_name_or_path: /llm-align/open_models/hunyuan3 packed_sequence: packed_sequence_size: 0 @@ -126,7 +126,7 @@ validation_dataset: num_samples_limit: 32 tokenizer: _target_: transformers.AutoTokenizer.from_pretrained - pretrained_model_name_or_path: /path/to/Hy3-preview # <-- replace with local path + pretrained_model_name_or_path: /llm-align/open_models/hunyuan3 validation_dataloader: _target_: torchdata.stateful_dataloader.StatefulDataLoader diff --git a/examples/llm_finetune/hy_v3/hy3_8layer_p2_deepep.yaml b/examples/llm_finetune/hy_v3/hy3_8layer_p2_deepep.yaml index 8dc3bbf02..2334eedb8 100644 --- a/examples/llm_finetune/hy_v3/hy3_8layer_p2_deepep.yaml +++ b/examples/llm_finetune/hy_v3/hy3_8layer_p2_deepep.yaml @@ -56,7 +56,7 @@ model: _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_config config: _target_: transformers.AutoConfig.from_pretrained - pretrained_model_name_or_path: /path/to/Hy3-preview # <-- replace with local path + pretrained_model_name_or_path: /llm-align/open_models/hunyuan3 num_hidden_layers: 8 # only build layers 0-7; keys for 8-79 ignored load_base_model: true trust_remote_code: false @@ -108,7 +108,7 @@ dataset: split: train tokenizer: _target_: transformers.AutoTokenizer.from_pretrained - pretrained_model_name_or_path: /path/to/Hy3-preview # <-- replace with local path + pretrained_model_name_or_path: /llm-align/open_models/hunyuan3 packed_sequence: packed_sequence_size: 0 @@ -127,7 +127,7 @@ validation_dataset: num_samples_limit: 32 tokenizer: _target_: transformers.AutoTokenizer.from_pretrained - pretrained_model_name_or_path: /path/to/Hy3-preview # <-- replace with local path + pretrained_model_name_or_path: /llm-align/open_models/hunyuan3 validation_dataloader: _target_: torchdata.stateful_dataloader.StatefulDataLoader From e54e6ae021640fa12e3dc4b99b548f082e035407 Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 28 Apr 2026 13:45:38 +0800 Subject: [PATCH 06/27] feat(llm): add HYV3Config and register hy_v3 with AutoConfig AutoConfig.from_pretrained failed on checkpoints with model_type=hy_v3 because the type was not registered. Add config.py with HYV3Config (PretrainedConfig subclass) and wire it into _CUSTOM_CONFIG_REGISTRATIONS so that trust_remote_code=False keeps working. Signed-off-by: khazic --- nemo_automodel/_transformers/registry.py | 1 + .../components/models/hy_v3/config.py | 100 ++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 nemo_automodel/components/models/hy_v3/config.py diff --git a/nemo_automodel/_transformers/registry.py b/nemo_automodel/_transformers/registry.py index bd025f354..b510a7f8b 100644 --- a/nemo_automodel/_transformers/registry.py +++ b/nemo_automodel/_transformers/registry.py @@ -178,6 +178,7 @@ _CUSTOM_CONFIG_REGISTRATIONS: Dict[str, Tuple[str, str]] = { "baichuan": ("nemo_automodel.components.models.baichuan.configuration", "BaichuanConfig"), "deepseek_v4": ("nemo_automodel.components.models.deepseek_v4.config", "DeepseekV4Config"), + "hy_v3": ("nemo_automodel.components.models.hy_v3.config", "HYV3Config"), "kimi_k25": ("nemo_automodel.components.models.kimi_k25_vl.model", "KimiK25VLConfig"), "kimi_vl": ("nemo_automodel.components.models.kimivl.model", "KimiVLConfig"), "llavaonevision1_5": ("nemo_automodel.components.models.llava_onevision.model", "Llavaonevision1_5Config"), diff --git a/nemo_automodel/components/models/hy_v3/config.py b/nemo_automodel/components/models/hy_v3/config.py new file mode 100644 index 000000000..479ce3a7f --- /dev/null +++ b/nemo_automodel/components/models/hy_v3/config.py @@ -0,0 +1,100 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from transformers import PretrainedConfig + + +class HYV3Config(PretrainedConfig): + """Configuration class for Tencent Hy3-preview (295B MoE). + + Architecture: + - 80 transformer layers; layer 0 is dense, layers 1-79 are MoE + - MoE: 192 routed experts + 1 shared expert, top-8 activated + - Sigmoid routing with expert-bias correction (e_score_correction_bias) + - GQA: 64 Q heads, 8 KV heads, head_dim=128 + - Per-head QK RMSNorm before RoPE + - 256K context, rope_theta=11158840 + """ + + model_type = "hy_v3" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size: int = 129280, + hidden_size: int = 4096, + intermediate_size: int = 1536, + moe_intermediate_size: int = 1536, + num_hidden_layers: int = 80, + num_attention_heads: int = 64, + num_key_value_heads: int = 8, + head_dim: int = 128, + # MoE routing + num_experts: int = 192, + num_shared_experts: int = 1, + num_experts_per_tok: int = 8, + router_scaling_factor: float = 1.0, + route_norm: bool = False, + moe_router_enable_expert_bias: bool = True, + # Dense layers + first_k_dense_replace: int = 1, + # Position encoding + max_position_embeddings: int = 262144, + rope_theta: float = 11158840.0, + rope_scaling: dict | None = None, + # Standard options + rms_norm_eps: float = 1e-6, + attention_bias: bool = False, + hidden_act: str = "silu", + use_cache: bool = True, + pad_token_id: int | None = None, + bos_token_id: int = 1, + eos_token_id: int = 2, + tie_word_embeddings: bool = False, + torch_dtype: str = "bfloat16", + **kwargs, + ): + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.moe_intermediate_size = moe_intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads + self.head_dim = head_dim + self.num_experts = num_experts + self.num_shared_experts = num_shared_experts + self.num_experts_per_tok = num_experts_per_tok + self.router_scaling_factor = router_scaling_factor + self.route_norm = route_norm + self.moe_router_enable_expert_bias = moe_router_enable_expert_bias + self.first_k_dense_replace = first_k_dense_replace + self.max_position_embeddings = max_position_embeddings + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.rms_norm_eps = rms_norm_eps + self.attention_bias = attention_bias + self.hidden_act = hidden_act + self.torch_dtype = torch_dtype + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + use_cache=use_cache, + **kwargs, + ) From 14895dc6293e256b45718817a255ab6e189ae03f Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 28 Apr 2026 13:52:10 +0800 Subject: [PATCH 07/27] fix(checkpoint): allow partial load when loading HF base checkpoint via DCP Custom models (e.g. HYV3) create training-only buffers (e_score_correction_bias) that are not present in the original HF pretrained checkpoint. Using DefaultLoadPlanner(allow_partial_load=True) for is_init_step loads lets those buffers keep their zero initialization instead of raising "Missing key in checkpoint state_dict". Signed-off-by: khazic --- nemo_automodel/components/checkpoint/checkpointing.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/nemo_automodel/components/checkpoint/checkpointing.py b/nemo_automodel/components/checkpoint/checkpointing.py index 5bc3cb8ad..bfc4e87e2 100644 --- a/nemo_automodel/components/checkpoint/checkpointing.py +++ b/nemo_automodel/components/checkpoint/checkpointing.py @@ -809,7 +809,13 @@ def _do_load( if self.config.is_peft and is_model and (not is_init_step): state_dict = load_file(os.path.join(path, "adapter_model.safetensors")) else: - dcp.load(state_dict, checkpoint_id=path, storage_reader=storage_reader) + if is_init_step and storage_reader is not None: + from torch.distributed.checkpoint import DefaultLoadPlanner + + planner = DefaultLoadPlanner(allow_partial_load=True) + else: + planner = None + dcp.load(state_dict, checkpoint_id=path, storage_reader=storage_reader, planner=planner) return state_dict def _do_save( From fafa76ea80a703dec6b545049934daf44dfbe320 Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 28 Apr 2026 14:01:45 +0800 Subject: [PATCH 08/27] fix(checkpoint): skip EP slicing in from_hf for standard DCP load path In the standard DCP load path (post-shard, e.g. PP+EP), DCP already distributes expert tensors correctly via DTensor placement (Shard(0)). Passing moe_mesh to from_hf causes a second EP slice on the DTensor, producing a plain tensor of local shape [48,...] that cannot be loaded into the model's DTensor parameter of global shape [192,...]. Fix: pass moe_mesh=None to _maybe_adapt_state_dict_from_hf in the standard DCP path so adapters only rename keys and do not re-slice tensors that DCP has already distributed. The fast path (lines 444-498) is unaffected: it still passes moe_mesh because it loads full plain tensors from disk and needs explicit slicing. Signed-off-by: khazic --- nemo_automodel/components/checkpoint/checkpointing.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/nemo_automodel/components/checkpoint/checkpointing.py b/nemo_automodel/components/checkpoint/checkpointing.py index bfc4e87e2..24c92f11d 100644 --- a/nemo_automodel/components/checkpoint/checkpointing.py +++ b/nemo_automodel/components/checkpoint/checkpointing.py @@ -553,7 +553,10 @@ def load_model( if compat_tied_lm_head_source_key is not None and isinstance(lm_head_param_name, str): state_dict[lm_head_param_name] = state_dict.pop(compat_tied_lm_head_source_key) - state_dict = _maybe_adapt_state_dict_from_hf(model_state.model[0], state_dict, moe_mesh=self.moe_mesh) + # DCP already handled EP tensor distribution via DTensors, so don't + # pass moe_mesh here — adapter.from_hf must only rename keys, not + # re-slice experts that are already EP-local inside the DTensors. + state_dict = _maybe_adapt_state_dict_from_hf(model_state.model[0], state_dict, moe_mesh=None) key_diff = _summarize_state_dict_key_diff(expected_keys, set(state_dict.keys())) if key_diff["missing_count"] or key_diff["unexpected_count"]: logging.warning( From c662e727290e5d24667d83529a9a1790eed8632c Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 28 Apr 2026 15:20:02 +0800 Subject: [PATCH 09/27] ci(llm): use public tencent/Hy3-preview HF path in HYV3 test yamls Replace internal server paths with the public HuggingFace model ID. Signed-off-by: khazic --- examples/llm_finetune/hy_v3/hy3_4layer_p0_smoke.yaml | 6 +++--- examples/llm_finetune/hy_v3/hy3_4layer_p1_ckpt.yaml | 6 +++--- examples/llm_finetune/hy_v3/hy3_8layer_p2_deepep.yaml | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/llm_finetune/hy_v3/hy3_4layer_p0_smoke.yaml b/examples/llm_finetune/hy_v3/hy3_4layer_p0_smoke.yaml index 02b9ed32f..5b37964b5 100644 --- a/examples/llm_finetune/hy_v3/hy3_4layer_p0_smoke.yaml +++ b/examples/llm_finetune/hy_v3/hy3_4layer_p0_smoke.yaml @@ -59,7 +59,7 @@ model: _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_config config: _target_: transformers.AutoConfig.from_pretrained - pretrained_model_name_or_path: /llm-align/open_models/hunyuan3 + pretrained_model_name_or_path: tencent/Hy3-preview num_hidden_layers: 4 # only build layers 0-3; keys for 4-79 ignored load_base_model: true trust_remote_code: false @@ -111,7 +111,7 @@ dataset: split: train tokenizer: _target_: transformers.AutoTokenizer.from_pretrained - pretrained_model_name_or_path: /llm-align/open_models/hunyuan3 + pretrained_model_name_or_path: tencent/Hy3-preview packed_sequence: packed_sequence_size: 0 @@ -130,7 +130,7 @@ validation_dataset: num_samples_limit: 32 tokenizer: _target_: transformers.AutoTokenizer.from_pretrained - pretrained_model_name_or_path: /llm-align/open_models/hunyuan3 + pretrained_model_name_or_path: tencent/Hy3-preview validation_dataloader: _target_: torchdata.stateful_dataloader.StatefulDataLoader diff --git a/examples/llm_finetune/hy_v3/hy3_4layer_p1_ckpt.yaml b/examples/llm_finetune/hy_v3/hy3_4layer_p1_ckpt.yaml index 8f9a774fb..e9235ef9c 100644 --- a/examples/llm_finetune/hy_v3/hy3_4layer_p1_ckpt.yaml +++ b/examples/llm_finetune/hy_v3/hy3_4layer_p1_ckpt.yaml @@ -53,7 +53,7 @@ model: _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_config config: _target_: transformers.AutoConfig.from_pretrained - pretrained_model_name_or_path: /llm-align/open_models/hunyuan3 + pretrained_model_name_or_path: tencent/Hy3-preview num_hidden_layers: 4 load_base_model: true trust_remote_code: false @@ -107,7 +107,7 @@ dataset: split: train tokenizer: _target_: transformers.AutoTokenizer.from_pretrained - pretrained_model_name_or_path: /llm-align/open_models/hunyuan3 + pretrained_model_name_or_path: tencent/Hy3-preview packed_sequence: packed_sequence_size: 0 @@ -126,7 +126,7 @@ validation_dataset: num_samples_limit: 32 tokenizer: _target_: transformers.AutoTokenizer.from_pretrained - pretrained_model_name_or_path: /llm-align/open_models/hunyuan3 + pretrained_model_name_or_path: tencent/Hy3-preview validation_dataloader: _target_: torchdata.stateful_dataloader.StatefulDataLoader diff --git a/examples/llm_finetune/hy_v3/hy3_8layer_p2_deepep.yaml b/examples/llm_finetune/hy_v3/hy3_8layer_p2_deepep.yaml index 2334eedb8..91266e186 100644 --- a/examples/llm_finetune/hy_v3/hy3_8layer_p2_deepep.yaml +++ b/examples/llm_finetune/hy_v3/hy3_8layer_p2_deepep.yaml @@ -56,7 +56,7 @@ model: _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_config config: _target_: transformers.AutoConfig.from_pretrained - pretrained_model_name_or_path: /llm-align/open_models/hunyuan3 + pretrained_model_name_or_path: tencent/Hy3-preview num_hidden_layers: 8 # only build layers 0-7; keys for 8-79 ignored load_base_model: true trust_remote_code: false @@ -108,7 +108,7 @@ dataset: split: train tokenizer: _target_: transformers.AutoTokenizer.from_pretrained - pretrained_model_name_or_path: /llm-align/open_models/hunyuan3 + pretrained_model_name_or_path: tencent/Hy3-preview packed_sequence: packed_sequence_size: 0 @@ -127,7 +127,7 @@ validation_dataset: num_samples_limit: 32 tokenizer: _target_: transformers.AutoTokenizer.from_pretrained - pretrained_model_name_or_path: /llm-align/open_models/hunyuan3 + pretrained_model_name_or_path: tencent/Hy3-preview validation_dataloader: _target_: torchdata.stateful_dataloader.StatefulDataLoader From 0c71b04b8bf4668566fe5e985536a25626dd0403 Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 28 Apr 2026 15:21:25 +0800 Subject: [PATCH 10/27] ci(llm): remove P2 DeepEP yaml (not yet validated) Signed-off-by: khazic --- .../hy_v3/hy3_8layer_p2_deepep.yaml | 149 ------------------ 1 file changed, 149 deletions(-) delete mode 100644 examples/llm_finetune/hy_v3/hy3_8layer_p2_deepep.yaml diff --git a/examples/llm_finetune/hy_v3/hy3_8layer_p2_deepep.yaml b/examples/llm_finetune/hy_v3/hy3_8layer_p2_deepep.yaml deleted file mode 100644 index 91266e186..000000000 --- a/examples/llm_finetune/hy_v3/hy3_8layer_p2_deepep.yaml +++ /dev/null @@ -1,149 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# P2 — DeepEP dispatcher scale-up test for HYV3ForCausalLM. -# -# Uses the real tencent/Hy3-preview config and weights, 8 layers. -# Layers 8-79 are never created; their checkpoint keys are ignored (strict=False). -# Replaces the torch dispatcher with deepep to validate communicate-compute -# overlap (async_finish=True) with real expert tensor shapes. -# -# Layer layout (8 layers, pp=2): -# Stage 0: embed + layers 0-3 (1 dense + 3 MoE) -# Stage 1: layers 4-7 (all MoE) + lm_head -# ep_size=4 → 192 / 4 = 48 experts per rank -# total GPUs = pp(2) × ep(4) = 8 -# -# Hardware: 8× GPU (A100 80GB or equivalent) with NVLink / NVSwitch for DeepEP -# automodel examples/llm_finetune/hy_v3/hy3_8layer_p2_deepep.yaml --nproc-per-node 8 -# -# Expected outcomes: -# - First step slow (DeepEP kernel JIT compile); subsequent steps fast -# - Loss at step 200 < loss at step 1 -# - No NaN / Inf in loss or gradients -# - Throughput higher than P0 torch run after warmup - -recipe: TrainFinetuneRecipeForNextTokenPrediction - -step_scheduler: - global_batch_size: 8 - local_batch_size: 2 - ckpt_every_steps: 999999 # disable checkpoint for P2 - val_every_steps: 20 - max_steps: 200 - -dist_env: - backend: nccl - timeout_minutes: 45 - -rng: - _target_: nemo_automodel.components.training.rng.StatefulRNG - seed: 1234 - ranked: true - -model: - _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_config - config: - _target_: transformers.AutoConfig.from_pretrained - pretrained_model_name_or_path: tencent/Hy3-preview - num_hidden_layers: 8 # only build layers 0-7; keys for 8-79 ignored - load_base_model: true - trust_remote_code: false - backend: - _target_: nemo_automodel.components.models.common.BackendConfig - attn: sdpa - linear: torch - rms_norm: torch_fp32 - experts: torch_mm - dispatcher: deepep # DeepEP for communicate-compute overlap - fake_balanced_gate: false - gate_precision: float32 - rope_fusion: false - enable_hf_state_dict_adapter: true - enable_fsdp_optimizations: true - -checkpoint: - enabled: false - checkpoint_dir: /tmp/checkpoints/hy3_p2/ - -distributed: - strategy: fsdp2 - tp_size: 1 - cp_size: 1 - pp_size: 2 - ep_size: 4 # 192 / 4 = 48 experts per rank - - sequence_parallel: false - activation_checkpointing: true - - pipeline: - pp_schedule: 1f1b - pp_microbatch_size: 1 - round_virtual_stages_to_pp_multiple: down - scale_grads_in_schedule: false - patch_inner_model: false - patch_causal_lm_model: false - - moe: - reshard_after_forward: false - wrap_outer_model: false - -loss_fn: - _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy - -dataset: - _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag - path_or_dataset: rowan/hellaswag - split: train - tokenizer: - _target_: transformers.AutoTokenizer.from_pretrained - pretrained_model_name_or_path: tencent/Hy3-preview - -packed_sequence: - packed_sequence_size: 0 - -dataloader: - _target_: torchdata.stateful_dataloader.StatefulDataLoader - collate_fn: - _target_: nemo_automodel.components.datasets.utils.default_collater - pad_seq_len_divisible: 64 - shuffle: true - -validation_dataset: - _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag - path_or_dataset: rowan/hellaswag - split: validation - num_samples_limit: 32 - tokenizer: - _target_: transformers.AutoTokenizer.from_pretrained - pretrained_model_name_or_path: tencent/Hy3-preview - -validation_dataloader: - _target_: torchdata.stateful_dataloader.StatefulDataLoader - collate_fn: - _target_: nemo_automodel.components.datasets.utils.default_collater - pad_seq_len_divisible: 64 - shuffle: false - drop_last: true - -optimizer: - _target_: torch.optim.AdamW - betas: [0.9, 0.95] - eps: 1e-8 - lr: 1e-5 - weight_decay: 0.0 - -ci: - recipe_owner: khazic - time: "00:45:00" From 665ddb5eebed61c9572ccac1af952d318327254d Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 28 Apr 2026 16:27:25 +0800 Subject: [PATCH 11/27] fix(llm): disable e_score_correction_bias EMA update for HYV3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The official Hy3-preview fine-tuning scripts (train.py and hy_v3_patches.py) treat e_score_correction_bias as a static pre-trained buffer — it is loaded from the base checkpoint and used as-is during SFT, with no in-training update. Set gate_bias_update_factor=0.0 so our implementation matches: the buffer is still created and loaded from checkpoint (via force_e_score_correction_bias), but the EMA update path in Gate.update_bias() is never triggered. Signed-off-by: khazic --- nemo_automodel/components/models/hy_v3/model.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nemo_automodel/components/models/hy_v3/model.py b/nemo_automodel/components/models/hy_v3/model.py index 735cd5354..9c549eb34 100644 --- a/nemo_automodel/components/models/hy_v3/model.py +++ b/nemo_automodel/components/models/hy_v3/model.py @@ -126,8 +126,7 @@ def __init__( n_expert_groups=0, n_limited_groups=0, train_gate=True, - # Expert-bias correction: updated every backward pass at rate 1e-3 - gate_bias_update_factor=1e-3, + gate_bias_update_factor=0.0, score_func="sigmoid", route_scale=getattr(config, "router_scaling_factor", 1.0), aux_loss_coeff=0.0, From 564ff4f2beaf86ed2d9fa5b59babef1e0800682b Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 28 Apr 2026 16:50:07 +0800 Subject: [PATCH 12/27] fix(llm): guard update_moe_gate_bias against disabled bias update When gate_bias_update_factor=0.0, calling update_bias() triggers an assertion error. Skip the call when the factor is zero. Signed-off-by: khazic --- nemo_automodel/components/models/hy_v3/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_automodel/components/models/hy_v3/model.py b/nemo_automodel/components/models/hy_v3/model.py index 9c549eb34..09bc496f4 100644 --- a/nemo_automodel/components/models/hy_v3/model.py +++ b/nemo_automodel/components/models/hy_v3/model.py @@ -303,7 +303,7 @@ def forward( def update_moe_gate_bias(self) -> None: with torch.no_grad(): for block in self.model.layers.values(): - if isinstance(block.mlp, MoE): + if isinstance(block.mlp, MoE) and block.mlp.gate.bias_update_factor > 0: block.mlp.gate.update_bias() @torch.no_grad() From f631d36d94838ff68c8b2b37dede27fd137efe2c Mon Sep 17 00:00:00 2001 From: khazic Date: Tue, 28 Apr 2026 19:11:46 +0800 Subject: [PATCH 13/27] docs(llm): add model-coverage page for Hy3-preview (HYV3ForCausalLM) Fixes test_every_registered_arch_has_model_coverage_doc: the new HYV3ForCausalLM architecture registered in the registry now has a corresponding docs/model-coverage/llm/tencent/hy3.md page. Signed-off-by: khazic --- docs/model-coverage/llm/tencent/hy3.md | 65 ++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 docs/model-coverage/llm/tencent/hy3.md diff --git a/docs/model-coverage/llm/tencent/hy3.md b/docs/model-coverage/llm/tencent/hy3.md new file mode 100644 index 000000000..22b66d145 --- /dev/null +++ b/docs/model-coverage/llm/tencent/hy3.md @@ -0,0 +1,65 @@ +# Hy3 (HunyuanLarge) + +[Hy3-preview](https://huggingface.co/tencent/Hy3-preview) is a 295B Mixture-of-Experts language model from Tencent. It features 80 transformer layers (layer 0 dense, layers 1–79 MoE), 192 routed experts plus 1 shared expert with top-8 sigmoid routing, Grouped Query Attention (64 Q / 8 KV heads), per-head QK RMSNorm, RoPE, and an `e_score_correction_bias` gate buffer for expert-load correction. It supports a 256K context window. + +:::{card} +| | | +|---|---| +| **Task** | Text Generation (MoE) | +| **Architecture** | `HYV3ForCausalLM` | +| **Parameters** | 295B total | +| **HF Org** | [tencent](https://huggingface.co/tencent) | +::: + +## Available Models + +- **Hy3-preview**: 295B total, top-8 routed experts activated per token + +## Architectures + +- `HYV3ForCausalLM` + +## Example HF Models + +| Model | HF ID | +|---|---| +| Hy3-preview | [`tencent/Hy3-preview`](https://huggingface.co/tencent/Hy3-preview) | + +## Example Recipes + +| Recipe | Description | +|---|---| +| {download}`hy3_4layer_p0_smoke.yaml <../../../../examples/llm_finetune/hy_v3/hy3_4layer_p0_smoke.yaml>` | SFT — Hy3 4-layer smoke test (P0) | +| {download}`hy3_4layer_p1_ckpt.yaml <../../../../examples/llm_finetune/hy_v3/hy3_4layer_p1_ckpt.yaml>` | SFT — Hy3 4-layer checkpoint test (P1) | +| {download}`hy3_preview_deepep.yaml <../../../../examples/llm_finetune/hy_v3/hy3_preview_deepep.yaml>` | SFT — Hy3-preview with DeepEP | + +## Try with NeMo AutoModel + +**1. Install** ([full instructions](../../../guides/installation.md)): + +```bash +pip install nemo-automodel +``` + +**2. Clone the repo** to get the example recipes: + +```bash +git clone https://github.com/NVIDIA-NeMo/Automodel.git +cd Automodel +``` + +**3. Run the recipe** from inside the repo: + +```bash +automodel --nproc-per-node=8 examples/llm_finetune/hy_v3/hy3_preview_deepep.yaml +``` + +See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md). + +## Fine-Tuning + +See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md) and the [Large MoE Fine-Tuning Guide](../../../guides/llm/large-moe-finetune.md). + +## Hugging Face Model Cards + +- [tencent/Hy3-preview](https://huggingface.co/tencent/Hy3-preview) From b7b01c829324a02f890388f83719ba366120043a Mon Sep 17 00:00:00 2001 From: khazic Date: Wed, 29 Apr 2026 00:16:53 +0800 Subject: [PATCH 14/27] docs(llm): add tencent/Hy3-preview to LLM model-coverage toctree hy3.md was missing from the toctree, causing a Sphinx build warning (treated as error by CI). Signed-off-by: khazic --- docs/model-coverage/llm/index.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/model-coverage/llm/index.md b/docs/model-coverage/llm/index.md index 6122aaf10..2012a5913 100644 --- a/docs/model-coverage/llm/index.md +++ b/docs/model-coverage/llm/index.md @@ -71,6 +71,7 @@ NeMo AutoModel supports the [AutoModelForCausalLM](https://huggingface.co/transf | Stability AI | [StableLM](stabilityai/stablelm.md) | `StableLmForCausalLM` | | Stepfun AI | [Step-3.5](stepfun-ai/step-3-5.md) | `Step3p5ForCausalLM` | | Parasail AI | [GritLM](parasail-ai/gritlm.md) | `GritLM` | +| Tencent | [Hy3-preview](tencent/hy3.md) | `HYV3ForCausalLM` | ## Fine-Tuning LLMs with NeMo AutoModel @@ -140,4 +141,5 @@ orionstar/orion stabilityai/stablelm stepfun-ai/step-3-5 parasail-ai/gritlm +tencent/hy3 ``` From 4eb813304fc74cba679ed75c34bc6213050f80dd Mon Sep 17 00:00:00 2001 From: HuiyingLi Date: Tue, 28 Apr 2026 12:00:01 -0700 Subject: [PATCH 15/27] chore(llm): tune Hy3-preview DeepEP recipe for 16-node run Bump local_batch_size 4->8, set pp_size=4 and ep_size=32 for 128 GPUs (16 nodes x 8), add max_steps=100, and raise val_every_steps to 500. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: HuiyingLi --- examples/llm_finetune/hy_v3/hy3_preview_deepep.yaml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/llm_finetune/hy_v3/hy3_preview_deepep.yaml b/examples/llm_finetune/hy_v3/hy3_preview_deepep.yaml index 66ecb18b2..b9ce4553b 100644 --- a/examples/llm_finetune/hy_v3/hy3_preview_deepep.yaml +++ b/examples/llm_finetune/hy_v3/hy3_preview_deepep.yaml @@ -24,10 +24,11 @@ recipe: TrainFinetuneRecipeForNextTokenPrediction step_scheduler: global_batch_size: 256 - local_batch_size: 4 + local_batch_size: 8 ckpt_every_steps: 500 - val_every_steps: 100 + val_every_steps: 500 num_epochs: 1 + max_steps: 100 dist_env: backend: nccl @@ -64,8 +65,8 @@ distributed: strategy: fsdp2 tp_size: 1 cp_size: 1 - pp_size: 1 - ep_size: 8 # 192 experts / 8 ranks = 24 experts per rank + pp_size: 4 + ep_size: 32 # 192 experts / 8 ranks = 24 experts per rank sequence_parallel: false activation_checkpointing: true From cb30a596f89502f00a94efdac151d2f24ecb3ac6 Mon Sep 17 00:00:00 2001 From: HuiyingLi Date: Tue, 28 Apr 2026 13:19:53 -0700 Subject: [PATCH 16/27] fix(llm): load Hy3-preview MoE expert weights from HF checkpoint The previous HYV3 adapter expected an already-fused HF state dict (mlp.experts.gate_up_proj / down_proj / e_score_correction_bias / shared_experts.* / mlp.gate.weight). The on-disk Tencent format actually stores per-expert split keys plus internal names (mlp.experts.{i}.{gate,up,down}_proj.weight, mlp.expert_bias, mlp.router.gate.weight, mlp.shared_mlp.*). Because checkpointing.py:507 zeroes reader_key_mapping when the model has a state_dict_adapter, the storage reader's renames never ran and DCP found no matching keys for any MoE tensor -- so router/shared/experts silently stayed at random init while everything else loaded fine. Rewrite HYV3StateDictAdapter on top of MoESplitExpertsStateDictMixin so to_hf produces on-disk-format keys (per-expert split + Tencent names) that DCP can match against the safetensors, and from_hf merges them back into the grouped native form. The three HYV3-specific renames (router.gate <-> gate, expert_bias <-> e_score_correction_bias, shared_mlp. <-> shared_experts.) are applied around the mixin. Also fix _maybe_adapt_state_dict_from_hf to pass moe_mesh in the DCP init path (was None). The mixin's validator/merger needs the EP mesh to know which expert-id subset is expected on the rank; without it, required_experts = range(192) and validation fails ("Expert weights missing from checkpoint: 432/576 ..."). The previous comment said "DCP already distributed -- don't pass moe_mesh", but the mesh is needed for subset-aware validation, not re-slicing. Verified end-to-end on hy3_4layer_p0_smoke (pp=2, ep=4, 8 GPUs): all 56 non-bias tensors are bitwise identical to the HF reference; the 3 e_score_correction_bias tensors agree to <=2.2e-4 (bf16 round-trip noise). Stitched mlp.gate.weight across 4 EP ranks matches the on-disk router.gate.weight bitwise. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: HuiyingLi --- .../components/checkpoint/checkpointing.py | 10 +- .../models/hy_v3/state_dict_adapter.py | 280 ++++++++++-------- 2 files changed, 161 insertions(+), 129 deletions(-) diff --git a/nemo_automodel/components/checkpoint/checkpointing.py b/nemo_automodel/components/checkpoint/checkpointing.py index 24c92f11d..7b9225d8e 100644 --- a/nemo_automodel/components/checkpoint/checkpointing.py +++ b/nemo_automodel/components/checkpoint/checkpointing.py @@ -553,10 +553,12 @@ def load_model( if compat_tied_lm_head_source_key is not None and isinstance(lm_head_param_name, str): state_dict[lm_head_param_name] = state_dict.pop(compat_tied_lm_head_source_key) - # DCP already handled EP tensor distribution via DTensors, so don't - # pass moe_mesh here — adapter.from_hf must only rename keys, not - # re-slice experts that are already EP-local inside the DTensors. - state_dict = _maybe_adapt_state_dict_from_hf(model_state.model[0], state_dict, moe_mesh=None) + # Pass moe_mesh so adapter.from_hf can validate / merge against the + # rank-local subset of experts. DCP has already populated only the + # rank-local slice; the mixin's _from_hf_w_merged_experts iterates only + # what's in the dict (no re-slicing), but it needs the mesh to know + # which expert IDs are expected on this rank. + state_dict = _maybe_adapt_state_dict_from_hf(model_state.model[0], state_dict, moe_mesh=self.moe_mesh) key_diff = _summarize_state_dict_key_diff(expected_keys, set(state_dict.keys())) if key_diff["missing_count"] or key_diff["unexpected_count"]: logging.warning( diff --git a/nemo_automodel/components/models/hy_v3/state_dict_adapter.py b/nemo_automodel/components/models/hy_v3/state_dict_adapter.py index 6e7f2d191..2a4de0d8b 100644 --- a/nemo_automodel/components/models/hy_v3/state_dict_adapter.py +++ b/nemo_automodel/components/models/hy_v3/state_dict_adapter.py @@ -12,27 +12,35 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""State dict conversion between HuggingFace HYV3 format and Automodel native format. - -Key differences between HF and native formats: - -HF format (tencent/Hy3-preview): - model.layers.{L}.mlp.experts.gate_up_proj # [n_experts, 2*moe_inter, hidden] - model.layers.{L}.mlp.experts.down_proj # [n_experts, hidden, moe_inter] - model.layers.{L}.mlp.e_score_correction_bias # [n_experts] (on MoE module) - model.layers.{L}.mlp.gate.weight # [n_experts, hidden] - model.layers.{L}.mlp.shared_experts.{gate,up,down}_proj.weight # separate MLPs - -Native format (Automodel GroupedExperts): - model.layers.{L}.mlp.experts.gate_and_up_projs # [n_local, hidden, 2*moe_inter] (transposed) - model.layers.{L}.mlp.experts.down_projs # [n_local, moe_inter, hidden] (transposed) - model.layers.{L}.mlp.gate.e_score_correction_bias # [n_local] (moved to Gate module) - model.layers.{L}.mlp.gate.weight # [n_experts, hidden] (unchanged) - model.layers.{L}.mlp.shared_experts.{gate,up,down}_proj.weight # unchanged - -All other keys (attention, norms, embeddings, lm_head) are identical between formats. -Expert parallelism: in from_hf each rank loads only its slice [start:end] of the -first (expert) dimension of gate_up_proj / down_proj. +"""State dict conversion between the on-disk tencent/Hy3-preview HF checkpoint +and Automodel's native (grouped-experts) format. + +On-disk HF format (what tencent/Hy3-preview safetensors actually contain): + model.layers.{L}.mlp.expert_bias # [n_experts] + model.layers.{L}.mlp.router.gate.weight # [n_experts, hidden] + model.layers.{L}.mlp.experts.{E}.gate_proj.weight # [moe_inter, hidden] + model.layers.{L}.mlp.experts.{E}.up_proj.weight # [moe_inter, hidden] + model.layers.{L}.mlp.experts.{E}.down_proj.weight # [hidden, moe_inter] + model.layers.{L}.mlp.shared_mlp.{gate,up,down}_proj.weight # [moe_inter, hidden] / [hidden, moe_inter] + +Automodel native format (matches the rest of the MoE stack): + model.layers.{L}.mlp.gate.e_score_correction_bias # [n_local] (on Gate, not MoE) + model.layers.{L}.mlp.gate.weight # [n_experts, hidden] + model.layers.{L}.mlp.experts.gate_and_up_projs # [n_local, hidden, 2*moe_inter] + model.layers.{L}.mlp.experts.down_projs # [n_local, moe_inter, hidden] + model.layers.{L}.mlp.shared_experts.{gate,up,down}_proj.weight # unchanged shapes + +Differences (vs. every other Automodel MoE adapter): + 1. Per-expert split tensors -> grouped (handled by MoESplitExpertsStateDictMixin). + 2. Three HYV3-specific name renames: expert_bias <-> gate.e_score_correction_bias, + router.gate.weight <-> gate.weight, shared_mlp.* <-> shared_experts.*. + 3. MTP layers (indices >= num_hidden_layers) on disk must be filtered out on load. + +Why the renames live in the adapter rather than in the storage reader's key_mapping: +nemo_automodel/components/checkpoint/checkpointing.py:507 deliberately passes +``reader_key_mapping=None`` when a model has a state_dict_adapter (to avoid +double-translation). So the adapter's ``to_hf`` / ``from_hf`` must produce keys +that match the actual on-disk strings. """ import logging @@ -45,27 +53,29 @@ from nemo_automodel.components.checkpoint.state_dict_adapter import StateDictAdapter from nemo_automodel.components.models.common import BackendConfig from nemo_automodel.components.moe.config import MoEConfig -from nemo_automodel.components.moe.state_dict_utils import get_expert_range_for_rank_from_mesh +from nemo_automodel.components.moe.state_dict_mixin import MoESplitExpertsStateDictMixin logger = logging.getLogger(__name__) -# Regex patterns used for key detection -_EXPERTS_GATE_UP = re.compile(r"(.*\.mlp)\.experts\.gate_up_proj$") -_EXPERTS_DOWN = re.compile(r"(.*\.mlp)\.experts\.down_proj$") -_E_SCORE_BIAS_HF = re.compile(r"(.*\.mlp)\.e_score_correction_bias$") -_EXPERTS_GATE_AND_UP = re.compile(r"(.*\.mlp)\.experts\.gate_and_up_projs$") -_EXPERTS_DOWN_NATIVE = re.compile(r"(.*\.mlp)\.experts\.down_projs$") -_E_SCORE_BIAS_NATIVE = re.compile(r"(.*\.mlp)\.gate\.e_score_correction_bias$") +# Pre-compiled HYV3-specific name renames (anchored to end-of-key for safety). +_NATIVE_TO_HF_RENAMES: tuple[tuple[re.Pattern[str], str], ...] = ( + (re.compile(r"\.mlp\.gate\.e_score_correction_bias$"), ".mlp.expert_bias"), + (re.compile(r"\.mlp\.gate\.weight$"), ".mlp.router.gate.weight"), + (re.compile(r"\.mlp\.shared_experts\."), ".mlp.shared_mlp."), +) +_HF_TO_NATIVE_RENAMES: tuple[tuple[re.Pattern[str], str], ...] = ( + (re.compile(r"\.mlp\.expert_bias$"), ".mlp.gate.e_score_correction_bias"), + (re.compile(r"\.mlp\.router\.gate\.weight$"), ".mlp.gate.weight"), + (re.compile(r"\.mlp\.shared_mlp\."), ".mlp.shared_experts."), +) -class HYV3StateDictAdapter(StateDictAdapter): - """Converts between HF HYV3 checkpoints and Automodel native format. +class HYV3StateDictAdapter(MoESplitExpertsStateDictMixin, StateDictAdapter): + """Bridges Automodel native (grouped experts) and tencent/Hy3-preview on-disk HF. - The HF format uses pre-grouped (batched) expert tensors but with the last - two dimensions transposed relative to Automodel's GroupedExperts convention. - The e_score_correction_bias lives on the MoE module in HF but on the Gate - sub-module in native format. + Inherits the per-expert split/merge logic from ``MoESplitExpertsStateDictMixin``; + only the three HYV3-specific name renames + MTP-layer filtering live here. """ def __init__( @@ -82,124 +92,144 @@ def __init__( self._uses_model_prefix = True # ------------------------------------------------------------------ - # HF → native + # Native -> on-disk HF # ------------------------------------------------------------------ - def from_hf( + def to_hf( self, - hf_state_dict: dict[str, Any], - device_mesh: Optional[DeviceMesh] = None, + state_dict: dict[str, Any], + exclude_key_regex: Optional[str] = None, **kwargs, ) -> dict[str, Any]: - """Convert HF checkpoint to native Automodel format. + """Convert native state dict back to the on-disk Tencent format. - For expert parallelism, only the slice owned by the current rank is - retained from the batched expert tensors. + Steps: + 1. Split grouped expert tensors into per-expert HF keys (mixin). + 2. Apply HYV3 name renames (gate.e_score_correction_bias -> expert_bias, + gate.weight -> router.gate.weight, shared_experts. -> shared_mlp.). """ - n_experts = self.moe_config.n_routed_experts - - if device_mesh is not None: - start_expert, end_expert = get_expert_range_for_rank_from_mesh(device_mesh, n_experts) - else: - start_expert, end_expert = 0, n_experts - - native: dict[str, Any] = {} - - for key, value in hf_state_dict.items(): - # Skip MTP (multi-token prediction) layers — not used during SFT. - # These appear as layers beyond num_hidden_layers in the checkpoint. - if self._is_mtp_key(key): + # Step 1: per-expert split via the mixin. Pass-through for non-expert keys. + hf_split: dict[str, Any] = self._to_hf_w_split_experts(state_dict) + + # Step 2: rename native -> on-disk Tencent. + out: dict[str, Any] = {} + for k, v in hf_split.items(): + new_k = k + for pat, repl in _NATIVE_TO_HF_RENAMES: + new_k, n = pat.subn(repl, new_k) + if n: + break + if exclude_key_regex and re.match(exclude_key_regex, new_k): continue + out[new_k] = v + return out - m = _EXPERTS_GATE_UP.match(key) - if m: - # HF: [n_experts, 2*moe_inter, hidden] → native: [n_local, hidden, 2*moe_inter] - sliced = value[start_expert:end_expert] - native_key = m.group(1) + ".experts.gate_and_up_projs" - native[native_key] = sliced.transpose(1, 2).contiguous() - continue - - m = _EXPERTS_DOWN.match(key) - if m: - # HF: [n_experts, hidden, moe_inter] → native: [n_local, moe_inter, hidden] - sliced = value[start_expert:end_expert] - native_key = m.group(1) + ".experts.down_projs" - native[native_key] = sliced.transpose(1, 2).contiguous() - continue + # ------------------------------------------------------------------ + # On-disk HF -> native + # ------------------------------------------------------------------ - m = _E_SCORE_BIAS_HF.match(key) - if m: - # Move from MoE module level to Gate sub-module. - native_key = m.group(1) + ".gate.e_score_correction_bias" - native[native_key] = value + def from_hf( + self, + hf_state_dict: dict[str, Any], + device_mesh: Optional[DeviceMesh] = None, + **kwargs, + ) -> dict[str, Any]: + """Convert the on-disk Tencent state dict to native format. + + Steps: + 1. Drop MTP (multi-token prediction) layer keys. + 2. Apply HYV3 name renames (on-disk -> native HF naming). + 3. Recover the EP device_mesh if the caller passed None: the standard + DCP path (checkpointing.py) calls us with ``moe_mesh=None`` on the + assumption that the state dict is already EP-local, but the mixin's + validator/merger needs the mesh to know which expert subset to + expect. Any per-expert tensor that survived ``to_hf`` is a DTensor + we can introspect for the mesh. + 4. Merge per-expert split tensors into grouped form via the mixin. + """ + # Step 1 + 2: filter MTP, then rename in a single pass to avoid copying twice. + renamed: dict[str, Any] = {} + for k, v in hf_state_dict.items(): + if self._is_mtp_key(k): continue + new_k = k + for pat, repl in _HF_TO_NATIVE_RENAMES: + new_k, n = pat.subn(repl, new_k) + if n: + break + renamed[new_k] = v - # All other keys (attn, norms, dense mlp, shared experts, lm_head) pass through. - native[key] = value + # Step 3: recover device_mesh from any expert-shard DTensor in the dict. + if device_mesh is None: + device_mesh = self._infer_ep_mesh(renamed) - return native + # Step 4: per-expert merge + EP slicing via the mixin. + return self._from_hf_w_merged_experts(renamed, device_mesh) # ------------------------------------------------------------------ - # native → HF + # Single-tensor variant required by the abstract base class. # ------------------------------------------------------------------ - def to_hf( + def convert_single_tensor_to_hf( self, - state_dict: dict[str, Any], - exclude_key_regex: Optional[str] = None, + fqn: str, + tensor: Any, **kwargs, - ) -> dict[str, Any]: - """Convert native Automodel state dict back to HF format.""" - hf: dict[str, Any] = {} - - for key, value in state_dict.items(): - converted_pairs = self.convert_single_tensor_to_hf(key, value, exclude_key_regex=exclude_key_regex) - for k, v in converted_pairs: - hf[k] = v + ) -> list[tuple[str, Any]]: + """Per-tensor variant of ``to_hf`` (used by save paths that stream tensors). - return hf - - def convert_single_tensor_to_hf(self, fqn: str, tensor: Any, **kwargs) -> list[tuple[str, Any]]: - """Convert a single native tensor to its HF equivalent.""" + Mirrors ``to_hf`` but operating on one (fqn, tensor) at a time: + 1. Try the mixin's per-expert split. Returns multiple (key, tensor) pairs + when *fqn* names a grouped expert tensor; otherwise returns ``None``. + 2. Apply HYV3 name renames to whichever key set we end up with. + """ exclude_key_regex = kwargs.get("exclude_key_regex", None) - m = _EXPERTS_GATE_AND_UP.match(fqn) - if m: - # native: [n_local, hidden, 2*moe_inter] → HF: [n_local, 2*moe_inter, hidden] - hf_key = m.group(1) + ".experts.gate_up_proj" - result = [(hf_key, tensor.transpose(1, 2).contiguous())] + expert_split = self._convert_single_merged_expert_to_hf_split_experts(fqn, tensor, **kwargs) + if expert_split is not None: + pairs = expert_split else: - m = _EXPERTS_DOWN_NATIVE.match(fqn) - if m: - # native: [n_local, moe_inter, hidden] → HF: [n_local, hidden, moe_inter] - hf_key = m.group(1) + ".experts.down_proj" - result = [(hf_key, tensor.transpose(1, 2).contiguous())] - else: - m = _E_SCORE_BIAS_NATIVE.match(fqn) - if m: - hf_key = m.group(1) + ".e_score_correction_bias" - result = [(hf_key, tensor)] - else: - result = [(fqn, tensor)] - - if exclude_key_regex: - result = [(k, v) for k, v in result if not re.match(exclude_key_regex, k)] - - return result + pairs = [(fqn, tensor)] + + out: list[tuple[str, Any]] = [] + for k, v in pairs: + new_k = k + for pat, repl in _NATIVE_TO_HF_RENAMES: + new_k, n = pat.subn(repl, new_k) + if n: + break + if exclude_key_regex and re.match(exclude_key_regex, new_k): + continue + out.append((new_k, v)) + return out # ------------------------------------------------------------------ # Helpers # ------------------------------------------------------------------ def _is_mtp_key(self, key: str) -> bool: - """Return True if key belongs to a multi-token prediction (MTP) layer. - - HYV3 checkpoints include num_nextn_predict_layers extra layers beyond - the main num_hidden_layers. These are stored at layer indices >= num_hidden_layers - and are not used during standard SFT. - """ + """Return True if *key* belongs to an MTP layer (index >= num_hidden_layers).""" num_hidden = getattr(self.config, "num_hidden_layers", 80) m = re.match(r"(?:model\.)?layers\.(\d+)\.", key) - if m and int(m.group(1)) >= num_hidden: - return True - return False + return bool(m and int(m.group(1)) >= num_hidden) + + def _infer_ep_mesh(self, state_dict: dict[str, Any]) -> Optional[DeviceMesh]: + """Return the EP device mesh inferred from any per-expert DTensor in *state_dict*. + + The standard DCP load path produces per-expert split DTensors (via + ``to_hf``) whose ``device_mesh`` carries the ``ep`` dimension we need + to drive ``_from_hf_w_merged_experts``. We probe the dict for any + DTensor expert tensor and reuse its mesh. + """ + from torch.distributed.tensor import DTensor + + expert_segment = self._expert_path_segment + for k, v in state_dict.items(): + if not isinstance(v, DTensor): + continue + if f".{expert_segment}." not in k: + continue + mesh = v.device_mesh + if mesh is not None and "ep" in (mesh.mesh_dim_names or ()): + return mesh + return None From 91d268c86a74e7ef30ec1d3bb0cbbd4d48208b1e Mon Sep 17 00:00:00 2001 From: HuiyingLi Date: Tue, 28 Apr 2026 13:30:07 -0700 Subject: [PATCH 17/27] chore(llm): env-gated parity dumps in train_ft.py setup() Add two debug-only hooks behind environment variables, both no-ops by default: - AUTOMODEL_PARITY_DUMP=: after build_model() returns, write each rank's post-load state_dict to /rank{R}_state_dict.pt. Used to verify HF -> Automodel weight loading matches a reference state dict from the on-disk safetensors. - AUTOMODEL_PARITY_LOGITS=: at the end of setup(), register forward hooks on embed_tokens, every decoder layer, the final norm, and lm_head; run one deterministic eval-mode forward through the PP schedule on a fixed input (torch.randint with seed 0, seqlen 8); each rank dumps its captured tensors to /rank{R}_outputs.pt. Stage 0 ranks capture hidden_0..hidden_{first_stage_layers}; stage 1 ranks capture the remaining hidden states + hidden_norm + logits. Used together with an HF transformers reference forward (same input) to validate per-layer parity within bf16 noise. No effect on production runs that don't set either env var. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: HuiyingLi --- nemo_automodel/recipes/llm/train_ft.py | 126 +++++++++++++++++++++++++ 1 file changed, 126 insertions(+) diff --git a/nemo_automodel/recipes/llm/train_ft.py b/nemo_automodel/recipes/llm/train_ft.py index c8152d0d6..1ba82b76c 100644 --- a/nemo_automodel/recipes/llm/train_ft.py +++ b/nemo_automodel/recipes/llm/train_ft.py @@ -29,6 +29,7 @@ import logging import pathlib import time +import os from contextlib import nullcontext from typing import TYPE_CHECKING, Any, Dict, Optional @@ -1049,6 +1050,32 @@ def setup(self): logger.warning("logits_to_keep not found in model.forward. Using MaskedCrossEntropy instead.") self.loss_fn = MaskedCrossEntropy() + # DEBUG: dump post-load state_dict so we can compare against the raw HF + # reference (parity check). Trigger via env var so this is a no-op for + # production runs. AUTOMODEL_PARITY_DUMP= writes one .pt per rank. + _parity_dir = os.environ.get("AUTOMODEL_PARITY_DUMP") + if _parity_dir: + import torch.distributed as _td + + _rank = _td.get_rank() if _td.is_initialized() else 0 + os.makedirs(_parity_dir, exist_ok=True) + _model_for_dump = ( + model.parts if hasattr(model, "parts") and not isinstance(model, torch.nn.Module) else [model] + ) + if isinstance(model, AutoPipeline): + _model_for_dump = model.parts + else: + _model_for_dump = [model] + _flat: dict[str, "torch.Tensor"] = {} + for _idx, _part in enumerate(_model_for_dump): + for _k, _v in _part.state_dict().items(): + _t = _v.to_local() if hasattr(_v, "to_local") else _v + _flat[f"part{_idx}.{_k}"] = _t.detach().cpu() + _path = os.path.join(_parity_dir, f"rank{_rank}_state_dict.pt") + torch.save(_flat, _path) + _shapes = {k: tuple(v.shape) for k, v in list(_flat.items())[:5]} + print(f"[parity-dump] rank {_rank}: wrote {_path} ({len(_flat)} tensors). first 5 shapes={_shapes}", flush=True) + if isinstance(model, AutoPipeline): self.model_parts = model.parts self.pp = model @@ -1168,6 +1195,105 @@ def setup(self): # Log step scheduler details self._log_step_scheduler_details(self.step_scheduler) + # DEBUG: per-layer logits parity dump. AUTOMODEL_PARITY_LOGITS= + # runs a single deterministic forward right after setup, with hooks on + # embed_tokens, every decoder layer, the final norm, and lm_head; each + # rank writes its captured tensors. Compare against an HF transformers + # forward of the same input ids (seed 0, seqlen 8) for parity. + _parity_logits_dir = os.environ.get("AUTOMODEL_PARITY_LOGITS") + if _parity_logits_dir: + self._dump_parity_logits(_parity_logits_dir) + + def _dump_parity_logits(self, dump_dir: str) -> None: + """Run one deterministic forward with hooks and dump per-layer activations. + + Inputs are torch.randint(0, vocab_size, (1, 8)) under generator(seed=0) + on CPU, then moved to GPU — matches the HF reference dump. + """ + import torch.distributed as _td + + _rank = _td.get_rank() if _td.is_initialized() else 0 + os.makedirs(dump_dir, exist_ok=True) + + captured: dict[str, torch.Tensor] = {} + hooks: list = [] + + def _hook(name: str): + def fn(_m, _a, output): + t = output[0] if isinstance(output, (tuple, list)) else output + if t is None: + return + if hasattr(t, "to_local"): + t = t.to_local() + captured[name] = t.detach().float().cpu() + + return fn + + for part in self.model_parts: + inner = getattr(part, "model", None) + if inner is None: + continue + if getattr(inner, "embed_tokens", None) is not None: + hooks.append(inner.embed_tokens.register_forward_hook(_hook("hidden_0"))) + layers = getattr(inner, "layers", None) + if layers is not None: + for key in list(layers): + try: + idx = int(key) + except (TypeError, ValueError): + continue + layer = layers[key] + if layer is None: + continue + hooks.append(layer.register_forward_hook(_hook(f"hidden_{idx + 1}"))) + if getattr(inner, "norm", None) is not None: + hooks.append(inner.norm.register_forward_hook(_hook("hidden_norm"))) + if getattr(part, "lm_head", None) is not None: + hooks.append(part.lm_head.register_forward_hook(_hook("logits"))) + + # Build deterministic fixed inputs. Pipeline schedule expects + # local_batch_size rows so the chunker can produce the configured + # number of microbatches; we slice down to the first row when comparing. + cfg_model = self.model_parts[0] + vocab_size = getattr(getattr(cfg_model, "config", None), "vocab_size", 120832) + local_bs = self.cfg.get("step_scheduler.local_batch_size", 1) + gen = torch.Generator(device="cpu").manual_seed(0) + # Reuse the same first-row sequence as the HF reference run. + first_row = torch.randint(0, vocab_size, (1, 8), generator=gen, dtype=torch.long) + input_ids = first_row.expand(local_bs, -1).contiguous().to(self.dist_env.device) + + for part in self.model_parts: + part.eval() + + try: + with torch.no_grad(): + if self.pp is not None: + # Pipeline path: drive both stages via the schedule. + # Last stage needs a target+losses sink; we don't care about + # the loss number itself, just that forward completes. + self.pp.update_seq_len(input_ids.shape[1]) + has_last = self.pp.info.has_last_stage + has_first = self.pp.info.has_first_stage + losses_sink = [] if has_last else None + targets = ( + torch.zeros_like(input_ids) if has_last else None + ) + if has_first: + self.pp.info.schedule.eval(input_ids, target=targets, losses=losses_sink) + else: + self.pp.info.schedule.eval(target=targets, losses=losses_sink) + else: + self.model_parts[0](input_ids=input_ids) + finally: + for h in hooks: + h.remove() + + captured["inputs"] = input_ids.detach().cpu() + path = os.path.join(dump_dir, f"rank{_rank}_outputs.pt") + torch.save(captured, path) + keys = list(captured.keys()) + print(f"[parity-logits] rank {_rank}: wrote {path} (keys={keys})", flush=True) + def _collect_moe_load_balance(self): """Collect MoE load balance metrics with DP all-reduce. From a02350023dbbc28ce8aa8923857faa057a0e24c6 Mon Sep 17 00:00:00 2001 From: HuiyingLi Date: Tue, 28 Apr 2026 14:40:19 -0700 Subject: [PATCH 18/27] Revert "chore(llm): env-gated parity dumps in train_ft.py setup()" This reverts commit c23bc048ce39014a9f939746ce46739cceb5e276. Signed-off-by: HuiyingLi --- nemo_automodel/recipes/llm/train_ft.py | 126 ------------------------- 1 file changed, 126 deletions(-) diff --git a/nemo_automodel/recipes/llm/train_ft.py b/nemo_automodel/recipes/llm/train_ft.py index 1ba82b76c..c8152d0d6 100644 --- a/nemo_automodel/recipes/llm/train_ft.py +++ b/nemo_automodel/recipes/llm/train_ft.py @@ -29,7 +29,6 @@ import logging import pathlib import time -import os from contextlib import nullcontext from typing import TYPE_CHECKING, Any, Dict, Optional @@ -1050,32 +1049,6 @@ def setup(self): logger.warning("logits_to_keep not found in model.forward. Using MaskedCrossEntropy instead.") self.loss_fn = MaskedCrossEntropy() - # DEBUG: dump post-load state_dict so we can compare against the raw HF - # reference (parity check). Trigger via env var so this is a no-op for - # production runs. AUTOMODEL_PARITY_DUMP= writes one .pt per rank. - _parity_dir = os.environ.get("AUTOMODEL_PARITY_DUMP") - if _parity_dir: - import torch.distributed as _td - - _rank = _td.get_rank() if _td.is_initialized() else 0 - os.makedirs(_parity_dir, exist_ok=True) - _model_for_dump = ( - model.parts if hasattr(model, "parts") and not isinstance(model, torch.nn.Module) else [model] - ) - if isinstance(model, AutoPipeline): - _model_for_dump = model.parts - else: - _model_for_dump = [model] - _flat: dict[str, "torch.Tensor"] = {} - for _idx, _part in enumerate(_model_for_dump): - for _k, _v in _part.state_dict().items(): - _t = _v.to_local() if hasattr(_v, "to_local") else _v - _flat[f"part{_idx}.{_k}"] = _t.detach().cpu() - _path = os.path.join(_parity_dir, f"rank{_rank}_state_dict.pt") - torch.save(_flat, _path) - _shapes = {k: tuple(v.shape) for k, v in list(_flat.items())[:5]} - print(f"[parity-dump] rank {_rank}: wrote {_path} ({len(_flat)} tensors). first 5 shapes={_shapes}", flush=True) - if isinstance(model, AutoPipeline): self.model_parts = model.parts self.pp = model @@ -1195,105 +1168,6 @@ def setup(self): # Log step scheduler details self._log_step_scheduler_details(self.step_scheduler) - # DEBUG: per-layer logits parity dump. AUTOMODEL_PARITY_LOGITS= - # runs a single deterministic forward right after setup, with hooks on - # embed_tokens, every decoder layer, the final norm, and lm_head; each - # rank writes its captured tensors. Compare against an HF transformers - # forward of the same input ids (seed 0, seqlen 8) for parity. - _parity_logits_dir = os.environ.get("AUTOMODEL_PARITY_LOGITS") - if _parity_logits_dir: - self._dump_parity_logits(_parity_logits_dir) - - def _dump_parity_logits(self, dump_dir: str) -> None: - """Run one deterministic forward with hooks and dump per-layer activations. - - Inputs are torch.randint(0, vocab_size, (1, 8)) under generator(seed=0) - on CPU, then moved to GPU — matches the HF reference dump. - """ - import torch.distributed as _td - - _rank = _td.get_rank() if _td.is_initialized() else 0 - os.makedirs(dump_dir, exist_ok=True) - - captured: dict[str, torch.Tensor] = {} - hooks: list = [] - - def _hook(name: str): - def fn(_m, _a, output): - t = output[0] if isinstance(output, (tuple, list)) else output - if t is None: - return - if hasattr(t, "to_local"): - t = t.to_local() - captured[name] = t.detach().float().cpu() - - return fn - - for part in self.model_parts: - inner = getattr(part, "model", None) - if inner is None: - continue - if getattr(inner, "embed_tokens", None) is not None: - hooks.append(inner.embed_tokens.register_forward_hook(_hook("hidden_0"))) - layers = getattr(inner, "layers", None) - if layers is not None: - for key in list(layers): - try: - idx = int(key) - except (TypeError, ValueError): - continue - layer = layers[key] - if layer is None: - continue - hooks.append(layer.register_forward_hook(_hook(f"hidden_{idx + 1}"))) - if getattr(inner, "norm", None) is not None: - hooks.append(inner.norm.register_forward_hook(_hook("hidden_norm"))) - if getattr(part, "lm_head", None) is not None: - hooks.append(part.lm_head.register_forward_hook(_hook("logits"))) - - # Build deterministic fixed inputs. Pipeline schedule expects - # local_batch_size rows so the chunker can produce the configured - # number of microbatches; we slice down to the first row when comparing. - cfg_model = self.model_parts[0] - vocab_size = getattr(getattr(cfg_model, "config", None), "vocab_size", 120832) - local_bs = self.cfg.get("step_scheduler.local_batch_size", 1) - gen = torch.Generator(device="cpu").manual_seed(0) - # Reuse the same first-row sequence as the HF reference run. - first_row = torch.randint(0, vocab_size, (1, 8), generator=gen, dtype=torch.long) - input_ids = first_row.expand(local_bs, -1).contiguous().to(self.dist_env.device) - - for part in self.model_parts: - part.eval() - - try: - with torch.no_grad(): - if self.pp is not None: - # Pipeline path: drive both stages via the schedule. - # Last stage needs a target+losses sink; we don't care about - # the loss number itself, just that forward completes. - self.pp.update_seq_len(input_ids.shape[1]) - has_last = self.pp.info.has_last_stage - has_first = self.pp.info.has_first_stage - losses_sink = [] if has_last else None - targets = ( - torch.zeros_like(input_ids) if has_last else None - ) - if has_first: - self.pp.info.schedule.eval(input_ids, target=targets, losses=losses_sink) - else: - self.pp.info.schedule.eval(target=targets, losses=losses_sink) - else: - self.model_parts[0](input_ids=input_ids) - finally: - for h in hooks: - h.remove() - - captured["inputs"] = input_ids.detach().cpu() - path = os.path.join(dump_dir, f"rank{_rank}_outputs.pt") - torch.save(captured, path) - keys = list(captured.keys()) - print(f"[parity-logits] rank {_rank}: wrote {path} (keys={keys})", flush=True) - def _collect_moe_load_balance(self): """Collect MoE load balance metrics with DP all-reduce. From 0265f01abca2abc05ff9b472c63a375c50f51bc3 Mon Sep 17 00:00:00 2001 From: Huiying Date: Tue, 28 Apr 2026 14:44:24 -0700 Subject: [PATCH 19/27] Apply suggestion from @jgerh Co-authored-by: jgerh <163925524+jgerh@users.noreply.github.com> Signed-off-by: HuiyingLi --- docs/model-coverage/llm/tencent/hy3.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/model-coverage/llm/tencent/hy3.md b/docs/model-coverage/llm/tencent/hy3.md index 22b66d145..5bb0be790 100644 --- a/docs/model-coverage/llm/tencent/hy3.md +++ b/docs/model-coverage/llm/tencent/hy3.md @@ -35,7 +35,7 @@ ## Try with NeMo AutoModel -**1. Install** ([full instructions](../../../guides/installation.md)): +**1. Install** ([NeMo AutoModel](../../../guides/installation.md)): ```bash pip install nemo-automodel From f60a2da929f5eb7998106c118aae5445ed360c2f Mon Sep 17 00:00:00 2001 From: Huiying Date: Tue, 28 Apr 2026 14:44:32 -0700 Subject: [PATCH 20/27] Apply suggestion from @jgerh Co-authored-by: jgerh <163925524+jgerh@users.noreply.github.com> Signed-off-by: HuiyingLi --- docs/model-coverage/llm/tencent/hy3.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/model-coverage/llm/tencent/hy3.md b/docs/model-coverage/llm/tencent/hy3.md index 5bb0be790..87b4aef62 100644 --- a/docs/model-coverage/llm/tencent/hy3.md +++ b/docs/model-coverage/llm/tencent/hy3.md @@ -54,7 +54,7 @@ cd Automodel automodel --nproc-per-node=8 examples/llm_finetune/hy_v3/hy3_preview_deepep.yaml ``` -See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md). +See the [NeMo AutoModel Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md). ## Fine-Tuning From b99084c666d36b8f9b388a177d6a08d6db4c200d Mon Sep 17 00:00:00 2001 From: Huiying Date: Tue, 28 Apr 2026 14:44:41 -0700 Subject: [PATCH 21/27] Apply suggestion from @jgerh Co-authored-by: jgerh <163925524+jgerh@users.noreply.github.com> Signed-off-by: HuiyingLi --- docs/model-coverage/llm/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/model-coverage/llm/index.md b/docs/model-coverage/llm/index.md index 2012a5913..497fc3337 100644 --- a/docs/model-coverage/llm/index.md +++ b/docs/model-coverage/llm/index.md @@ -11,7 +11,7 @@ To run LLMs with NeMo AutoModel, make sure you're using NeMo container version [ pip3 install --upgrade git+git@github.com:NVIDIA-NeMo/AutoModel.git ``` -For other installation options (e.g., uv), please see our [Installation Guide](../../guides/installation.md). +For other installation options (e.g., uv), see the [NeMo AutoModel Installation Guide](../../guides/installation.md). ## Supported Models From 3a1b2a4f3b271ffaa083d3f662eb073263de20b6 Mon Sep 17 00:00:00 2001 From: Huiying Date: Tue, 28 Apr 2026 14:44:49 -0700 Subject: [PATCH 22/27] Apply suggestion from @jgerh Co-authored-by: jgerh <163925524+jgerh@users.noreply.github.com> Signed-off-by: HuiyingLi --- docs/model-coverage/llm/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/model-coverage/llm/index.md b/docs/model-coverage/llm/index.md index 497fc3337..cc75e2b91 100644 --- a/docs/model-coverage/llm/index.md +++ b/docs/model-coverage/llm/index.md @@ -80,7 +80,7 @@ The models listed above can be fine-tuned using NeMo AutoModel. We support two p 1. **Parameter-Efficient Fine-Tuning (PEFT)**: Updates only a small subset of parameters (typically <1%) using techniques like Low-Rank Adaptation (LoRA). 2. **Supervised Fine-Tuning (SFT)**: Updates all or most model parameters for deeper adaptation. -Please see our [Fine-Tuning Guide](../../guides/llm/finetune.md) to learn how to apply both methods to your data. +See the [Fine-Tuning Guide](../../guides/llm/finetune.md) to learn how to apply both methods to your data. :::{tip} In these guides, we use the `SQuAD v1.1` dataset for demonstration purposes, but you can use your own data. Update the recipe YAML `dataset` / `validation_dataset` sections accordingly. See [LLM datasets](../../guides/llm/dataset.md) and [dataset overview](../../guides/dataset-overview.md). From 2b24efdbd2bea0e1b333c82e0f577f364c659ba8 Mon Sep 17 00:00:00 2001 From: HuiyingLi Date: Tue, 28 Apr 2026 14:51:30 -0700 Subject: [PATCH 23/27] chore(llm): drop Hy3 4-layer smoke/ckpt example yamls The two truncated 4-layer recipes (hy3_4layer_p0_smoke.yaml, hy3_4layer_p1_ckpt.yaml) were used during initial state-dict and checkpoint validation; the only remaining production recipe for HYV3 is hy3_preview_deepep.yaml. Drop the smoke files and remove the now-stale download links from the model-coverage page. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: HuiyingLi --- docs/model-coverage/llm/tencent/hy3.md | 2 - .../hy_v3/hy3_4layer_p0_smoke.yaml | 152 ------------------ .../hy_v3/hy3_4layer_p1_ckpt.yaml | 148 ----------------- 3 files changed, 302 deletions(-) delete mode 100644 examples/llm_finetune/hy_v3/hy3_4layer_p0_smoke.yaml delete mode 100644 examples/llm_finetune/hy_v3/hy3_4layer_p1_ckpt.yaml diff --git a/docs/model-coverage/llm/tencent/hy3.md b/docs/model-coverage/llm/tencent/hy3.md index 87b4aef62..ea2d8be1e 100644 --- a/docs/model-coverage/llm/tencent/hy3.md +++ b/docs/model-coverage/llm/tencent/hy3.md @@ -29,8 +29,6 @@ | Recipe | Description | |---|---| -| {download}`hy3_4layer_p0_smoke.yaml <../../../../examples/llm_finetune/hy_v3/hy3_4layer_p0_smoke.yaml>` | SFT — Hy3 4-layer smoke test (P0) | -| {download}`hy3_4layer_p1_ckpt.yaml <../../../../examples/llm_finetune/hy_v3/hy3_4layer_p1_ckpt.yaml>` | SFT — Hy3 4-layer checkpoint test (P1) | | {download}`hy3_preview_deepep.yaml <../../../../examples/llm_finetune/hy_v3/hy3_preview_deepep.yaml>` | SFT — Hy3-preview with DeepEP | ## Try with NeMo AutoModel diff --git a/examples/llm_finetune/hy_v3/hy3_4layer_p0_smoke.yaml b/examples/llm_finetune/hy_v3/hy3_4layer_p0_smoke.yaml deleted file mode 100644 index 5b37964b5..000000000 --- a/examples/llm_finetune/hy_v3/hy3_4layer_p0_smoke.yaml +++ /dev/null @@ -1,152 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# P0 — End-to-end forward/backward/PP/EP health check for HYV3ForCausalLM. -# -# Uses the real tencent/Hy3-preview config and weights, but only 4 layers. -# Layers 4-79 are never created; their checkpoint keys are ignored (strict=False). -# Tests real tensor shapes (hidden=4096, 192 experts) and real routing logic. -# -# Layer layout (4 layers, pp=2): -# Stage 0: embed + layer 0 (dense) + layer 1 (MoE) -# Stage 1: layer 2 (MoE) + layer 3 (MoE) + lm_head -# ep_size=4 → 192 / 4 = 48 experts per rank -# total GPUs = pp(2) × ep(4) = 8 -# -# Hardware: 8× GPU (A100 80GB or equivalent) -# automodel examples/llm_finetune/hy_v3/hy3_4layer_p0_smoke.yaml --nproc-per-node 8 -# -# Before running: set HY3_PATH to your local Hy3-preview directory, then -# replace the two pretrained_model_name_or_path fields below accordingly. -# -# Expected outcomes: -# - Training starts within 3 min (HF checkpoint loading for 4 layers) -# - Loss at step 100 < loss at step 1 -# - No NaN / Inf in loss or gradients -# - Memory stable at every step (no OOM growth) -# - Validation runs at step 2 without error - -recipe: TrainFinetuneRecipeForNextTokenPrediction - -step_scheduler: - global_batch_size: 8 - local_batch_size: 2 - ckpt_every_steps: 999999 # disable checkpoint for P0 - val_every_steps: 2 - max_steps: 100 - -dist_env: - backend: nccl - timeout_minutes: 30 - -rng: - _target_: nemo_automodel.components.training.rng.StatefulRNG - seed: 1234 - ranked: true - -model: - _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_config - config: - _target_: transformers.AutoConfig.from_pretrained - pretrained_model_name_or_path: tencent/Hy3-preview - num_hidden_layers: 4 # only build layers 0-3; keys for 4-79 ignored - load_base_model: true - trust_remote_code: false - backend: - _target_: nemo_automodel.components.models.common.BackendConfig - attn: sdpa - linear: torch - rms_norm: torch_fp32 - experts: torch_mm - dispatcher: torch # torch dispatcher for P0 - fake_balanced_gate: false - gate_precision: float32 - rope_fusion: false - enable_hf_state_dict_adapter: true - enable_fsdp_optimizations: true - -checkpoint: - enabled: false - checkpoint_dir: /tmp/checkpoints/hy3_p0/ - -distributed: - strategy: fsdp2 - tp_size: 1 - cp_size: 1 - pp_size: 2 - ep_size: 4 # 192 / 4 = 48 experts per rank - - sequence_parallel: false - activation_checkpointing: true - - pipeline: - pp_schedule: 1f1b - pp_microbatch_size: 1 - round_virtual_stages_to_pp_multiple: down - scale_grads_in_schedule: false - patch_inner_model: false - patch_causal_lm_model: false - - moe: - reshard_after_forward: false - wrap_outer_model: false - -loss_fn: - _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy - -dataset: - _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag - path_or_dataset: rowan/hellaswag - split: train - tokenizer: - _target_: transformers.AutoTokenizer.from_pretrained - pretrained_model_name_or_path: tencent/Hy3-preview - -packed_sequence: - packed_sequence_size: 0 - -dataloader: - _target_: torchdata.stateful_dataloader.StatefulDataLoader - collate_fn: - _target_: nemo_automodel.components.datasets.utils.default_collater - pad_seq_len_divisible: 64 - shuffle: true - -validation_dataset: - _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag - path_or_dataset: rowan/hellaswag - split: validation - num_samples_limit: 32 - tokenizer: - _target_: transformers.AutoTokenizer.from_pretrained - pretrained_model_name_or_path: tencent/Hy3-preview - -validation_dataloader: - _target_: torchdata.stateful_dataloader.StatefulDataLoader - collate_fn: - _target_: nemo_automodel.components.datasets.utils.default_collater - pad_seq_len_divisible: 64 - shuffle: false - drop_last: true - -optimizer: - _target_: torch.optim.AdamW - betas: [0.9, 0.95] - eps: 1e-8 - lr: 1e-5 - weight_decay: 0.0 - -ci: - recipe_owner: khazic - time: "00:30:00" diff --git a/examples/llm_finetune/hy_v3/hy3_4layer_p1_ckpt.yaml b/examples/llm_finetune/hy_v3/hy3_4layer_p1_ckpt.yaml deleted file mode 100644 index e9235ef9c..000000000 --- a/examples/llm_finetune/hy_v3/hy3_4layer_p1_ckpt.yaml +++ /dev/null @@ -1,148 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# P1 — Checkpoint save / resume health check for HYV3ForCausalLM. -# -# Same 4-layer truncated setup as P0 (real weights, real dims) but with DCP -# checkpoint enabled. Tests that the full FSDP2+EP state — including the -# e_score_correction_bias Gate buffers — is saved and restored correctly. -# -# Procedure: -# 1. Run for 100 steps; a DCP checkpoint is saved at step 50. -# 2. Kill the job, then re-run with --resume pointing to the saved checkpoint. -# 3. Confirm loss at step 51 (post-resume) matches the original run ± 1e-4. -# -# Hardware: 8× GPU (A100 80GB or equivalent) -# automodel examples/llm_finetune/hy_v3/hy3_4layer_p1_ckpt.yaml --nproc-per-node 8 -# -# Expected outcomes: -# - epoch_0_step_50/ directory present after Run 1 -# - Loss at step 51 (resumed) ≡ loss at step 51 (original) ± 1e-4 -# - e_score_correction_bias norm identical at step 51 across both runs - -recipe: TrainFinetuneRecipeForNextTokenPrediction - -step_scheduler: - global_batch_size: 8 - local_batch_size: 2 - ckpt_every_steps: 50 - val_every_steps: 10 - max_steps: 100 - -dist_env: - backend: nccl - timeout_minutes: 30 - -rng: - _target_: nemo_automodel.components.training.rng.StatefulRNG - seed: 1234 - ranked: true - -model: - _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_config - config: - _target_: transformers.AutoConfig.from_pretrained - pretrained_model_name_or_path: tencent/Hy3-preview - num_hidden_layers: 4 - load_base_model: true - trust_remote_code: false - backend: - _target_: nemo_automodel.components.models.common.BackendConfig - attn: sdpa - linear: torch - rms_norm: torch_fp32 - experts: torch_mm - dispatcher: torch - fake_balanced_gate: false - gate_precision: float32 - rope_fusion: false - enable_hf_state_dict_adapter: true - enable_fsdp_optimizations: true - -checkpoint: - enabled: true - checkpoint_dir: /tmp/checkpoints/hy3_p1/ - model_save_format: safetensors - save_consolidated: false - -distributed: - strategy: fsdp2 - tp_size: 1 - cp_size: 1 - pp_size: 2 - ep_size: 4 - - sequence_parallel: false - activation_checkpointing: true - - pipeline: - pp_schedule: 1f1b - pp_microbatch_size: 1 - round_virtual_stages_to_pp_multiple: down - scale_grads_in_schedule: false - patch_inner_model: false - patch_causal_lm_model: false - - moe: - reshard_after_forward: false - wrap_outer_model: false - -loss_fn: - _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy - -dataset: - _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag - path_or_dataset: rowan/hellaswag - split: train - tokenizer: - _target_: transformers.AutoTokenizer.from_pretrained - pretrained_model_name_or_path: tencent/Hy3-preview - -packed_sequence: - packed_sequence_size: 0 - -dataloader: - _target_: torchdata.stateful_dataloader.StatefulDataLoader - collate_fn: - _target_: nemo_automodel.components.datasets.utils.default_collater - pad_seq_len_divisible: 64 - shuffle: true - -validation_dataset: - _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag - path_or_dataset: rowan/hellaswag - split: validation - num_samples_limit: 32 - tokenizer: - _target_: transformers.AutoTokenizer.from_pretrained - pretrained_model_name_or_path: tencent/Hy3-preview - -validation_dataloader: - _target_: torchdata.stateful_dataloader.StatefulDataLoader - collate_fn: - _target_: nemo_automodel.components.datasets.utils.default_collater - pad_seq_len_divisible: 64 - shuffle: false - drop_last: true - -optimizer: - _target_: torch.optim.AdamW - betas: [0.9, 0.95] - eps: 1e-8 - lr: 1e-5 - weight_decay: 0.0 - -ci: - recipe_owner: khazic - time: "00:30:00" From 47c25db7f6bdaa8c2b00d86f08f30773a1258e71 Mon Sep 17 00:00:00 2001 From: HuiyingLi Date: Tue, 28 Apr 2026 15:55:35 -0700 Subject: [PATCH 24/27] chore(checkpoint): drop redundant changes already on main The PR branch's modifications to nemo_automodel/components/checkpoint/checkpointing.py were tracking older state of main; the relevant moe_mesh wiring (_maybe_adapt_state_dict_from_hf(..., moe_mesh=self.moe_mesh)) has been on main since #1904 (Adil, 2025-10-16). Sync this file back to main. Also drop the DefaultLoadPlanner(allow_partial_load=True) shim that was introduced for the previous HYV3 adapter where the e_score_correction_bias buffer's HF key was not renamed correctly. The new mixin-based adapter (commit cb30a596) renames expert_bias <-> e_score_correction_bias during from_hf/to_hf, so DCP finds matching keys on disk and allow_partial_load is unnecessary. Verified end-to-end on hy3_4layer_p0_smoke (pp=2, ep=4, 8 GPUs): 3 training steps complete, no missing-key errors, MoE weights load correctly via the shared mixin path with moe_mesh from the call site. While here, drop the unused _infer_ep_mesh helper in HYV3StateDictAdapter -- the call site always supplies moe_mesh now. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: HuiyingLi --- .../components/checkpoint/checkpointing.py | 13 +------ .../models/hy_v3/state_dict_adapter.py | 38 ++----------------- 2 files changed, 5 insertions(+), 46 deletions(-) diff --git a/nemo_automodel/components/checkpoint/checkpointing.py b/nemo_automodel/components/checkpoint/checkpointing.py index 7b9225d8e..5bc3cb8ad 100644 --- a/nemo_automodel/components/checkpoint/checkpointing.py +++ b/nemo_automodel/components/checkpoint/checkpointing.py @@ -553,11 +553,6 @@ def load_model( if compat_tied_lm_head_source_key is not None and isinstance(lm_head_param_name, str): state_dict[lm_head_param_name] = state_dict.pop(compat_tied_lm_head_source_key) - # Pass moe_mesh so adapter.from_hf can validate / merge against the - # rank-local subset of experts. DCP has already populated only the - # rank-local slice; the mixin's _from_hf_w_merged_experts iterates only - # what's in the dict (no re-slicing), but it needs the mesh to know - # which expert IDs are expected on this rank. state_dict = _maybe_adapt_state_dict_from_hf(model_state.model[0], state_dict, moe_mesh=self.moe_mesh) key_diff = _summarize_state_dict_key_diff(expected_keys, set(state_dict.keys())) if key_diff["missing_count"] or key_diff["unexpected_count"]: @@ -814,13 +809,7 @@ def _do_load( if self.config.is_peft and is_model and (not is_init_step): state_dict = load_file(os.path.join(path, "adapter_model.safetensors")) else: - if is_init_step and storage_reader is not None: - from torch.distributed.checkpoint import DefaultLoadPlanner - - planner = DefaultLoadPlanner(allow_partial_load=True) - else: - planner = None - dcp.load(state_dict, checkpoint_id=path, storage_reader=storage_reader, planner=planner) + dcp.load(state_dict, checkpoint_id=path, storage_reader=storage_reader) return state_dict def _do_save( diff --git a/nemo_automodel/components/models/hy_v3/state_dict_adapter.py b/nemo_automodel/components/models/hy_v3/state_dict_adapter.py index 2a4de0d8b..a75d77a91 100644 --- a/nemo_automodel/components/models/hy_v3/state_dict_adapter.py +++ b/nemo_automodel/components/models/hy_v3/state_dict_adapter.py @@ -139,15 +139,10 @@ def from_hf( Steps: 1. Drop MTP (multi-token prediction) layer keys. 2. Apply HYV3 name renames (on-disk -> native HF naming). - 3. Recover the EP device_mesh if the caller passed None: the standard - DCP path (checkpointing.py) calls us with ``moe_mesh=None`` on the - assumption that the state dict is already EP-local, but the mixin's - validator/merger needs the mesh to know which expert subset to - expect. Any per-expert tensor that survived ``to_hf`` is a DTensor - we can introspect for the mesh. - 4. Merge per-expert split tensors into grouped form via the mixin. + 3. Merge per-expert split tensors into grouped form via the mixin + (validates expert availability against the rank's EP slice). """ - # Step 1 + 2: filter MTP, then rename in a single pass to avoid copying twice. + # Step 1 + 2: filter MTP, rename to native names, in a single pass. renamed: dict[str, Any] = {} for k, v in hf_state_dict.items(): if self._is_mtp_key(k): @@ -159,11 +154,7 @@ def from_hf( break renamed[new_k] = v - # Step 3: recover device_mesh from any expert-shard DTensor in the dict. - if device_mesh is None: - device_mesh = self._infer_ep_mesh(renamed) - - # Step 4: per-expert merge + EP slicing via the mixin. + # Step 3: per-expert merge + EP slicing via the mixin. return self._from_hf_w_merged_experts(renamed, device_mesh) # ------------------------------------------------------------------ @@ -212,24 +203,3 @@ def _is_mtp_key(self, key: str) -> bool: num_hidden = getattr(self.config, "num_hidden_layers", 80) m = re.match(r"(?:model\.)?layers\.(\d+)\.", key) return bool(m and int(m.group(1)) >= num_hidden) - - def _infer_ep_mesh(self, state_dict: dict[str, Any]) -> Optional[DeviceMesh]: - """Return the EP device mesh inferred from any per-expert DTensor in *state_dict*. - - The standard DCP load path produces per-expert split DTensors (via - ``to_hf``) whose ``device_mesh`` carries the ``ep`` dimension we need - to drive ``_from_hf_w_merged_experts``. We probe the dict for any - DTensor expert tensor and reuse its mesh. - """ - from torch.distributed.tensor import DTensor - - expert_segment = self._expert_path_segment - for k, v in state_dict.items(): - if not isinstance(v, DTensor): - continue - if f".{expert_segment}." not in k: - continue - mesh = v.device_mesh - if mesh is not None and "ep" in (mesh.mesh_dim_names or ()): - return mesh - return None From 66780c9e5a585bd80042f35d66963401456ae947 Mon Sep 17 00:00:00 2001 From: HuiyingLi Date: Tue, 28 Apr 2026 16:00:16 -0700 Subject: [PATCH 25/27] test(llm): unit tests for HYV3StateDictAdapter 40 tests covering all behavior introduced by the rewritten adapter: - Initialization: attribute wiring, default dtype, mixin inheritance. - Rename tables (_NATIVE_TO_HF_RENAMES / _HF_TO_NATIVE_RENAMES): parametrized round-trip for each rename pair, plus negative cases that must NOT be renamed (attention, layernorm, embed, dense MLP, lm_head, model.norm). - from_hf (on-disk -> native): * router.gate.weight -> gate.weight * expert_bias -> gate.e_score_correction_bias * shared_mlp.* -> shared_experts.* * per-expert split keys merged into experts.gate_and_up_projs and experts.down_projs with the right [E,H,2I]/[E,I,H] native shapes and value-level transposed/concatenated layout * MTP layer keys (index >= num_hidden_layers) dropped * unrelated keys pass through unchanged - to_hf (native -> on-disk): * reverse renames produce the on-disk Tencent names * grouped expert tensors split into per-expert keys with [moe_inter, hidden] / [hidden, moe_inter] disk shapes * exclude_key_regex honored - convert_single_tensor_to_hf: * non-expert keys renamed or pass through * expert tensors split + renamed (one input -> 2*E or E pairs) * exclude_key_regex applied after rename - Round-trip integrity: * native -> to_hf -> from_hf recovers every key value-for-value * disk -> from_hf -> to_hf recovers every non-MTP key value-for-value - _is_mtp_key: parametrized layer-index classification with/without the "model." prefix, plus a config-threshold variation test. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: HuiyingLi --- .../hy_v3/test_hy_v3_state_dict_adapter.py | 536 ++++++++++++++++++ 1 file changed, 536 insertions(+) create mode 100644 tests/unit_tests/models/hy_v3/test_hy_v3_state_dict_adapter.py diff --git a/tests/unit_tests/models/hy_v3/test_hy_v3_state_dict_adapter.py b/tests/unit_tests/models/hy_v3/test_hy_v3_state_dict_adapter.py new file mode 100644 index 000000000..675c890b3 --- /dev/null +++ b/tests/unit_tests/models/hy_v3/test_hy_v3_state_dict_adapter.py @@ -0,0 +1,536 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for ``HYV3StateDictAdapter``. + +Covers the four behaviors that distinguish HYV3 from the shared +``MoESplitExpertsStateDictMixin``: + + 1. HYV3-specific name renames (router.gate, expert_bias, shared_mlp.). + 2. Per-expert split / merge inherited from the mixin. + 3. MTP-layer filtering (drops keys for layer index >= num_hidden_layers). + 4. Round-trip integrity: ``from_hf(to_hf(x))`` recovers ``x``. +""" + +from unittest.mock import Mock + +import pytest +import torch + +from nemo_automodel.components.models.common import BackendConfig +from nemo_automodel.components.models.hy_v3.state_dict_adapter import ( + _HF_TO_NATIVE_RENAMES, + _NATIVE_TO_HF_RENAMES, + HYV3StateDictAdapter, +) +from nemo_automodel.components.moe.config import MoEConfig + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +N_EXPERTS = 4 +HIDDEN = 16 +MOE_INTER = 8 +NUM_LAYERS = 2 # layer 0 dense, layer 1 MoE +NUM_MTP = 1 + + +@pytest.fixture +def config(): + cfg = Mock() + cfg.num_hidden_layers = NUM_LAYERS + cfg.hidden_size = HIDDEN + cfg.intermediate_size = 32 + cfg.moe_intermediate_size = MOE_INTER + cfg.num_attention_heads = 4 + cfg.num_key_value_heads = 2 + cfg.num_experts = N_EXPERTS + cfg.num_experts_per_tok = 2 + cfg.num_shared_experts = 1 + cfg.first_k_dense_replace = 1 + cfg.num_nextn_predict_layers = NUM_MTP + return cfg + + +@pytest.fixture +def moe_config(): + return MoEConfig( + dim=HIDDEN, + inter_dim=32, + moe_inter_dim=MOE_INTER, + n_routed_experts=N_EXPERTS, + n_shared_experts=1, + n_activated_experts=2, + n_expert_groups=0, + n_limited_groups=0, + train_gate=True, + gate_bias_update_factor=0.0, + score_func="sigmoid", + route_scale=1.0, + aux_loss_coeff=0.0, + norm_topk_prob=False, + expert_bias=False, + router_bias=False, + expert_activation="swiglu", + softmax_before_topk=False, + force_e_score_correction_bias=True, + ) + + +@pytest.fixture +def backend_config(): + return BackendConfig( + linear="torch", + attn="sdpa", + rms_norm="torch", + experts="torch", + dispatcher="torch", + fake_balanced_gate=False, + enable_hf_state_dict_adapter=False, + ) + + +@pytest.fixture +def adapter(config, moe_config, backend_config): + return HYV3StateDictAdapter( + config=config, moe_config=moe_config, backend=backend_config, dtype=torch.float32 + ) + + +def _make_disk_state_dict(*, with_mtp: bool = True): + """Synthesize a state dict matching the on-disk Tencent Hy3-preview format. + + Layer 0: dense MLP. Layer 1: MoE with N_EXPERTS experts + shared MLP + + router gate + expert_bias. Optionally include one MTP layer at index + NUM_LAYERS that should be filtered out by ``from_hf``. + """ + sd: dict[str, torch.Tensor] = { + # Top-level (passes through unchanged on both directions). + "model.embed_tokens.weight": torch.randn(32, HIDDEN), + "model.norm.weight": torch.randn(HIDDEN), + "lm_head.weight": torch.randn(32, HIDDEN), + # Layer 0: dense MLP + attention. + "model.layers.0.input_layernorm.weight": torch.randn(HIDDEN), + "model.layers.0.post_attention_layernorm.weight": torch.randn(HIDDEN), + "model.layers.0.self_attn.q_proj.weight": torch.randn(HIDDEN, HIDDEN), + "model.layers.0.self_attn.k_proj.weight": torch.randn(HIDDEN // 2, HIDDEN), + "model.layers.0.self_attn.v_proj.weight": torch.randn(HIDDEN // 2, HIDDEN), + "model.layers.0.self_attn.o_proj.weight": torch.randn(HIDDEN, HIDDEN), + "model.layers.0.mlp.gate_proj.weight": torch.randn(32, HIDDEN), + "model.layers.0.mlp.up_proj.weight": torch.randn(32, HIDDEN), + "model.layers.0.mlp.down_proj.weight": torch.randn(HIDDEN, 32), + # Layer 1: MoE -- on-disk format with Tencent-internal names. + "model.layers.1.input_layernorm.weight": torch.randn(HIDDEN), + "model.layers.1.post_attention_layernorm.weight": torch.randn(HIDDEN), + "model.layers.1.self_attn.q_proj.weight": torch.randn(HIDDEN, HIDDEN), + "model.layers.1.self_attn.k_proj.weight": torch.randn(HIDDEN // 2, HIDDEN), + "model.layers.1.self_attn.v_proj.weight": torch.randn(HIDDEN // 2, HIDDEN), + "model.layers.1.self_attn.o_proj.weight": torch.randn(HIDDEN, HIDDEN), + "model.layers.1.mlp.router.gate.weight": torch.randn(N_EXPERTS, HIDDEN), + "model.layers.1.mlp.expert_bias": torch.randn(N_EXPERTS), + "model.layers.1.mlp.shared_mlp.gate_proj.weight": torch.randn(MOE_INTER, HIDDEN), + "model.layers.1.mlp.shared_mlp.up_proj.weight": torch.randn(MOE_INTER, HIDDEN), + "model.layers.1.mlp.shared_mlp.down_proj.weight": torch.randn(HIDDEN, MOE_INTER), + } + for e in range(N_EXPERTS): + sd[f"model.layers.1.mlp.experts.{e}.gate_proj.weight"] = torch.randn(MOE_INTER, HIDDEN) + sd[f"model.layers.1.mlp.experts.{e}.up_proj.weight"] = torch.randn(MOE_INTER, HIDDEN) + sd[f"model.layers.1.mlp.experts.{e}.down_proj.weight"] = torch.randn(HIDDEN, MOE_INTER) + if with_mtp: + # MTP layer: layer index NUM_LAYERS, must be dropped on from_hf. + sd[f"model.layers.{NUM_LAYERS}.input_layernorm.weight"] = torch.randn(HIDDEN) + sd[f"model.layers.{NUM_LAYERS}.mlp.expert_bias"] = torch.randn(N_EXPERTS) + return sd + + +def _make_native_state_dict(): + """Synthesize a state dict matching the Automodel native HYV3 format.""" + sd: dict[str, torch.Tensor] = { + "model.embed_tokens.weight": torch.randn(32, HIDDEN), + "model.norm.weight": torch.randn(HIDDEN), + "lm_head.weight": torch.randn(32, HIDDEN), + "model.layers.0.input_layernorm.weight": torch.randn(HIDDEN), + "model.layers.0.post_attention_layernorm.weight": torch.randn(HIDDEN), + "model.layers.0.self_attn.q_proj.weight": torch.randn(HIDDEN, HIDDEN), + "model.layers.0.self_attn.k_proj.weight": torch.randn(HIDDEN // 2, HIDDEN), + "model.layers.0.self_attn.v_proj.weight": torch.randn(HIDDEN // 2, HIDDEN), + "model.layers.0.self_attn.o_proj.weight": torch.randn(HIDDEN, HIDDEN), + "model.layers.0.mlp.gate_proj.weight": torch.randn(32, HIDDEN), + "model.layers.0.mlp.up_proj.weight": torch.randn(32, HIDDEN), + "model.layers.0.mlp.down_proj.weight": torch.randn(HIDDEN, 32), + "model.layers.1.input_layernorm.weight": torch.randn(HIDDEN), + "model.layers.1.post_attention_layernorm.weight": torch.randn(HIDDEN), + "model.layers.1.self_attn.q_proj.weight": torch.randn(HIDDEN, HIDDEN), + "model.layers.1.self_attn.k_proj.weight": torch.randn(HIDDEN // 2, HIDDEN), + "model.layers.1.self_attn.v_proj.weight": torch.randn(HIDDEN // 2, HIDDEN), + "model.layers.1.self_attn.v_proj.weight": torch.randn(HIDDEN // 2, HIDDEN), + "model.layers.1.self_attn.o_proj.weight": torch.randn(HIDDEN, HIDDEN), + # Native MoE keys (post-rename + post-merge): + "model.layers.1.mlp.gate.weight": torch.randn(N_EXPERTS, HIDDEN), + "model.layers.1.mlp.gate.e_score_correction_bias": torch.randn(N_EXPERTS), + "model.layers.1.mlp.experts.gate_and_up_projs": torch.randn(N_EXPERTS, HIDDEN, 2 * MOE_INTER), + "model.layers.1.mlp.experts.down_projs": torch.randn(N_EXPERTS, MOE_INTER, HIDDEN), + "model.layers.1.mlp.shared_experts.gate_proj.weight": torch.randn(MOE_INTER, HIDDEN), + "model.layers.1.mlp.shared_experts.up_proj.weight": torch.randn(MOE_INTER, HIDDEN), + "model.layers.1.mlp.shared_experts.down_proj.weight": torch.randn(HIDDEN, MOE_INTER), + } + return sd + + +# --------------------------------------------------------------------------- +# Initialization +# --------------------------------------------------------------------------- + + +class TestInitialization: + def test_attributes_set(self, config, moe_config, backend_config): + a = HYV3StateDictAdapter(config=config, moe_config=moe_config, backend=backend_config, dtype=torch.float16) + assert a.config is config + assert a.moe_config is moe_config + assert a.backend is backend_config + assert a.dtype == torch.float16 + assert a._uses_model_prefix is True + + def test_default_dtype_is_bfloat16(self, config, moe_config, backend_config): + a = HYV3StateDictAdapter(config=config, moe_config=moe_config, backend=backend_config) + assert a.dtype == torch.bfloat16 + + def test_inherits_mixin(self, adapter): + from nemo_automodel.components.moe.state_dict_mixin import MoESplitExpertsStateDictMixin + + assert isinstance(adapter, MoESplitExpertsStateDictMixin) + + +# --------------------------------------------------------------------------- +# Rename tables +# --------------------------------------------------------------------------- + + +class TestRenameTables: + """Sanity-check the static rename tables: each native pattern must round-trip.""" + + @pytest.mark.parametrize( + "native, hf", + [ + ("model.layers.5.mlp.gate.e_score_correction_bias", "model.layers.5.mlp.expert_bias"), + ("model.layers.5.mlp.gate.weight", "model.layers.5.mlp.router.gate.weight"), + ("model.layers.5.mlp.shared_experts.gate_proj.weight", "model.layers.5.mlp.shared_mlp.gate_proj.weight"), + ("model.layers.5.mlp.shared_experts.up_proj.weight", "model.layers.5.mlp.shared_mlp.up_proj.weight"), + ("model.layers.5.mlp.shared_experts.down_proj.weight", "model.layers.5.mlp.shared_mlp.down_proj.weight"), + ], + ) + def test_native_to_hf_round_trip(self, native, hf): + # Apply native->HF + nk = native + for pat, repl in _NATIVE_TO_HF_RENAMES: + nk, n = pat.subn(repl, nk) + if n: + break + assert nk == hf + + # Apply HF->native + hk = hf + for pat, repl in _HF_TO_NATIVE_RENAMES: + hk, n = pat.subn(repl, hk) + if n: + break + assert hk == native + + def test_unrelated_keys_pass_through(self): + """Renames must not touch attention, embed, lm_head, layernorm keys.""" + for k in ( + "model.embed_tokens.weight", + "lm_head.weight", + "model.layers.0.self_attn.q_proj.weight", + "model.layers.0.input_layernorm.weight", + "model.layers.0.mlp.gate_proj.weight", # dense MLP gate_proj must NOT match + "model.norm.weight", + ): + for tab in (_NATIVE_TO_HF_RENAMES, _HF_TO_NATIVE_RENAMES): + v = k + for pat, repl in tab: + v, n = pat.subn(repl, v) + if n: + break + assert v == k, f"{k} unexpectedly renamed to {v}" + + +# --------------------------------------------------------------------------- +# from_hf: on-disk -> native +# --------------------------------------------------------------------------- + + +class TestFromHF: + def test_renames_router_gate(self, adapter): + hf = _make_disk_state_dict(with_mtp=False) + native = adapter.from_hf(hf, device_mesh=None) + assert "model.layers.1.mlp.gate.weight" in native + assert "model.layers.1.mlp.router.gate.weight" not in native + + def test_renames_expert_bias_to_gate_bias(self, adapter): + hf = _make_disk_state_dict(with_mtp=False) + native = adapter.from_hf(hf, device_mesh=None) + assert "model.layers.1.mlp.gate.e_score_correction_bias" in native + assert "model.layers.1.mlp.expert_bias" not in native + + def test_renames_shared_mlp_to_shared_experts(self, adapter): + hf = _make_disk_state_dict(with_mtp=False) + native = adapter.from_hf(hf, device_mesh=None) + for proj in ("gate_proj", "up_proj", "down_proj"): + assert f"model.layers.1.mlp.shared_experts.{proj}.weight" in native + assert f"model.layers.1.mlp.shared_mlp.{proj}.weight" not in native + + def test_merges_experts_into_grouped_form(self, adapter): + hf = _make_disk_state_dict(with_mtp=False) + native = adapter.from_hf(hf, device_mesh=None) + # Per-expert split keys must be gone. + for e in range(N_EXPERTS): + for proj in ("gate_proj", "up_proj", "down_proj"): + assert f"model.layers.1.mlp.experts.{e}.{proj}.weight" not in native + # Grouped tensors present. + assert "model.layers.1.mlp.experts.gate_and_up_projs" in native + assert "model.layers.1.mlp.experts.down_projs" in native + + def test_merged_shapes_are_native_layout(self, adapter): + hf = _make_disk_state_dict(with_mtp=False) + native = adapter.from_hf(hf, device_mesh=None) + # Native gate_and_up_projs: [E, hidden, 2*moe_inter] + assert tuple(native["model.layers.1.mlp.experts.gate_and_up_projs"].shape) == ( + N_EXPERTS, + HIDDEN, + 2 * MOE_INTER, + ) + # Native down_projs: [E, moe_inter, hidden] + assert tuple(native["model.layers.1.mlp.experts.down_projs"].shape) == ( + N_EXPERTS, + MOE_INTER, + HIDDEN, + ) + + def test_merged_values_match_per_expert_inputs(self, adapter): + """The stacked native tensors must contain the per-expert tensors transposed + and concatenated in the well-defined gate-then-up order.""" + hf = _make_disk_state_dict(with_mtp=False) + native = adapter.from_hf(hf, device_mesh=None) + + gate_up = native["model.layers.1.mlp.experts.gate_and_up_projs"] + down = native["model.layers.1.mlp.experts.down_projs"] + for e in range(N_EXPERTS): + g_hf = hf[f"model.layers.1.mlp.experts.{e}.gate_proj.weight"] # [moe_inter, hidden] + u_hf = hf[f"model.layers.1.mlp.experts.{e}.up_proj.weight"] + d_hf = hf[f"model.layers.1.mlp.experts.{e}.down_proj.weight"] # [hidden, moe_inter] + + # gate half: [hidden, moe_inter] -> first MOE_INTER columns of gate_and_up_projs + assert torch.allclose(gate_up[e, :, :MOE_INTER], g_hf.transpose(0, 1).to(adapter.dtype)) + # up half: last MOE_INTER columns + assert torch.allclose(gate_up[e, :, MOE_INTER:], u_hf.transpose(0, 1).to(adapter.dtype)) + # down: [hidden, moe_inter] HF -> [moe_inter, hidden] native + assert torch.allclose(down[e], d_hf.transpose(0, 1).to(adapter.dtype)) + + def test_drops_mtp_layer_keys(self, adapter): + hf = _make_disk_state_dict(with_mtp=True) + # MTP keys are present in input. + assert any(k.startswith(f"model.layers.{NUM_LAYERS}.") for k in hf) + native = adapter.from_hf(hf, device_mesh=None) + # MTP keys must not survive. + assert not any(k.startswith(f"model.layers.{NUM_LAYERS}.") for k in native) + assert not any(k.startswith(f"layers.{NUM_LAYERS}.") for k in native) + + def test_passes_through_unrelated_keys(self, adapter): + hf = _make_disk_state_dict(with_mtp=False) + native = adapter.from_hf(hf, device_mesh=None) + for k in ( + "model.embed_tokens.weight", + "model.norm.weight", + "lm_head.weight", + "model.layers.0.self_attn.q_proj.weight", + "model.layers.0.input_layernorm.weight", + "model.layers.0.mlp.gate_proj.weight", + "model.layers.1.input_layernorm.weight", + "model.layers.1.self_attn.q_proj.weight", + ): + assert k in native + assert torch.equal(native[k], hf[k]) + + +# --------------------------------------------------------------------------- +# to_hf: native -> on-disk +# --------------------------------------------------------------------------- + + +class TestToHF: + def test_renames_native_back_to_on_disk(self, adapter): + native = _make_native_state_dict() + hf = adapter.to_hf(native) + # On-disk keys present. + assert "model.layers.1.mlp.router.gate.weight" in hf + assert "model.layers.1.mlp.expert_bias" in hf + assert "model.layers.1.mlp.shared_mlp.gate_proj.weight" in hf + # Native keys gone. + assert "model.layers.1.mlp.gate.weight" not in hf + assert "model.layers.1.mlp.gate.e_score_correction_bias" not in hf + assert "model.layers.1.mlp.shared_experts.gate_proj.weight" not in hf + + def test_splits_experts_into_per_expert_keys(self, adapter): + native = _make_native_state_dict() + hf = adapter.to_hf(native) + # Grouped keys gone. + assert "model.layers.1.mlp.experts.gate_and_up_projs" not in hf + assert "model.layers.1.mlp.experts.down_projs" not in hf + # Per-expert split keys present. + for e in range(N_EXPERTS): + for proj in ("gate_proj", "up_proj", "down_proj"): + assert f"model.layers.1.mlp.experts.{e}.{proj}.weight" in hf + + def test_per_expert_shapes_match_disk_layout(self, adapter): + native = _make_native_state_dict() + hf = adapter.to_hf(native) + for e in range(N_EXPERTS): + assert tuple(hf[f"model.layers.1.mlp.experts.{e}.gate_proj.weight"].shape) == (MOE_INTER, HIDDEN) + assert tuple(hf[f"model.layers.1.mlp.experts.{e}.up_proj.weight"].shape) == (MOE_INTER, HIDDEN) + assert tuple(hf[f"model.layers.1.mlp.experts.{e}.down_proj.weight"].shape) == (HIDDEN, MOE_INTER) + + def test_exclude_key_regex(self, adapter): + native = _make_native_state_dict() + native["custom.exclude.weight"] = torch.randn(2, 2) + hf = adapter.to_hf(native, exclude_key_regex=r"^custom\.exclude") + assert "custom.exclude.weight" not in hf + # Renames still applied. + assert "model.layers.1.mlp.router.gate.weight" in hf + + +# --------------------------------------------------------------------------- +# convert_single_tensor_to_hf +# --------------------------------------------------------------------------- + + +class TestConvertSingleTensorToHF: + def test_non_expert_key_renamed(self, adapter): + t = torch.randn(N_EXPERTS, HIDDEN) + out = adapter.convert_single_tensor_to_hf("model.layers.1.mlp.gate.weight", t) + assert len(out) == 1 + assert out[0][0] == "model.layers.1.mlp.router.gate.weight" + assert torch.equal(out[0][1], t) + + def test_non_expert_key_pass_through(self, adapter): + t = torch.randn(HIDDEN, HIDDEN) + out = adapter.convert_single_tensor_to_hf("model.layers.0.self_attn.q_proj.weight", t) + assert len(out) == 1 + assert out[0][0] == "model.layers.0.self_attn.q_proj.weight" + assert torch.equal(out[0][1], t) + + def test_expert_tensor_split_and_renamed(self, adapter): + # gate_and_up_projs -> per-expert gate_proj.weight + up_proj.weight (split + transposed) + t = torch.randn(N_EXPERTS, HIDDEN, 2 * MOE_INTER) + pairs = adapter.convert_single_tensor_to_hf("model.layers.1.mlp.experts.gate_and_up_projs", t) + keys = {k for k, _ in pairs} + assert len(pairs) == 2 * N_EXPERTS + for e in range(N_EXPERTS): + assert f"model.layers.1.mlp.experts.{e}.gate_proj.weight" in keys + assert f"model.layers.1.mlp.experts.{e}.up_proj.weight" in keys + + def test_expert_down_proj_split_and_transposed(self, adapter): + t = torch.randn(N_EXPERTS, MOE_INTER, HIDDEN) + pairs = adapter.convert_single_tensor_to_hf("model.layers.1.mlp.experts.down_projs", t) + keys = {k for k, _ in pairs} + assert len(pairs) == N_EXPERTS + for e in range(N_EXPERTS): + assert f"model.layers.1.mlp.experts.{e}.down_proj.weight" in keys + # Shape per expert: [hidden, moe_inter] + for k, v in pairs: + assert tuple(v.shape) == (HIDDEN, MOE_INTER) + + def test_exclude_regex_applied_after_rename(self, adapter): + t = torch.randn(N_EXPERTS) + out = adapter.convert_single_tensor_to_hf( + "model.layers.1.mlp.gate.e_score_correction_bias", + t, + exclude_key_regex=r".*\.expert_bias$", # matches the renamed-to name + ) + assert out == [] + + def test_exclude_regex_when_no_match(self, adapter): + t = torch.randn(2, 2) + out = adapter.convert_single_tensor_to_hf("custom.weight", t, exclude_key_regex=r"^never") + assert len(out) == 1 + assert out[0][0] == "custom.weight" + + +# --------------------------------------------------------------------------- +# Round-trip: from_hf(to_hf(native)) == native +# --------------------------------------------------------------------------- + + +class TestRoundTrip: + def test_native_to_hf_to_native(self, adapter): + native = _make_native_state_dict() + hf = adapter.to_hf(native) + recovered = adapter.from_hf(hf, device_mesh=None) + + assert set(recovered.keys()) == set(native.keys()) + for k in native: + a, b = native[k].float(), recovered[k].float() + assert a.shape == b.shape, f"{k}: shape {a.shape} != {b.shape}" + assert torch.allclose(a, b, atol=1e-5, rtol=1e-5), f"{k} differs after round-trip" + + def test_disk_to_native_to_disk(self, adapter): + """Loading from disk then re-saving must round-trip every per-expert key. + MTP keys (which from_hf drops) are excluded from the comparison.""" + hf = _make_disk_state_dict(with_mtp=True) + native = adapter.from_hf(hf, device_mesh=None) + re_hf = adapter.to_hf(native) + + expected_keys = {k for k in hf if not adapter._is_mtp_key(k)} + # MTP layer keys must not show up in re_hf either. + assert set(re_hf.keys()) == expected_keys + + for k in expected_keys: + a, b = hf[k].float(), re_hf[k].float() + assert a.shape == b.shape, f"{k}: shape mismatch" + assert torch.allclose(a, b, atol=1e-5, rtol=1e-5), f"{k} differs after round-trip" + + +# --------------------------------------------------------------------------- +# _is_mtp_key +# --------------------------------------------------------------------------- + + +class TestIsMTPKey: + @pytest.mark.parametrize( + "key, expected", + [ + ("model.layers.0.self_attn.q_proj.weight", False), + ("model.layers.1.mlp.expert_bias", False), + (f"model.layers.{NUM_LAYERS}.input_layernorm.weight", True), + (f"model.layers.{NUM_LAYERS}.mlp.expert_bias", True), + (f"model.layers.{NUM_LAYERS + 5}.self_attn.q_proj.weight", True), + ("model.embed_tokens.weight", False), + ("model.norm.weight", False), + ("lm_head.weight", False), + # Without the model. prefix + ("layers.0.foo", False), + (f"layers.{NUM_LAYERS}.foo", True), + ], + ) + def test_layer_index_classification(self, adapter, key, expected): + assert adapter._is_mtp_key(key) is expected + + def test_threshold_uses_config_num_hidden_layers(self, config, moe_config, backend_config): + config.num_hidden_layers = 80 + a = HYV3StateDictAdapter(config=config, moe_config=moe_config, backend=backend_config) + assert a._is_mtp_key("model.layers.79.foo") is False + assert a._is_mtp_key("model.layers.80.foo") is True From 8f568532c4aca37bfe224225d79c70acdb7b62fc Mon Sep 17 00:00:00 2001 From: HuiyingLi Date: Tue, 28 Apr 2026 16:10:15 -0700 Subject: [PATCH 26/27] test(llm): cover remaining HYV3 + recipe changes in this PR Adds tests for every code change in the PR that wasn't already covered: - tests/unit_tests/models/hy_v3/test_hy_v3_config.py (15 tests): HYV3Config defaults match the published 295B spec, override propagation for attention dims / MoE routing / layer truncation / first_k_dense_replace / router flags / RoPE / token IDs, to_dict round-trip, and class-level model_type stability. - tests/unit_tests/models/hy_v3/test_hy_v3_layers.py (10 tests): HYV3Attention initialization (projection shapes, per-head qk_norm, attention_bias on/off), forward output shapes through the sdpa backend, q/k/v/o projections all called, attention_mask propagated, init_weights resets norms + reseeds linears. - tests/unit_tests/models/hy_v3/test_hy_v3_model.py (24 tests): Block dense vs MoE switching at first_k_dense_replace, residual forward calls attn + mlp, attention_mask -> padding_mask conversion, init_weights propagates to sub-components; HYV3Model construction + dense+MoE structure + moe_config inference + moe_overrides + moe_config-vs-overrides conflict + forward + position_ids + init_weights; HYV3ForCausalLM construction, optional state_dict_adapter wiring, default backend, get/set in/out embeddings, forward logits shape, initialize_weights, the update_moe_gate_bias no-op-when-factor-zero contract (regression test for 564ff4f2), from_config/from_pretrained classmethods, ModelClass alias, module exports. - tests/unit_tests/_transformers/test_registry_hy_v3.py (6 tests): HYV3ForCausalLM is registered in MODEL_ARCH_MAPPING and resolves to the right (module, class). hy_v3 is registered in _CUSTOM_CONFIG_REGISTRATIONS and the resolved class has model_type == 'hy_v3'. Negative tests confirm the removed Ministral3 bidirectional retrieval keys are gone from SUPPORTED_BACKBONES. - tests/unit_tests/recipes/test_train_ft.py (3 tests): PEFT + torch_save raises ValueError (parity with the equivalent test added in test_finetune_vlm_helpers.py), PEFT + safetensors succeeds, non-PEFT + torch_save succeeds. All 107 tests pass (40 from the prior adapter test commit + 67 new here). Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: HuiyingLi --- .../_transformers/test_registry_hy_v3.py | 71 ++++ .../models/hy_v3/test_hy_v3_config.py | 139 +++++++ .../models/hy_v3/test_hy_v3_layers.py | 185 +++++++++ .../models/hy_v3/test_hy_v3_model.py | 380 ++++++++++++++++++ tests/unit_tests/recipes/test_train_ft.py | 57 +++ 5 files changed, 832 insertions(+) create mode 100644 tests/unit_tests/_transformers/test_registry_hy_v3.py create mode 100644 tests/unit_tests/models/hy_v3/test_hy_v3_config.py create mode 100644 tests/unit_tests/models/hy_v3/test_hy_v3_layers.py create mode 100644 tests/unit_tests/models/hy_v3/test_hy_v3_model.py diff --git a/tests/unit_tests/_transformers/test_registry_hy_v3.py b/tests/unit_tests/_transformers/test_registry_hy_v3.py new file mode 100644 index 000000000..1a256705b --- /dev/null +++ b/tests/unit_tests/_transformers/test_registry_hy_v3.py @@ -0,0 +1,71 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Verify HYV3 model + config are registered in nemo_automodel._transformers.registry.""" + +import pytest + + +class TestArchMapping: + def test_hyv3_arch_registered(self): + from nemo_automodel._transformers.registry import MODEL_ARCH_MAPPING + + assert "HYV3ForCausalLM" in MODEL_ARCH_MAPPING + + def test_hyv3_arch_points_at_correct_module(self): + from nemo_automodel._transformers.registry import MODEL_ARCH_MAPPING + + entry = MODEL_ARCH_MAPPING["HYV3ForCausalLM"] + assert entry[0] == "nemo_automodel.components.models.hy_v3.model" + assert entry[1] == "HYV3ForCausalLM" + + def test_hyv3_arch_resolves_to_class(self): + """Walk the mapping path -- importable + the named class exists.""" + import importlib + + from nemo_automodel._transformers.registry import MODEL_ARCH_MAPPING + + mod_path, cls_name, *_ = MODEL_ARCH_MAPPING["HYV3ForCausalLM"] + mod = importlib.import_module(mod_path) + assert hasattr(mod, cls_name) + + +class TestCustomConfigRegistration: + def test_hy_v3_config_registered(self): + from nemo_automodel._transformers.registry import _CUSTOM_CONFIG_REGISTRATIONS + + assert "hy_v3" in _CUSTOM_CONFIG_REGISTRATIONS + + def test_hy_v3_config_resolves_to_class(self): + import importlib + + from nemo_automodel._transformers.registry import _CUSTOM_CONFIG_REGISTRATIONS + + mod_path, cls_name = _CUSTOM_CONFIG_REGISTRATIONS["hy_v3"] + mod = importlib.import_module(mod_path) + cls = getattr(mod, cls_name) + assert cls.__name__ == "HYV3Config" + assert cls.model_type == "hy_v3" + + +class TestRetrievalMappingSync: + """Sync-state tests: the PR removed the Ministral3 bidirectional retrieval + backbone; ensure those keys are no longer in SUPPORTED_BACKBONES.""" + + def test_ministral3_not_in_supported_backbones(self): + from nemo_automodel._transformers.retrieval import SUPPORTED_BACKBONES + + assert "ministral3" not in SUPPORTED_BACKBONES + assert "ministral3_bidirec" not in SUPPORTED_BACKBONES + assert "mistral3" not in SUPPORTED_BACKBONES diff --git a/tests/unit_tests/models/hy_v3/test_hy_v3_config.py b/tests/unit_tests/models/hy_v3/test_hy_v3_config.py new file mode 100644 index 000000000..5ebebb031 --- /dev/null +++ b/tests/unit_tests/models/hy_v3/test_hy_v3_config.py @@ -0,0 +1,139 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for ``HYV3Config``.""" + +import pytest +from transformers import PretrainedConfig + +from nemo_automodel.components.models.hy_v3.config import HYV3Config + + +class TestDefaults: + def test_model_type(self): + assert HYV3Config.model_type == "hy_v3" + + def test_inherits_pretrained_config(self): + cfg = HYV3Config() + assert isinstance(cfg, PretrainedConfig) + + def test_default_attributes_match_295b(self): + cfg = HYV3Config() + # Architecture defaults from the published Hy3-preview spec. + assert cfg.vocab_size == 129280 + assert cfg.hidden_size == 4096 + assert cfg.intermediate_size == 1536 + assert cfg.moe_intermediate_size == 1536 + assert cfg.num_hidden_layers == 80 + assert cfg.num_attention_heads == 64 + assert cfg.num_key_value_heads == 8 + assert cfg.head_dim == 128 + assert cfg.num_experts == 192 + assert cfg.num_shared_experts == 1 + assert cfg.num_experts_per_tok == 8 + assert cfg.first_k_dense_replace == 1 + assert cfg.max_position_embeddings == 262144 + assert cfg.rope_theta == 11158840.0 + assert cfg.rms_norm_eps == 1e-6 + assert cfg.attention_bias is False + assert cfg.hidden_act == "silu" + # torch_dtype is auto-coerced by PretrainedConfig (deprecated -> dtype); + # accept either the string we set or whatever the base class normalizes to. + assert cfg.torch_dtype in ("bfloat16", None) or str(cfg.torch_dtype).endswith("bfloat16") + assert cfg.tie_word_embeddings is False + assert cfg.moe_router_enable_expert_bias is True + + def test_keys_to_ignore_at_inference(self): + assert HYV3Config.keys_to_ignore_at_inference == ["past_key_values"] + + +class TestOverrides: + def test_override_attention_dims(self): + cfg = HYV3Config( + num_attention_heads=8, + num_key_value_heads=2, + head_dim=64, + hidden_size=512, + ) + assert cfg.num_attention_heads == 8 + assert cfg.num_key_value_heads == 2 + assert cfg.head_dim == 64 + assert cfg.hidden_size == 512 + + def test_override_moe_routing(self): + cfg = HYV3Config(num_experts=64, num_experts_per_tok=4, num_shared_experts=2, router_scaling_factor=1.5) + assert cfg.num_experts == 64 + assert cfg.num_experts_per_tok == 4 + assert cfg.num_shared_experts == 2 + assert cfg.router_scaling_factor == 1.5 + + def test_truncated_layer_count(self): + cfg = HYV3Config(num_hidden_layers=4) + assert cfg.num_hidden_layers == 4 + + def test_first_k_dense_replace(self): + cfg = HYV3Config(first_k_dense_replace=3) + assert cfg.first_k_dense_replace == 3 + + def test_router_flags(self): + cfg = HYV3Config(route_norm=True, moe_router_enable_expert_bias=False) + assert cfg.route_norm is True + assert cfg.moe_router_enable_expert_bias is False + + def test_rope_overrides(self): + cfg = HYV3Config(rope_theta=500000.0, max_position_embeddings=4096) + assert cfg.rope_theta == 500000.0 + assert cfg.max_position_embeddings == 4096 + + def test_rope_scaling_dict(self): + scaling = {"factor": 8.0, "rope_type": "yarn"} + cfg = HYV3Config(rope_scaling=scaling) + assert cfg.rope_scaling == scaling + + def test_token_ids(self): + cfg = HYV3Config(pad_token_id=0, bos_token_id=10, eos_token_id=11) + assert cfg.pad_token_id == 0 + assert cfg.bos_token_id == 10 + assert cfg.eos_token_id == 11 + + def test_super_init_kwargs_accepted(self): + # Verify that PretrainedConfig-recognized kwargs (here: use_cache, + # tie_word_embeddings) flow through __init__ without raising. + HYV3Config(use_cache=False, tie_word_embeddings=True) + + def test_extra_kwargs_pass_through_super_init(self): + # PretrainedConfig **kwargs in newer transformers no longer attaches + # arbitrary fields to the instance, but the call should still succeed. + cfg = HYV3Config(custom_field="abc") + assert isinstance(cfg, HYV3Config) + + +class TestSerialization: + def test_to_dict_round_trip(self): + cfg = HYV3Config(num_hidden_layers=4, num_experts=8, hidden_size=256) + d = cfg.to_dict() + assert d["model_type"] == "hy_v3" + assert d["num_hidden_layers"] == 4 + assert d["num_experts"] == 8 + + rebuilt = HYV3Config(**{k: v for k, v in d.items() if k != "model_type"}) + assert rebuilt.num_hidden_layers == 4 + assert rebuilt.num_experts == 8 + assert rebuilt.hidden_size == 256 + + def test_model_type_class_attribute_not_overridden_by_instance(self): + cfg = HYV3Config() + # model_type is a class-level attribute that AutoConfig dispatches on. + assert cfg.model_type == "hy_v3" + assert HYV3Config.model_type == "hy_v3" diff --git a/tests/unit_tests/models/hy_v3/test_hy_v3_layers.py b/tests/unit_tests/models/hy_v3/test_hy_v3_layers.py new file mode 100644 index 000000000..7a3eb5468 --- /dev/null +++ b/tests/unit_tests/models/hy_v3/test_hy_v3_layers.py @@ -0,0 +1,185 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for ``HYV3Attention``.""" + +from unittest.mock import patch + +import pytest +import torch + +from nemo_automodel.components.models.common import BackendConfig +from nemo_automodel.components.models.hy_v3.config import HYV3Config +from nemo_automodel.components.models.hy_v3.layers import HYV3Attention + + +pytestmark = pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + + +HIDDEN = 64 +N_HEADS = 8 +N_KV = 2 +HEAD_DIM = 16 + + +@pytest.fixture +def device(): + return torch.device(f"cuda:{torch.cuda.current_device()}") + + +@pytest.fixture +def config(): + return HYV3Config( + vocab_size=128, + hidden_size=HIDDEN, + intermediate_size=128, + moe_intermediate_size=64, + num_hidden_layers=1, + num_attention_heads=N_HEADS, + num_key_value_heads=N_KV, + head_dim=HEAD_DIM, + max_position_embeddings=128, + rope_theta=10000.0, + rms_norm_eps=1e-6, + ) + + +@pytest.fixture +def sdpa_backend(): + return BackendConfig( + linear="torch", + attn="sdpa", + rms_norm="torch", + experts="torch", + dispatcher="torch", + fake_balanced_gate=False, + enable_hf_state_dict_adapter=False, + rope_fusion=False, + ) + + +def _make_freqs_cis(seq_len: int, device: torch.device) -> torch.Tensor: + """Synthesize a freqs_cis tensor of shape (1, seq, head_dim) — matches + ``apply_rotary_emb_qk(format='bshd', rope_fusion=False)`` which splits + [cos, sin] along the last dim.""" + return torch.zeros(1, seq_len, HEAD_DIM, device=device) + + +# --------------------------------------------------------------------------- +# Initialization +# --------------------------------------------------------------------------- + + +class TestInit: + def test_module_attributes(self, config, sdpa_backend): + attn = HYV3Attention(config, backend=sdpa_backend) + assert attn.num_heads == N_HEADS + assert attn.num_kv_heads == N_KV + assert attn.head_dim == HEAD_DIM + assert attn.backend is sdpa_backend + + def test_projection_shapes(self, config, sdpa_backend): + attn = HYV3Attention(config, backend=sdpa_backend) + assert attn.q_proj.weight.shape == (N_HEADS * HEAD_DIM, HIDDEN) + assert attn.k_proj.weight.shape == (N_KV * HEAD_DIM, HIDDEN) + assert attn.v_proj.weight.shape == (N_KV * HEAD_DIM, HIDDEN) + assert attn.o_proj.weight.shape == (HIDDEN, N_HEADS * HEAD_DIM) + + def test_q_k_norm_per_head_dim(self, config, sdpa_backend): + """qk_norm is applied per-head, so the norm weight is sized to head_dim.""" + attn = HYV3Attention(config, backend=sdpa_backend) + assert attn.q_norm.weight.shape == (HEAD_DIM,) + assert attn.k_norm.weight.shape == (HEAD_DIM,) + + def test_no_attention_bias_by_default(self, config, sdpa_backend): + attn = HYV3Attention(config, backend=sdpa_backend) + assert attn.q_proj.bias is None + assert attn.k_proj.bias is None + assert attn.v_proj.bias is None + assert attn.o_proj.bias is None + + def test_attention_bias_enabled(self, config, sdpa_backend): + config.attention_bias = True + attn = HYV3Attention(config, backend=sdpa_backend) + assert attn.q_proj.bias is not None + assert attn.k_proj.bias is not None + assert attn.v_proj.bias is not None + assert attn.o_proj.bias is not None + + +# --------------------------------------------------------------------------- +# Forward +# --------------------------------------------------------------------------- + + +class TestForward: + def test_output_shape_bshd(self, config, sdpa_backend, device): + attn = HYV3Attention(config, backend=sdpa_backend).to(device) + bsz, seqlen = 2, 4 + x = torch.randn(bsz, seqlen, HIDDEN, device=device, dtype=torch.bfloat16) + freqs = _make_freqs_cis(seqlen, device) + + out = attn(x, freqs_cis=freqs) + assert out.shape == (bsz, seqlen, HIDDEN) + + def test_calls_q_k_v_o_projections(self, config, sdpa_backend, device): + attn = HYV3Attention(config, backend=sdpa_backend).to(device) + bsz, seqlen = 1, 3 + x = torch.randn(bsz, seqlen, HIDDEN, device=device, dtype=torch.bfloat16) + freqs = _make_freqs_cis(seqlen, device) + + with patch.object(attn.q_proj, "forward", wraps=attn.q_proj.forward) as q, \ + patch.object(attn.k_proj, "forward", wraps=attn.k_proj.forward) as k, \ + patch.object(attn.v_proj, "forward", wraps=attn.v_proj.forward) as v, \ + patch.object(attn.o_proj, "forward", wraps=attn.o_proj.forward) as o: + attn(x, freqs_cis=freqs) + q.assert_called_once() + k.assert_called_once() + v.assert_called_once() + o.assert_called_once() + + def test_attention_mask_to_padding_mask_unused_in_sdpa(self, config, sdpa_backend, device): + """With sdpa+causal the attention mask is converted via preprocess but never + propagates to the kernel (is_causal=True suffices); we just check forward + completes without raising.""" + attn = HYV3Attention(config, backend=sdpa_backend).to(device) + bsz, seqlen = 1, 4 + x = torch.randn(bsz, seqlen, HIDDEN, device=device, dtype=torch.bfloat16) + freqs = _make_freqs_cis(seqlen, device) + mask = torch.ones(bsz, seqlen, dtype=torch.float32, device=device) + out = attn(x, freqs_cis=freqs, attention_mask=mask) + assert out.shape == x.shape + + +# --------------------------------------------------------------------------- +# Init weights +# --------------------------------------------------------------------------- + + +class TestInitWeights: + def test_resets_norms_and_linears(self, config, sdpa_backend, device): + attn = HYV3Attention(config, backend=sdpa_backend).to(device) + with patch.object(attn.q_norm, "reset_parameters") as qn, \ + patch.object(attn.k_norm, "reset_parameters") as kn: + attn.init_weights(buffer_device=device, init_std=0.01) + qn.assert_called_once() + kn.assert_called_once() + + def test_uses_provided_init_std(self, config, sdpa_backend, device): + attn = HYV3Attention(config, backend=sdpa_backend).to(device) + attn.init_weights(buffer_device=device, init_std=0.5) + # trunc_normal_ has the requested std (within stat tolerance for a small tensor). + # Just verify the linears were re-initialized to non-default values. + for lin in (attn.q_proj, attn.k_proj, attn.v_proj, attn.o_proj): + assert lin.weight.abs().max().item() > 0 diff --git a/tests/unit_tests/models/hy_v3/test_hy_v3_model.py b/tests/unit_tests/models/hy_v3/test_hy_v3_model.py new file mode 100644 index 000000000..1023d1412 --- /dev/null +++ b/tests/unit_tests/models/hy_v3/test_hy_v3_model.py @@ -0,0 +1,380 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for the HYV3 Block / HYV3Model / HYV3ForCausalLM layers.""" + +from unittest.mock import MagicMock, patch + +import pytest +import torch + +from nemo_automodel.components.models.common import BackendConfig +from nemo_automodel.components.models.hy_v3.config import HYV3Config +from nemo_automodel.components.models.hy_v3.model import ( + Block, + HYV3ForCausalLM, + HYV3Model, + ModelClass, +) +from nemo_automodel.components.moe.config import MoEConfig +from nemo_automodel.components.moe.layers import MLP, MoE + + +pytestmark = pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + + +HIDDEN = 64 +INTER = 128 +MOE_INTER = 64 +N_HEADS = 8 +N_KV = 2 +HEAD_DIM = 16 +N_EXPERTS = 4 + + +@pytest.fixture +def device(): + return torch.device(f"cuda:{torch.cuda.current_device()}") + + +@pytest.fixture +def config(): + return HYV3Config( + vocab_size=128, + hidden_size=HIDDEN, + intermediate_size=INTER, + moe_intermediate_size=MOE_INTER, + num_hidden_layers=2, + num_attention_heads=N_HEADS, + num_key_value_heads=N_KV, + head_dim=HEAD_DIM, + num_experts=N_EXPERTS, + num_experts_per_tok=2, + num_shared_experts=1, + first_k_dense_replace=1, + max_position_embeddings=128, + rope_theta=10000.0, + rms_norm_eps=1e-6, + ) + + +@pytest.fixture +def backend_config(): + return BackendConfig( + linear="torch", + attn="sdpa", + rms_norm="torch", + experts="torch", + dispatcher="torch", + fake_balanced_gate=False, + gate_precision="float32", + rope_fusion=False, + enable_hf_state_dict_adapter=False, + enable_fsdp_optimizations=False, + ) + + +@pytest.fixture +def moe_config(config): + return MoEConfig( + dim=config.hidden_size, + inter_dim=config.intermediate_size, + moe_inter_dim=config.moe_intermediate_size, + n_routed_experts=config.num_experts, + n_shared_experts=config.num_shared_experts, + n_activated_experts=config.num_experts_per_tok, + n_expert_groups=0, + n_limited_groups=0, + train_gate=True, + gate_bias_update_factor=0.0, + score_func="sigmoid", + route_scale=1.0, + aux_loss_coeff=0.0, + norm_topk_prob=False, + expert_bias=False, + router_bias=False, + expert_activation="swiglu", + softmax_before_topk=False, + force_e_score_correction_bias=True, + ) + + +# --------------------------------------------------------------------------- +# Block +# --------------------------------------------------------------------------- + + +class TestBlock: + def test_dense_layer_uses_mlp_when_idx_below_first_k_dense(self, config, moe_config, backend_config): + """layer_idx < first_k_dense_replace -> dense MLP, not MoE.""" + config.first_k_dense_replace = 1 + block = Block(layer_idx=0, config=config, moe_config=moe_config, backend=backend_config) + assert isinstance(block.mlp, MLP) + assert not isinstance(block.mlp, MoE) + + def test_moe_layer_uses_moe_when_idx_at_or_above_first_k_dense(self, config, moe_config, backend_config): + config.first_k_dense_replace = 1 + block = Block(layer_idx=1, config=config, moe_config=moe_config, backend=backend_config) + assert isinstance(block.mlp, MoE) + + def test_first_k_dense_replace_higher_threshold(self, config, moe_config, backend_config): + """If first_k_dense_replace=3, layers 0-2 are dense and 3+ are MoE.""" + config.first_k_dense_replace = 3 + for i in (0, 1, 2): + block = Block(layer_idx=i, config=config, moe_config=moe_config, backend=backend_config) + assert isinstance(block.mlp, MLP), f"layer {i} should be dense" + block = Block(layer_idx=3, config=config, moe_config=moe_config, backend=backend_config) + assert isinstance(block.mlp, MoE) + + def test_block_has_required_submodules(self, config, moe_config, backend_config): + block = Block(layer_idx=1, config=config, moe_config=moe_config, backend=backend_config) + assert hasattr(block, "self_attn") + assert hasattr(block, "mlp") + assert hasattr(block, "input_layernorm") + assert hasattr(block, "post_attention_layernorm") + assert block.layer_idx == 1 + + def test_forward_residual_calls_attn_then_mlp(self, config, moe_config, backend_config, device): + block = Block(layer_idx=0, config=config, moe_config=moe_config, backend=backend_config).to(device) + bsz, seq = 2, 4 + x = torch.randn(bsz, seq, HIDDEN, device=device, dtype=torch.bfloat16) + freqs = torch.zeros(1, seq, HEAD_DIM, device=device) + with patch.object(block.self_attn, "forward", return_value=torch.zeros_like(x)) as mock_attn, \ + patch.object(block, "_mlp", return_value=torch.zeros_like(x)) as mock_mlp: + out = block(x, freqs_cis=freqs) + assert out.shape == x.shape + mock_attn.assert_called_once() + mock_mlp.assert_called_once() + + def test_padding_mask_built_from_attention_mask(self, config, moe_config, backend_config, device): + block = Block(layer_idx=0, config=config, moe_config=moe_config, backend=backend_config).to(device) + x = torch.randn(1, 3, HIDDEN, device=device, dtype=torch.bfloat16) + freqs = torch.zeros(1, 3, HEAD_DIM, device=device) + mask = torch.tensor([[1, 1, 0]], dtype=torch.bool, device=device) + with patch.object(block.self_attn, "forward", return_value=torch.zeros_like(x)), \ + patch.object(block, "_mlp", return_value=torch.zeros_like(x)) as mock_mlp: + block(x, freqs_cis=freqs, attention_mask=mask) + _, kwargs = mock_mlp.call_args + torch.testing.assert_close(kwargs["padding_mask"], mask.logical_not()) + + def test_mlp_wrapper_dense_path(self, config, moe_config, backend_config, device): + config.first_k_dense_replace = 1 + block = ( + Block(layer_idx=0, config=config, moe_config=moe_config, backend=backend_config) + .to(device) + .to(torch.bfloat16) + ) + x = torch.randn(2, 4, HIDDEN, device=device, dtype=torch.bfloat16) + out = block._mlp(x, padding_mask=None) + assert out.shape == x.shape + + def test_init_weights_invokes_subcomponents(self, config, moe_config, backend_config, device): + block = Block(layer_idx=1, config=config, moe_config=moe_config, backend=backend_config).to(device) + with patch.object(block.input_layernorm, "reset_parameters") as in_norm, \ + patch.object(block.post_attention_layernorm, "reset_parameters") as post_norm, \ + patch.object(block.self_attn, "init_weights") as attn_init, \ + patch.object(block.mlp, "init_weights") as mlp_init: + block.init_weights(buffer_device=device) + in_norm.assert_called_once() + post_norm.assert_called_once() + attn_init.assert_called_once() + mlp_init.assert_called_once() + + +# --------------------------------------------------------------------------- +# HYV3Model +# --------------------------------------------------------------------------- + + +class TestHYV3Model: + def test_construction_sets_components(self, config, backend_config): + model = HYV3Model(config, backend=backend_config) + assert len(model.layers) == config.num_hidden_layers + assert model.embed_tokens.num_embeddings == config.vocab_size + assert model.norm is not None + assert model.rotary_emb.head_dim == config.head_dim + assert isinstance(model.moe_config, MoEConfig) + + def test_dense_then_moe_layer_structure(self, config, backend_config): + config.first_k_dense_replace = 1 + config.num_hidden_layers = 3 + model = HYV3Model(config, backend=backend_config) + assert isinstance(model.layers["0"].mlp, MLP) + assert isinstance(model.layers["1"].mlp, MoE) + assert isinstance(model.layers["2"].mlp, MoE) + + def test_moe_config_inferred_from_config(self, config, backend_config): + model = HYV3Model(config, backend=backend_config) + mc = model.moe_config + assert mc.dim == config.hidden_size + assert mc.moe_inter_dim == config.moe_intermediate_size + assert mc.n_routed_experts == config.num_experts + assert mc.n_activated_experts == config.num_experts_per_tok + assert mc.n_shared_experts == config.num_shared_experts + assert mc.score_func == "sigmoid" + assert mc.expert_activation == "swiglu" + + def test_moe_overrides_take_effect(self, config, backend_config): + model = HYV3Model(config, backend=backend_config, moe_overrides={"score_func": "softmax", "route_scale": 2.0}) + assert model.moe_config.score_func == "softmax" + assert model.moe_config.route_scale == 2.0 + + def test_explicit_moe_config_passes_through(self, config, backend_config, moe_config): + model = HYV3Model(config, backend=backend_config, moe_config=moe_config) + assert model.moe_config is moe_config + + def test_explicit_moe_config_and_overrides_conflict(self, config, backend_config, moe_config): + with pytest.raises(ValueError, match="Cannot pass both"): + HYV3Model(config, backend=backend_config, moe_config=moe_config, moe_overrides={"score_func": "softmax"}) + + def test_forward_runs_all_layers(self, config, backend_config, device): + model = HYV3Model(config, backend=backend_config).to(device) + bsz, seq = 1, 4 + input_ids = torch.randint(0, config.vocab_size, (bsz, seq), device=device) + with patch.object(Block, "forward", side_effect=lambda x=None, **kw: x if x is not None else kw["x"]) as mock_block: + out = model(input_ids) + assert out.shape == (bsz, seq, HIDDEN) + assert mock_block.call_count == config.num_hidden_layers + + def test_forward_accepts_explicit_position_ids(self, config, backend_config, device): + model = HYV3Model(config, backend=backend_config).to(device) + input_ids = torch.randint(0, config.vocab_size, (1, 4), device=device) + position_ids = torch.arange(4, device=device).unsqueeze(0) + with patch.object(Block, "forward", side_effect=lambda x=None, **kw: x if x is not None else kw["x"]): + out = model(input_ids, position_ids=position_ids) + assert out.shape == (1, 4, HIDDEN) + + def test_init_weights_resets_layers_norm_embeddings(self, config, backend_config, device): + model = HYV3Model(config, backend=backend_config).to(device) + embed_before = model.embed_tokens.weight.detach().clone() + with patch.object(model.norm, "reset_parameters") as mock_norm, \ + patch.object(Block, "init_weights") as mock_layer_init: + model.init_weights(buffer_device=device) + mock_norm.assert_called_once() + assert mock_layer_init.call_count == config.num_hidden_layers + # Embedding weights are re-initialized. + assert not torch.equal(model.embed_tokens.weight.detach(), embed_before) + + +# --------------------------------------------------------------------------- +# HYV3ForCausalLM +# --------------------------------------------------------------------------- + + +class TestHYV3ForCausalLM: + def test_construction_attaches_model_and_lm_head(self, config, backend_config): + model = HYV3ForCausalLM(config, backend=backend_config) + assert isinstance(model.model, HYV3Model) + assert model.lm_head.weight.shape == (config.vocab_size, config.hidden_size) + assert model.config is config + + def test_state_dict_adapter_attached_when_enabled(self, config, backend_config): + backend_config.enable_hf_state_dict_adapter = True + model = HYV3ForCausalLM(config, backend=backend_config) + from nemo_automodel.components.models.hy_v3.state_dict_adapter import HYV3StateDictAdapter + + assert hasattr(model, "state_dict_adapter") + assert isinstance(model.state_dict_adapter, HYV3StateDictAdapter) + + def test_state_dict_adapter_not_attached_when_disabled(self, config, backend_config): + backend_config.enable_hf_state_dict_adapter = False + model = HYV3ForCausalLM(config, backend=backend_config) + assert not hasattr(model, "state_dict_adapter") + + def test_default_backend_built_when_omitted(self, config): + model = HYV3ForCausalLM(config) + assert isinstance(model.backend, BackendConfig) + + def test_get_set_input_embeddings(self, config, backend_config): + model = HYV3ForCausalLM(config, backend=backend_config) + new_emb = torch.nn.Embedding(config.vocab_size, config.hidden_size) + model.set_input_embeddings(new_emb) + assert model.get_input_embeddings() is new_emb + + def test_get_set_output_embeddings(self, config, backend_config): + model = HYV3ForCausalLM(config, backend=backend_config) + new_head = torch.nn.Linear(config.hidden_size, config.vocab_size, bias=False) + model.set_output_embeddings(new_head) + assert model.get_output_embeddings() is new_head + + def test_forward_returns_logits_shape(self, config, backend_config, device): + model = HYV3ForCausalLM(config, backend=backend_config).to(device) + bsz, seq = 2, 6 + input_ids = torch.randint(0, config.vocab_size, (bsz, seq), device=device) + fake_hidden = torch.randn(bsz, seq, config.hidden_size, device=device, dtype=torch.bfloat16) + with patch.object(model.model, "forward", return_value=fake_hidden): + logits = model(input_ids) + assert logits.shape == (bsz, seq, config.vocab_size) + + def test_initialize_weights_invokes_inner_init(self, config, backend_config, device): + model = HYV3ForCausalLM(config, backend=backend_config).to(device) + with patch.object(model.model, "init_weights") as mock_init: + model.initialize_weights(buffer_device=device, dtype=torch.float32) + mock_init.assert_called_once() + assert model.lm_head.weight.dtype == torch.float32 + + def test_update_moe_gate_bias_no_op_when_factor_zero(self, config, backend_config, device): + """gate_bias_update_factor defaults to 0.0; update_moe_gate_bias must NOT call + gate.update_bias() when the factor is zero (the bug fixed in 564ff4f2).""" + model = HYV3ForCausalLM(config, backend=backend_config).to(device) + for layer in model.model.layers.values(): + if isinstance(layer.mlp, MoE): + with patch.object(layer.mlp.gate, "update_bias") as mock: + model.update_moe_gate_bias() + mock.assert_not_called() + + def test_update_moe_gate_bias_calls_when_factor_positive(self, config, backend_config, device): + model = HYV3ForCausalLM( + config, backend=backend_config, moe_overrides={"gate_bias_update_factor": 1e-3} + ).to(device) + called = 0 + for layer in model.model.layers.values(): + if isinstance(layer.mlp, MoE): + with patch.object(layer.mlp.gate, "update_bias") as mock: + model.update_moe_gate_bias() + if mock.called: + called += 1 + assert called > 0 + + def test_from_config_classmethod_passes_through(self, config, backend_config): + model = HYV3ForCausalLM.from_config(config, backend=backend_config) + assert isinstance(model, HYV3ForCausalLM) + + def test_from_pretrained_resolves_config_then_delegates(self, config, backend_config): + with patch("transformers.AutoConfig.from_pretrained", return_value=config) as mock_acfg, \ + patch.object(HYV3ForCausalLM, "from_config", wraps=HYV3ForCausalLM.from_config) as mock_fc: + model = HYV3ForCausalLM.from_pretrained("tencent/Hy3-preview", backend=backend_config) + mock_acfg.assert_called_once() + mock_fc.assert_called_once() + assert isinstance(model, HYV3ForCausalLM) + + def test_modelclass_alias(self): + assert ModelClass is HYV3ForCausalLM + + +# --------------------------------------------------------------------------- +# Module-level export +# --------------------------------------------------------------------------- + + +class TestModuleExports: + def test_init_exports_hyv3_for_causal_lm(self): + from nemo_automodel.components.models.hy_v3 import HYV3ForCausalLM as exported + + assert exported is HYV3ForCausalLM + + def test_module_class_pointer(self): + from nemo_automodel.components.models.hy_v3 import model as mod + + assert mod.ModelClass is HYV3ForCausalLM diff --git a/tests/unit_tests/recipes/test_train_ft.py b/tests/unit_tests/recipes/test_train_ft.py index dfd045ec1..623c9694a 100644 --- a/tests/unit_tests/recipes/test_train_ft.py +++ b/tests/unit_tests/recipes/test_train_ft.py @@ -291,6 +291,63 @@ def test_peft_with_tp_disables_triton(caplog): assert "Disabling Triton with TP" in caplog.text +def test_build_checkpoint_config_rejects_peft_with_torch_save(): + """PEFT + torch_save must hard-error (no silent fallback to safetensors).""" + from nemo_automodel.recipes.llm.train_ft import build_checkpoint_config + + cfg_ckpt = MagicMock() + cfg_ckpt.to_dict.return_value = { + "model_save_format": "torch_save", + "checkpoint_dir": "/user/ckpt/", + } + + with pytest.raises(ValueError, match="PEFT checkpointing is not supported for torch_save"): + build_checkpoint_config( + cfg_ckpt=cfg_ckpt, + cache_dir=None, + model_repo_id="org/model", + is_peft=True, + ) + + +def test_build_checkpoint_config_accepts_peft_with_safetensors(): + """PEFT + safetensors (the default) is the supported path.""" + from nemo_automodel.components.checkpoint._backports.filesystem import SerializationFormat + from nemo_automodel.recipes.llm.train_ft import build_checkpoint_config + + cfg_ckpt = MagicMock() + cfg_ckpt.to_dict.return_value = { + "model_save_format": "safetensors", + "checkpoint_dir": "/user/ckpt/", + } + config = build_checkpoint_config( + cfg_ckpt=cfg_ckpt, + cache_dir=None, + model_repo_id="org/model", + is_peft=True, + ) + assert config.is_peft is True + assert config.model_save_format == SerializationFormat.SAFETENSORS + assert config.checkpoint_dir == "/user/ckpt/" + + +def test_build_checkpoint_config_accepts_non_peft_with_torch_save(): + """Non-PEFT runs may use torch_save freely.""" + from nemo_automodel.components.checkpoint._backports.filesystem import SerializationFormat + from nemo_automodel.recipes.llm.train_ft import build_checkpoint_config + + cfg_ckpt = MagicMock() + cfg_ckpt.to_dict.return_value = {"model_save_format": "torch_save"} + config = build_checkpoint_config( + cfg_ckpt=cfg_ckpt, + cache_dir=None, + model_repo_id="org/model", + is_peft=False, + ) + assert config.is_peft is False + assert config.model_save_format == SerializationFormat.TORCH_SAVE + + def test_build_dataloader_iterable_shard_and_shuffle_removed_from_cfg(monkeypatch): # cfg_ds: target resolves to this test module dataset class cfg_ds = ConfigNode( From 1d511f8ddcf0aa26ecae78fc4ab9c090a14d422e Mon Sep 17 00:00:00 2001 From: HuiyingLi Date: Tue, 28 Apr 2026 16:17:45 -0700 Subject: [PATCH 27/27] docs(llm): announce Hy3-preview in README and latest-models log Mirror the MiniMax-M2.7 PR (#1785) doc additions for the new HYV3 support: news entry at the top of README.md "What's New" and a row in docs/model-coverage/latest-models.md. Per-model coverage page (docs/model-coverage/llm/tencent/hy3.md) and llm/index.md row are already present. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: HuiyingLi --- README.md | 1 + docs/model-coverage/latest-models.md | 1 + 2 files changed, 2 insertions(+) diff --git a/README.md b/README.md index 9fdd9db6f..004fdcfd3 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,7 @@ ## 📣 News and Discussions - [04/25/2026][**DeepSeek V4 Flash**](https://huggingface.co/deepseek-ai/DeepSeek-V4-Flash) We now support finetuning `deepseek-ai/DeepSeek-V4-Flash`, thanks to [@Khazic](https://github.com/khazic). Check out our [recipe](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/deepseek_v4/deepseek_v4_flash_hellaswag.yaml) and [guide](https://github.com/NVIDIA-NeMo/Automodel/blob/main/docs/guides/llm/dsv4-flash.md). +- [04/28/2026][**Hy3-preview**](https://huggingface.co/tencent/Hy3-preview) We now support finetuning `tencent/Hy3-preview`, thanks to [@Khazic](https://github.com/khazic). Check out our [recipe](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/hy_v3/hy3_preview_deepep.yaml). - [04/22/2026][**Qwen3.6-27B**](https://huggingface.co/Qwen/Qwen3.6-27B) We now support finetuning `Qwen/Qwen3.6-27B`. Check out our [recipe](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/qwen3_5/qwen3_6_27b.yaml). - [04/20/2026][**Qwen-Image**](https://huggingface.co/Qwen/Qwen-Image) We now support finetuning `Qwen/Qwen-Image`, thanks to [@harshareddy832](https://github.com/harshareddy832). Check out our [recipe](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/diffusion/finetune/qwen_image_t2i_flow.yaml). - [04/16/2026][**Qwen3.6 MoE**](https://huggingface.co/Qwen/Qwen3.6-35B-A3B) We now support finetuning `Qwen/Qwen3.6-35B-A3B`. Check out our [recipe](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/qwen3_5_moe/qwen3_6_35b.yaml). diff --git a/docs/model-coverage/latest-models.md b/docs/model-coverage/latest-models.md index efbdbe5ba..dcd27080f 100644 --- a/docs/model-coverage/latest-models.md +++ b/docs/model-coverage/latest-models.md @@ -6,6 +6,7 @@ See the [Model Coverage Overview](overview.md) for release summaries, and the [L | Date | Model | HF Model ID | Modality | Recipe | Try on Brev | |------|-------|-------------|----------|--------|------| +| 2026-04-28 | Hy3-preview | [`tencent/Hy3-preview`](https://huggingface.co/tencent/Hy3-preview) | LLM | [hy3_preview_deepep.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/hy_v3/hy3_preview_deepep.yaml) | 🚧 | | 2026-04-25 | DeepSeek V4 Flash | [`deepseek-ai/DeepSeek-V4-Flash`](https://huggingface.co/deepseek-ai/DeepSeek-V4-Flash) | LLM | [deepseek_v4_flash_hellaswag.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/deepseek_v4/deepseek_v4_flash_hellaswag.yaml) | 🚧 | | 2026-04-22 | Qwen3.6-27B | [`Qwen/Qwen3.6-27B`](https://huggingface.co/Qwen/Qwen3.6-27B) | VLM | [qwen3_6_27b.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/qwen3_5/qwen3_6_27b.yaml) | 🚧 | | 2026-04-16 | LLaVA-OneVision-1.5 (4B / 8B) | [`lmms-lab/LLaVA-OneVision-1.5-4B-Instruct`](https://huggingface.co/lmms-lab/LLaVA-OneVision-1.5-4B-Instruct) | VLM | [llava_ov_1_5_4b_finetune.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/llava_onevision/llava_ov_1_5_4b_finetune.yaml) | 🚧 |