diff --git a/examples/llm_benchmark/nemotron/nemotron_super_v3_te_deepep.yaml b/examples/llm_benchmark/nemotron/nemotron_super_v3_te_deepep.yaml index f5528080ab..60c823b58c 100644 --- a/examples/llm_benchmark/nemotron/nemotron_super_v3_te_deepep.yaml +++ b/examples/llm_benchmark/nemotron/nemotron_super_v3_te_deepep.yaml @@ -55,7 +55,7 @@ model: _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_config config: _target_: transformers.AutoConfig.from_pretrained - pretrained_model_name_or_path: /path/to/model + pretrained_model_name_or_path: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16 # pragma: allowlist secret trust_remote_code: true backend: _target_: nemo_automodel.components.models.common.BackendConfig diff --git a/examples/llm_benchmark/step/step35flash_lora.yaml b/examples/llm_benchmark/step/step35flash_lora.yaml index c6802da3cc..a4e81199f1 100644 --- a/examples/llm_benchmark/step/step35flash_lora.yaml +++ b/examples/llm_benchmark/step/step35flash_lora.yaml @@ -64,6 +64,7 @@ model: config: _target_: transformers.AutoConfig.from_pretrained pretrained_model_name_or_path: stepfun-ai/Step-3.5-Flash + trust_remote_code: true trust_remote_code: true backend: _target_: nemo_automodel.components.models.common.BackendConfig diff --git a/examples/llm_benchmark/step/step_3.5_flash_te_deepep.yaml b/examples/llm_benchmark/step/step_3.5_flash_te_deepep.yaml index a6652a1b78..f384d6f9db 100644 --- a/examples/llm_benchmark/step/step_3.5_flash_te_deepep.yaml +++ b/examples/llm_benchmark/step/step_3.5_flash_te_deepep.yaml @@ -71,6 +71,7 @@ model: config: _target_: transformers.AutoConfig.from_pretrained pretrained_model_name_or_path: stepfun-ai/Step-3.5-Flash + trust_remote_code: true trust_remote_code: true backend: _target_: nemo_automodel.components.models.common.BackendConfig diff --git a/examples/llm_finetune/gpt_oss/gpt_oss_20b.yaml b/examples/llm_finetune/gpt_oss/gpt_oss_20b.yaml index 599319e05b..64f93f7f84 100644 --- a/examples/llm_finetune/gpt_oss/gpt_oss_20b.yaml +++ b/examples/llm_finetune/gpt_oss/gpt_oss_20b.yaml @@ -123,7 +123,7 @@ ci: vllm_deploy: true vllm_smoke_test: true checkpoint_robustness: - hf_kl_threshold: 5e-2 + hf_kl_threshold: 1e-1 tokenizer_name: openai/gpt-oss-20b check_phantom_keys: true no_check_resume: true diff --git a/examples/llm_finetune/qwen/qwen3_moe_30b_hellaswag.yaml b/examples/llm_finetune/qwen/qwen3_moe_30b_hellaswag.yaml index 9fe160460b..d24fb31495 100644 --- a/examples/llm_finetune/qwen/qwen3_moe_30b_hellaswag.yaml +++ b/examples/llm_finetune/qwen/qwen3_moe_30b_hellaswag.yaml @@ -85,7 +85,7 @@ validation_dataloader: optimizer: _target_: torch.optim.AdamW - lr: 1.0e-3 + lr: 1.0e-4 weight_decay: 0.01 betas: [0.9, 0.95] eps: 1e-8 @@ -94,7 +94,7 @@ ci: recipe_owner: hemildesai time: "00:15:00" checkpoint_robustness: - hf_kl_threshold: 1e-4 + hf_kl_threshold: 1e-3 tokenizer_name: Qwen/Qwen3-30B-A3B no_check_resume: true dataset.num_samples_limit: 500 diff --git a/nemo_automodel/components/models/qwen3_next/model.py b/nemo_automodel/components/models/qwen3_next/model.py index 6501f024b5..7c7df49a24 100644 --- a/nemo_automodel/components/models/qwen3_next/model.py +++ b/nemo_automodel/components/models/qwen3_next/model.py @@ -77,10 +77,6 @@ def forward( attn_out = self.linear_attn( hidden_states=self.input_layernorm(x), attention_mask=attention_mask, - position_ids=position_ids, - qkv_format=attn_kwargs.get("qkv_format"), - cu_seqlens=attn_kwargs.get("cu_seqlens"), - seq_index=attn_kwargs.get("seq_index"), ) elif self.layer_type == "full_attention": attn_out = self.self_attn(