NVIDIA-NeMo · akoumpa · Apr 17, 2026 · Apr 16, 2026
@@ -55,7 +55,7 @@ model:
   _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_config
   config:
     _target_: transformers.AutoConfig.from_pretrained
-    pretrained_model_name_or_path: /path/to/model
+    pretrained_model_name_or_path: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16  # pragma: allowlist secret
     trust_remote_code: true
   backend:
     _target_: nemo_automodel.components.models.common.BackendConfig

@@ -64,6 +64,7 @@ model:
   config:
     _target_: transformers.AutoConfig.from_pretrained
     pretrained_model_name_or_path: stepfun-ai/Step-3.5-Flash
+    trust_remote_code: true
   trust_remote_code: true
   backend:
     _target_: nemo_automodel.components.models.common.BackendConfig

@@ -71,6 +71,7 @@ model:
   config:
     _target_: transformers.AutoConfig.from_pretrained
     pretrained_model_name_or_path: stepfun-ai/Step-3.5-Flash
+    trust_remote_code: true
   trust_remote_code: true
   backend:
     _target_: nemo_automodel.components.models.common.BackendConfig

@@ -123,7 +123,7 @@ ci:
   vllm_deploy: true
   vllm_smoke_test: true
   checkpoint_robustness:
-    hf_kl_threshold: 5e-2
+    hf_kl_threshold: 1e-1
     tokenizer_name: openai/gpt-oss-20b
     check_phantom_keys: true
     no_check_resume: true

@@ -85,7 +85,7 @@ validation_dataloader:
 
 optimizer:
   _target_: torch.optim.AdamW
-  lr: 1.0e-3
+  lr: 1.0e-4
   weight_decay: 0.01
   betas: [0.9, 0.95]
   eps: 1e-8
@@ -94,7 +94,7 @@ ci:
   recipe_owner: hemildesai
   time: "00:15:00"
   checkpoint_robustness:
-    hf_kl_threshold: 1e-4
+    hf_kl_threshold: 1e-3
     tokenizer_name: Qwen/Qwen3-30B-A3B
     no_check_resume: true
     dataset.num_samples_limit: 500

@@ -77,10 +77,6 @@ def forward(
             attn_out = self.linear_attn(
                 hidden_states=self.input_layernorm(x),
                 attention_mask=attention_mask,
-                position_ids=position_ids,
-                qkv_format=attn_kwargs.get("qkv_format"),
-                cu_seqlens=attn_kwargs.get("cu_seqlens"),
-                seq_index=attn_kwargs.get("seq_index"),
             )
         elif self.layer_type == "full_attention":
             attn_out = self.self_attn(