diff --git a/examples/llm_benchmark/nemotron/nemotron_super_v3_te_deepep.yaml b/examples/llm_benchmark/nemotron/nemotron_super_v3_te_deepep.yaml
index f5528080ab..60c823b58c 100644
--- a/examples/llm_benchmark/nemotron/nemotron_super_v3_te_deepep.yaml
+++ b/examples/llm_benchmark/nemotron/nemotron_super_v3_te_deepep.yaml
@@ -55,7 +55,7 @@ model:
   _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_config
   config:
     _target_: transformers.AutoConfig.from_pretrained
-    pretrained_model_name_or_path: /path/to/model
+    pretrained_model_name_or_path: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16  # pragma: allowlist secret
     trust_remote_code: true
   backend:
     _target_: nemo_automodel.components.models.common.BackendConfig
diff --git a/examples/llm_benchmark/step/step35flash_lora.yaml b/examples/llm_benchmark/step/step35flash_lora.yaml
index c6802da3cc..a4e81199f1 100644
--- a/examples/llm_benchmark/step/step35flash_lora.yaml
+++ b/examples/llm_benchmark/step/step35flash_lora.yaml
@@ -64,6 +64,7 @@ model:
   config:
     _target_: transformers.AutoConfig.from_pretrained
     pretrained_model_name_or_path: stepfun-ai/Step-3.5-Flash
+    trust_remote_code: true
   trust_remote_code: true
   backend:
     _target_: nemo_automodel.components.models.common.BackendConfig
diff --git a/examples/llm_benchmark/step/step_3.5_flash_te_deepep.yaml b/examples/llm_benchmark/step/step_3.5_flash_te_deepep.yaml
index a6652a1b78..f384d6f9db 100644
--- a/examples/llm_benchmark/step/step_3.5_flash_te_deepep.yaml
+++ b/examples/llm_benchmark/step/step_3.5_flash_te_deepep.yaml
@@ -71,6 +71,7 @@ model:
   config:
     _target_: transformers.AutoConfig.from_pretrained
     pretrained_model_name_or_path: stepfun-ai/Step-3.5-Flash
+    trust_remote_code: true
   trust_remote_code: true
   backend:
     _target_: nemo_automodel.components.models.common.BackendConfig
diff --git a/examples/llm_finetune/gpt_oss/gpt_oss_20b.yaml b/examples/llm_finetune/gpt_oss/gpt_oss_20b.yaml
index 599319e05b..64f93f7f84 100644
--- a/examples/llm_finetune/gpt_oss/gpt_oss_20b.yaml
+++ b/examples/llm_finetune/gpt_oss/gpt_oss_20b.yaml
@@ -123,7 +123,7 @@ ci:
   vllm_deploy: true
   vllm_smoke_test: true
   checkpoint_robustness:
-    hf_kl_threshold: 5e-2
+    hf_kl_threshold: 1e-1
     tokenizer_name: openai/gpt-oss-20b
     check_phantom_keys: true
     no_check_resume: true
diff --git a/examples/llm_finetune/qwen/qwen3_moe_30b_hellaswag.yaml b/examples/llm_finetune/qwen/qwen3_moe_30b_hellaswag.yaml
index 9fe160460b..d24fb31495 100644
--- a/examples/llm_finetune/qwen/qwen3_moe_30b_hellaswag.yaml
+++ b/examples/llm_finetune/qwen/qwen3_moe_30b_hellaswag.yaml
@@ -85,7 +85,7 @@ validation_dataloader:
 
 optimizer:
   _target_: torch.optim.AdamW
-  lr: 1.0e-3
+  lr: 1.0e-4
   weight_decay: 0.01
   betas: [0.9, 0.95]
   eps: 1e-8
@@ -94,7 +94,7 @@ ci:
   recipe_owner: hemildesai
   time: "00:15:00"
   checkpoint_robustness:
-    hf_kl_threshold: 1e-4
+    hf_kl_threshold: 1e-3
     tokenizer_name: Qwen/Qwen3-30B-A3B
     no_check_resume: true
     dataset.num_samples_limit: 500
diff --git a/nemo_automodel/components/models/qwen3_next/model.py b/nemo_automodel/components/models/qwen3_next/model.py
index 6501f024b5..7c7df49a24 100644
--- a/nemo_automodel/components/models/qwen3_next/model.py
+++ b/nemo_automodel/components/models/qwen3_next/model.py
@@ -77,10 +77,6 @@ def forward(
             attn_out = self.linear_attn(
                 hidden_states=self.input_layernorm(x),
                 attention_mask=attention_mask,
-                position_ids=position_ids,
-                qkv_format=attn_kwargs.get("qkv_format"),
-                cu_seqlens=attn_kwargs.get("cu_seqlens"),
-                seq_index=attn_kwargs.get("seq_index"),
             )
         elif self.layer_type == "full_attention":
             attn_out = self.self_attn(