From f6a4f999eb4652fa4c527770b6d65d6f03ca15f9 Mon Sep 17 00:00:00 2001
From: HuiyingLi <willwin.lee@gmail.com>
Date: Thu, 16 Apr 2026 11:52:56 -0700
Subject: [PATCH] feat: add Qwen3.6-35B-A3B VLM finetune recipe
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a ready-to-run MedPix-VQA fine-tuning recipe for `Qwen/Qwen3.6-35B-A3B`
under the existing `qwen3_5_moe` architecture (same custom model impl).
Verified on 8×H100: 100 steps complete, loss 1.86 → ~1.5, peak mem 64 GiB/GPU.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Signed-off-by: HuiyingLi <willwin.lee@gmail.com>
---
 README.md                                     |   1 +
 docs/model-coverage/latest-models.md          |   1 +
 docs/model-coverage/vlm/qwen/qwen3-5-vl.md    |   2 +
 .../vlm_finetune/qwen3_5_moe/qwen3_6_35b.yaml | 120 ++++++++++++++++++
 4 files changed, 124 insertions(+)
 create mode 100644 examples/vlm_finetune/qwen3_5_moe/qwen3_6_35b.yaml
diff --git a/README.md b/README.md
index 15eb3794f4..563acec0b0 100644
--- a/README.md
+++ b/README.md
@@ -21,6 +21,7 @@
 </div>
 
 ## 📣 News and Discussions
+- [04/16/2026][**Qwen3.6 MoE**](https://huggingface.co/Qwen/Qwen3.6-35B-A3B) We now support finetuning `Qwen/Qwen3.6-35B-A3B`. Check out our [recipe](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/qwen3_5_moe/qwen3_6_35b.yaml).
 - [04/12/2026][**MiniMax-M2.7**](https://huggingface.co/MiniMaxAI/MiniMax-M2.7) We now support finetuning `MiniMaxAI/MiniMax-M2.7`. Check out our [recipe](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/minimax_m2/minimax_m2.7_hellaswag_pp.yaml).
 - [04/07/2026][**GLM-5.1**](https://huggingface.co/zai-org/GLM-5.1) We now support finetuning `zai-org/GLM-5.1`. GLM-5.1 is Zhipu AI's latest open-source MoE model featuring MLA + DeepSeek Sparse Attention. Check out our [recipe](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/glm/glm_5.1_hellaswag_pp.yaml) and [discussion](https://github.com/NVIDIA-NeMo/Automodel/discussions/1719).
 - [04/02/2026][**Gemma 4**](https://huggingface.co/collections/google/gemma-4) We support fine-tuning for Gemma4 (2B, 4B, 31B, 26BA4B)! Check out our [recipes](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/vlm_finetune/gemma4).
diff --git a/docs/model-coverage/latest-models.md b/docs/model-coverage/latest-models.md
index 78782ccb15..7d9efd8196 100644
--- a/docs/model-coverage/latest-models.md
+++ b/docs/model-coverage/latest-models.md
@@ -6,6 +6,7 @@ See the [Model Coverage Overview](overview.md) for release summaries, and the [L
 
 | Date | Model | HF Model ID | Modality | Recipe | Try on Brev |
 |------|-------|-------------|----------|--------|------|
+| 2026-04-16 | Qwen3.6 MoE | [`Qwen/Qwen3.6-35B-A3B`](https://huggingface.co/Qwen/Qwen3.6-35B-A3B) | VLM | [qwen3_6_35b.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/qwen3_5_moe/qwen3_6_35b.yaml) | 🚧 |
 | 2026-04-12 | MiniMax-M2.7 | [`MiniMaxAI/MiniMax-M2.7`](https://huggingface.co/MiniMaxAI/MiniMax-M2.7) | LLM | [minimax_m2.7_hellaswag_pp.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/minimax_m2/minimax_m2.7_hellaswag_pp.yaml) |
 | 2026-04-07 | GLM-5.1 | [`zai-org/GLM-5.1`](https://huggingface.co/zai-org/GLM-5.1) | LLM | [glm_5.1_hellaswag_pp.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/glm/glm_5.1_hellaswag_pp.yaml) | 🚧 |
 | 2026-04-02 | Gemma 4 | [`google/gemma-4-4b-it`](https://huggingface.co/google/gemma-4-4b-it) | VLM | [gemma4_4b.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/gemma4/gemma4_4b.yaml) | 🚧 |
diff --git a/docs/model-coverage/vlm/qwen/qwen3-5-vl.md b/docs/model-coverage/vlm/qwen/qwen3-5-vl.md
index 8c440047a3..e1f24ba64c 100644
--- a/docs/model-coverage/vlm/qwen/qwen3-5-vl.md
+++ b/docs/model-coverage/vlm/qwen/qwen3-5-vl.md
@@ -16,6 +16,7 @@ Qwen3.5-VL is Alibaba Cloud's next-generation vision language model series, incl
 - **Qwen3.5-VL-4B**: 4B dense model
 - **Qwen3.5-VL-9B**: 9B dense model
 - **Qwen3.5-MoE**: large MoE variant (35B+)
+- **Qwen3.6-35B-A3B**: next-generation MoE variant (35B total, 3B active)
 
 ## Architectures
 
@@ -30,6 +31,7 @@ Qwen3.5-VL is Alibaba Cloud's next-generation vision language model series, incl
 | {download}`qwen3_5_9b.yaml <../../../../examples/vlm_finetune/qwen3_5/qwen3_5_9b.yaml>` | MedPix-VQA | SFT — Qwen3.5-VL 9B on MedPix |
 | {download}`qwen3_5_moe_medpix.yaml <../../../../examples/vlm_finetune/qwen3_5_moe/qwen3_5_moe_medpix.yaml>` | MedPix-VQA | SFT — Qwen3.5-MoE on MedPix |
 | {download}`qwen3_5_35b.yaml <../../../../examples/vlm_finetune/qwen3_5_moe/qwen3_5_35b.yaml>` | MedPix-VQA | SFT — Qwen3.5 35B on MedPix |
+| {download}`qwen3_6_35b.yaml <../../../../examples/vlm_finetune/qwen3_5_moe/qwen3_6_35b.yaml>` | MedPix-VQA | SFT — Qwen3.6 35B-A3B on MedPix |
 
 
 ## Try with NeMo AutoModel
diff --git a/examples/vlm_finetune/qwen3_5_moe/qwen3_6_35b.yaml b/examples/vlm_finetune/qwen3_5_moe/qwen3_6_35b.yaml
new file mode 100644
index 0000000000..5767838cc5
--- /dev/null
+++ b/examples/vlm_finetune/qwen3_5_moe/qwen3_6_35b.yaml
@@ -0,0 +1,120 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# To run this recipe:
+#   automodel examples/vlm_finetune/qwen3_5_moe/qwen3_6_35b.yaml --nproc-per-node 8
+# Adjust --nproc-per-node to the number of GPUs available on your machine.
+
+recipe: FinetuneRecipeForVLM
+
+step_scheduler:
+  global_batch_size: 16
+  local_batch_size: 1
+  ckpt_every_steps: 1000
+  val_every_steps: 100
+  num_epochs: 2
+  max_steps: 100
+
+dist_env:
+  backend: nccl
+  timeout_minutes: 60
+
+rng:
+  _target_: nemo_automodel.components.training.rng.StatefulRNG
+  seed: 1234
+  ranked: true
+
+model:
+  _target_: nemo_automodel.NeMoAutoModelForImageTextToText.from_pretrained
+  pretrained_model_name_or_path: Qwen/Qwen3.6-35B-A3B
+  backend:
+    _target_: nemo_automodel.components.models.common.BackendConfig
+    attn: sdpa
+    linear: torch
+    rms_norm: torch_fp32
+    rope_fusion: false
+    enable_deepep: true
+    fake_balanced_gate: false
+    enable_hf_state_dict_adapter: true
+
+processor:
+  _target_: transformers.AutoProcessor.from_pretrained
+  pretrained_model_name_or_path: Qwen/Qwen3.6-35B-A3B
+
+checkpoint:
+  enabled: true
+  checkpoint_dir: /checkpoints/qwen3_6_35b/
+  model_save_format: safetensors
+  save_consolidated: false
+
+distributed:
+  strategy: fsdp2
+  tp_size: 1
+  cp_size: 1
+  pp_size: 1
+  dp_replicate_size: 1
+  ep_size: 8
+
+  sequence_parallel: false
+
+freeze_config:
+  freeze_vision_tower: true
+  freeze_audio_tower: true
+  freeze_language_model: false
+
+loss_fn:
+  _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
+
+dataset:
+  _target_: nemo_automodel.components.datasets.vlm.datasets.make_medpix_dataset
+  path_or_dataset: mmoukouba/MedPix-VQA
+  split: train
+
+dataloader:
+  _target_: torchdata.stateful_dataloader.StatefulDataLoader
+  num_workers: 1
+  collate_fn:
+    _target_: nemo_automodel.components.datasets.vlm.collate_fns.default_collate_fn
+    max_length: 2048
+  drop_last: true
+
+validation_dataset:
+  _target_: nemo_automodel.components.datasets.vlm.datasets.make_medpix_dataset
+  path_or_dataset: mmoukouba/MedPix-VQA
+  split: validation
+
+validation_dataloader:
+  _target_: torchdata.stateful_dataloader.StatefulDataLoader
+  num_workers: 1
+  collate_fn:
+    _target_: nemo_automodel.components.datasets.vlm.collate_fns.default_collate_fn
+    max_length: 2048
+
+optimizer:
+  _target_: torch.optim.AdamW
+  betas: [0.9, 0.95]
+  eps: 1e-8
+  lr: 5.0e-6
+  weight_decay: 0.1
+
+ci:
+  recipe_owner: HuiyingLi
+  time: "00:30:00"
+
+# Uncomment and configure for W&B logging
+# wandb:
+#   project: <your_wandb_project>
+#   entity: <your_wandb_entity>
+#   name: <your_wandb_name>
+#   save_dir: <your_wandb_save_dir>