From f6a4f999eb4652fa4c527770b6d65d6f03ca15f9 Mon Sep 17 00:00:00 2001 From: HuiyingLi Date: Thu, 16 Apr 2026 11:52:56 -0700 Subject: [PATCH] feat: add Qwen3.6-35B-A3B VLM finetune recipe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a ready-to-run MedPix-VQA fine-tuning recipe for `Qwen/Qwen3.6-35B-A3B` under the existing `qwen3_5_moe` architecture (same custom model impl). Verified on 8×H100: 100 steps complete, loss 1.86 → ~1.5, peak mem 64 GiB/GPU. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: HuiyingLi --- README.md | 1 + docs/model-coverage/latest-models.md | 1 + docs/model-coverage/vlm/qwen/qwen3-5-vl.md | 2 + .../vlm_finetune/qwen3_5_moe/qwen3_6_35b.yaml | 120 ++++++++++++++++++ 4 files changed, 124 insertions(+) create mode 100644 examples/vlm_finetune/qwen3_5_moe/qwen3_6_35b.yaml diff --git a/README.md b/README.md index 15eb3794f4..563acec0b0 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,7 @@ ## 📣 News and Discussions +- [04/16/2026][**Qwen3.6 MoE**](https://huggingface.co/Qwen/Qwen3.6-35B-A3B) We now support finetuning `Qwen/Qwen3.6-35B-A3B`. Check out our [recipe](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/qwen3_5_moe/qwen3_6_35b.yaml). - [04/12/2026][**MiniMax-M2.7**](https://huggingface.co/MiniMaxAI/MiniMax-M2.7) We now support finetuning `MiniMaxAI/MiniMax-M2.7`. Check out our [recipe](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/minimax_m2/minimax_m2.7_hellaswag_pp.yaml). - [04/07/2026][**GLM-5.1**](https://huggingface.co/zai-org/GLM-5.1) We now support finetuning `zai-org/GLM-5.1`. GLM-5.1 is Zhipu AI's latest open-source MoE model featuring MLA + DeepSeek Sparse Attention. Check out our [recipe](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/glm/glm_5.1_hellaswag_pp.yaml) and [discussion](https://github.com/NVIDIA-NeMo/Automodel/discussions/1719). - [04/02/2026][**Gemma 4**](https://huggingface.co/collections/google/gemma-4) We support fine-tuning for Gemma4 (2B, 4B, 31B, 26BA4B)! Check out our [recipes](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/vlm_finetune/gemma4). diff --git a/docs/model-coverage/latest-models.md b/docs/model-coverage/latest-models.md index 78782ccb15..7d9efd8196 100644 --- a/docs/model-coverage/latest-models.md +++ b/docs/model-coverage/latest-models.md @@ -6,6 +6,7 @@ See the [Model Coverage Overview](overview.md) for release summaries, and the [L | Date | Model | HF Model ID | Modality | Recipe | Try on Brev | |------|-------|-------------|----------|--------|------| +| 2026-04-16 | Qwen3.6 MoE | [`Qwen/Qwen3.6-35B-A3B`](https://huggingface.co/Qwen/Qwen3.6-35B-A3B) | VLM | [qwen3_6_35b.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/qwen3_5_moe/qwen3_6_35b.yaml) | 🚧 | | 2026-04-12 | MiniMax-M2.7 | [`MiniMaxAI/MiniMax-M2.7`](https://huggingface.co/MiniMaxAI/MiniMax-M2.7) | LLM | [minimax_m2.7_hellaswag_pp.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/minimax_m2/minimax_m2.7_hellaswag_pp.yaml) | | 2026-04-07 | GLM-5.1 | [`zai-org/GLM-5.1`](https://huggingface.co/zai-org/GLM-5.1) | LLM | [glm_5.1_hellaswag_pp.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/glm/glm_5.1_hellaswag_pp.yaml) | 🚧 | | 2026-04-02 | Gemma 4 | [`google/gemma-4-4b-it`](https://huggingface.co/google/gemma-4-4b-it) | VLM | [gemma4_4b.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/gemma4/gemma4_4b.yaml) | 🚧 | diff --git a/docs/model-coverage/vlm/qwen/qwen3-5-vl.md b/docs/model-coverage/vlm/qwen/qwen3-5-vl.md index 8c440047a3..e1f24ba64c 100644 --- a/docs/model-coverage/vlm/qwen/qwen3-5-vl.md +++ b/docs/model-coverage/vlm/qwen/qwen3-5-vl.md @@ -16,6 +16,7 @@ Qwen3.5-VL is Alibaba Cloud's next-generation vision language model series, incl - **Qwen3.5-VL-4B**: 4B dense model - **Qwen3.5-VL-9B**: 9B dense model - **Qwen3.5-MoE**: large MoE variant (35B+) +- **Qwen3.6-35B-A3B**: next-generation MoE variant (35B total, 3B active) ## Architectures @@ -30,6 +31,7 @@ Qwen3.5-VL is Alibaba Cloud's next-generation vision language model series, incl | {download}`qwen3_5_9b.yaml <../../../../examples/vlm_finetune/qwen3_5/qwen3_5_9b.yaml>` | MedPix-VQA | SFT — Qwen3.5-VL 9B on MedPix | | {download}`qwen3_5_moe_medpix.yaml <../../../../examples/vlm_finetune/qwen3_5_moe/qwen3_5_moe_medpix.yaml>` | MedPix-VQA | SFT — Qwen3.5-MoE on MedPix | | {download}`qwen3_5_35b.yaml <../../../../examples/vlm_finetune/qwen3_5_moe/qwen3_5_35b.yaml>` | MedPix-VQA | SFT — Qwen3.5 35B on MedPix | +| {download}`qwen3_6_35b.yaml <../../../../examples/vlm_finetune/qwen3_5_moe/qwen3_6_35b.yaml>` | MedPix-VQA | SFT — Qwen3.6 35B-A3B on MedPix | ## Try with NeMo AutoModel diff --git a/examples/vlm_finetune/qwen3_5_moe/qwen3_6_35b.yaml b/examples/vlm_finetune/qwen3_5_moe/qwen3_6_35b.yaml new file mode 100644 index 0000000000..5767838cc5 --- /dev/null +++ b/examples/vlm_finetune/qwen3_5_moe/qwen3_6_35b.yaml @@ -0,0 +1,120 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# To run this recipe: +# automodel examples/vlm_finetune/qwen3_5_moe/qwen3_6_35b.yaml --nproc-per-node 8 +# Adjust --nproc-per-node to the number of GPUs available on your machine. + +recipe: FinetuneRecipeForVLM + +step_scheduler: + global_batch_size: 16 + local_batch_size: 1 + ckpt_every_steps: 1000 + val_every_steps: 100 + num_epochs: 2 + max_steps: 100 + +dist_env: + backend: nccl + timeout_minutes: 60 + +rng: + _target_: nemo_automodel.components.training.rng.StatefulRNG + seed: 1234 + ranked: true + +model: + _target_: nemo_automodel.NeMoAutoModelForImageTextToText.from_pretrained + pretrained_model_name_or_path: Qwen/Qwen3.6-35B-A3B + backend: + _target_: nemo_automodel.components.models.common.BackendConfig + attn: sdpa + linear: torch + rms_norm: torch_fp32 + rope_fusion: false + enable_deepep: true + fake_balanced_gate: false + enable_hf_state_dict_adapter: true + +processor: + _target_: transformers.AutoProcessor.from_pretrained + pretrained_model_name_or_path: Qwen/Qwen3.6-35B-A3B + +checkpoint: + enabled: true + checkpoint_dir: /checkpoints/qwen3_6_35b/ + model_save_format: safetensors + save_consolidated: false + +distributed: + strategy: fsdp2 + tp_size: 1 + cp_size: 1 + pp_size: 1 + dp_replicate_size: 1 + ep_size: 8 + + sequence_parallel: false + +freeze_config: + freeze_vision_tower: true + freeze_audio_tower: true + freeze_language_model: false + +loss_fn: + _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy + +dataset: + _target_: nemo_automodel.components.datasets.vlm.datasets.make_medpix_dataset + path_or_dataset: mmoukouba/MedPix-VQA + split: train + +dataloader: + _target_: torchdata.stateful_dataloader.StatefulDataLoader + num_workers: 1 + collate_fn: + _target_: nemo_automodel.components.datasets.vlm.collate_fns.default_collate_fn + max_length: 2048 + drop_last: true + +validation_dataset: + _target_: nemo_automodel.components.datasets.vlm.datasets.make_medpix_dataset + path_or_dataset: mmoukouba/MedPix-VQA + split: validation + +validation_dataloader: + _target_: torchdata.stateful_dataloader.StatefulDataLoader + num_workers: 1 + collate_fn: + _target_: nemo_automodel.components.datasets.vlm.collate_fns.default_collate_fn + max_length: 2048 + +optimizer: + _target_: torch.optim.AdamW + betas: [0.9, 0.95] + eps: 1e-8 + lr: 5.0e-6 + weight_decay: 0.1 + +ci: + recipe_owner: HuiyingLi + time: "00:30:00" + +# Uncomment and configure for W&B logging +# wandb: +# project: +# entity: +# name: +# save_dir: