diff --git a/examples/llm_finetune/qwen/qwen3_moe_30b_te_hybridep.yaml b/examples/llm_finetune/qwen/qwen3_moe_30b_te_hybridep.yaml new file mode 100644 index 0000000000..88f9da77e8 --- /dev/null +++ b/examples/llm_finetune/qwen/qwen3_moe_30b_te_hybridep.yaml @@ -0,0 +1,121 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Qwen3-30B-A3B fine-tuning with HybridEP token dispatch. +# +# HybridEP fuses permutation, communication, and expert permutation into +# a single step, reducing kernel-launch overhead compared to DeepEP. +# +# To run this recipe: +# automodel examples/llm_finetune/qwen/qwen3_moe_30b_te_hybridep.yaml --nproc-per-node 8 +# Adjust --nproc-per-node to the number of GPUs available on your machine. + +recipe: TrainFinetuneRecipeForNextTokenPrediction + +step_scheduler: + global_batch_size: 32 + local_batch_size: 4 + ckpt_every_steps: 500 + num_epochs: 2 + +dist_env: + backend: nccl + timeout_minutes: 10 + +rng: + _target_: nemo_automodel.components.training.rng.StatefulRNG + seed: 1111 + ranked: true + +model: + _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained + pretrained_model_name_or_path: Qwen/Qwen3-30B-A3B + backend: + _target_: nemo_automodel.components.models.common.BackendConfig + attn: te + linear: te + rms_norm: torch_fp32 + experts: te + dispatcher: hybridep + fake_balanced_gate: false + enable_hf_state_dict_adapter: true + +checkpoint: + enabled: false + checkpoint_dir: checkpoints/ + model_save_format: torch_save # torch_save or safetensors + save_consolidated: false # saves the model in a consolidated safetensors format. Requires model_save_format to be safetensors. + +distributed: + strategy: fsdp2 + tp_size: 1 + cp_size: 1 + pp_size: 1 + ep_size: 8 + + sequence_parallel: false + activation_checkpointing: true + + pipeline: + pp_schedule: interleaved1f1b + pp_microbatch_size: 4 + round_virtual_stages_to_pp_multiple: down + scale_grads_in_schedule: false + patch_inner_model: false + patch_causal_lm_model: false + layers_per_stage: 2 + +loss_fn: + _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy + +dataset: + _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag + path_or_dataset: rowan/hellaswag + split: train + pad_to_max_length: false + +packed_sequence: + # Set packed_sequence_size > 0 to run with packed sequences + packed_sequence_size: 1024 + +dataloader: + _target_: torchdata.stateful_dataloader.StatefulDataLoader + collate_fn: nemo_automodel.components.datasets.utils.packed_sequence_thd_collater + shuffle: true + +validation_dataset: + _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag + path_or_dataset: rowan/hellaswag + split: validation + pad_to_max_length: false + +validation_dataloader: + _target_: torchdata.stateful_dataloader.StatefulDataLoader + collate_fn: nemo_automodel.components.datasets.utils.packed_sequence_thd_collater + +optimizer: + _target_: torch.optim.Adam + betas: [0.9, 0.999] + eps: 1e-7 + lr: 1.0e-4 + weight_decay: 0 + foreach: false + # min_lr: 1.0e-5 + +# # Uncomment and configure for W&B logging +# wandb: +# project: +# entity: +# name: +# save_dir: