Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/actions/test-template/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ runs:
-d \
--name nemo_container_${{ github.run_id }} ${ARG[@]} \
--shm-size=64g \
--env TRANSFORMERS_OFFLINE=1 \
--env TRANSFORMERS_OFFLINE=0 \
--env HYDRA_FULL_ERROR=1 \
--env HF_HOME=/home/TestData/HF_HOME \
--env RUN_ID=${{ github.run_id }} \
Expand Down
78 changes: 41 additions & 37 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,54 +45,58 @@ jobs:
with:
image-name: dfm
dockerfile: docker/Dockerfile.ci
runner: self-hosted-nemo
secrets:
AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }}
AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }}
AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }}

cicd-unit-tests:
strategy:
fail-fast: false
matrix:
include:
- script: L0_Unit_Tests_GPU
runner: linux-amd64-gpu-rtxa6000-latest-2-nemo
timeout: 30
- script: L0_Unit_Tests_CPU
runner: linux-amd64-cpu16
cpu-only: true
needs: [cicd-container-build]
runs-on: ${{ matrix.runner }}
name: ${{ matrix.script }}
environment: nemo-ci
steps:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: recursive
- name: main
uses: ./.github/actions/test-template
with:
runner: ${{ runner.name }}
script: ${{ matrix.script }}
timeout: ${{ matrix.timeout || 10 }}
is_unit_test: "true"
image: dfm
cpu-only: ${{ matrix.cpu-only || false }}
has-azure-credentials: "true"
azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
# cicd-unit-tests:
# strategy:
# fail-fast: false
# matrix:
# include:
# - script: L0_Unit_Tests_GPU
# runner: self-hosted-nemo
# timeout: 30
# - script: L0_Unit_Tests_CPU
# runner: linux-amd64-cpu16
# cpu-only: true
# needs: [cicd-container-build]
# runs-on: ${{ matrix.runner }}
# name: ${{ matrix.script }}
# environment: nemo-ci
# steps:
# - name: Checkout
# uses: actions/checkout@v4
# with:
# submodules: recursive
# - name: main
# uses: ./.github/actions/test-template
# with:
# runner: ${{ runner.name }}
# script: ${{ matrix.script }}
# timeout: ${{ matrix.timeout || 10 }}
# is_unit_test: "true"
# image: dfm
# cpu-only: ${{ matrix.cpu-only || false }}
# has-azure-credentials: "true"
# azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
# azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
# azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}

cicd-e2e-tests:
strategy:
fail-fast: false
matrix:
include:
- script: L2_Functional_Tests_GPU
runner: linux-amd64-gpu-rtxa6000-latest-2-nemo
# - script: L2_Functional_Tests_GPU
# runner: self-hosted-nemo
# timeout: 30
- script: L2_Mcore_Mock_Tests_GPU
runner: self-hosted-nemo
timeout: 30
needs: [cicd-unit-tests]
needs: [cicd-container-build]
runs-on: ${{ matrix.runner }}
name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
environment: nemo-ci
Expand All @@ -117,7 +121,7 @@ jobs:
Nemo_CICD_Test:
needs:
- cicd-container-build
- cicd-unit-tests
# - cicd-unit-tests
- cicd-e2e-tests
if: always()
runs-on: ubuntu-latest
Expand Down
2 changes: 1 addition & 1 deletion 3rdparty/Megatron-Bridge
Submodule Megatron-Bridge updated 77 files
+5 −1 .github/workflows/cache-hf-model.yml
+2 −1 .github/workflows/cicd-main.yml
+1 −1 3rdparty/Megatron-LM
+1 −1 CONTRIBUTING.md
+1 −0 docs/models/llm/index.md
+183 −0 docs/models/llm/olmoe.md
+1 −0 docs/models/vlm/index.md
+143 −0 docs/models/vlm/qwen2.5-vl.md
+7 −0 docs/training/checkpointing.md
+0 −1 docs/training/peft.md
+674 −0 examples/conversion/compare_text_generation.py
+1 −1 examples/quantization/ptq_generate.py
+48 −0 examples/recipes/qwen3_next/conf/qwen3_next_80b_a3b_finetune_override_example.yaml
+147 −0 examples/recipes/qwen3_next/finetune_qwen3_next_80b_a3b.py
+9 −2 examples/recipes/qwen_vl/finetune_qwen25_vl.py
+16 −31 scripts/performance/argument_parser.py
+22 −12 scripts/performance/configs/deepseek/deepseek_llm_pretrain.py
+8 −8 scripts/performance/configs/gpt_oss/gpt_oss_llm_pretrain.py
+38 −22 scripts/performance/configs/llama3/llama3_llm_pretrain.py
+2 −5 scripts/performance/configs/llama3/workload_base_configs.py
+27 −11 scripts/performance/configs/llama31/llama31_llm_pretrain.py
+6 −6 scripts/performance/configs/nemotronh/nemotronh_llm_pretrain.py
+24 −23 scripts/performance/configs/qwen3/qwen3_llm_pretrain.py
+12 −25 scripts/performance/perf_plugins.py
+11 −8 scripts/performance/setup_experiment.py
+9 −6 scripts/performance/utils/executors.py
+34 −25 scripts/performance/utils/helpers.py
+2 −6 scripts/performance/utils/utils.py
+105 −0 src/megatron/bridge/data/iterator_utils.py
+1 −1 src/megatron/bridge/data/loaders.py
+39 −1 src/megatron/bridge/models/conversion/auto_bridge.py
+112 −4 src/megatron/bridge/models/conversion/model_bridge.py
+8 −8 src/megatron/bridge/models/conversion/param_mapping.py
+2 −2 src/megatron/bridge/models/conversion/utils.py
+1 −0 src/megatron/bridge/models/gemma/gemma3_provider.py
+3 −0 src/megatron/bridge/models/gpt_provider.py
+2 −0 src/megatron/bridge/models/mamba/mamba_provider.py
+1 −1 src/megatron/bridge/models/olmoe/olmoe_provider.py
+31 −7 src/megatron/bridge/peft/lora.py
+2 −0 src/megatron/bridge/recipes/gemma/__init__.py
+245 −1 src/megatron/bridge/recipes/gemma/gemma3.py
+5 −5 src/megatron/bridge/recipes/gpt_oss/gpt_oss.py
+22 −0 src/megatron/bridge/recipes/llama/__init__.py
+534 −22 src/megatron/bridge/recipes/llama/llama3.py
+24 −0 src/megatron/bridge/recipes/olmoe/__init__.py
+682 −0 src/megatron/bridge/recipes/olmoe/olmoe_7b.py
+6 −0 src/megatron/bridge/recipes/qwen/__init__.py
+294 −1 src/megatron/bridge/recipes/qwen/qwen3_moe.py
+292 −49 src/megatron/bridge/recipes/qwen/qwen3_next.py
+73 −9 src/megatron/bridge/recipes/qwen_vl/qwen25_vl.py
+37 −21 src/megatron/bridge/training/checkpointing.py
+22 −1 src/megatron/bridge/training/config.py
+44 −4 src/megatron/bridge/training/eval.py
+0 −1 src/megatron/bridge/training/pretrain.py
+6 −0 src/megatron/bridge/training/setup.py
+125 −68 src/megatron/bridge/training/train.py
+154 −38 src/megatron/bridge/training/utils/checkpoint_utils.py
+184 −121 tests/end_to_end_tests/evaluate_recipe_training.py
+28 −0 tests/functional_tests/L2_Launch_post_training_quantization.sh
+1 −0 tests/functional_tests/L2_Launch_quantization_aware_training.sh
+81 −10 tests/functional_tests/quantization/test_qat_workflow.py
+27 −10 tests/functional_tests/quantization/test_quantization_workflow.py
+1 −0 tests/functional_tests/training/test_finetune_lora.py
+2 −0 tests/functional_tests/training/test_pretrain_resume.py
+1 −1 tests/functional_tests/utils.py
+143 −0 tests/unit_tests/data/test_finetuning.py
+109 −0 tests/unit_tests/data/test_iterator_utils.py
+40 −0 tests/unit_tests/models/test_auto_bridge.py
+119 −0 tests/unit_tests/models/test_model_bridge_lora.py
+262 −14 tests/unit_tests/recipes/test_gemma3_recipes.py
+356 −24 tests/unit_tests/recipes/test_llama_recipes.py
+457 −0 tests/unit_tests/recipes/test_olmoe_recipes.py
+216 −17 tests/unit_tests/recipes/test_qwen_recipes.py
+26 −7 tests/unit_tests/training/test_checkpointing.py
+233 −0 tests/unit_tests/training/test_train.py
+46 −0 tests/unit_tests/training/utils/test_checkpoint_utils.py
+104 −164 uv.lock
28 changes: 28 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,32 @@
# Contributing To NeMo DFM
## 🛠️ Setting Up Your Environment

Use the instructions below to setup a dev environment and a dev container

### Building a container
```bash
# We recommend you to get the latest commits for Megatron-Bridge and Autmodel
# The easiest way to do that might be to remove the 3rdparty directly completely before running the following commands
git submodule update --init --recursive --remote # Get all the 3rd party submodules
cd 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM # Megatron LM commit might be wrong
# Get the right megatron commit from here: https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/3rdparty
git checkout <commit_hash>
cd ../../../../
docker build -f docker/Dockerfile.ci -t dfm:latest .
```

### Run the container
```bash
docker run --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --gpus all $(pwd):/opt/DFM -it dfm:latest bash
```

### inside the container
```bash
# Add DFM to PYTHONPATH
export PYTHONPATH=$PYTHONPATH:/opt/DFM

# Run a Mock Run:
```

## Signing Your Work

Expand Down
8 changes: 5 additions & 3 deletions dfm/src/megatron/model/common/dit_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def __init__(
else:
self.k_layernorm = None

def get_query_key_value_tensors(self, hidden_states, key_value_states=None, split_qkv=False):
def get_query_key_value_tensors(self, hidden_states, key_value_states=None, output_gate=None, split_qkv=True):
"""
Derives `query`, `key` and `value` tensors from `hidden_states`.
"""
Expand Down Expand Up @@ -251,13 +251,15 @@ def __init__(
is_expert=False,
)

def get_query_key_value_tensors(self, hidden_states, key_value_states, split_qkv=False):
def get_query_key_value_tensors(self, hidden_states, key_value_states, output_gate=None, split_qkv=True):
"""
Derives `query` tensor from `hidden_states`, and `key`/`value` tensors
from `key_value_states`.
"""

query, key, value = super().get_query_key_value_tensors(hidden_states, key_value_states)
query, key, value = super().get_query_key_value_tensors(
hidden_states, key_value_states, output_gate=output_gate, split_qkv=split_qkv
)

# gather query and key heads across TP ranks if self.layernorm_across_heads is True
if self.layernorm_across_heads and parallel_state.get_tensor_model_parallel_world_size() > 1:
Expand Down
1 change: 0 additions & 1 deletion dfm/src/megatron/model/dit/dit_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,6 @@ def __init__(
super(DiTCrossAttentionModel, self).__init__(config=config)

self.config: TransformerConfig = config

self.transformer_decoder_layer_spec = transformer_decoder_layer_spec()
self.pre_process = pre_process
self.post_process = post_process
Expand Down
1 change: 1 addition & 0 deletions dfm/src/megatron/model/wan/wan_layer_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ def forward(
packed_seq_params=None,
sequence_len_offset=None,
inference_context=None,
rotary_pos_cos_sin=None,
):
# the timestep embedding is stored in attention_mask argument
timestep_emb = attention_mask
Expand Down
14 changes: 13 additions & 1 deletion docker/Dockerfile.ci
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,19 @@ RUN uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages
# Copy dependency files and source code (needed for dynamic version resolution)
COPY pyproject.toml uv.lock ./
COPY dfm ./dfm
COPY 3rdparty ./3rdparty

# Copy 3rdparty dependencies with minimal files for metadata resolution
# Copy Automodel
COPY 3rdparty/Automodel ./3rdparty/Automodel

# Copy Megatron-Bridge
COPY 3rdparty/Megatron-Bridge/pyproject.toml ./3rdparty/Megatron-Bridge/
COPY 3rdparty/Megatron-Bridge/src ./3rdparty/Megatron-Bridge/src

# Copy minimal Megatron-LM files for metadata (prevents full source build)
COPY 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/pyproject.toml ./3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/
COPY 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/__init__.py ./3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/
COPY 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/package_info.py ./3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/

# Install dependencies in two steps:
# 1. Install build dependencies first (required for packages with no-build-isolation)
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ explicit = true
[tool.uv.sources]
nemo-automodel = { path = "3rdparty/Automodel" }
megatron-bridge = { path = "3rdparty/Megatron-Bridge" }
megatron-core = { path = "3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/" }
transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "release_v2.9" }
nvidia-resiliency-ext = { index = "pypi" }

Expand Down
14 changes: 14 additions & 0 deletions tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
CUDA_VISIBLE_DEVICES="0,1" uv run --group megatron-bridge coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/functional_tests/test_mcore_wan_pretrain.py -m "not pleasefixme" --with_downloads -v
108 changes: 108 additions & 0 deletions tests/functional_tests/test_mcore_wan_pretrain.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Functional smoke tests for Mcore WAN pretrain mock runs."""

import os
import subprocess

import pytest


class TestMcoreWanPretrain:
"""Test class for Mcore WAN pretrain functional tests."""

@pytest.mark.run_only_on("GPU")
def test_wan_pretrain_mock(self, tmp_path):
"""
Functional test for WAN pretrain recipe with mock data.

This test verifies that the WAN pretrain recipe can run successfully
in mock mode with minimal configuration, ensuring:
1. The distributed training can start without errors
2. Model initialization works correctly
3. Forward/backward passes complete successfully
4. The training loop executes without crashes
"""
# Set up temporary directories for dataset and checkpoints
dataset_path = os.path.join(tmp_path, "mock_dataset")
checkpoint_dir = os.path.join(tmp_path, "checkpoints")
os.makedirs(dataset_path, exist_ok=True)
os.makedirs(checkpoint_dir, exist_ok=True)

# Build the command for the mock run
cmd = [
"python",
"-m",
"torch.distributed.run",
"--nproc_per_node=1",
"examples/megatron/recipes/wan/pretrain_wan.py",
"--training-mode",
"pretrain",
"model.tensor_model_parallel_size=1",
"model.pipeline_model_parallel_size=1",
"model.context_parallel_size=1",
"model.crossattn_emb_size=1536",
"model.hidden_size=1536",
"model.ffn_hidden_size=8960",
"model.num_attention_heads=12",
"model.num_layers=3",
"model.qkv_format=thd",
f"dataset.path={dataset_path}",
f"checkpoint.save={checkpoint_dir}",
f"checkpoint.load={checkpoint_dir}",
"checkpoint.load_optim=false",
"checkpoint.save_interval=200",
"optimizer.lr=5e-6",
"optimizer.min_lr=5e-6",
"train.eval_iters=0",
"train.max_steps=10",
"scheduler.lr_decay_style=constant",
"scheduler.lr_warmup_iters=0",
"model.seq_length=2048",
"dataset.seq_length=2048",
"train.global_batch_size=2",
"train.micro_batch_size=1",
"dataset.global_batch_size=2",
"dataset.micro_batch_size=1",
"logger.log_interval=1",
"--mock",
]

# Run the command with a timeout
try:
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=1800, # 30 minute timeout
check=True,
)

# Print output for debugging if needed
print("STDOUT:", result.stdout)
print("STDERR:", result.stderr)

# Basic verification that the run completed
assert result.returncode == 0, f"Command failed with return code {result.returncode}"

# Check for common success indicators in output
assert "iteration" in result.stdout.lower() or "iteration" in result.stderr.lower(), (
"Expected to see iteration progress in output"
)

except subprocess.TimeoutExpired:
pytest.fail("WAN pretrain mock run exceeded timeout of 1800 seconds (30 minutes)")
except subprocess.CalledProcessError as e:
pytest.fail(f"WAN pretrain mock run failed with error:\nSTDOUT: {e.stdout}\nSTDERR: {e.stderr}")