From a0908e688bbf871157ceb2d01d0c6c378e207fe2 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Fri, 14 Nov 2025 13:19:06 -0800 Subject: [PATCH 01/15] Explicit mcore path override to use Megatron-Bridge's pinned submodule commit Signed-off-by: Pablo Garay --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 56e934d9..05a40a68 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -132,6 +132,7 @@ explicit = true [tool.uv.sources] nemo-automodel = { path = "3rdparty/Automodel" } megatron-bridge = { path = "3rdparty/Megatron-Bridge" } +megatron-core = { path = "3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/" } transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "release_v2.9" } nvidia-resiliency-ext = { index = "pypi" } From d1b810668949394dc682e338694f8e083510284e Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Fri, 14 Nov 2025 14:04:03 -0800 Subject: [PATCH 02/15] Update Megatron-Bridge submodule to latest main with correct Megatron-LM commit (3cbe5c68) Signed-off-by: Pablo Garay --- 3rdparty/Megatron-Bridge | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/Megatron-Bridge b/3rdparty/Megatron-Bridge index 8e21f81a..4e4ce420 160000 --- a/3rdparty/Megatron-Bridge +++ b/3rdparty/Megatron-Bridge @@ -1 +1 @@ -Subproject commit 8e21f81ab961bdb0ad99a275074fe50aae15d2f9 +Subproject commit 4e4ce4203589466d0a5b846e12dd24fa74c57f2a From 881edc62ecba52a493b5865d126913bcc7b15d6a Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Fri, 14 Nov 2025 14:25:20 -0800 Subject: [PATCH 03/15] Add Mcore WAN pretrain mock test to CI/CD Signed-off-by: Pablo Garay --- .github/workflows/cicd-main.yml | 3 + .../L2_Mcore_Mock_Tests_GPU.sh | 15 +++ .../test_mcore_wan_pretrain.py | 109 ++++++++++++++++++ 3 files changed, 127 insertions(+) create mode 100644 tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh create mode 100644 tests/functional_tests/test_mcore_wan_pretrain.py diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index b3f08665..7f792d47 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -92,6 +92,9 @@ jobs: - script: L2_Functional_Tests_GPU runner: linux-amd64-gpu-rtxa6000-latest-2-nemo timeout: 30 + - script: L2_Mcore_Mock_Tests_GPU + runner: linux-amd64-gpu-rtxa6000-latest-2-nemo + timeout: 30 needs: [cicd-unit-tests] runs-on: ${{ matrix.runner }} name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }} diff --git a/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh b/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh new file mode 100644 index 00000000..871b9e6a --- /dev/null +++ b/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh @@ -0,0 +1,15 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +CUDA_VISIBLE_DEVICES="0,1" uv run coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/functional_tests/test_mcore_wan_pretrain.py -m "not pleasefixme" --with_downloads -v + diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py new file mode 100644 index 00000000..b19836af --- /dev/null +++ b/tests/functional_tests/test_mcore_wan_pretrain.py @@ -0,0 +1,109 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional smoke tests for Mcore WAN pretrain mock runs.""" + +import os +import subprocess +import tempfile + +import pytest + + +class TestMcoreWanPretrain: + """Test class for Mcore WAN pretrain functional tests.""" + + @pytest.mark.run_only_on("GPU") + def test_wan_pretrain_mock(self, tmp_path): + """ + Functional test for WAN pretrain recipe with mock data. + + This test verifies that the WAN pretrain recipe can run successfully + in mock mode with minimal configuration, ensuring: + 1. The distributed training can start without errors + 2. Model initialization works correctly + 3. Forward/backward passes complete successfully + 4. The training loop executes without crashes + """ + # Set up temporary directories for dataset and checkpoints + dataset_path = os.path.join(tmp_path, "mock_dataset") + checkpoint_dir = os.path.join(tmp_path, "checkpoints") + os.makedirs(dataset_path, exist_ok=True) + os.makedirs(checkpoint_dir, exist_ok=True) + + # Build the command for the mock run + cmd = [ + "python", + "-m", + "torch.distributed.run", + "--nproc_per_node=1", + "examples/megatron/recipes/wan/pretrain_wan.py", + "--training-mode", + "pretrain", + "model.tensor_model_parallel_size=1", + "model.pipeline_model_parallel_size=1", + "model.context_parallel_size=1", + "model.crossattn_emb_size=1536", + "model.hidden_size=1536", + "model.ffn_hidden_size=8960", + "model.num_attention_heads=12", + "model.num_layers=3", + "model.qkv_format=thd", + f"dataset.path={dataset_path}", + f"checkpoint.save={checkpoint_dir}", + f"checkpoint.load={checkpoint_dir}", + "checkpoint.load_optim=false", + "checkpoint.save_interval=200", + "optimizer.lr=5e-6", + "optimizer.min_lr=5e-6", + "train.eval_iters=0", + "scheduler.lr_decay_style=constant", + "scheduler.lr_warmup_iters=0", + "model.seq_length=2048", + "dataset.seq_length=2048", + "train.global_batch_size=2", + "train.micro_batch_size=1", + "dataset.global_batch_size=2", + "dataset.micro_batch_size=1", + "logger.log_interval=1", + "--mock", + ] + + # Run the command with a timeout + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=300, # 5 minute timeout + check=True, + ) + + # Print output for debugging if needed + print("STDOUT:", result.stdout) + print("STDERR:", result.stderr) + + # Basic verification that the run completed + assert result.returncode == 0, f"Command failed with return code {result.returncode}" + + # Check for common success indicators in output + assert "iteration" in result.stdout.lower() or "iteration" in result.stderr.lower(), ( + "Expected to see iteration progress in output" + ) + + except subprocess.TimeoutExpired: + pytest.fail("WAN pretrain mock run exceeded timeout of 300 seconds") + except subprocess.CalledProcessError as e: + pytest.fail(f"WAN pretrain mock run failed with error:\nSTDOUT: {e.stdout}\nSTDERR: {e.stderr}") + From e387e66b11d42ddacda4244e4a902c873baf1651 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Fri, 14 Nov 2025 14:30:53 -0800 Subject: [PATCH 04/15] lintfix Signed-off-by: Pablo Garay --- tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh | 1 - tests/functional_tests/test_mcore_wan_pretrain.py | 2 -- 2 files changed, 3 deletions(-) diff --git a/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh b/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh index 871b9e6a..2e99db05 100644 --- a/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh +++ b/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh @@ -12,4 +12,3 @@ # See the License for the specific language governing permissions and # limitations under the License. CUDA_VISIBLE_DEVICES="0,1" uv run coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/functional_tests/test_mcore_wan_pretrain.py -m "not pleasefixme" --with_downloads -v - diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py index b19836af..780bb253 100644 --- a/tests/functional_tests/test_mcore_wan_pretrain.py +++ b/tests/functional_tests/test_mcore_wan_pretrain.py @@ -16,7 +16,6 @@ import os import subprocess -import tempfile import pytest @@ -106,4 +105,3 @@ def test_wan_pretrain_mock(self, tmp_path): pytest.fail("WAN pretrain mock run exceeded timeout of 300 seconds") except subprocess.CalledProcessError as e: pytest.fail(f"WAN pretrain mock run failed with error:\nSTDOUT: {e.stdout}\nSTDERR: {e.stderr}") - From 175b42d27ef7a6170e364c7e00bf7616127206e3 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Fri, 14 Nov 2025 17:16:42 -0800 Subject: [PATCH 05/15] Fix slow Docker build from Megatron-LM source Signed-off-by: Pablo Garay --- docker/Dockerfile.ci | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci index 7096b3c6..8de9c016 100644 --- a/docker/Dockerfile.ci +++ b/docker/Dockerfile.ci @@ -32,7 +32,19 @@ RUN uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages # Copy dependency files and source code (needed for dynamic version resolution) COPY pyproject.toml uv.lock ./ COPY dfm ./dfm -COPY 3rdparty ./3rdparty + +# Copy 3rdparty dependencies with minimal files for metadata resolution +# Copy Automodel +COPY 3rdparty/Automodel ./3rdparty/Automodel + +# Copy Megatron-Bridge +COPY 3rdparty/Megatron-Bridge/pyproject.toml ./3rdparty/Megatron-Bridge/ +COPY 3rdparty/Megatron-Bridge/src ./3rdparty/Megatron-Bridge/src + +# Copy minimal Megatron-LM files for metadata (prevents full source build) +COPY 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/pyproject.toml ./3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/ +COPY 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/__init__.py ./3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/ +COPY 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/package_info.py ./3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/ # Install dependencies in two steps: # 1. Install build dependencies first (required for packages with no-build-isolation) From 50f058db97a838908bc2040de6f700f6aa8e9eb2 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Sat, 15 Nov 2025 09:22:19 -0600 Subject: [PATCH 06/15] ci: Update gpu runners to use self-hosted-nemo Signed-off-by: Charlie Truong --- .github/workflows/cicd-main.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 7f792d47..81af9136 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -45,6 +45,7 @@ jobs: with: image-name: dfm dockerfile: docker/Dockerfile.ci + runner: self-hosted-nemo secrets: AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }} AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }} @@ -56,7 +57,7 @@ jobs: matrix: include: - script: L0_Unit_Tests_GPU - runner: linux-amd64-gpu-rtxa6000-latest-2-nemo + runner: self-hosted-nemo timeout: 30 - script: L0_Unit_Tests_CPU runner: linux-amd64-cpu16 @@ -90,10 +91,10 @@ jobs: matrix: include: - script: L2_Functional_Tests_GPU - runner: linux-amd64-gpu-rtxa6000-latest-2-nemo + runner: self-hosted-nemo timeout: 30 - script: L2_Mcore_Mock_Tests_GPU - runner: linux-amd64-gpu-rtxa6000-latest-2-nemo + runner: self-hosted-nemo timeout: 30 needs: [cicd-unit-tests] runs-on: ${{ matrix.runner }} From 4bbb20c94c8554060e0aed45c23626bd9726d5f8 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Sat, 15 Nov 2025 10:45:45 -0600 Subject: [PATCH 07/15] Use uv run in test_mcore_wan_pretrain Signed-off-by: Charlie Truong --- .github/workflows/cicd-main.yml | 76 +++++++++---------- .../test_mcore_wan_pretrain.py | 2 + 2 files changed, 40 insertions(+), 38 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 81af9136..989130cb 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -51,52 +51,52 @@ jobs: AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }} AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }} - cicd-unit-tests: - strategy: - fail-fast: false - matrix: - include: - - script: L0_Unit_Tests_GPU - runner: self-hosted-nemo - timeout: 30 - - script: L0_Unit_Tests_CPU - runner: linux-amd64-cpu16 - cpu-only: true - needs: [cicd-container-build] - runs-on: ${{ matrix.runner }} - name: ${{ matrix.script }} - environment: nemo-ci - steps: - - name: Checkout - uses: actions/checkout@v4 - with: - submodules: recursive - - name: main - uses: ./.github/actions/test-template - with: - runner: ${{ runner.name }} - script: ${{ matrix.script }} - timeout: ${{ matrix.timeout || 10 }} - is_unit_test: "true" - image: dfm - cpu-only: ${{ matrix.cpu-only || false }} - has-azure-credentials: "true" - azure-client-id: ${{ secrets.AZURE_CLIENT_ID }} - azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }} - azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + # cicd-unit-tests: + # strategy: + # fail-fast: false + # matrix: + # include: + # - script: L0_Unit_Tests_GPU + # runner: self-hosted-nemo + # timeout: 30 + # - script: L0_Unit_Tests_CPU + # runner: linux-amd64-cpu16 + # cpu-only: true + # needs: [cicd-container-build] + # runs-on: ${{ matrix.runner }} + # name: ${{ matrix.script }} + # environment: nemo-ci + # steps: + # - name: Checkout + # uses: actions/checkout@v4 + # with: + # submodules: recursive + # - name: main + # uses: ./.github/actions/test-template + # with: + # runner: ${{ runner.name }} + # script: ${{ matrix.script }} + # timeout: ${{ matrix.timeout || 10 }} + # is_unit_test: "true" + # image: dfm + # cpu-only: ${{ matrix.cpu-only || false }} + # has-azure-credentials: "true" + # azure-client-id: ${{ secrets.AZURE_CLIENT_ID }} + # azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }} + # azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} cicd-e2e-tests: strategy: fail-fast: false matrix: include: - - script: L2_Functional_Tests_GPU - runner: self-hosted-nemo - timeout: 30 + # - script: L2_Functional_Tests_GPU + # runner: self-hosted-nemo + # timeout: 30 - script: L2_Mcore_Mock_Tests_GPU runner: self-hosted-nemo timeout: 30 - needs: [cicd-unit-tests] + needs: [cicd-container-build] runs-on: ${{ matrix.runner }} name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }} environment: nemo-ci @@ -121,7 +121,7 @@ jobs: Nemo_CICD_Test: needs: - cicd-container-build - - cicd-unit-tests + # - cicd-unit-tests - cicd-e2e-tests if: always() runs-on: ubuntu-latest diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py index 780bb253..6ca8ee34 100644 --- a/tests/functional_tests/test_mcore_wan_pretrain.py +++ b/tests/functional_tests/test_mcore_wan_pretrain.py @@ -43,6 +43,8 @@ def test_wan_pretrain_mock(self, tmp_path): # Build the command for the mock run cmd = [ + "uv", + "run", "python", "-m", "torch.distributed.run", From b412e4eec0b29fe14d736f25d6d66203f58e7ed4 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Sat, 15 Nov 2025 11:24:46 -0600 Subject: [PATCH 08/15] Ensure uv group megatron-bridge is used for test_mcore_wan_pretrain Signed-off-by: Charlie Truong --- tests/functional_tests/test_mcore_wan_pretrain.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py index 6ca8ee34..68b7000b 100644 --- a/tests/functional_tests/test_mcore_wan_pretrain.py +++ b/tests/functional_tests/test_mcore_wan_pretrain.py @@ -45,6 +45,8 @@ def test_wan_pretrain_mock(self, tmp_path): cmd = [ "uv", "run", + "--group", + "megatron-bridge", "python", "-m", "torch.distributed.run", From 39df47278fc69e4c5b845a68323e072f4933ac9b Mon Sep 17 00:00:00 2001 From: Abhinav Garg Date: Sat, 15 Nov 2025 17:48:53 +0000 Subject: [PATCH 09/15] Update TRANSFORMERS_OFFLINE environment variable to 0 and increase timeout in test_mcore_wan_pretrain Signed-off-by: Charlie Truong --- .github/actions/test-template/action.yml | 2 +- tests/functional_tests/test_mcore_wan_pretrain.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/actions/test-template/action.yml b/.github/actions/test-template/action.yml index ccf10a80..ef0e5446 100644 --- a/.github/actions/test-template/action.yml +++ b/.github/actions/test-template/action.yml @@ -161,7 +161,7 @@ runs: -d \ --name nemo_container_${{ github.run_id }} ${ARG[@]} \ --shm-size=64g \ - --env TRANSFORMERS_OFFLINE=1 \ + --env TRANSFORMERS_OFFLINE=0 \ --env HYDRA_FULL_ERROR=1 \ --env HF_HOME=/home/TestData/HF_HOME \ --env RUN_ID=${{ github.run_id }} \ diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py index 68b7000b..0638f615 100644 --- a/tests/functional_tests/test_mcore_wan_pretrain.py +++ b/tests/functional_tests/test_mcore_wan_pretrain.py @@ -89,7 +89,7 @@ def test_wan_pretrain_mock(self, tmp_path): cmd, capture_output=True, text=True, - timeout=300, # 5 minute timeout + timeout=3000, # 5 minute timeout check=True, ) From d7ad1ab48b4d5f2fb00f1a51c84320228c1f64f3 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Sat, 15 Nov 2025 12:13:01 -0600 Subject: [PATCH 10/15] Revert GHA changes Signed-off-by: Charlie Truong --- .github/workflows/cicd-main.yml | 76 ++++++++++++++++----------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 989130cb..81af9136 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -51,52 +51,52 @@ jobs: AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }} AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }} - # cicd-unit-tests: - # strategy: - # fail-fast: false - # matrix: - # include: - # - script: L0_Unit_Tests_GPU - # runner: self-hosted-nemo - # timeout: 30 - # - script: L0_Unit_Tests_CPU - # runner: linux-amd64-cpu16 - # cpu-only: true - # needs: [cicd-container-build] - # runs-on: ${{ matrix.runner }} - # name: ${{ matrix.script }} - # environment: nemo-ci - # steps: - # - name: Checkout - # uses: actions/checkout@v4 - # with: - # submodules: recursive - # - name: main - # uses: ./.github/actions/test-template - # with: - # runner: ${{ runner.name }} - # script: ${{ matrix.script }} - # timeout: ${{ matrix.timeout || 10 }} - # is_unit_test: "true" - # image: dfm - # cpu-only: ${{ matrix.cpu-only || false }} - # has-azure-credentials: "true" - # azure-client-id: ${{ secrets.AZURE_CLIENT_ID }} - # azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }} - # azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + cicd-unit-tests: + strategy: + fail-fast: false + matrix: + include: + - script: L0_Unit_Tests_GPU + runner: self-hosted-nemo + timeout: 30 + - script: L0_Unit_Tests_CPU + runner: linux-amd64-cpu16 + cpu-only: true + needs: [cicd-container-build] + runs-on: ${{ matrix.runner }} + name: ${{ matrix.script }} + environment: nemo-ci + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: recursive + - name: main + uses: ./.github/actions/test-template + with: + runner: ${{ runner.name }} + script: ${{ matrix.script }} + timeout: ${{ matrix.timeout || 10 }} + is_unit_test: "true" + image: dfm + cpu-only: ${{ matrix.cpu-only || false }} + has-azure-credentials: "true" + azure-client-id: ${{ secrets.AZURE_CLIENT_ID }} + azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }} + azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} cicd-e2e-tests: strategy: fail-fast: false matrix: include: - # - script: L2_Functional_Tests_GPU - # runner: self-hosted-nemo - # timeout: 30 + - script: L2_Functional_Tests_GPU + runner: self-hosted-nemo + timeout: 30 - script: L2_Mcore_Mock_Tests_GPU runner: self-hosted-nemo timeout: 30 - needs: [cicd-container-build] + needs: [cicd-unit-tests] runs-on: ${{ matrix.runner }} name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }} environment: nemo-ci @@ -121,7 +121,7 @@ jobs: Nemo_CICD_Test: needs: - cicd-container-build - # - cicd-unit-tests + - cicd-unit-tests - cicd-e2e-tests if: always() runs-on: ubuntu-latest From 1b3184affe5b5e0ab02e8252e53f17a5986bf32f Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Sat, 15 Nov 2025 12:15:43 -0600 Subject: [PATCH 11/15] Move uv run group call to L2_Mcore_Mock_Tests_GPU Signed-off-by: Charlie Truong --- tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh | 2 +- tests/functional_tests/test_mcore_wan_pretrain.py | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh b/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh index 2e99db05..b8d237a1 100644 --- a/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh +++ b/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh @@ -11,4 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -CUDA_VISIBLE_DEVICES="0,1" uv run coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/functional_tests/test_mcore_wan_pretrain.py -m "not pleasefixme" --with_downloads -v +CUDA_VISIBLE_DEVICES="0,1" uv run --group megatron-bridge coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/functional_tests/test_mcore_wan_pretrain.py -m "not pleasefixme" --with_downloads -v diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py index 0638f615..312a299f 100644 --- a/tests/functional_tests/test_mcore_wan_pretrain.py +++ b/tests/functional_tests/test_mcore_wan_pretrain.py @@ -43,10 +43,6 @@ def test_wan_pretrain_mock(self, tmp_path): # Build the command for the mock run cmd = [ - "uv", - "run", - "--group", - "megatron-bridge", "python", "-m", "torch.distributed.run", From 3a64d34a28b3787933d675e4a51c261d193e7c7d Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Sat, 15 Nov 2025 12:17:33 -0600 Subject: [PATCH 12/15] Set test back to 5 minute timeout Signed-off-by: Charlie Truong --- tests/functional_tests/test_mcore_wan_pretrain.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py index 312a299f..780bb253 100644 --- a/tests/functional_tests/test_mcore_wan_pretrain.py +++ b/tests/functional_tests/test_mcore_wan_pretrain.py @@ -85,7 +85,7 @@ def test_wan_pretrain_mock(self, tmp_path): cmd, capture_output=True, text=True, - timeout=3000, # 5 minute timeout + timeout=300, # 5 minute timeout check=True, ) From afcae6cef26b08f0ab2f1182ee3458733472643e Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Sat, 15 Nov 2025 12:09:16 -0800 Subject: [PATCH 13/15] Megatron fixes (#49) * Enhance DiT and Wan layer specifications - Updated `get_query_key_value_tensors` method in `dit_attention.py` to include an `output_gate` parameter and set `split_qkv` to default to `True`. - Modified `WanLayerWithAdaLN` class in `wan_layer_spec.py` to add `rotary_pos_cos_sin` parameter for improved positional encoding handling. * Implement ProcessGroupCollection initialization in DiT and Wan models - Added initialization of `pg_collection` in both `DiTCrossAttentionModel` and `WanModel` to ensure proper handling of process groups. - This change checks if `pg_collection` exists and is not None before assigning it, enhancing the robustness of the models. * Update CONTRIBUTING.md to include detailed setup instructions for development environment and Docker container usage. Added sections for building and running the container, as well as setting the PYTHONPATH for DFM. * Refactor import statements in dit_model.py to streamline dependencies. Removed redundant import of ProcessGroupCollection, enhancing code clarity and maintainability. * Refactor code style in DiT and Wan models - Updated string quotes in `dit_model.py` and `wan_model.py` for consistency, changing from single to double quotes. - Reformatted the `get_query_key_value_tensors` method call in `dit_attention.py` for improved readability by breaking it into multiple lines. * Revert M4 changes * Ruff * Ruff * Lint --------- Co-authored-by: Abhinav Garg --- CONTRIBUTING.md | 28 +++++++++++++++++++ .../megatron/model/common/dit_attention.py | 8 ++++-- dfm/src/megatron/model/dit/dit_model.py | 1 - dfm/src/megatron/model/wan/wan_layer_spec.py | 1 + 4 files changed, 34 insertions(+), 4 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 68ab66d4..aed9cf99 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,4 +1,32 @@ # Contributing To NeMo DFM +## 🛠️ Setting Up Your Environment + +Use the instructions below to setup a dev environment and a dev container + +### Building a container +```bash +# We recommend you to get the latest commits for Megatron-Bridge and Autmodel +# The easiest way to do that might be to remove the 3rdparty directly completely before running the following commands +git submodule update --init --recursive --remote # Get all the 3rd party submodules +cd 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM # Megatron LM commit might be wrong +# Get the right megatron commit from here: https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/3rdparty +git checkout +cd ../../../../ +docker build -f docker/Dockerfile.ci -t dfm:latest . +``` + +### Run the container +```bash +docker run --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --gpus all $(pwd):/opt/DFM -it dfm:latest bash +``` + +### inside the container +```bash +# Add DFM to PYTHONPATH +export PYTHONPATH=$PYTHONPATH:/opt/DFM + +# Run a Mock Run: +``` ## Signing Your Work diff --git a/dfm/src/megatron/model/common/dit_attention.py b/dfm/src/megatron/model/common/dit_attention.py index 321e9b08..acf39d47 100644 --- a/dfm/src/megatron/model/common/dit_attention.py +++ b/dfm/src/megatron/model/common/dit_attention.py @@ -100,7 +100,7 @@ def __init__( else: self.k_layernorm = None - def get_query_key_value_tensors(self, hidden_states, key_value_states=None, split_qkv=False): + def get_query_key_value_tensors(self, hidden_states, key_value_states=None, output_gate=None, split_qkv=True): """ Derives `query`, `key` and `value` tensors from `hidden_states`. """ @@ -251,13 +251,15 @@ def __init__( is_expert=False, ) - def get_query_key_value_tensors(self, hidden_states, key_value_states, split_qkv=False): + def get_query_key_value_tensors(self, hidden_states, key_value_states, output_gate=None, split_qkv=True): """ Derives `query` tensor from `hidden_states`, and `key`/`value` tensors from `key_value_states`. """ - query, key, value = super().get_query_key_value_tensors(hidden_states, key_value_states) + query, key, value = super().get_query_key_value_tensors( + hidden_states, key_value_states, output_gate=output_gate, split_qkv=split_qkv + ) # gather query and key heads across TP ranks if self.layernorm_across_heads is True if self.layernorm_across_heads and parallel_state.get_tensor_model_parallel_world_size() > 1: diff --git a/dfm/src/megatron/model/dit/dit_model.py b/dfm/src/megatron/model/dit/dit_model.py index e3ae8a29..38cb8422 100644 --- a/dfm/src/megatron/model/dit/dit_model.py +++ b/dfm/src/megatron/model/dit/dit_model.py @@ -105,7 +105,6 @@ def __init__( super(DiTCrossAttentionModel, self).__init__(config=config) self.config: TransformerConfig = config - self.transformer_decoder_layer_spec = transformer_decoder_layer_spec() self.pre_process = pre_process self.post_process = post_process diff --git a/dfm/src/megatron/model/wan/wan_layer_spec.py b/dfm/src/megatron/model/wan/wan_layer_spec.py index 2b355930..a0d6354e 100644 --- a/dfm/src/megatron/model/wan/wan_layer_spec.py +++ b/dfm/src/megatron/model/wan/wan_layer_spec.py @@ -162,6 +162,7 @@ def forward( packed_seq_params=None, sequence_len_offset=None, inference_context=None, + rotary_pos_cos_sin=None, ): # the timestep embedding is stored in attention_mask argument timestep_emb = attention_mask From b87b7dc74d6ac7f37dbe4d28ae19bca533ef21a6 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Sat, 15 Nov 2025 12:54:43 -0800 Subject: [PATCH 14/15] Revert "Revert GHA changes" This reverts commit d7ad1ab48b4d5f2fb00f1a51c84320228c1f64f3. --- .github/workflows/cicd-main.yml | 76 ++++++++++++++++----------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 81af9136..989130cb 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -51,52 +51,52 @@ jobs: AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }} AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }} - cicd-unit-tests: - strategy: - fail-fast: false - matrix: - include: - - script: L0_Unit_Tests_GPU - runner: self-hosted-nemo - timeout: 30 - - script: L0_Unit_Tests_CPU - runner: linux-amd64-cpu16 - cpu-only: true - needs: [cicd-container-build] - runs-on: ${{ matrix.runner }} - name: ${{ matrix.script }} - environment: nemo-ci - steps: - - name: Checkout - uses: actions/checkout@v4 - with: - submodules: recursive - - name: main - uses: ./.github/actions/test-template - with: - runner: ${{ runner.name }} - script: ${{ matrix.script }} - timeout: ${{ matrix.timeout || 10 }} - is_unit_test: "true" - image: dfm - cpu-only: ${{ matrix.cpu-only || false }} - has-azure-credentials: "true" - azure-client-id: ${{ secrets.AZURE_CLIENT_ID }} - azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }} - azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + # cicd-unit-tests: + # strategy: + # fail-fast: false + # matrix: + # include: + # - script: L0_Unit_Tests_GPU + # runner: self-hosted-nemo + # timeout: 30 + # - script: L0_Unit_Tests_CPU + # runner: linux-amd64-cpu16 + # cpu-only: true + # needs: [cicd-container-build] + # runs-on: ${{ matrix.runner }} + # name: ${{ matrix.script }} + # environment: nemo-ci + # steps: + # - name: Checkout + # uses: actions/checkout@v4 + # with: + # submodules: recursive + # - name: main + # uses: ./.github/actions/test-template + # with: + # runner: ${{ runner.name }} + # script: ${{ matrix.script }} + # timeout: ${{ matrix.timeout || 10 }} + # is_unit_test: "true" + # image: dfm + # cpu-only: ${{ matrix.cpu-only || false }} + # has-azure-credentials: "true" + # azure-client-id: ${{ secrets.AZURE_CLIENT_ID }} + # azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }} + # azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} cicd-e2e-tests: strategy: fail-fast: false matrix: include: - - script: L2_Functional_Tests_GPU - runner: self-hosted-nemo - timeout: 30 + # - script: L2_Functional_Tests_GPU + # runner: self-hosted-nemo + # timeout: 30 - script: L2_Mcore_Mock_Tests_GPU runner: self-hosted-nemo timeout: 30 - needs: [cicd-unit-tests] + needs: [cicd-container-build] runs-on: ${{ matrix.runner }} name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }} environment: nemo-ci @@ -121,7 +121,7 @@ jobs: Nemo_CICD_Test: needs: - cicd-container-build - - cicd-unit-tests + # - cicd-unit-tests - cicd-e2e-tests if: always() runs-on: ubuntu-latest From 0bc8872724aa8c8041d32eae2cf1d7217d96cda3 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Sat, 15 Nov 2025 13:23:59 -0800 Subject: [PATCH 15/15] tempfortest: timeout setting Signed-off-by: Pablo Garay --- tests/functional_tests/test_mcore_wan_pretrain.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py index 780bb253..1d8122ae 100644 --- a/tests/functional_tests/test_mcore_wan_pretrain.py +++ b/tests/functional_tests/test_mcore_wan_pretrain.py @@ -67,6 +67,7 @@ def test_wan_pretrain_mock(self, tmp_path): "optimizer.lr=5e-6", "optimizer.min_lr=5e-6", "train.eval_iters=0", + "train.max_steps=10", "scheduler.lr_decay_style=constant", "scheduler.lr_warmup_iters=0", "model.seq_length=2048", @@ -85,7 +86,7 @@ def test_wan_pretrain_mock(self, tmp_path): cmd, capture_output=True, text=True, - timeout=300, # 5 minute timeout + timeout=1800, # 30 minute timeout check=True, ) @@ -102,6 +103,6 @@ def test_wan_pretrain_mock(self, tmp_path): ) except subprocess.TimeoutExpired: - pytest.fail("WAN pretrain mock run exceeded timeout of 300 seconds") + pytest.fail("WAN pretrain mock run exceeded timeout of 1800 seconds (30 minutes)") except subprocess.CalledProcessError as e: pytest.fail(f"WAN pretrain mock run failed with error:\nSTDOUT: {e.stdout}\nSTDERR: {e.stderr}")