From a0908e688bbf871157ceb2d01d0c6c378e207fe2 Mon Sep 17 00:00:00 2001
From: Pablo Garay <pagaray@nvidia.com>
Date: Fri, 14 Nov 2025 13:19:06 -0800
Subject: [PATCH 01/15] Explicit mcore path override to use Megatron-Bridge's
 pinned submodule commit

Signed-off-by: Pablo Garay <pagaray@nvidia.com>
---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 56e934d9..05a40a68 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -132,6 +132,7 @@ explicit = true
 [tool.uv.sources]
 nemo-automodel = { path = "3rdparty/Automodel" }
 megatron-bridge = { path = "3rdparty/Megatron-Bridge" }
+megatron-core = { path = "3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/" }
 transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "release_v2.9" }
 nvidia-resiliency-ext = { index = "pypi" }
 

From d1b810668949394dc682e338694f8e083510284e Mon Sep 17 00:00:00 2001
From: Pablo Garay <pagaray@nvidia.com>
Date: Fri, 14 Nov 2025 14:04:03 -0800
Subject: [PATCH 02/15] Update Megatron-Bridge submodule to latest main with
 correct Megatron-LM commit (3cbe5c68)

Signed-off-by: Pablo Garay <pagaray@nvidia.com>
---
 3rdparty/Megatron-Bridge | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/Megatron-Bridge b/3rdparty/Megatron-Bridge
index 8e21f81a..4e4ce420 160000
--- a/3rdparty/Megatron-Bridge
+++ b/3rdparty/Megatron-Bridge
@@ -1 +1 @@
-Subproject commit 8e21f81ab961bdb0ad99a275074fe50aae15d2f9
+Subproject commit 4e4ce4203589466d0a5b846e12dd24fa74c57f2a

From 881edc62ecba52a493b5865d126913bcc7b15d6a Mon Sep 17 00:00:00 2001
From: Pablo Garay <pagaray@nvidia.com>
Date: Fri, 14 Nov 2025 14:25:20 -0800
Subject: [PATCH 03/15] Add Mcore WAN pretrain mock test to CI/CD

Signed-off-by: Pablo Garay <pagaray@nvidia.com>
---
 .github/workflows/cicd-main.yml               |   3 +
 .../L2_Mcore_Mock_Tests_GPU.sh                |  15 +++
 .../test_mcore_wan_pretrain.py                | 109 ++++++++++++++++++
 3 files changed, 127 insertions(+)
 create mode 100644 tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh
 create mode 100644 tests/functional_tests/test_mcore_wan_pretrain.py

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index b3f08665..7f792d47 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -92,6 +92,9 @@ jobs:
           - script: L2_Functional_Tests_GPU
             runner: linux-amd64-gpu-rtxa6000-latest-2-nemo
             timeout: 30
+          - script: L2_Mcore_Mock_Tests_GPU
+            runner: linux-amd64-gpu-rtxa6000-latest-2-nemo
+            timeout: 30
     needs: [cicd-unit-tests]
     runs-on: ${{ matrix.runner }}
     name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
diff --git a/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh b/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh
new file mode 100644
index 00000000..871b9e6a
--- /dev/null
+++ b/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh
@@ -0,0 +1,15 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+CUDA_VISIBLE_DEVICES="0,1" uv run coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/functional_tests/test_mcore_wan_pretrain.py -m "not pleasefixme" --with_downloads -v
+
diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py
new file mode 100644
index 00000000..b19836af
--- /dev/null
+++ b/tests/functional_tests/test_mcore_wan_pretrain.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Functional smoke tests for Mcore WAN pretrain mock runs."""
+
+import os
+import subprocess
+import tempfile
+
+import pytest
+
+
+class TestMcoreWanPretrain:
+    """Test class for Mcore WAN pretrain functional tests."""
+
+    @pytest.mark.run_only_on("GPU")
+    def test_wan_pretrain_mock(self, tmp_path):
+        """
+        Functional test for WAN pretrain recipe with mock data.
+
+        This test verifies that the WAN pretrain recipe can run successfully
+        in mock mode with minimal configuration, ensuring:
+        1. The distributed training can start without errors
+        2. Model initialization works correctly
+        3. Forward/backward passes complete successfully
+        4. The training loop executes without crashes
+        """
+        # Set up temporary directories for dataset and checkpoints
+        dataset_path = os.path.join(tmp_path, "mock_dataset")
+        checkpoint_dir = os.path.join(tmp_path, "checkpoints")
+        os.makedirs(dataset_path, exist_ok=True)
+        os.makedirs(checkpoint_dir, exist_ok=True)
+
+        # Build the command for the mock run
+        cmd = [
+            "python",
+            "-m",
+            "torch.distributed.run",
+            "--nproc_per_node=1",
+            "examples/megatron/recipes/wan/pretrain_wan.py",
+            "--training-mode",
+            "pretrain",
+            "model.tensor_model_parallel_size=1",
+            "model.pipeline_model_parallel_size=1",
+            "model.context_parallel_size=1",
+            "model.crossattn_emb_size=1536",
+            "model.hidden_size=1536",
+            "model.ffn_hidden_size=8960",
+            "model.num_attention_heads=12",
+            "model.num_layers=3",
+            "model.qkv_format=thd",
+            f"dataset.path={dataset_path}",
+            f"checkpoint.save={checkpoint_dir}",
+            f"checkpoint.load={checkpoint_dir}",
+            "checkpoint.load_optim=false",
+            "checkpoint.save_interval=200",
+            "optimizer.lr=5e-6",
+            "optimizer.min_lr=5e-6",
+            "train.eval_iters=0",
+            "scheduler.lr_decay_style=constant",
+            "scheduler.lr_warmup_iters=0",
+            "model.seq_length=2048",
+            "dataset.seq_length=2048",
+            "train.global_batch_size=2",
+            "train.micro_batch_size=1",
+            "dataset.global_batch_size=2",
+            "dataset.micro_batch_size=1",
+            "logger.log_interval=1",
+            "--mock",
+        ]
+
+        # Run the command with a timeout
+        try:
+            result = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                timeout=300,  # 5 minute timeout
+                check=True,
+            )
+
+            # Print output for debugging if needed
+            print("STDOUT:", result.stdout)
+            print("STDERR:", result.stderr)
+
+            # Basic verification that the run completed
+            assert result.returncode == 0, f"Command failed with return code {result.returncode}"
+
+            # Check for common success indicators in output
+            assert "iteration" in result.stdout.lower() or "iteration" in result.stderr.lower(), (
+                "Expected to see iteration progress in output"
+            )
+
+        except subprocess.TimeoutExpired:
+            pytest.fail("WAN pretrain mock run exceeded timeout of 300 seconds")
+        except subprocess.CalledProcessError as e:
+            pytest.fail(f"WAN pretrain mock run failed with error:\nSTDOUT: {e.stdout}\nSTDERR: {e.stderr}")
+

From e387e66b11d42ddacda4244e4a902c873baf1651 Mon Sep 17 00:00:00 2001
From: Pablo Garay <pagaray@nvidia.com>
Date: Fri, 14 Nov 2025 14:30:53 -0800
Subject: [PATCH 04/15] lintfix

Signed-off-by: Pablo Garay <pagaray@nvidia.com>
---
 tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh | 1 -
 tests/functional_tests/test_mcore_wan_pretrain.py | 2 --
 2 files changed, 3 deletions(-)

diff --git a/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh b/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh
index 871b9e6a..2e99db05 100644
--- a/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh
+++ b/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh
@@ -12,4 +12,3 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 CUDA_VISIBLE_DEVICES="0,1" uv run coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/functional_tests/test_mcore_wan_pretrain.py -m "not pleasefixme" --with_downloads -v
-
diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py
index b19836af..780bb253 100644
--- a/tests/functional_tests/test_mcore_wan_pretrain.py
+++ b/tests/functional_tests/test_mcore_wan_pretrain.py
@@ -16,7 +16,6 @@
 
 import os
 import subprocess
-import tempfile
 
 import pytest
 
@@ -106,4 +105,3 @@ def test_wan_pretrain_mock(self, tmp_path):
             pytest.fail("WAN pretrain mock run exceeded timeout of 300 seconds")
         except subprocess.CalledProcessError as e:
             pytest.fail(f"WAN pretrain mock run failed with error:\nSTDOUT: {e.stdout}\nSTDERR: {e.stderr}")
-

From 175b42d27ef7a6170e364c7e00bf7616127206e3 Mon Sep 17 00:00:00 2001
From: Pablo Garay <pagaray@nvidia.com>
Date: Fri, 14 Nov 2025 17:16:42 -0800
Subject: [PATCH 05/15] Fix slow Docker build from Megatron-LM source

Signed-off-by: Pablo Garay <pagaray@nvidia.com>
---
 docker/Dockerfile.ci | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci
index 7096b3c6..8de9c016 100644
--- a/docker/Dockerfile.ci
+++ b/docker/Dockerfile.ci
@@ -32,7 +32,19 @@ RUN uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages
 # Copy dependency files and source code (needed for dynamic version resolution)
 COPY pyproject.toml uv.lock ./
 COPY dfm ./dfm
-COPY 3rdparty ./3rdparty
+
+# Copy 3rdparty dependencies with minimal files for metadata resolution
+# Copy Automodel
+COPY 3rdparty/Automodel ./3rdparty/Automodel
+
+# Copy Megatron-Bridge
+COPY 3rdparty/Megatron-Bridge/pyproject.toml ./3rdparty/Megatron-Bridge/
+COPY 3rdparty/Megatron-Bridge/src ./3rdparty/Megatron-Bridge/src
+
+# Copy minimal Megatron-LM files for metadata (prevents full source build)
+COPY 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/pyproject.toml ./3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/
+COPY 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/__init__.py ./3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/
+COPY 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/package_info.py ./3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/
 
 # Install dependencies in two steps:
 # 1. Install build dependencies first (required for packages with no-build-isolation)

From 50f058db97a838908bc2040de6f700f6aa8e9eb2 Mon Sep 17 00:00:00 2001
From: Charlie Truong <chtruong@nvidia.com>
Date: Sat, 15 Nov 2025 09:22:19 -0600
Subject: [PATCH 06/15] ci: Update gpu runners to use self-hosted-nemo

Signed-off-by: Charlie Truong <chtruong@nvidia.com>
---
 .github/workflows/cicd-main.yml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 7f792d47..81af9136 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -45,6 +45,7 @@ jobs:
     with:
       image-name: dfm
       dockerfile: docker/Dockerfile.ci
+      runner: self-hosted-nemo
     secrets:
       AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }}
       AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }}
@@ -56,7 +57,7 @@ jobs:
       matrix:
         include:
           - script: L0_Unit_Tests_GPU
-            runner: linux-amd64-gpu-rtxa6000-latest-2-nemo
+            runner: self-hosted-nemo
             timeout: 30
           - script: L0_Unit_Tests_CPU
             runner: linux-amd64-cpu16
@@ -90,10 +91,10 @@ jobs:
       matrix:
         include:
           - script: L2_Functional_Tests_GPU
-            runner: linux-amd64-gpu-rtxa6000-latest-2-nemo
+            runner: self-hosted-nemo
             timeout: 30
           - script: L2_Mcore_Mock_Tests_GPU
-            runner: linux-amd64-gpu-rtxa6000-latest-2-nemo
+            runner: self-hosted-nemo
             timeout: 30
     needs: [cicd-unit-tests]
     runs-on: ${{ matrix.runner }}

From 4bbb20c94c8554060e0aed45c23626bd9726d5f8 Mon Sep 17 00:00:00 2001
From: Charlie Truong <chtruong@nvidia.com>
Date: Sat, 15 Nov 2025 10:45:45 -0600
Subject: [PATCH 07/15] Use uv run in test_mcore_wan_pretrain

Signed-off-by: Charlie Truong <chtruong@nvidia.com>
---
 .github/workflows/cicd-main.yml               | 76 +++++++++----------
 .../test_mcore_wan_pretrain.py                |  2 +
 2 files changed, 40 insertions(+), 38 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 81af9136..989130cb 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -51,52 +51,52 @@ jobs:
       AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }}
       AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
 
-  cicd-unit-tests:
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - script: L0_Unit_Tests_GPU
-            runner: self-hosted-nemo
-            timeout: 30
-          - script: L0_Unit_Tests_CPU
-            runner: linux-amd64-cpu16
-            cpu-only: true
-    needs: [cicd-container-build]
-    runs-on: ${{ matrix.runner }}
-    name: ${{ matrix.script }}
-    environment: nemo-ci
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: main
-        uses: ./.github/actions/test-template
-        with:
-          runner: ${{ runner.name }}
-          script: ${{ matrix.script }}
-          timeout: ${{ matrix.timeout || 10 }}
-          is_unit_test: "true"
-          image: dfm
-          cpu-only: ${{ matrix.cpu-only || false }}
-          has-azure-credentials: "true"
-          azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
-          azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
-          azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
+  # cicd-unit-tests:
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       include:
+  #         - script: L0_Unit_Tests_GPU
+  #           runner: self-hosted-nemo
+  #           timeout: 30
+  #         - script: L0_Unit_Tests_CPU
+  #           runner: linux-amd64-cpu16
+  #           cpu-only: true
+  #   needs: [cicd-container-build]
+  #   runs-on: ${{ matrix.runner }}
+  #   name: ${{ matrix.script }}
+  #   environment: nemo-ci
+  #   steps:
+  #     - name: Checkout
+  #       uses: actions/checkout@v4
+  #       with:
+  #         submodules: recursive
+  #     - name: main
+  #       uses: ./.github/actions/test-template
+  #       with:
+  #         runner: ${{ runner.name }}
+  #         script: ${{ matrix.script }}
+  #         timeout: ${{ matrix.timeout || 10 }}
+  #         is_unit_test: "true"
+  #         image: dfm
+  #         cpu-only: ${{ matrix.cpu-only || false }}
+  #         has-azure-credentials: "true"
+  #         azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
+  #         azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
+  #         azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
 
   cicd-e2e-tests:
     strategy:
       fail-fast: false
       matrix:
         include:
-          - script: L2_Functional_Tests_GPU
-            runner: self-hosted-nemo
-            timeout: 30
+          # - script: L2_Functional_Tests_GPU
+          #   runner: self-hosted-nemo
+          #   timeout: 30
           - script: L2_Mcore_Mock_Tests_GPU
             runner: self-hosted-nemo
             timeout: 30
-    needs: [cicd-unit-tests]
+    needs: [cicd-container-build]
     runs-on: ${{ matrix.runner }}
     name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
     environment: nemo-ci
@@ -121,7 +121,7 @@ jobs:
   Nemo_CICD_Test:
     needs:
       - cicd-container-build
-      - cicd-unit-tests
+      # - cicd-unit-tests
       - cicd-e2e-tests
     if: always()
     runs-on: ubuntu-latest
diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py
index 780bb253..6ca8ee34 100644
--- a/tests/functional_tests/test_mcore_wan_pretrain.py
+++ b/tests/functional_tests/test_mcore_wan_pretrain.py
@@ -43,6 +43,8 @@ def test_wan_pretrain_mock(self, tmp_path):
 
         # Build the command for the mock run
         cmd = [
+            "uv",
+            "run",
             "python",
             "-m",
             "torch.distributed.run",

From b412e4eec0b29fe14d736f25d6d66203f58e7ed4 Mon Sep 17 00:00:00 2001
From: Charlie Truong <chtruong@nvidia.com>
Date: Sat, 15 Nov 2025 11:24:46 -0600
Subject: [PATCH 08/15] Ensure uv group megatron-bridge is used for
 test_mcore_wan_pretrain

Signed-off-by: Charlie Truong <chtruong@nvidia.com>
---
 tests/functional_tests/test_mcore_wan_pretrain.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py
index 6ca8ee34..68b7000b 100644
--- a/tests/functional_tests/test_mcore_wan_pretrain.py
+++ b/tests/functional_tests/test_mcore_wan_pretrain.py
@@ -45,6 +45,8 @@ def test_wan_pretrain_mock(self, tmp_path):
         cmd = [
             "uv",
             "run",
+            "--group",
+            "megatron-bridge",
             "python",
             "-m",
             "torch.distributed.run",

From 39df47278fc69e4c5b845a68323e072f4933ac9b Mon Sep 17 00:00:00 2001
From: Abhinav Garg <abhinavg@stanford.edu>
Date: Sat, 15 Nov 2025 17:48:53 +0000
Subject: [PATCH 09/15] Update TRANSFORMERS_OFFLINE environment variable to 0
 and increase timeout in test_mcore_wan_pretrain

Signed-off-by: Charlie Truong <chtruong@nvidia.com>
---
 .github/actions/test-template/action.yml          | 2 +-
 tests/functional_tests/test_mcore_wan_pretrain.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/actions/test-template/action.yml b/.github/actions/test-template/action.yml
index ccf10a80..ef0e5446 100644
--- a/.github/actions/test-template/action.yml
+++ b/.github/actions/test-template/action.yml
@@ -161,7 +161,7 @@ runs:
           -d \
           --name nemo_container_${{ github.run_id }} ${ARG[@]} \
           --shm-size=64g \
-          --env TRANSFORMERS_OFFLINE=1 \
+          --env TRANSFORMERS_OFFLINE=0 \
           --env HYDRA_FULL_ERROR=1 \
           --env HF_HOME=/home/TestData/HF_HOME \
           --env RUN_ID=${{ github.run_id }} \
diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py
index 68b7000b..0638f615 100644
--- a/tests/functional_tests/test_mcore_wan_pretrain.py
+++ b/tests/functional_tests/test_mcore_wan_pretrain.py
@@ -89,7 +89,7 @@ def test_wan_pretrain_mock(self, tmp_path):
                 cmd,
                 capture_output=True,
                 text=True,
-                timeout=300,  # 5 minute timeout
+                timeout=3000,  # 5 minute timeout
                 check=True,
             )
 

From d7ad1ab48b4d5f2fb00f1a51c84320228c1f64f3 Mon Sep 17 00:00:00 2001
From: Charlie Truong <chtruong@nvidia.com>
Date: Sat, 15 Nov 2025 12:13:01 -0600
Subject: [PATCH 10/15] Revert GHA changes

Signed-off-by: Charlie Truong <chtruong@nvidia.com>
---
 .github/workflows/cicd-main.yml | 76 ++++++++++++++++-----------------
 1 file changed, 38 insertions(+), 38 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 989130cb..81af9136 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -51,52 +51,52 @@ jobs:
       AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }}
       AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
 
-  # cicd-unit-tests:
-  #   strategy:
-  #     fail-fast: false
-  #     matrix:
-  #       include:
-  #         - script: L0_Unit_Tests_GPU
-  #           runner: self-hosted-nemo
-  #           timeout: 30
-  #         - script: L0_Unit_Tests_CPU
-  #           runner: linux-amd64-cpu16
-  #           cpu-only: true
-  #   needs: [cicd-container-build]
-  #   runs-on: ${{ matrix.runner }}
-  #   name: ${{ matrix.script }}
-  #   environment: nemo-ci
-  #   steps:
-  #     - name: Checkout
-  #       uses: actions/checkout@v4
-  #       with:
-  #         submodules: recursive
-  #     - name: main
-  #       uses: ./.github/actions/test-template
-  #       with:
-  #         runner: ${{ runner.name }}
-  #         script: ${{ matrix.script }}
-  #         timeout: ${{ matrix.timeout || 10 }}
-  #         is_unit_test: "true"
-  #         image: dfm
-  #         cpu-only: ${{ matrix.cpu-only || false }}
-  #         has-azure-credentials: "true"
-  #         azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
-  #         azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
-  #         azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
+  cicd-unit-tests:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - script: L0_Unit_Tests_GPU
+            runner: self-hosted-nemo
+            timeout: 30
+          - script: L0_Unit_Tests_CPU
+            runner: linux-amd64-cpu16
+            cpu-only: true
+    needs: [cicd-container-build]
+    runs-on: ${{ matrix.runner }}
+    name: ${{ matrix.script }}
+    environment: nemo-ci
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      - name: main
+        uses: ./.github/actions/test-template
+        with:
+          runner: ${{ runner.name }}
+          script: ${{ matrix.script }}
+          timeout: ${{ matrix.timeout || 10 }}
+          is_unit_test: "true"
+          image: dfm
+          cpu-only: ${{ matrix.cpu-only || false }}
+          has-azure-credentials: "true"
+          azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
+          azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
+          azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
 
   cicd-e2e-tests:
     strategy:
       fail-fast: false
       matrix:
         include:
-          # - script: L2_Functional_Tests_GPU
-          #   runner: self-hosted-nemo
-          #   timeout: 30
+          - script: L2_Functional_Tests_GPU
+            runner: self-hosted-nemo
+            timeout: 30
           - script: L2_Mcore_Mock_Tests_GPU
             runner: self-hosted-nemo
             timeout: 30
-    needs: [cicd-container-build]
+    needs: [cicd-unit-tests]
     runs-on: ${{ matrix.runner }}
     name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
     environment: nemo-ci
@@ -121,7 +121,7 @@ jobs:
   Nemo_CICD_Test:
     needs:
       - cicd-container-build
-      # - cicd-unit-tests
+      - cicd-unit-tests
       - cicd-e2e-tests
     if: always()
     runs-on: ubuntu-latest

From 1b3184affe5b5e0ab02e8252e53f17a5986bf32f Mon Sep 17 00:00:00 2001
From: Charlie Truong <chtruong@nvidia.com>
Date: Sat, 15 Nov 2025 12:15:43 -0600
Subject: [PATCH 11/15] Move uv run group call to L2_Mcore_Mock_Tests_GPU

Signed-off-by: Charlie Truong <chtruong@nvidia.com>
---
 tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh | 2 +-
 tests/functional_tests/test_mcore_wan_pretrain.py | 4 ----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh b/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh
index 2e99db05..b8d237a1 100644
--- a/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh
+++ b/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh
@@ -11,4 +11,4 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-CUDA_VISIBLE_DEVICES="0,1" uv run coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/functional_tests/test_mcore_wan_pretrain.py -m "not pleasefixme" --with_downloads -v
+CUDA_VISIBLE_DEVICES="0,1" uv run --group megatron-bridge coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/functional_tests/test_mcore_wan_pretrain.py -m "not pleasefixme" --with_downloads -v
diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py
index 0638f615..312a299f 100644
--- a/tests/functional_tests/test_mcore_wan_pretrain.py
+++ b/tests/functional_tests/test_mcore_wan_pretrain.py
@@ -43,10 +43,6 @@ def test_wan_pretrain_mock(self, tmp_path):
 
         # Build the command for the mock run
         cmd = [
-            "uv",
-            "run",
-            "--group",
-            "megatron-bridge",
             "python",
             "-m",
             "torch.distributed.run",

From 3a64d34a28b3787933d675e4a51c261d193e7c7d Mon Sep 17 00:00:00 2001
From: Charlie Truong <chtruong@nvidia.com>
Date: Sat, 15 Nov 2025 12:17:33 -0600
Subject: [PATCH 12/15] Set test back to 5 minute timeout

Signed-off-by: Charlie Truong <chtruong@nvidia.com>
---
 tests/functional_tests/test_mcore_wan_pretrain.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py
index 312a299f..780bb253 100644
--- a/tests/functional_tests/test_mcore_wan_pretrain.py
+++ b/tests/functional_tests/test_mcore_wan_pretrain.py
@@ -85,7 +85,7 @@ def test_wan_pretrain_mock(self, tmp_path):
                 cmd,
                 capture_output=True,
                 text=True,
-                timeout=3000,  # 5 minute timeout
+                timeout=300,  # 5 minute timeout
                 check=True,
             )
 

From afcae6cef26b08f0ab2f1182ee3458733472643e Mon Sep 17 00:00:00 2001
From: Pablo Garay <pagaray@nvidia.com>
Date: Sat, 15 Nov 2025 12:09:16 -0800
Subject: [PATCH 13/15] Megatron fixes (#49)

* Enhance DiT and Wan layer specifications

- Updated `get_query_key_value_tensors` method in `dit_attention.py` to include an `output_gate` parameter and set `split_qkv` to default to `True`.
- Modified `WanLayerWithAdaLN` class in `wan_layer_spec.py` to add `rotary_pos_cos_sin` parameter for improved positional encoding handling.

* Implement ProcessGroupCollection initialization in DiT and Wan models

- Added initialization of `pg_collection` in both `DiTCrossAttentionModel` and `WanModel` to ensure proper handling of process groups.
- This change checks if `pg_collection` exists and is not None before assigning it, enhancing the robustness of the models.

* Update CONTRIBUTING.md to include detailed setup instructions for development environment and Docker container usage. Added sections for building and running the container, as well as setting the PYTHONPATH for DFM.

* Refactor import statements in dit_model.py to streamline dependencies. Removed redundant import of ProcessGroupCollection, enhancing code clarity and maintainability.

* Refactor code style in DiT and Wan models

- Updated string quotes in `dit_model.py` and `wan_model.py` for consistency, changing from single to double quotes.
- Reformatted the `get_query_key_value_tensors` method call in `dit_attention.py` for improved readability by breaking it into multiple lines.

* Revert M4 changes

* Ruff

* Ruff

* Lint

---------

Co-authored-by: Abhinav Garg <abhinavg@stanford.edu>
---
 CONTRIBUTING.md                               | 28 +++++++++++++++++++
 .../megatron/model/common/dit_attention.py    |  8 ++++--
 dfm/src/megatron/model/dit/dit_model.py       |  1 -
 dfm/src/megatron/model/wan/wan_layer_spec.py  |  1 +
 4 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 68ab66d4..aed9cf99 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,4 +1,32 @@
 # Contributing To NeMo DFM
+## 🛠️ Setting Up Your Environment
+
+Use the instructions below to setup a dev environment and a dev container
+
+### Building a container
+```bash
+# We recommend you to get the latest commits for Megatron-Bridge and Autmodel
+# The easiest way to do that might be to remove the 3rdparty directly completely before running the following commands
+git submodule update --init --recursive --remote # Get all the 3rd party submodules
+cd 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM # Megatron LM commit might be wrong
+# Get the right megatron commit from here: https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/3rdparty
+git checkout <commit_hash>
+cd ../../../../
+docker build -f docker/Dockerfile.ci -t dfm:latest .
+```
+
+### Run the container
+```bash
+docker run --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --gpus all $(pwd):/opt/DFM -it dfm:latest bash
+```
+
+### inside the container
+```bash
+# Add DFM to PYTHONPATH
+export PYTHONPATH=$PYTHONPATH:/opt/DFM
+
+# Run a Mock Run:
+```
 
 ## Signing Your Work
 
diff --git a/dfm/src/megatron/model/common/dit_attention.py b/dfm/src/megatron/model/common/dit_attention.py
index 321e9b08..acf39d47 100644
--- a/dfm/src/megatron/model/common/dit_attention.py
+++ b/dfm/src/megatron/model/common/dit_attention.py
@@ -100,7 +100,7 @@ def __init__(
         else:
             self.k_layernorm = None
 
-    def get_query_key_value_tensors(self, hidden_states, key_value_states=None, split_qkv=False):
+    def get_query_key_value_tensors(self, hidden_states, key_value_states=None, output_gate=None, split_qkv=True):
         """
         Derives `query`, `key` and `value` tensors from `hidden_states`.
         """
@@ -251,13 +251,15 @@ def __init__(
             is_expert=False,
         )
 
-    def get_query_key_value_tensors(self, hidden_states, key_value_states, split_qkv=False):
+    def get_query_key_value_tensors(self, hidden_states, key_value_states, output_gate=None, split_qkv=True):
         """
         Derives `query` tensor from `hidden_states`, and `key`/`value` tensors
         from `key_value_states`.
         """
 
-        query, key, value = super().get_query_key_value_tensors(hidden_states, key_value_states)
+        query, key, value = super().get_query_key_value_tensors(
+            hidden_states, key_value_states, output_gate=output_gate, split_qkv=split_qkv
+        )
 
         # gather query and key heads across TP ranks if self.layernorm_across_heads is True
         if self.layernorm_across_heads and parallel_state.get_tensor_model_parallel_world_size() > 1:
diff --git a/dfm/src/megatron/model/dit/dit_model.py b/dfm/src/megatron/model/dit/dit_model.py
index e3ae8a29..38cb8422 100644
--- a/dfm/src/megatron/model/dit/dit_model.py
+++ b/dfm/src/megatron/model/dit/dit_model.py
@@ -105,7 +105,6 @@ def __init__(
         super(DiTCrossAttentionModel, self).__init__(config=config)
 
         self.config: TransformerConfig = config
-
         self.transformer_decoder_layer_spec = transformer_decoder_layer_spec()
         self.pre_process = pre_process
         self.post_process = post_process
diff --git a/dfm/src/megatron/model/wan/wan_layer_spec.py b/dfm/src/megatron/model/wan/wan_layer_spec.py
index 2b355930..a0d6354e 100644
--- a/dfm/src/megatron/model/wan/wan_layer_spec.py
+++ b/dfm/src/megatron/model/wan/wan_layer_spec.py
@@ -162,6 +162,7 @@ def forward(
         packed_seq_params=None,
         sequence_len_offset=None,
         inference_context=None,
+        rotary_pos_cos_sin=None,
     ):
         # the timestep embedding is stored in attention_mask argument
         timestep_emb = attention_mask

From b87b7dc74d6ac7f37dbe4d28ae19bca533ef21a6 Mon Sep 17 00:00:00 2001
From: Pablo Garay <pagaray@nvidia.com>
Date: Sat, 15 Nov 2025 12:54:43 -0800
Subject: [PATCH 14/15] Revert "Revert GHA changes"

This reverts commit d7ad1ab48b4d5f2fb00f1a51c84320228c1f64f3.
---
 .github/workflows/cicd-main.yml | 76 ++++++++++++++++-----------------
 1 file changed, 38 insertions(+), 38 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 81af9136..989130cb 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -51,52 +51,52 @@ jobs:
       AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }}
       AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
 
-  cicd-unit-tests:
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - script: L0_Unit_Tests_GPU
-            runner: self-hosted-nemo
-            timeout: 30
-          - script: L0_Unit_Tests_CPU
-            runner: linux-amd64-cpu16
-            cpu-only: true
-    needs: [cicd-container-build]
-    runs-on: ${{ matrix.runner }}
-    name: ${{ matrix.script }}
-    environment: nemo-ci
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: main
-        uses: ./.github/actions/test-template
-        with:
-          runner: ${{ runner.name }}
-          script: ${{ matrix.script }}
-          timeout: ${{ matrix.timeout || 10 }}
-          is_unit_test: "true"
-          image: dfm
-          cpu-only: ${{ matrix.cpu-only || false }}
-          has-azure-credentials: "true"
-          azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
-          azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
-          azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
+  # cicd-unit-tests:
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       include:
+  #         - script: L0_Unit_Tests_GPU
+  #           runner: self-hosted-nemo
+  #           timeout: 30
+  #         - script: L0_Unit_Tests_CPU
+  #           runner: linux-amd64-cpu16
+  #           cpu-only: true
+  #   needs: [cicd-container-build]
+  #   runs-on: ${{ matrix.runner }}
+  #   name: ${{ matrix.script }}
+  #   environment: nemo-ci
+  #   steps:
+  #     - name: Checkout
+  #       uses: actions/checkout@v4
+  #       with:
+  #         submodules: recursive
+  #     - name: main
+  #       uses: ./.github/actions/test-template
+  #       with:
+  #         runner: ${{ runner.name }}
+  #         script: ${{ matrix.script }}
+  #         timeout: ${{ matrix.timeout || 10 }}
+  #         is_unit_test: "true"
+  #         image: dfm
+  #         cpu-only: ${{ matrix.cpu-only || false }}
+  #         has-azure-credentials: "true"
+  #         azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
+  #         azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
+  #         azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
 
   cicd-e2e-tests:
     strategy:
       fail-fast: false
       matrix:
         include:
-          - script: L2_Functional_Tests_GPU
-            runner: self-hosted-nemo
-            timeout: 30
+          # - script: L2_Functional_Tests_GPU
+          #   runner: self-hosted-nemo
+          #   timeout: 30
           - script: L2_Mcore_Mock_Tests_GPU
             runner: self-hosted-nemo
             timeout: 30
-    needs: [cicd-unit-tests]
+    needs: [cicd-container-build]
     runs-on: ${{ matrix.runner }}
     name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
     environment: nemo-ci
@@ -121,7 +121,7 @@ jobs:
   Nemo_CICD_Test:
     needs:
       - cicd-container-build
-      - cicd-unit-tests
+      # - cicd-unit-tests
       - cicd-e2e-tests
     if: always()
     runs-on: ubuntu-latest

From 0bc8872724aa8c8041d32eae2cf1d7217d96cda3 Mon Sep 17 00:00:00 2001
From: Pablo Garay <pagaray@nvidia.com>
Date: Sat, 15 Nov 2025 13:23:59 -0800
Subject: [PATCH 15/15] tempfortest: timeout setting

Signed-off-by: Pablo Garay <pagaray@nvidia.com>
---
 tests/functional_tests/test_mcore_wan_pretrain.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py
index 780bb253..1d8122ae 100644
--- a/tests/functional_tests/test_mcore_wan_pretrain.py
+++ b/tests/functional_tests/test_mcore_wan_pretrain.py
@@ -67,6 +67,7 @@ def test_wan_pretrain_mock(self, tmp_path):
             "optimizer.lr=5e-6",
             "optimizer.min_lr=5e-6",
             "train.eval_iters=0",
+            "train.max_steps=10",
             "scheduler.lr_decay_style=constant",
             "scheduler.lr_warmup_iters=0",
             "model.seq_length=2048",
@@ -85,7 +86,7 @@ def test_wan_pretrain_mock(self, tmp_path):
                 cmd,
                 capture_output=True,
                 text=True,
-                timeout=300,  # 5 minute timeout
+                timeout=1800,  # 30 minute timeout
                 check=True,
             )
 
@@ -102,6 +103,6 @@ def test_wan_pretrain_mock(self, tmp_path):
             )
 
         except subprocess.TimeoutExpired:
-            pytest.fail("WAN pretrain mock run exceeded timeout of 300 seconds")
+            pytest.fail("WAN pretrain mock run exceeded timeout of 1800 seconds (30 minutes)")
         except subprocess.CalledProcessError as e:
             pytest.fail(f"WAN pretrain mock run failed with error:\nSTDOUT: {e.stdout}\nSTDERR: {e.stderr}")