NVIDIA-NeMo · pablo-garay · Nov 14, 2025 · Nov 14, 2025 · Nov 14, 2025 · Nov 14, 2025
@@ -161,7 +161,7 @@ runs:
           -d \
           --name nemo_container_${{ github.run_id }} ${ARG[@]} \
           --shm-size=64g \
-          --env TRANSFORMERS_OFFLINE=1 \
+          --env TRANSFORMERS_OFFLINE=0 \
           --env HYDRA_FULL_ERROR=1 \
           --env HF_HOME=/home/TestData/HF_HOME \
           --env RUN_ID=${{ github.run_id }} \

@@ -45,54 +45,58 @@ jobs:
     with:
       image-name: dfm
       dockerfile: docker/Dockerfile.ci
+      runner: self-hosted-nemo
     secrets:
       AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }}
       AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }}
       AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
 
-  cicd-unit-tests:
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - script: L0_Unit_Tests_GPU
-            runner: linux-amd64-gpu-rtxa6000-latest-2-nemo
-            timeout: 30
-          - script: L0_Unit_Tests_CPU
-            runner: linux-amd64-cpu16
-            cpu-only: true
-    needs: [cicd-container-build]
-    runs-on: ${{ matrix.runner }}
-    name: ${{ matrix.script }}
-    environment: nemo-ci
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: main
-        uses: ./.github/actions/test-template
-        with:
-          runner: ${{ runner.name }}
-          script: ${{ matrix.script }}
-          timeout: ${{ matrix.timeout || 10 }}
-          is_unit_test: "true"
-          image: dfm
-          cpu-only: ${{ matrix.cpu-only || false }}
-          has-azure-credentials: "true"
-          azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
-          azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
-          azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
+  # cicd-unit-tests:
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       include:
+  #         - script: L0_Unit_Tests_GPU
+  #           runner: self-hosted-nemo
+  #           timeout: 30
+  #         - script: L0_Unit_Tests_CPU
+  #           runner: linux-amd64-cpu16
+  #           cpu-only: true
+  #   needs: [cicd-container-build]
+  #   runs-on: ${{ matrix.runner }}
+  #   name: ${{ matrix.script }}
+  #   environment: nemo-ci
+  #   steps:
+  #     - name: Checkout
+  #       uses: actions/checkout@v4
+  #       with:
+  #         submodules: recursive
+  #     - name: main
+  #       uses: ./.github/actions/test-template
+  #       with:
+  #         runner: ${{ runner.name }}
+  #         script: ${{ matrix.script }}
+  #         timeout: ${{ matrix.timeout || 10 }}
+  #         is_unit_test: "true"
+  #         image: dfm
+  #         cpu-only: ${{ matrix.cpu-only || false }}
+  #         has-azure-credentials: "true"
+  #         azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
+  #         azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
+  #         azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
 
   cicd-e2e-tests:
     strategy:
       fail-fast: false
       matrix:
         include:
-          - script: L2_Functional_Tests_GPU
-            runner: linux-amd64-gpu-rtxa6000-latest-2-nemo
+          # - script: L2_Functional_Tests_GPU
+          #   runner: self-hosted-nemo
+          #   timeout: 30
+          - script: L2_Mcore_Mock_Tests_GPU
+            runner: self-hosted-nemo
             timeout: 30
-    needs: [cicd-unit-tests]
+    needs: [cicd-container-build]
     runs-on: ${{ matrix.runner }}
     name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
     environment: nemo-ci
@@ -117,7 +121,7 @@ jobs:
   Nemo_CICD_Test:
     needs:
       - cicd-container-build
-      - cicd-unit-tests
+      # - cicd-unit-tests
       - cicd-e2e-tests
     if: always()
     runs-on: ubuntu-latest

diff --git a/3rdparty/Megatron-Bridge b/3rdparty/Megatron-Bridge
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -1,4 +1,32 @@
 # Contributing To NeMo DFM
+## 🛠️ Setting Up Your Environment
+
+Use the instructions below to setup a dev environment and a dev container
+
+### Building a container
+```bash
+# We recommend you to get the latest commits for Megatron-Bridge and Autmodel
+# The easiest way to do that might be to remove the 3rdparty directly completely before running the following commands
+git submodule update --init --recursive --remote # Get all the 3rd party submodules
+cd 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM # Megatron LM commit might be wrong
+# Get the right megatron commit from here: https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/3rdparty
+git checkout <commit_hash>
+cd ../../../../
+docker build -f docker/Dockerfile.ci -t dfm:latest .
+```
+
+### Run the container
+```bash
+docker run --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --gpus all $(pwd):/opt/DFM -it dfm:latest bash
+```
+
+### inside the container
+```bash
+# Add DFM to PYTHONPATH
+export PYTHONPATH=$PYTHONPATH:/opt/DFM
+
+# Run a Mock Run:
+```
 
 ## Signing Your Work
 

diff --git a/dfm/src/megatron/model/common/dit_attention.py b/dfm/src/megatron/model/common/dit_attention.py
@@ -100,7 +100,7 @@ def __init__(
         else:
             self.k_layernorm = None
 
-    def get_query_key_value_tensors(self, hidden_states, key_value_states=None, split_qkv=False):
+    def get_query_key_value_tensors(self, hidden_states, key_value_states=None, output_gate=None, split_qkv=True):
         """
         Derives `query`, `key` and `value` tensors from `hidden_states`.
         """
@@ -251,13 +251,15 @@ def __init__(
             is_expert=False,
         )
 
-    def get_query_key_value_tensors(self, hidden_states, key_value_states, split_qkv=False):
+    def get_query_key_value_tensors(self, hidden_states, key_value_states, output_gate=None, split_qkv=True):
         """
         Derives `query` tensor from `hidden_states`, and `key`/`value` tensors
         from `key_value_states`.
         """
 
-        query, key, value = super().get_query_key_value_tensors(hidden_states, key_value_states)
+        query, key, value = super().get_query_key_value_tensors(
+            hidden_states, key_value_states, output_gate=output_gate, split_qkv=split_qkv
+        )
 
         # gather query and key heads across TP ranks if self.layernorm_across_heads is True
         if self.layernorm_across_heads and parallel_state.get_tensor_model_parallel_world_size() > 1:

diff --git a/dfm/src/megatron/model/dit/dit_model.py b/dfm/src/megatron/model/dit/dit_model.py
@@ -105,7 +105,6 @@ def __init__(
         super(DiTCrossAttentionModel, self).__init__(config=config)
 
         self.config: TransformerConfig = config
-
         self.transformer_decoder_layer_spec = transformer_decoder_layer_spec()
         self.pre_process = pre_process
         self.post_process = post_process

diff --git a/dfm/src/megatron/model/wan/wan_layer_spec.py b/dfm/src/megatron/model/wan/wan_layer_spec.py
@@ -162,6 +162,7 @@ def forward(
         packed_seq_params=None,
         sequence_len_offset=None,
         inference_context=None,
+        rotary_pos_cos_sin=None,
     ):
         # the timestep embedding is stored in attention_mask argument
         timestep_emb = attention_mask

@@ -32,7 +32,19 @@ RUN uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages
 # Copy dependency files and source code (needed for dynamic version resolution)
 COPY pyproject.toml uv.lock ./
 COPY dfm ./dfm
-COPY 3rdparty ./3rdparty
+
+# Copy 3rdparty dependencies with minimal files for metadata resolution
+# Copy Automodel
+COPY 3rdparty/Automodel ./3rdparty/Automodel
+
+# Copy Megatron-Bridge
+COPY 3rdparty/Megatron-Bridge/pyproject.toml ./3rdparty/Megatron-Bridge/
+COPY 3rdparty/Megatron-Bridge/src ./3rdparty/Megatron-Bridge/src
+
+# Copy minimal Megatron-LM files for metadata (prevents full source build)
+COPY 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/pyproject.toml ./3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/
+COPY 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/__init__.py ./3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/
+COPY 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/package_info.py ./3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/
 
 # Install dependencies in two steps:
 # 1. Install build dependencies first (required for packages with no-build-isolation)

@@ -132,6 +132,7 @@ explicit = true
 [tool.uv.sources]
 nemo-automodel = { path = "3rdparty/Automodel" }
 megatron-bridge = { path = "3rdparty/Megatron-Bridge" }
+megatron-core = { path = "3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/" }
 transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "release_v2.9" }
 nvidia-resiliency-ext = { index = "pypi" }
 

diff --git a/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh b/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh
@@ -0,0 +1,14 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+CUDA_VISIBLE_DEVICES="0,1" uv run --group megatron-bridge coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/functional_tests/test_mcore_wan_pretrain.py -m "not pleasefixme" --with_downloads -v
diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Functional smoke tests for Mcore WAN pretrain mock runs."""
+
+import os
+import subprocess
+
+import pytest
+
+
+class TestMcoreWanPretrain:
+    """Test class for Mcore WAN pretrain functional tests."""
+
+    @pytest.mark.run_only_on("GPU")
+    def test_wan_pretrain_mock(self, tmp_path):
+        """
+        Functional test for WAN pretrain recipe with mock data.
+
+        This test verifies that the WAN pretrain recipe can run successfully
+        in mock mode with minimal configuration, ensuring:
+        1. The distributed training can start without errors
+        2. Model initialization works correctly
+        3. Forward/backward passes complete successfully
+        4. The training loop executes without crashes
+        """
+        # Set up temporary directories for dataset and checkpoints
+        dataset_path = os.path.join(tmp_path, "mock_dataset")
+        checkpoint_dir = os.path.join(tmp_path, "checkpoints")
+        os.makedirs(dataset_path, exist_ok=True)
+        os.makedirs(checkpoint_dir, exist_ok=True)
+
+        # Build the command for the mock run
+        cmd = [
+            "python",
+            "-m",
+            "torch.distributed.run",
+            "--nproc_per_node=1",
+            "examples/megatron/recipes/wan/pretrain_wan.py",
+            "--training-mode",
+            "pretrain",
+            "model.tensor_model_parallel_size=1",
+            "model.pipeline_model_parallel_size=1",
+            "model.context_parallel_size=1",
+            "model.crossattn_emb_size=1536",
+            "model.hidden_size=1536",
+            "model.ffn_hidden_size=8960",
+            "model.num_attention_heads=12",
+            "model.num_layers=3",
+            "model.qkv_format=thd",
+            f"dataset.path={dataset_path}",
+            f"checkpoint.save={checkpoint_dir}",
+            f"checkpoint.load={checkpoint_dir}",
+            "checkpoint.load_optim=false",
+            "checkpoint.save_interval=200",
+            "optimizer.lr=5e-6",
+            "optimizer.min_lr=5e-6",
+            "train.eval_iters=0",
+            "train.max_steps=10",
+            "scheduler.lr_decay_style=constant",
+            "scheduler.lr_warmup_iters=0",
+            "model.seq_length=2048",
+            "dataset.seq_length=2048",
+            "train.global_batch_size=2",
+            "train.micro_batch_size=1",
+            "dataset.global_batch_size=2",
+            "dataset.micro_batch_size=1",
+            "logger.log_interval=1",
+            "--mock",
+        ]
+
+        # Run the command with a timeout
+        try:
+            result = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                timeout=1800,  # 30 minute timeout
+                check=True,
+            )
+
+            # Print output for debugging if needed
+            print("STDOUT:", result.stdout)
+            print("STDERR:", result.stderr)
+
+            # Basic verification that the run completed
+            assert result.returncode == 0, f"Command failed with return code {result.returncode}"
+
+            # Check for common success indicators in output
+            assert "iteration" in result.stdout.lower() or "iteration" in result.stderr.lower(), (
+                "Expected to see iteration progress in output"
+            )
+
+        except subprocess.TimeoutExpired:
+            pytest.fail("WAN pretrain mock run exceeded timeout of 1800 seconds (30 minutes)")
+        except subprocess.CalledProcessError as e:
+            pytest.fail(f"WAN pretrain mock run failed with error:\nSTDOUT: {e.stdout}\nSTDERR: {e.stderr}")
+5 −1		.github/workflows/cache-hf-model.yml
+2 −1		.github/workflows/cicd-main.yml
+1 −1		3rdparty/Megatron-LM
+1 −1		CONTRIBUTING.md
+1 −0		docs/models/llm/index.md
+183 −0		docs/models/llm/olmoe.md
+1 −0		docs/models/vlm/index.md
+143 −0		docs/models/vlm/qwen2.5-vl.md
+7 −0		docs/training/checkpointing.md
+0 −1		docs/training/peft.md
+674 −0		examples/conversion/compare_text_generation.py
+1 −1		examples/quantization/ptq_generate.py
+48 −0		examples/recipes/qwen3_next/conf/qwen3_next_80b_a3b_finetune_override_example.yaml
+147 −0		examples/recipes/qwen3_next/finetune_qwen3_next_80b_a3b.py
+9 −2		examples/recipes/qwen_vl/finetune_qwen25_vl.py
+16 −31		scripts/performance/argument_parser.py
+22 −12		scripts/performance/configs/deepseek/deepseek_llm_pretrain.py
+8 −8		scripts/performance/configs/gpt_oss/gpt_oss_llm_pretrain.py
+38 −22		scripts/performance/configs/llama3/llama3_llm_pretrain.py
+2 −5		scripts/performance/configs/llama3/workload_base_configs.py
+27 −11		scripts/performance/configs/llama31/llama31_llm_pretrain.py
+6 −6		scripts/performance/configs/nemotronh/nemotronh_llm_pretrain.py
+24 −23		scripts/performance/configs/qwen3/qwen3_llm_pretrain.py
+12 −25		scripts/performance/perf_plugins.py
+11 −8		scripts/performance/setup_experiment.py
+9 −6		scripts/performance/utils/executors.py
+34 −25		scripts/performance/utils/helpers.py
+2 −6		scripts/performance/utils/utils.py
+105 −0		src/megatron/bridge/data/iterator_utils.py
+1 −1		src/megatron/bridge/data/loaders.py
+39 −1		src/megatron/bridge/models/conversion/auto_bridge.py
+112 −4		src/megatron/bridge/models/conversion/model_bridge.py
+8 −8		src/megatron/bridge/models/conversion/param_mapping.py
+2 −2		src/megatron/bridge/models/conversion/utils.py
+1 −0		src/megatron/bridge/models/gemma/gemma3_provider.py
+3 −0		src/megatron/bridge/models/gpt_provider.py
+2 −0		src/megatron/bridge/models/mamba/mamba_provider.py
+1 −1		src/megatron/bridge/models/olmoe/olmoe_provider.py
+31 −7		src/megatron/bridge/peft/lora.py
+2 −0		src/megatron/bridge/recipes/gemma/__init__.py
+245 −1		src/megatron/bridge/recipes/gemma/gemma3.py
+5 −5		src/megatron/bridge/recipes/gpt_oss/gpt_oss.py
+22 −0		src/megatron/bridge/recipes/llama/__init__.py
+534 −22		src/megatron/bridge/recipes/llama/llama3.py
+24 −0		src/megatron/bridge/recipes/olmoe/__init__.py
+682 −0		src/megatron/bridge/recipes/olmoe/olmoe_7b.py
+6 −0		src/megatron/bridge/recipes/qwen/__init__.py
+294 −1		src/megatron/bridge/recipes/qwen/qwen3_moe.py
+292 −49		src/megatron/bridge/recipes/qwen/qwen3_next.py
+73 −9		src/megatron/bridge/recipes/qwen_vl/qwen25_vl.py
+37 −21		src/megatron/bridge/training/checkpointing.py
+22 −1		src/megatron/bridge/training/config.py
+44 −4		src/megatron/bridge/training/eval.py
+0 −1		src/megatron/bridge/training/pretrain.py
+6 −0		src/megatron/bridge/training/setup.py
+125 −68		src/megatron/bridge/training/train.py
+154 −38		src/megatron/bridge/training/utils/checkpoint_utils.py
+184 −121		tests/end_to_end_tests/evaluate_recipe_training.py
+28 −0		tests/functional_tests/L2_Launch_post_training_quantization.sh
+1 −0		tests/functional_tests/L2_Launch_quantization_aware_training.sh
+81 −10		tests/functional_tests/quantization/test_qat_workflow.py
+27 −10		tests/functional_tests/quantization/test_quantization_workflow.py
+1 −0		tests/functional_tests/training/test_finetune_lora.py
+2 −0		tests/functional_tests/training/test_pretrain_resume.py
+1 −1		tests/functional_tests/utils.py
+143 −0		tests/unit_tests/data/test_finetuning.py
+109 −0		tests/unit_tests/data/test_iterator_utils.py
+40 −0		tests/unit_tests/models/test_auto_bridge.py
+119 −0		tests/unit_tests/models/test_model_bridge_lora.py
+262 −14		tests/unit_tests/recipes/test_gemma3_recipes.py
+356 −24		tests/unit_tests/recipes/test_llama_recipes.py
+457 −0		tests/unit_tests/recipes/test_olmoe_recipes.py
+216 −17		tests/unit_tests/recipes/test_qwen_recipes.py
+26 −7		tests/unit_tests/training/test_checkpointing.py
+233 −0		tests/unit_tests/training/test_train.py
+46 −0		tests/unit_tests/training/utils/test_checkpoint_utils.py
+104 −164		uv.lock