diff --git a/.github/actions/test-template/action.yml b/.github/actions/test-template/action.yml
index ccf10a80..ef0e5446 100644
--- a/.github/actions/test-template/action.yml
+++ b/.github/actions/test-template/action.yml
@@ -161,7 +161,7 @@ runs:
           -d \
           --name nemo_container_${{ github.run_id }} ${ARG[@]} \
           --shm-size=64g \
-          --env TRANSFORMERS_OFFLINE=1 \
+          --env TRANSFORMERS_OFFLINE=0 \
           --env HYDRA_FULL_ERROR=1 \
           --env HF_HOME=/home/TestData/HF_HOME \
           --env RUN_ID=${{ github.run_id }} \
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index b3f08665..989130cb 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -45,54 +45,58 @@ jobs:
     with:
       image-name: dfm
       dockerfile: docker/Dockerfile.ci
+      runner: self-hosted-nemo
     secrets:
       AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }}
       AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }}
       AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
 
-  cicd-unit-tests:
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - script: L0_Unit_Tests_GPU
-            runner: linux-amd64-gpu-rtxa6000-latest-2-nemo
-            timeout: 30
-          - script: L0_Unit_Tests_CPU
-            runner: linux-amd64-cpu16
-            cpu-only: true
-    needs: [cicd-container-build]
-    runs-on: ${{ matrix.runner }}
-    name: ${{ matrix.script }}
-    environment: nemo-ci
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: main
-        uses: ./.github/actions/test-template
-        with:
-          runner: ${{ runner.name }}
-          script: ${{ matrix.script }}
-          timeout: ${{ matrix.timeout || 10 }}
-          is_unit_test: "true"
-          image: dfm
-          cpu-only: ${{ matrix.cpu-only || false }}
-          has-azure-credentials: "true"
-          azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
-          azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
-          azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
+  # cicd-unit-tests:
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       include:
+  #         - script: L0_Unit_Tests_GPU
+  #           runner: self-hosted-nemo
+  #           timeout: 30
+  #         - script: L0_Unit_Tests_CPU
+  #           runner: linux-amd64-cpu16
+  #           cpu-only: true
+  #   needs: [cicd-container-build]
+  #   runs-on: ${{ matrix.runner }}
+  #   name: ${{ matrix.script }}
+  #   environment: nemo-ci
+  #   steps:
+  #     - name: Checkout
+  #       uses: actions/checkout@v4
+  #       with:
+  #         submodules: recursive
+  #     - name: main
+  #       uses: ./.github/actions/test-template
+  #       with:
+  #         runner: ${{ runner.name }}
+  #         script: ${{ matrix.script }}
+  #         timeout: ${{ matrix.timeout || 10 }}
+  #         is_unit_test: "true"
+  #         image: dfm
+  #         cpu-only: ${{ matrix.cpu-only || false }}
+  #         has-azure-credentials: "true"
+  #         azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
+  #         azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
+  #         azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
 
   cicd-e2e-tests:
     strategy:
       fail-fast: false
       matrix:
         include:
-          - script: L2_Functional_Tests_GPU
-            runner: linux-amd64-gpu-rtxa6000-latest-2-nemo
+          # - script: L2_Functional_Tests_GPU
+          #   runner: self-hosted-nemo
+          #   timeout: 30
+          - script: L2_Mcore_Mock_Tests_GPU
+            runner: self-hosted-nemo
             timeout: 30
-    needs: [cicd-unit-tests]
+    needs: [cicd-container-build]
     runs-on: ${{ matrix.runner }}
     name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
     environment: nemo-ci
@@ -117,7 +121,7 @@ jobs:
   Nemo_CICD_Test:
     needs:
       - cicd-container-build
-      - cicd-unit-tests
+      # - cicd-unit-tests
       - cicd-e2e-tests
     if: always()
     runs-on: ubuntu-latest
diff --git a/3rdparty/Megatron-Bridge b/3rdparty/Megatron-Bridge
index 8e21f81a..4e4ce420 160000
--- a/3rdparty/Megatron-Bridge
+++ b/3rdparty/Megatron-Bridge
@@ -1 +1 @@
-Subproject commit 8e21f81ab961bdb0ad99a275074fe50aae15d2f9
+Subproject commit 4e4ce4203589466d0a5b846e12dd24fa74c57f2a
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 68ab66d4..aed9cf99 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,4 +1,32 @@
 # Contributing To NeMo DFM
+## 🛠️ Setting Up Your Environment
+
+Use the instructions below to setup a dev environment and a dev container
+
+### Building a container
+```bash
+# We recommend you to get the latest commits for Megatron-Bridge and Autmodel
+# The easiest way to do that might be to remove the 3rdparty directly completely before running the following commands
+git submodule update --init --recursive --remote # Get all the 3rd party submodules
+cd 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM # Megatron LM commit might be wrong
+# Get the right megatron commit from here: https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/3rdparty
+git checkout <commit_hash>
+cd ../../../../
+docker build -f docker/Dockerfile.ci -t dfm:latest .
+```
+
+### Run the container
+```bash
+docker run --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --gpus all $(pwd):/opt/DFM -it dfm:latest bash
+```
+
+### inside the container
+```bash
+# Add DFM to PYTHONPATH
+export PYTHONPATH=$PYTHONPATH:/opt/DFM
+
+# Run a Mock Run:
+```
 
 ## Signing Your Work
 
diff --git a/dfm/src/megatron/model/common/dit_attention.py b/dfm/src/megatron/model/common/dit_attention.py
index 321e9b08..acf39d47 100644
--- a/dfm/src/megatron/model/common/dit_attention.py
+++ b/dfm/src/megatron/model/common/dit_attention.py
@@ -100,7 +100,7 @@ def __init__(
         else:
             self.k_layernorm = None
 
-    def get_query_key_value_tensors(self, hidden_states, key_value_states=None, split_qkv=False):
+    def get_query_key_value_tensors(self, hidden_states, key_value_states=None, output_gate=None, split_qkv=True):
         """
         Derives `query`, `key` and `value` tensors from `hidden_states`.
         """
@@ -251,13 +251,15 @@ def __init__(
             is_expert=False,
         )
 
-    def get_query_key_value_tensors(self, hidden_states, key_value_states, split_qkv=False):
+    def get_query_key_value_tensors(self, hidden_states, key_value_states, output_gate=None, split_qkv=True):
         """
         Derives `query` tensor from `hidden_states`, and `key`/`value` tensors
         from `key_value_states`.
         """
 
-        query, key, value = super().get_query_key_value_tensors(hidden_states, key_value_states)
+        query, key, value = super().get_query_key_value_tensors(
+            hidden_states, key_value_states, output_gate=output_gate, split_qkv=split_qkv
+        )
 
         # gather query and key heads across TP ranks if self.layernorm_across_heads is True
         if self.layernorm_across_heads and parallel_state.get_tensor_model_parallel_world_size() > 1:
diff --git a/dfm/src/megatron/model/dit/dit_model.py b/dfm/src/megatron/model/dit/dit_model.py
index e3ae8a29..38cb8422 100644
--- a/dfm/src/megatron/model/dit/dit_model.py
+++ b/dfm/src/megatron/model/dit/dit_model.py
@@ -105,7 +105,6 @@ def __init__(
         super(DiTCrossAttentionModel, self).__init__(config=config)
 
         self.config: TransformerConfig = config
-
         self.transformer_decoder_layer_spec = transformer_decoder_layer_spec()
         self.pre_process = pre_process
         self.post_process = post_process
diff --git a/dfm/src/megatron/model/wan/wan_layer_spec.py b/dfm/src/megatron/model/wan/wan_layer_spec.py
index 2b355930..a0d6354e 100644
--- a/dfm/src/megatron/model/wan/wan_layer_spec.py
+++ b/dfm/src/megatron/model/wan/wan_layer_spec.py
@@ -162,6 +162,7 @@ def forward(
         packed_seq_params=None,
         sequence_len_offset=None,
         inference_context=None,
+        rotary_pos_cos_sin=None,
     ):
         # the timestep embedding is stored in attention_mask argument
         timestep_emb = attention_mask
diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci
index 7096b3c6..8de9c016 100644
--- a/docker/Dockerfile.ci
+++ b/docker/Dockerfile.ci
@@ -32,7 +32,19 @@ RUN uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages
 # Copy dependency files and source code (needed for dynamic version resolution)
 COPY pyproject.toml uv.lock ./
 COPY dfm ./dfm
-COPY 3rdparty ./3rdparty
+
+# Copy 3rdparty dependencies with minimal files for metadata resolution
+# Copy Automodel
+COPY 3rdparty/Automodel ./3rdparty/Automodel
+
+# Copy Megatron-Bridge
+COPY 3rdparty/Megatron-Bridge/pyproject.toml ./3rdparty/Megatron-Bridge/
+COPY 3rdparty/Megatron-Bridge/src ./3rdparty/Megatron-Bridge/src
+
+# Copy minimal Megatron-LM files for metadata (prevents full source build)
+COPY 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/pyproject.toml ./3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/
+COPY 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/__init__.py ./3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/
+COPY 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/package_info.py ./3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/
 
 # Install dependencies in two steps:
 # 1. Install build dependencies first (required for packages with no-build-isolation)
diff --git a/pyproject.toml b/pyproject.toml
index 56e934d9..05a40a68 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -132,6 +132,7 @@ explicit = true
 [tool.uv.sources]
 nemo-automodel = { path = "3rdparty/Automodel" }
 megatron-bridge = { path = "3rdparty/Megatron-Bridge" }
+megatron-core = { path = "3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/" }
 transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "release_v2.9" }
 nvidia-resiliency-ext = { index = "pypi" }
 
diff --git a/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh b/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh
new file mode 100644
index 00000000..b8d237a1
--- /dev/null
+++ b/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh
@@ -0,0 +1,14 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+CUDA_VISIBLE_DEVICES="0,1" uv run --group megatron-bridge coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/functional_tests/test_mcore_wan_pretrain.py -m "not pleasefixme" --with_downloads -v
diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py
new file mode 100644
index 00000000..1d8122ae
--- /dev/null
+++ b/tests/functional_tests/test_mcore_wan_pretrain.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Functional smoke tests for Mcore WAN pretrain mock runs."""
+
+import os
+import subprocess
+
+import pytest
+
+
+class TestMcoreWanPretrain:
+    """Test class for Mcore WAN pretrain functional tests."""
+
+    @pytest.mark.run_only_on("GPU")
+    def test_wan_pretrain_mock(self, tmp_path):
+        """
+        Functional test for WAN pretrain recipe with mock data.
+
+        This test verifies that the WAN pretrain recipe can run successfully
+        in mock mode with minimal configuration, ensuring:
+        1. The distributed training can start without errors
+        2. Model initialization works correctly
+        3. Forward/backward passes complete successfully
+        4. The training loop executes without crashes
+        """
+        # Set up temporary directories for dataset and checkpoints
+        dataset_path = os.path.join(tmp_path, "mock_dataset")
+        checkpoint_dir = os.path.join(tmp_path, "checkpoints")
+        os.makedirs(dataset_path, exist_ok=True)
+        os.makedirs(checkpoint_dir, exist_ok=True)
+
+        # Build the command for the mock run
+        cmd = [
+            "python",
+            "-m",
+            "torch.distributed.run",
+            "--nproc_per_node=1",
+            "examples/megatron/recipes/wan/pretrain_wan.py",
+            "--training-mode",
+            "pretrain",
+            "model.tensor_model_parallel_size=1",
+            "model.pipeline_model_parallel_size=1",
+            "model.context_parallel_size=1",
+            "model.crossattn_emb_size=1536",
+            "model.hidden_size=1536",
+            "model.ffn_hidden_size=8960",
+            "model.num_attention_heads=12",
+            "model.num_layers=3",
+            "model.qkv_format=thd",
+            f"dataset.path={dataset_path}",
+            f"checkpoint.save={checkpoint_dir}",
+            f"checkpoint.load={checkpoint_dir}",
+            "checkpoint.load_optim=false",
+            "checkpoint.save_interval=200",
+            "optimizer.lr=5e-6",
+            "optimizer.min_lr=5e-6",
+            "train.eval_iters=0",
+            "train.max_steps=10",
+            "scheduler.lr_decay_style=constant",
+            "scheduler.lr_warmup_iters=0",
+            "model.seq_length=2048",
+            "dataset.seq_length=2048",
+            "train.global_batch_size=2",
+            "train.micro_batch_size=1",
+            "dataset.global_batch_size=2",
+            "dataset.micro_batch_size=1",
+            "logger.log_interval=1",
+            "--mock",
+        ]
+
+        # Run the command with a timeout
+        try:
+            result = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                timeout=1800,  # 30 minute timeout
+                check=True,
+            )
+
+            # Print output for debugging if needed
+            print("STDOUT:", result.stdout)
+            print("STDERR:", result.stderr)
+
+            # Basic verification that the run completed
+            assert result.returncode == 0, f"Command failed with return code {result.returncode}"
+
+            # Check for common success indicators in output
+            assert "iteration" in result.stdout.lower() or "iteration" in result.stderr.lower(), (
+                "Expected to see iteration progress in output"
+            )
+
+        except subprocess.TimeoutExpired:
+            pytest.fail("WAN pretrain mock run exceeded timeout of 1800 seconds (30 minutes)")
+        except subprocess.CalledProcessError as e:
+            pytest.fail(f"WAN pretrain mock run failed with error:\nSTDOUT: {e.stdout}\nSTDERR: {e.stderr}")