diff --git a/.github/actions/test-template/action.yml b/.github/actions/test-template/action.yml index ccf10a80..ef0e5446 100644 --- a/.github/actions/test-template/action.yml +++ b/.github/actions/test-template/action.yml @@ -161,7 +161,7 @@ runs: -d \ --name nemo_container_${{ github.run_id }} ${ARG[@]} \ --shm-size=64g \ - --env TRANSFORMERS_OFFLINE=1 \ + --env TRANSFORMERS_OFFLINE=0 \ --env HYDRA_FULL_ERROR=1 \ --env HF_HOME=/home/TestData/HF_HOME \ --env RUN_ID=${{ github.run_id }} \ diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index b3f08665..989130cb 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -45,54 +45,58 @@ jobs: with: image-name: dfm dockerfile: docker/Dockerfile.ci + runner: self-hosted-nemo secrets: AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }} AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }} AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }} - cicd-unit-tests: - strategy: - fail-fast: false - matrix: - include: - - script: L0_Unit_Tests_GPU - runner: linux-amd64-gpu-rtxa6000-latest-2-nemo - timeout: 30 - - script: L0_Unit_Tests_CPU - runner: linux-amd64-cpu16 - cpu-only: true - needs: [cicd-container-build] - runs-on: ${{ matrix.runner }} - name: ${{ matrix.script }} - environment: nemo-ci - steps: - - name: Checkout - uses: actions/checkout@v4 - with: - submodules: recursive - - name: main - uses: ./.github/actions/test-template - with: - runner: ${{ runner.name }} - script: ${{ matrix.script }} - timeout: ${{ matrix.timeout || 10 }} - is_unit_test: "true" - image: dfm - cpu-only: ${{ matrix.cpu-only || false }} - has-azure-credentials: "true" - azure-client-id: ${{ secrets.AZURE_CLIENT_ID }} - azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }} - azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + # cicd-unit-tests: + # strategy: + # fail-fast: false + # matrix: + # include: + # - script: L0_Unit_Tests_GPU + # runner: self-hosted-nemo + # timeout: 30 + # - script: L0_Unit_Tests_CPU + # runner: linux-amd64-cpu16 + # cpu-only: true + # needs: [cicd-container-build] + # runs-on: ${{ matrix.runner }} + # name: ${{ matrix.script }} + # environment: nemo-ci + # steps: + # - name: Checkout + # uses: actions/checkout@v4 + # with: + # submodules: recursive + # - name: main + # uses: ./.github/actions/test-template + # with: + # runner: ${{ runner.name }} + # script: ${{ matrix.script }} + # timeout: ${{ matrix.timeout || 10 }} + # is_unit_test: "true" + # image: dfm + # cpu-only: ${{ matrix.cpu-only || false }} + # has-azure-credentials: "true" + # azure-client-id: ${{ secrets.AZURE_CLIENT_ID }} + # azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }} + # azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} cicd-e2e-tests: strategy: fail-fast: false matrix: include: - - script: L2_Functional_Tests_GPU - runner: linux-amd64-gpu-rtxa6000-latest-2-nemo + # - script: L2_Functional_Tests_GPU + # runner: self-hosted-nemo + # timeout: 30 + - script: L2_Mcore_Mock_Tests_GPU + runner: self-hosted-nemo timeout: 30 - needs: [cicd-unit-tests] + needs: [cicd-container-build] runs-on: ${{ matrix.runner }} name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }} environment: nemo-ci @@ -117,7 +121,7 @@ jobs: Nemo_CICD_Test: needs: - cicd-container-build - - cicd-unit-tests + # - cicd-unit-tests - cicd-e2e-tests if: always() runs-on: ubuntu-latest diff --git a/3rdparty/Megatron-Bridge b/3rdparty/Megatron-Bridge index 8e21f81a..4e4ce420 160000 --- a/3rdparty/Megatron-Bridge +++ b/3rdparty/Megatron-Bridge @@ -1 +1 @@ -Subproject commit 8e21f81ab961bdb0ad99a275074fe50aae15d2f9 +Subproject commit 4e4ce4203589466d0a5b846e12dd24fa74c57f2a diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 68ab66d4..aed9cf99 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,4 +1,32 @@ # Contributing To NeMo DFM +## 🛠️ Setting Up Your Environment + +Use the instructions below to setup a dev environment and a dev container + +### Building a container +```bash +# We recommend you to get the latest commits for Megatron-Bridge and Autmodel +# The easiest way to do that might be to remove the 3rdparty directly completely before running the following commands +git submodule update --init --recursive --remote # Get all the 3rd party submodules +cd 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM # Megatron LM commit might be wrong +# Get the right megatron commit from here: https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/3rdparty +git checkout +cd ../../../../ +docker build -f docker/Dockerfile.ci -t dfm:latest . +``` + +### Run the container +```bash +docker run --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --gpus all $(pwd):/opt/DFM -it dfm:latest bash +``` + +### inside the container +```bash +# Add DFM to PYTHONPATH +export PYTHONPATH=$PYTHONPATH:/opt/DFM + +# Run a Mock Run: +``` ## Signing Your Work diff --git a/dfm/src/megatron/model/common/dit_attention.py b/dfm/src/megatron/model/common/dit_attention.py index 321e9b08..acf39d47 100644 --- a/dfm/src/megatron/model/common/dit_attention.py +++ b/dfm/src/megatron/model/common/dit_attention.py @@ -100,7 +100,7 @@ def __init__( else: self.k_layernorm = None - def get_query_key_value_tensors(self, hidden_states, key_value_states=None, split_qkv=False): + def get_query_key_value_tensors(self, hidden_states, key_value_states=None, output_gate=None, split_qkv=True): """ Derives `query`, `key` and `value` tensors from `hidden_states`. """ @@ -251,13 +251,15 @@ def __init__( is_expert=False, ) - def get_query_key_value_tensors(self, hidden_states, key_value_states, split_qkv=False): + def get_query_key_value_tensors(self, hidden_states, key_value_states, output_gate=None, split_qkv=True): """ Derives `query` tensor from `hidden_states`, and `key`/`value` tensors from `key_value_states`. """ - query, key, value = super().get_query_key_value_tensors(hidden_states, key_value_states) + query, key, value = super().get_query_key_value_tensors( + hidden_states, key_value_states, output_gate=output_gate, split_qkv=split_qkv + ) # gather query and key heads across TP ranks if self.layernorm_across_heads is True if self.layernorm_across_heads and parallel_state.get_tensor_model_parallel_world_size() > 1: diff --git a/dfm/src/megatron/model/dit/dit_model.py b/dfm/src/megatron/model/dit/dit_model.py index e3ae8a29..38cb8422 100644 --- a/dfm/src/megatron/model/dit/dit_model.py +++ b/dfm/src/megatron/model/dit/dit_model.py @@ -105,7 +105,6 @@ def __init__( super(DiTCrossAttentionModel, self).__init__(config=config) self.config: TransformerConfig = config - self.transformer_decoder_layer_spec = transformer_decoder_layer_spec() self.pre_process = pre_process self.post_process = post_process diff --git a/dfm/src/megatron/model/wan/wan_layer_spec.py b/dfm/src/megatron/model/wan/wan_layer_spec.py index 2b355930..a0d6354e 100644 --- a/dfm/src/megatron/model/wan/wan_layer_spec.py +++ b/dfm/src/megatron/model/wan/wan_layer_spec.py @@ -162,6 +162,7 @@ def forward( packed_seq_params=None, sequence_len_offset=None, inference_context=None, + rotary_pos_cos_sin=None, ): # the timestep embedding is stored in attention_mask argument timestep_emb = attention_mask diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci index 7096b3c6..8de9c016 100644 --- a/docker/Dockerfile.ci +++ b/docker/Dockerfile.ci @@ -32,7 +32,19 @@ RUN uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages # Copy dependency files and source code (needed for dynamic version resolution) COPY pyproject.toml uv.lock ./ COPY dfm ./dfm -COPY 3rdparty ./3rdparty + +# Copy 3rdparty dependencies with minimal files for metadata resolution +# Copy Automodel +COPY 3rdparty/Automodel ./3rdparty/Automodel + +# Copy Megatron-Bridge +COPY 3rdparty/Megatron-Bridge/pyproject.toml ./3rdparty/Megatron-Bridge/ +COPY 3rdparty/Megatron-Bridge/src ./3rdparty/Megatron-Bridge/src + +# Copy minimal Megatron-LM files for metadata (prevents full source build) +COPY 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/pyproject.toml ./3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/ +COPY 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/__init__.py ./3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/ +COPY 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/package_info.py ./3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/ # Install dependencies in two steps: # 1. Install build dependencies first (required for packages with no-build-isolation) diff --git a/pyproject.toml b/pyproject.toml index 56e934d9..05a40a68 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -132,6 +132,7 @@ explicit = true [tool.uv.sources] nemo-automodel = { path = "3rdparty/Automodel" } megatron-bridge = { path = "3rdparty/Megatron-Bridge" } +megatron-core = { path = "3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/" } transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "release_v2.9" } nvidia-resiliency-ext = { index = "pypi" } diff --git a/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh b/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh new file mode 100644 index 00000000..b8d237a1 --- /dev/null +++ b/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh @@ -0,0 +1,14 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +CUDA_VISIBLE_DEVICES="0,1" uv run --group megatron-bridge coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/functional_tests/test_mcore_wan_pretrain.py -m "not pleasefixme" --with_downloads -v diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py new file mode 100644 index 00000000..1d8122ae --- /dev/null +++ b/tests/functional_tests/test_mcore_wan_pretrain.py @@ -0,0 +1,108 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional smoke tests for Mcore WAN pretrain mock runs.""" + +import os +import subprocess + +import pytest + + +class TestMcoreWanPretrain: + """Test class for Mcore WAN pretrain functional tests.""" + + @pytest.mark.run_only_on("GPU") + def test_wan_pretrain_mock(self, tmp_path): + """ + Functional test for WAN pretrain recipe with mock data. + + This test verifies that the WAN pretrain recipe can run successfully + in mock mode with minimal configuration, ensuring: + 1. The distributed training can start without errors + 2. Model initialization works correctly + 3. Forward/backward passes complete successfully + 4. The training loop executes without crashes + """ + # Set up temporary directories for dataset and checkpoints + dataset_path = os.path.join(tmp_path, "mock_dataset") + checkpoint_dir = os.path.join(tmp_path, "checkpoints") + os.makedirs(dataset_path, exist_ok=True) + os.makedirs(checkpoint_dir, exist_ok=True) + + # Build the command for the mock run + cmd = [ + "python", + "-m", + "torch.distributed.run", + "--nproc_per_node=1", + "examples/megatron/recipes/wan/pretrain_wan.py", + "--training-mode", + "pretrain", + "model.tensor_model_parallel_size=1", + "model.pipeline_model_parallel_size=1", + "model.context_parallel_size=1", + "model.crossattn_emb_size=1536", + "model.hidden_size=1536", + "model.ffn_hidden_size=8960", + "model.num_attention_heads=12", + "model.num_layers=3", + "model.qkv_format=thd", + f"dataset.path={dataset_path}", + f"checkpoint.save={checkpoint_dir}", + f"checkpoint.load={checkpoint_dir}", + "checkpoint.load_optim=false", + "checkpoint.save_interval=200", + "optimizer.lr=5e-6", + "optimizer.min_lr=5e-6", + "train.eval_iters=0", + "train.max_steps=10", + "scheduler.lr_decay_style=constant", + "scheduler.lr_warmup_iters=0", + "model.seq_length=2048", + "dataset.seq_length=2048", + "train.global_batch_size=2", + "train.micro_batch_size=1", + "dataset.global_batch_size=2", + "dataset.micro_batch_size=1", + "logger.log_interval=1", + "--mock", + ] + + # Run the command with a timeout + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=1800, # 30 minute timeout + check=True, + ) + + # Print output for debugging if needed + print("STDOUT:", result.stdout) + print("STDERR:", result.stderr) + + # Basic verification that the run completed + assert result.returncode == 0, f"Command failed with return code {result.returncode}" + + # Check for common success indicators in output + assert "iteration" in result.stdout.lower() or "iteration" in result.stderr.lower(), ( + "Expected to see iteration progress in output" + ) + + except subprocess.TimeoutExpired: + pytest.fail("WAN pretrain mock run exceeded timeout of 1800 seconds (30 minutes)") + except subprocess.CalledProcessError as e: + pytest.fail(f"WAN pretrain mock run failed with error:\nSTDOUT: {e.stdout}\nSTDERR: {e.stderr}")