From 50f058db97a838908bc2040de6f700f6aa8e9eb2 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Sat, 15 Nov 2025 09:22:19 -0600 Subject: [PATCH 01/17] ci: Update gpu runners to use self-hosted-nemo Signed-off-by: Charlie Truong --- .github/workflows/cicd-main.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 7f792d47..81af9136 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -45,6 +45,7 @@ jobs: with: image-name: dfm dockerfile: docker/Dockerfile.ci + runner: self-hosted-nemo secrets: AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }} AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }} @@ -56,7 +57,7 @@ jobs: matrix: include: - script: L0_Unit_Tests_GPU - runner: linux-amd64-gpu-rtxa6000-latest-2-nemo + runner: self-hosted-nemo timeout: 30 - script: L0_Unit_Tests_CPU runner: linux-amd64-cpu16 @@ -90,10 +91,10 @@ jobs: matrix: include: - script: L2_Functional_Tests_GPU - runner: linux-amd64-gpu-rtxa6000-latest-2-nemo + runner: self-hosted-nemo timeout: 30 - script: L2_Mcore_Mock_Tests_GPU - runner: linux-amd64-gpu-rtxa6000-latest-2-nemo + runner: self-hosted-nemo timeout: 30 needs: [cicd-unit-tests] runs-on: ${{ matrix.runner }} From 4bbb20c94c8554060e0aed45c23626bd9726d5f8 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Sat, 15 Nov 2025 10:45:45 -0600 Subject: [PATCH 02/17] Use uv run in test_mcore_wan_pretrain Signed-off-by: Charlie Truong --- .github/workflows/cicd-main.yml | 76 +++++++++---------- .../test_mcore_wan_pretrain.py | 2 + 2 files changed, 40 insertions(+), 38 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 81af9136..989130cb 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -51,52 +51,52 @@ jobs: AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }} AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }} - cicd-unit-tests: - strategy: - fail-fast: false - matrix: - include: - - script: L0_Unit_Tests_GPU - runner: self-hosted-nemo - timeout: 30 - - script: L0_Unit_Tests_CPU - runner: linux-amd64-cpu16 - cpu-only: true - needs: [cicd-container-build] - runs-on: ${{ matrix.runner }} - name: ${{ matrix.script }} - environment: nemo-ci - steps: - - name: Checkout - uses: actions/checkout@v4 - with: - submodules: recursive - - name: main - uses: ./.github/actions/test-template - with: - runner: ${{ runner.name }} - script: ${{ matrix.script }} - timeout: ${{ matrix.timeout || 10 }} - is_unit_test: "true" - image: dfm - cpu-only: ${{ matrix.cpu-only || false }} - has-azure-credentials: "true" - azure-client-id: ${{ secrets.AZURE_CLIENT_ID }} - azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }} - azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + # cicd-unit-tests: + # strategy: + # fail-fast: false + # matrix: + # include: + # - script: L0_Unit_Tests_GPU + # runner: self-hosted-nemo + # timeout: 30 + # - script: L0_Unit_Tests_CPU + # runner: linux-amd64-cpu16 + # cpu-only: true + # needs: [cicd-container-build] + # runs-on: ${{ matrix.runner }} + # name: ${{ matrix.script }} + # environment: nemo-ci + # steps: + # - name: Checkout + # uses: actions/checkout@v4 + # with: + # submodules: recursive + # - name: main + # uses: ./.github/actions/test-template + # with: + # runner: ${{ runner.name }} + # script: ${{ matrix.script }} + # timeout: ${{ matrix.timeout || 10 }} + # is_unit_test: "true" + # image: dfm + # cpu-only: ${{ matrix.cpu-only || false }} + # has-azure-credentials: "true" + # azure-client-id: ${{ secrets.AZURE_CLIENT_ID }} + # azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }} + # azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} cicd-e2e-tests: strategy: fail-fast: false matrix: include: - - script: L2_Functional_Tests_GPU - runner: self-hosted-nemo - timeout: 30 + # - script: L2_Functional_Tests_GPU + # runner: self-hosted-nemo + # timeout: 30 - script: L2_Mcore_Mock_Tests_GPU runner: self-hosted-nemo timeout: 30 - needs: [cicd-unit-tests] + needs: [cicd-container-build] runs-on: ${{ matrix.runner }} name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }} environment: nemo-ci @@ -121,7 +121,7 @@ jobs: Nemo_CICD_Test: needs: - cicd-container-build - - cicd-unit-tests + # - cicd-unit-tests - cicd-e2e-tests if: always() runs-on: ubuntu-latest diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py index 780bb253..6ca8ee34 100644 --- a/tests/functional_tests/test_mcore_wan_pretrain.py +++ b/tests/functional_tests/test_mcore_wan_pretrain.py @@ -43,6 +43,8 @@ def test_wan_pretrain_mock(self, tmp_path): # Build the command for the mock run cmd = [ + "uv", + "run", "python", "-m", "torch.distributed.run", From b412e4eec0b29fe14d736f25d6d66203f58e7ed4 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Sat, 15 Nov 2025 11:24:46 -0600 Subject: [PATCH 03/17] Ensure uv group megatron-bridge is used for test_mcore_wan_pretrain Signed-off-by: Charlie Truong --- tests/functional_tests/test_mcore_wan_pretrain.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py index 6ca8ee34..68b7000b 100644 --- a/tests/functional_tests/test_mcore_wan_pretrain.py +++ b/tests/functional_tests/test_mcore_wan_pretrain.py @@ -45,6 +45,8 @@ def test_wan_pretrain_mock(self, tmp_path): cmd = [ "uv", "run", + "--group", + "megatron-bridge", "python", "-m", "torch.distributed.run", From f25bae695bb2088d429ecc92cb1d7d858a71d2be Mon Sep 17 00:00:00 2001 From: Abhinav Garg Date: Sat, 15 Nov 2025 17:48:53 +0000 Subject: [PATCH 04/17] Update TRANSFORMERS_OFFLINE environment variable to 0 and increase timeout in test_mcore_wan_pretrain --- .github/actions/test-template/action.yml | 2 +- tests/functional_tests/test_mcore_wan_pretrain.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/actions/test-template/action.yml b/.github/actions/test-template/action.yml index ccf10a80..ef0e5446 100644 --- a/.github/actions/test-template/action.yml +++ b/.github/actions/test-template/action.yml @@ -161,7 +161,7 @@ runs: -d \ --name nemo_container_${{ github.run_id }} ${ARG[@]} \ --shm-size=64g \ - --env TRANSFORMERS_OFFLINE=1 \ + --env TRANSFORMERS_OFFLINE=0 \ --env HYDRA_FULL_ERROR=1 \ --env HF_HOME=/home/TestData/HF_HOME \ --env RUN_ID=${{ github.run_id }} \ diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py index 68b7000b..0638f615 100644 --- a/tests/functional_tests/test_mcore_wan_pretrain.py +++ b/tests/functional_tests/test_mcore_wan_pretrain.py @@ -89,7 +89,7 @@ def test_wan_pretrain_mock(self, tmp_path): cmd, capture_output=True, text=True, - timeout=300, # 5 minute timeout + timeout=3000, # 5 minute timeout check=True, ) From 39df47278fc69e4c5b845a68323e072f4933ac9b Mon Sep 17 00:00:00 2001 From: Abhinav Garg Date: Sat, 15 Nov 2025 17:48:53 +0000 Subject: [PATCH 05/17] Update TRANSFORMERS_OFFLINE environment variable to 0 and increase timeout in test_mcore_wan_pretrain Signed-off-by: Charlie Truong --- .github/actions/test-template/action.yml | 2 +- tests/functional_tests/test_mcore_wan_pretrain.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/actions/test-template/action.yml b/.github/actions/test-template/action.yml index ccf10a80..ef0e5446 100644 --- a/.github/actions/test-template/action.yml +++ b/.github/actions/test-template/action.yml @@ -161,7 +161,7 @@ runs: -d \ --name nemo_container_${{ github.run_id }} ${ARG[@]} \ --shm-size=64g \ - --env TRANSFORMERS_OFFLINE=1 \ + --env TRANSFORMERS_OFFLINE=0 \ --env HYDRA_FULL_ERROR=1 \ --env HF_HOME=/home/TestData/HF_HOME \ --env RUN_ID=${{ github.run_id }} \ diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py index 68b7000b..0638f615 100644 --- a/tests/functional_tests/test_mcore_wan_pretrain.py +++ b/tests/functional_tests/test_mcore_wan_pretrain.py @@ -89,7 +89,7 @@ def test_wan_pretrain_mock(self, tmp_path): cmd, capture_output=True, text=True, - timeout=300, # 5 minute timeout + timeout=3000, # 5 minute timeout check=True, ) From d7ad1ab48b4d5f2fb00f1a51c84320228c1f64f3 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Sat, 15 Nov 2025 12:13:01 -0600 Subject: [PATCH 06/17] Revert GHA changes Signed-off-by: Charlie Truong --- .github/workflows/cicd-main.yml | 76 ++++++++++++++++----------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 989130cb..81af9136 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -51,52 +51,52 @@ jobs: AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }} AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }} - # cicd-unit-tests: - # strategy: - # fail-fast: false - # matrix: - # include: - # - script: L0_Unit_Tests_GPU - # runner: self-hosted-nemo - # timeout: 30 - # - script: L0_Unit_Tests_CPU - # runner: linux-amd64-cpu16 - # cpu-only: true - # needs: [cicd-container-build] - # runs-on: ${{ matrix.runner }} - # name: ${{ matrix.script }} - # environment: nemo-ci - # steps: - # - name: Checkout - # uses: actions/checkout@v4 - # with: - # submodules: recursive - # - name: main - # uses: ./.github/actions/test-template - # with: - # runner: ${{ runner.name }} - # script: ${{ matrix.script }} - # timeout: ${{ matrix.timeout || 10 }} - # is_unit_test: "true" - # image: dfm - # cpu-only: ${{ matrix.cpu-only || false }} - # has-azure-credentials: "true" - # azure-client-id: ${{ secrets.AZURE_CLIENT_ID }} - # azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }} - # azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + cicd-unit-tests: + strategy: + fail-fast: false + matrix: + include: + - script: L0_Unit_Tests_GPU + runner: self-hosted-nemo + timeout: 30 + - script: L0_Unit_Tests_CPU + runner: linux-amd64-cpu16 + cpu-only: true + needs: [cicd-container-build] + runs-on: ${{ matrix.runner }} + name: ${{ matrix.script }} + environment: nemo-ci + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: recursive + - name: main + uses: ./.github/actions/test-template + with: + runner: ${{ runner.name }} + script: ${{ matrix.script }} + timeout: ${{ matrix.timeout || 10 }} + is_unit_test: "true" + image: dfm + cpu-only: ${{ matrix.cpu-only || false }} + has-azure-credentials: "true" + azure-client-id: ${{ secrets.AZURE_CLIENT_ID }} + azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }} + azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} cicd-e2e-tests: strategy: fail-fast: false matrix: include: - # - script: L2_Functional_Tests_GPU - # runner: self-hosted-nemo - # timeout: 30 + - script: L2_Functional_Tests_GPU + runner: self-hosted-nemo + timeout: 30 - script: L2_Mcore_Mock_Tests_GPU runner: self-hosted-nemo timeout: 30 - needs: [cicd-container-build] + needs: [cicd-unit-tests] runs-on: ${{ matrix.runner }} name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }} environment: nemo-ci @@ -121,7 +121,7 @@ jobs: Nemo_CICD_Test: needs: - cicd-container-build - # - cicd-unit-tests + - cicd-unit-tests - cicd-e2e-tests if: always() runs-on: ubuntu-latest From 1b3184affe5b5e0ab02e8252e53f17a5986bf32f Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Sat, 15 Nov 2025 12:15:43 -0600 Subject: [PATCH 07/17] Move uv run group call to L2_Mcore_Mock_Tests_GPU Signed-off-by: Charlie Truong --- tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh | 2 +- tests/functional_tests/test_mcore_wan_pretrain.py | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh b/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh index 2e99db05..b8d237a1 100644 --- a/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh +++ b/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh @@ -11,4 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -CUDA_VISIBLE_DEVICES="0,1" uv run coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/functional_tests/test_mcore_wan_pretrain.py -m "not pleasefixme" --with_downloads -v +CUDA_VISIBLE_DEVICES="0,1" uv run --group megatron-bridge coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/functional_tests/test_mcore_wan_pretrain.py -m "not pleasefixme" --with_downloads -v diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py index 0638f615..312a299f 100644 --- a/tests/functional_tests/test_mcore_wan_pretrain.py +++ b/tests/functional_tests/test_mcore_wan_pretrain.py @@ -43,10 +43,6 @@ def test_wan_pretrain_mock(self, tmp_path): # Build the command for the mock run cmd = [ - "uv", - "run", - "--group", - "megatron-bridge", "python", "-m", "torch.distributed.run", From 3a64d34a28b3787933d675e4a51c261d193e7c7d Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Sat, 15 Nov 2025 12:17:33 -0600 Subject: [PATCH 08/17] Set test back to 5 minute timeout Signed-off-by: Charlie Truong --- tests/functional_tests/test_mcore_wan_pretrain.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py index 312a299f..780bb253 100644 --- a/tests/functional_tests/test_mcore_wan_pretrain.py +++ b/tests/functional_tests/test_mcore_wan_pretrain.py @@ -85,7 +85,7 @@ def test_wan_pretrain_mock(self, tmp_path): cmd, capture_output=True, text=True, - timeout=3000, # 5 minute timeout + timeout=300, # 5 minute timeout check=True, ) From afcae6cef26b08f0ab2f1182ee3458733472643e Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Sat, 15 Nov 2025 12:09:16 -0800 Subject: [PATCH 09/17] Megatron fixes (#49) * Enhance DiT and Wan layer specifications - Updated `get_query_key_value_tensors` method in `dit_attention.py` to include an `output_gate` parameter and set `split_qkv` to default to `True`. - Modified `WanLayerWithAdaLN` class in `wan_layer_spec.py` to add `rotary_pos_cos_sin` parameter for improved positional encoding handling. * Implement ProcessGroupCollection initialization in DiT and Wan models - Added initialization of `pg_collection` in both `DiTCrossAttentionModel` and `WanModel` to ensure proper handling of process groups. - This change checks if `pg_collection` exists and is not None before assigning it, enhancing the robustness of the models. * Update CONTRIBUTING.md to include detailed setup instructions for development environment and Docker container usage. Added sections for building and running the container, as well as setting the PYTHONPATH for DFM. * Refactor import statements in dit_model.py to streamline dependencies. Removed redundant import of ProcessGroupCollection, enhancing code clarity and maintainability. * Refactor code style in DiT and Wan models - Updated string quotes in `dit_model.py` and `wan_model.py` for consistency, changing from single to double quotes. - Reformatted the `get_query_key_value_tensors` method call in `dit_attention.py` for improved readability by breaking it into multiple lines. * Revert M4 changes * Ruff * Ruff * Lint --------- Co-authored-by: Abhinav Garg --- CONTRIBUTING.md | 28 +++++++++++++++++++ .../megatron/model/common/dit_attention.py | 8 ++++-- dfm/src/megatron/model/dit/dit_model.py | 1 - dfm/src/megatron/model/wan/wan_layer_spec.py | 1 + 4 files changed, 34 insertions(+), 4 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 68ab66d4..aed9cf99 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,4 +1,32 @@ # Contributing To NeMo DFM +## 🛠️ Setting Up Your Environment + +Use the instructions below to setup a dev environment and a dev container + +### Building a container +```bash +# We recommend you to get the latest commits for Megatron-Bridge and Autmodel +# The easiest way to do that might be to remove the 3rdparty directly completely before running the following commands +git submodule update --init --recursive --remote # Get all the 3rd party submodules +cd 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM # Megatron LM commit might be wrong +# Get the right megatron commit from here: https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/3rdparty +git checkout +cd ../../../../ +docker build -f docker/Dockerfile.ci -t dfm:latest . +``` + +### Run the container +```bash +docker run --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --gpus all $(pwd):/opt/DFM -it dfm:latest bash +``` + +### inside the container +```bash +# Add DFM to PYTHONPATH +export PYTHONPATH=$PYTHONPATH:/opt/DFM + +# Run a Mock Run: +``` ## Signing Your Work diff --git a/dfm/src/megatron/model/common/dit_attention.py b/dfm/src/megatron/model/common/dit_attention.py index 321e9b08..acf39d47 100644 --- a/dfm/src/megatron/model/common/dit_attention.py +++ b/dfm/src/megatron/model/common/dit_attention.py @@ -100,7 +100,7 @@ def __init__( else: self.k_layernorm = None - def get_query_key_value_tensors(self, hidden_states, key_value_states=None, split_qkv=False): + def get_query_key_value_tensors(self, hidden_states, key_value_states=None, output_gate=None, split_qkv=True): """ Derives `query`, `key` and `value` tensors from `hidden_states`. """ @@ -251,13 +251,15 @@ def __init__( is_expert=False, ) - def get_query_key_value_tensors(self, hidden_states, key_value_states, split_qkv=False): + def get_query_key_value_tensors(self, hidden_states, key_value_states, output_gate=None, split_qkv=True): """ Derives `query` tensor from `hidden_states`, and `key`/`value` tensors from `key_value_states`. """ - query, key, value = super().get_query_key_value_tensors(hidden_states, key_value_states) + query, key, value = super().get_query_key_value_tensors( + hidden_states, key_value_states, output_gate=output_gate, split_qkv=split_qkv + ) # gather query and key heads across TP ranks if self.layernorm_across_heads is True if self.layernorm_across_heads and parallel_state.get_tensor_model_parallel_world_size() > 1: diff --git a/dfm/src/megatron/model/dit/dit_model.py b/dfm/src/megatron/model/dit/dit_model.py index e3ae8a29..38cb8422 100644 --- a/dfm/src/megatron/model/dit/dit_model.py +++ b/dfm/src/megatron/model/dit/dit_model.py @@ -105,7 +105,6 @@ def __init__( super(DiTCrossAttentionModel, self).__init__(config=config) self.config: TransformerConfig = config - self.transformer_decoder_layer_spec = transformer_decoder_layer_spec() self.pre_process = pre_process self.post_process = post_process diff --git a/dfm/src/megatron/model/wan/wan_layer_spec.py b/dfm/src/megatron/model/wan/wan_layer_spec.py index 2b355930..a0d6354e 100644 --- a/dfm/src/megatron/model/wan/wan_layer_spec.py +++ b/dfm/src/megatron/model/wan/wan_layer_spec.py @@ -162,6 +162,7 @@ def forward( packed_seq_params=None, sequence_len_offset=None, inference_context=None, + rotary_pos_cos_sin=None, ): # the timestep embedding is stored in attention_mask argument timestep_emb = attention_mask From fdb911f729d2870e96266e34b7592819140ff2e7 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Sat, 15 Nov 2025 12:54:43 -0800 Subject: [PATCH 10/17] Revert "Revert GHA changes" This reverts commit d7ad1ab48b4d5f2fb00f1a51c84320228c1f64f3. --- .github/workflows/cicd-main.yml | 76 ++++++++++++++++----------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 81af9136..989130cb 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -51,52 +51,52 @@ jobs: AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }} AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }} - cicd-unit-tests: - strategy: - fail-fast: false - matrix: - include: - - script: L0_Unit_Tests_GPU - runner: self-hosted-nemo - timeout: 30 - - script: L0_Unit_Tests_CPU - runner: linux-amd64-cpu16 - cpu-only: true - needs: [cicd-container-build] - runs-on: ${{ matrix.runner }} - name: ${{ matrix.script }} - environment: nemo-ci - steps: - - name: Checkout - uses: actions/checkout@v4 - with: - submodules: recursive - - name: main - uses: ./.github/actions/test-template - with: - runner: ${{ runner.name }} - script: ${{ matrix.script }} - timeout: ${{ matrix.timeout || 10 }} - is_unit_test: "true" - image: dfm - cpu-only: ${{ matrix.cpu-only || false }} - has-azure-credentials: "true" - azure-client-id: ${{ secrets.AZURE_CLIENT_ID }} - azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }} - azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + # cicd-unit-tests: + # strategy: + # fail-fast: false + # matrix: + # include: + # - script: L0_Unit_Tests_GPU + # runner: self-hosted-nemo + # timeout: 30 + # - script: L0_Unit_Tests_CPU + # runner: linux-amd64-cpu16 + # cpu-only: true + # needs: [cicd-container-build] + # runs-on: ${{ matrix.runner }} + # name: ${{ matrix.script }} + # environment: nemo-ci + # steps: + # - name: Checkout + # uses: actions/checkout@v4 + # with: + # submodules: recursive + # - name: main + # uses: ./.github/actions/test-template + # with: + # runner: ${{ runner.name }} + # script: ${{ matrix.script }} + # timeout: ${{ matrix.timeout || 10 }} + # is_unit_test: "true" + # image: dfm + # cpu-only: ${{ matrix.cpu-only || false }} + # has-azure-credentials: "true" + # azure-client-id: ${{ secrets.AZURE_CLIENT_ID }} + # azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }} + # azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} cicd-e2e-tests: strategy: fail-fast: false matrix: include: - - script: L2_Functional_Tests_GPU - runner: self-hosted-nemo - timeout: 30 + # - script: L2_Functional_Tests_GPU + # runner: self-hosted-nemo + # timeout: 30 - script: L2_Mcore_Mock_Tests_GPU runner: self-hosted-nemo timeout: 30 - needs: [cicd-unit-tests] + needs: [cicd-container-build] runs-on: ${{ matrix.runner }} name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }} environment: nemo-ci @@ -121,7 +121,7 @@ jobs: Nemo_CICD_Test: needs: - cicd-container-build - - cicd-unit-tests + # - cicd-unit-tests - cicd-e2e-tests if: always() runs-on: ubuntu-latest From 60f4046354da552ce8d072cd39ceb6c5c9bf72b0 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Sat, 15 Nov 2025 13:23:59 -0800 Subject: [PATCH 11/17] tempfortest: timeout setting Signed-off-by: Pablo Garay --- tests/functional_tests/test_mcore_wan_pretrain.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py index 780bb253..1d8122ae 100644 --- a/tests/functional_tests/test_mcore_wan_pretrain.py +++ b/tests/functional_tests/test_mcore_wan_pretrain.py @@ -67,6 +67,7 @@ def test_wan_pretrain_mock(self, tmp_path): "optimizer.lr=5e-6", "optimizer.min_lr=5e-6", "train.eval_iters=0", + "train.max_steps=10", "scheduler.lr_decay_style=constant", "scheduler.lr_warmup_iters=0", "model.seq_length=2048", @@ -85,7 +86,7 @@ def test_wan_pretrain_mock(self, tmp_path): cmd, capture_output=True, text=True, - timeout=300, # 5 minute timeout + timeout=1800, # 30 minute timeout check=True, ) @@ -102,6 +103,6 @@ def test_wan_pretrain_mock(self, tmp_path): ) except subprocess.TimeoutExpired: - pytest.fail("WAN pretrain mock run exceeded timeout of 300 seconds") + pytest.fail("WAN pretrain mock run exceeded timeout of 1800 seconds (30 minutes)") except subprocess.CalledProcessError as e: pytest.fail(f"WAN pretrain mock run failed with error:\nSTDOUT: {e.stdout}\nSTDERR: {e.stderr}") From 9e886923a96c2c3a9eadeef5aca5ad74e628ab7d Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Sat, 15 Nov 2025 13:38:57 -0800 Subject: [PATCH 12/17] workflow dispatch Signed-off-by: Pablo Garay --- .github/workflows/cicd-main.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 989130cb..fc8b2c25 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -13,6 +13,7 @@ # limitations under the License. name: CICD NeMo on: + workflow_dispatch: schedule: - cron: 0 0 * * * push: From b0f4058d6ea5f7a80fec228e8c314508b791bece Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Sat, 15 Nov 2025 14:22:29 -0800 Subject: [PATCH 13/17] update Signed-off-by: Pablo Garay --- tests/functional_tests/test_mcore_wan_pretrain.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py index 1d8122ae..f9dd95ba 100644 --- a/tests/functional_tests/test_mcore_wan_pretrain.py +++ b/tests/functional_tests/test_mcore_wan_pretrain.py @@ -67,7 +67,6 @@ def test_wan_pretrain_mock(self, tmp_path): "optimizer.lr=5e-6", "optimizer.min_lr=5e-6", "train.eval_iters=0", - "train.max_steps=10", "scheduler.lr_decay_style=constant", "scheduler.lr_warmup_iters=0", "model.seq_length=2048", From 08a2f448a4b481edf183e36da9a4f01d5fde67f3 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Sat, 15 Nov 2025 16:40:20 -0800 Subject: [PATCH 14/17] add logging Signed-off-by: Pablo Garay --- tests/functional_tests/test_mcore_wan_pretrain.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py index f9dd95ba..669e425f 100644 --- a/tests/functional_tests/test_mcore_wan_pretrain.py +++ b/tests/functional_tests/test_mcore_wan_pretrain.py @@ -81,27 +81,17 @@ def test_wan_pretrain_mock(self, tmp_path): # Run the command with a timeout try: + # Stream output in real-time instead of capturing it result = subprocess.run( cmd, - capture_output=True, - text=True, timeout=1800, # 30 minute timeout check=True, ) - # Print output for debugging if needed - print("STDOUT:", result.stdout) - print("STDERR:", result.stderr) - # Basic verification that the run completed assert result.returncode == 0, f"Command failed with return code {result.returncode}" - # Check for common success indicators in output - assert "iteration" in result.stdout.lower() or "iteration" in result.stderr.lower(), ( - "Expected to see iteration progress in output" - ) - except subprocess.TimeoutExpired: pytest.fail("WAN pretrain mock run exceeded timeout of 1800 seconds (30 minutes)") except subprocess.CalledProcessError as e: - pytest.fail(f"WAN pretrain mock run failed with error:\nSTDOUT: {e.stdout}\nSTDERR: {e.stderr}") + pytest.fail(f"WAN pretrain mock run failed with return code {e.returncode}") From a5d4e44e85e631f41b6656c8c927a406db05669c Mon Sep 17 00:00:00 2001 From: Abhinav Garg Date: Sun, 16 Nov 2025 02:50:13 +0000 Subject: [PATCH 15/17] Update test configuration for Mcore WAN pretraining - Increased the number of processes per node from 1 to 2 for distributed training. - Set the number of training iterations to 10 to enhance the training process. --- tests/functional_tests/test_mcore_wan_pretrain.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py index 669e425f..99645e31 100644 --- a/tests/functional_tests/test_mcore_wan_pretrain.py +++ b/tests/functional_tests/test_mcore_wan_pretrain.py @@ -46,7 +46,7 @@ def test_wan_pretrain_mock(self, tmp_path): "python", "-m", "torch.distributed.run", - "--nproc_per_node=1", + "--nproc_per_node=2", "examples/megatron/recipes/wan/pretrain_wan.py", "--training-mode", "pretrain", @@ -67,6 +67,7 @@ def test_wan_pretrain_mock(self, tmp_path): "optimizer.lr=5e-6", "optimizer.min_lr=5e-6", "train.eval_iters=0", + "train.train_iters=10", "scheduler.lr_decay_style=constant", "scheduler.lr_warmup_iters=0", "model.seq_length=2048", From a209623e85085cfa683eb685cf5aa806fb849684 Mon Sep 17 00:00:00 2001 From: Abhinav Garg Date: Sun, 16 Nov 2025 02:52:07 +0000 Subject: [PATCH 16/17] More changes --- tests/functional_tests/test_mcore_wan_pretrain.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py index 99645e31..0be8d812 100644 --- a/tests/functional_tests/test_mcore_wan_pretrain.py +++ b/tests/functional_tests/test_mcore_wan_pretrain.py @@ -85,10 +85,17 @@ def test_wan_pretrain_mock(self, tmp_path): # Stream output in real-time instead of capturing it result = subprocess.run( cmd, + capture_output=True, + text=True, timeout=1800, # 30 minute timeout check=True, ) + + # Print output for debugging if needed + print("STDOUT:", result.stdout) + print("STDERR:", result.stderr) + # Basic verification that the run completed assert result.returncode == 0, f"Command failed with return code {result.returncode}" From f2a61c13ce3996f7557bf8dbd70e675d94799e79 Mon Sep 17 00:00:00 2001 From: Abhinav Garg Date: Sun, 16 Nov 2025 02:53:16 +0000 Subject: [PATCH 17/17] Lint --- tests/functional_tests/test_mcore_wan_pretrain.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py index 0be8d812..0c9879d9 100644 --- a/tests/functional_tests/test_mcore_wan_pretrain.py +++ b/tests/functional_tests/test_mcore_wan_pretrain.py @@ -91,7 +91,6 @@ def test_wan_pretrain_mock(self, tmp_path): check=True, ) - # Print output for debugging if needed print("STDOUT:", result.stdout) print("STDERR:", result.stderr)