Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/actions/test-template/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ runs:
-d \
--name nemo_container_${{ github.run_id }} ${ARG[@]} \
--shm-size=64g \
--env TRANSFORMERS_OFFLINE=1 \
--env TRANSFORMERS_OFFLINE=0 \
--env HYDRA_FULL_ERROR=1 \
--env HF_HOME=/home/TestData/HF_HOME \
--env RUN_ID=${{ github.run_id }} \
Expand Down
80 changes: 41 additions & 39 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.
name: CICD NeMo
on:
workflow_dispatch:
schedule:
- cron: 0 0 * * *
push:
Expand Down Expand Up @@ -45,57 +46,58 @@ jobs:
with:
image-name: dfm
dockerfile: docker/Dockerfile.ci
runner: self-hosted-nemo
secrets:
AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }}
AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }}
AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }}

cicd-unit-tests:
strategy:
fail-fast: false
matrix:
include:
- script: L0_Unit_Tests_GPU
runner: linux-amd64-gpu-rtxa6000-latest-2-nemo
timeout: 30
- script: L0_Unit_Tests_CPU
runner: linux-amd64-cpu16
cpu-only: true
needs: [cicd-container-build]
runs-on: ${{ matrix.runner }}
name: ${{ matrix.script }}
environment: nemo-ci
steps:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: recursive
- name: main
uses: ./.github/actions/test-template
with:
runner: ${{ runner.name }}
script: ${{ matrix.script }}
timeout: ${{ matrix.timeout || 10 }}
is_unit_test: "true"
image: dfm
cpu-only: ${{ matrix.cpu-only || false }}
has-azure-credentials: "true"
azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
# cicd-unit-tests:
# strategy:
# fail-fast: false
# matrix:
# include:
# - script: L0_Unit_Tests_GPU
# runner: self-hosted-nemo
# timeout: 30
# - script: L0_Unit_Tests_CPU
# runner: linux-amd64-cpu16
# cpu-only: true
# needs: [cicd-container-build]
# runs-on: ${{ matrix.runner }}
# name: ${{ matrix.script }}
# environment: nemo-ci
# steps:
# - name: Checkout
# uses: actions/checkout@v4
# with:
# submodules: recursive
# - name: main
# uses: ./.github/actions/test-template
# with:
# runner: ${{ runner.name }}
# script: ${{ matrix.script }}
# timeout: ${{ matrix.timeout || 10 }}
# is_unit_test: "true"
# image: dfm
# cpu-only: ${{ matrix.cpu-only || false }}
# has-azure-credentials: "true"
# azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
# azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
# azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}

cicd-e2e-tests:
strategy:
fail-fast: false
matrix:
include:
- script: L2_Functional_Tests_GPU
runner: linux-amd64-gpu-rtxa6000-latest-2-nemo
timeout: 30
# - script: L2_Functional_Tests_GPU
# runner: self-hosted-nemo
# timeout: 30
- script: L2_Mcore_Mock_Tests_GPU
runner: linux-amd64-gpu-rtxa6000-latest-2-nemo
runner: self-hosted-nemo
timeout: 30
needs: [cicd-unit-tests]
needs: [cicd-container-build]
runs-on: ${{ matrix.runner }}
name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
environment: nemo-ci
Expand All @@ -120,7 +122,7 @@ jobs:
Nemo_CICD_Test:
needs:
- cicd-container-build
- cicd-unit-tests
# - cicd-unit-tests
- cicd-e2e-tests
if: always()
runs-on: ubuntu-latest
Expand Down
28 changes: 28 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,32 @@
# Contributing To NeMo DFM
## 🛠️ Setting Up Your Environment

Use the instructions below to setup a dev environment and a dev container

### Building a container
```bash
# We recommend you to get the latest commits for Megatron-Bridge and Autmodel
# The easiest way to do that might be to remove the 3rdparty directly completely before running the following commands
git submodule update --init --recursive --remote # Get all the 3rd party submodules
cd 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM # Megatron LM commit might be wrong
# Get the right megatron commit from here: https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/3rdparty
git checkout <commit_hash>
cd ../../../../
docker build -f docker/Dockerfile.ci -t dfm:latest .
```

### Run the container
```bash
docker run --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --gpus all $(pwd):/opt/DFM -it dfm:latest bash
```

### inside the container
```bash
# Add DFM to PYTHONPATH
export PYTHONPATH=$PYTHONPATH:/opt/DFM

# Run a Mock Run:
```

## Signing Your Work

Expand Down
8 changes: 5 additions & 3 deletions dfm/src/megatron/model/common/dit_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def __init__(
else:
self.k_layernorm = None

def get_query_key_value_tensors(self, hidden_states, key_value_states=None, split_qkv=False):
def get_query_key_value_tensors(self, hidden_states, key_value_states=None, output_gate=None, split_qkv=True):
"""
Derives `query`, `key` and `value` tensors from `hidden_states`.
"""
Expand Down Expand Up @@ -251,13 +251,15 @@ def __init__(
is_expert=False,
)

def get_query_key_value_tensors(self, hidden_states, key_value_states, split_qkv=False):
def get_query_key_value_tensors(self, hidden_states, key_value_states, output_gate=None, split_qkv=True):
"""
Derives `query` tensor from `hidden_states`, and `key`/`value` tensors
from `key_value_states`.
"""

query, key, value = super().get_query_key_value_tensors(hidden_states, key_value_states)
query, key, value = super().get_query_key_value_tensors(
hidden_states, key_value_states, output_gate=output_gate, split_qkv=split_qkv
)

# gather query and key heads across TP ranks if self.layernorm_across_heads is True
if self.layernorm_across_heads and parallel_state.get_tensor_model_parallel_world_size() > 1:
Expand Down
1 change: 0 additions & 1 deletion dfm/src/megatron/model/dit/dit_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,6 @@ def __init__(
super(DiTCrossAttentionModel, self).__init__(config=config)

self.config: TransformerConfig = config

self.transformer_decoder_layer_spec = transformer_decoder_layer_spec()
self.pre_process = pre_process
self.post_process = post_process
Expand Down
1 change: 1 addition & 0 deletions dfm/src/megatron/model/wan/wan_layer_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ def forward(
packed_seq_params=None,
sequence_len_offset=None,
inference_context=None,
rotary_pos_cos_sin=None,
):
# the timestep embedding is stored in attention_mask argument
timestep_emb = attention_mask
Expand Down
2 changes: 1 addition & 1 deletion tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
CUDA_VISIBLE_DEVICES="0,1" uv run coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/functional_tests/test_mcore_wan_pretrain.py -m "not pleasefixme" --with_downloads -v
CUDA_VISIBLE_DEVICES="0,1" uv run --group megatron-bridge coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/functional_tests/test_mcore_wan_pretrain.py -m "not pleasefixme" --with_downloads -v
15 changes: 6 additions & 9 deletions tests/functional_tests/test_mcore_wan_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def test_wan_pretrain_mock(self, tmp_path):
"python",
"-m",
"torch.distributed.run",
"--nproc_per_node=1",
"--nproc_per_node=2",
"examples/megatron/recipes/wan/pretrain_wan.py",
"--training-mode",
"pretrain",
Expand All @@ -67,6 +67,7 @@ def test_wan_pretrain_mock(self, tmp_path):
"optimizer.lr=5e-6",
"optimizer.min_lr=5e-6",
"train.eval_iters=0",
"train.train_iters=10",
"scheduler.lr_decay_style=constant",
"scheduler.lr_warmup_iters=0",
"model.seq_length=2048",
Expand All @@ -81,11 +82,12 @@ def test_wan_pretrain_mock(self, tmp_path):

# Run the command with a timeout
try:
# Stream output in real-time instead of capturing it
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=300, # 5 minute timeout
timeout=1800, # 30 minute timeout
check=True,
)

Expand All @@ -96,12 +98,7 @@ def test_wan_pretrain_mock(self, tmp_path):
# Basic verification that the run completed
assert result.returncode == 0, f"Command failed with return code {result.returncode}"

# Check for common success indicators in output
assert "iteration" in result.stdout.lower() or "iteration" in result.stderr.lower(), (
"Expected to see iteration progress in output"
)

except subprocess.TimeoutExpired:
pytest.fail("WAN pretrain mock run exceeded timeout of 300 seconds")
pytest.fail("WAN pretrain mock run exceeded timeout of 1800 seconds (30 minutes)")
except subprocess.CalledProcessError as e:
pytest.fail(f"WAN pretrain mock run failed with error:\nSTDOUT: {e.stdout}\nSTDERR: {e.stderr}")
pytest.fail(f"WAN pretrain mock run failed with return code {e.returncode}")
Loading